dbfp_pub/libs/fingerprint_index.py

246 lines
6.9 KiB
Python

#
#
# [ md5_all, md5_list, file_name ]
#
#
import re
import os
import sys
import logging
import sqlite3 as sql
from libs.exceptions import FingerprintIndexWrite
from libs.exceptions import FingerprintIndexOpen
from libs.fingerprint import FingerprintDB
# prefixed with "_" so that it will be listed first and visible
INDEX_FILENAME = '_index_dbfp.db'
class FingerprintIndex:
"""
Class handling an index of fingerprints for effeciently locating a fingerprint
"""
#
def __init__(self):
self.db_conn = None
self.cur = None
return
#
def openIndex(self, fp_dir):
if os.path.isdir(fp_dir):
fq_fpidx = fp_dir + os.path.sep + INDEX_FILENAME
else:
fq_fpidx = fp_dir
try:
if (os.path.isfile(fq_fpidx)):
self.db_conn = sql.connect(fq_fpidx)
logging.info("DB Open SUCCESSFUL")
self.cur = self.db_conn.cursor()
else:
logging.info("No index file found, creating index now...")
self.__createIndex(fp_dir)
try:
self.cur = self.db_conn.cursor()
self.__populateIndex(fp_dir)
except Exception as ex:
raise FingerprintIndexWrite("Error populating index file\n{}".format(ex))
logging.info("Successfully populated the index")
self.db_conn = sql.connect(fq_fpidx)
except Exception as ex:
if self.db_conn:
self.db_conn.close()
self.db_conn = None
logging.error(ex)
raise FingerprintIndexOpen("Error opening/creating an index file\n")
#
def findFP(self, md5_db, md5_tables):
rows = self.__qDatabaseMD5(md5_db)
if len(rows) > 0:
return rows
for md5_table in md5_tables:
retval = {}
rows = self.__qTableMD5(md5_table)
for row in rows:
fp_list = row[0]
fps = fp_list.split(',')
for fp in fps:
retval[fp] = 1
#print "***** __qTableMD5 *****\n{}\n".format(retval.keys())
return retval.keys()
#
def findDB(self, md5_db):
rows = self.__qDatabaseMD5(md5_db)
return rows
#
def __qDatabaseMD5(self, md5_db):
try:
rows = self.cur.execute('''
SELECT md5_list, fp_list, fp_count
FROM md5_all
WHERE md5_db=?
''', [md5_db])
results = []
for row in rows:
results.append((row[0], row[1], row[2]))
return results
except Exception as ex:
logging.error(ex)
#
def __qTableMD5(self, md5_table):
try:
rows = self.cur.execute('''
SELECT fp_list, fp_count
FROM md5_tables
WHERE md5_table=?
''', [md5_table])
return rows
except Exception as ex:
logging.error(ex)
#
def __createIndex(self, fp_dir):
fq_fpidx = fp_dir + os.path.sep + INDEX_FILENAME
try:
self.db_conn = sql.connect(fq_fpidx)
self.db_conn.execute('''
CREATE TABLE md5_all (
md5_db TEXT PRIMARY KEY,
md5_list TEXT,
fp_list TEXT,
fp_count INTEGER);
''')
self.db_conn.execute('''
CREATE TABLE md5_tables (
md5_table TEXT PRIMARY KEY,
fp_list TEXT,
fp_count INTEGER);
''')
logging.info("Successfully created index table")
except Exception as ex:
raise FingerprintIndexWrite("Error creating index file\n{}".format(ex))
#
def __populateIndex(self, fp_dir):
""" read each file, pull md5, add row to database """
finCount = 0
failCount = 0
naCount = 0
try:
db = FingerprintDB()
files = os.listdir(fp_dir)
for file in files:
try:
# only parese files with .json eextension
if not re.search(r'.*\.json', file):
naCount = naCount+1
pass
#print file
fq_file = fp_dir + os.path.sep + file
db.importJson(fq_file)
self.__insertMod_md5_all(db.db_hash, db.table_hashes.values(), file)
self.__insertMod_md5_tables(db.table_hashes.values(), file)
finCount = finCount+1
if ((finCount % 5) == 0):
self.db_conn.commit()
except Exception as ex:
logging.error(ex)
failCount = failCount+1
except Exception as ex:
logging.error(ex)
finally:
self.db_conn.commit()
logging.info("Completed populating the index. Completed: {} Failed: {} NA: {}".format(str(finCount), str(failCount), str(naCount)))
#
def __insertMod_md5_all(self, md5_db, md5_list, filename):
try:
# logging.info("INSERT INTO md5_index VALUES(?, ?, ?): {}; {}; {}".format(md5_all, str(md5_list), filename))
self.db_conn.execute(
'''
INSERT INTO md5_all VALUES(?, ?, ?, ?)
''', [md5_db, ','.join(md5_list), filename, 1])
except sql.IntegrityError:
try:
(fp_list, fp_count) = self.__selectFileList(md5_db)
fp_list += ","+filename
fp_count += 1
# logging.info("fp_list=={}".format(fp_list))
self.db_conn.execute(
'''
UPDATE md5_all SET fp_list=?, fp_count=? WHERE md5_db=?
''', [fp_list, fp_count, md5_db])
except Exception as ex:
raise FingerprintIndexWrite("Error updating a row\n{}".format(ex))
except Exception as ex:
raise FingerprintIndexWrite("Error inserting a row\n{}".format(ex))
#
def __insertMod_md5_tables(self, md5_db, filename):
# insert the md5 of the table schemas
for md5_table in md5_db:
try:
self.db_conn.execute(
'''
INSERT INTO md5_tables VALUES(?, ?, ?)
''', [md5_table, filename, 1])
except sql.IntegrityError:
try:
(fp_list, fp_count) = self.__selectFileList222(md5_table)
fp_list += ","+filename
fp_count += 1
self.db_conn.execute(
'''
UPDATE md5_tables SET fp_list=?, fp_count=? WHERE md5_table=?
''', [fp_list, fp_count, md5_table])
except Exception as ex:
raise FingerprintIndexWrite("Error updating a row\n{}".format(ex))
except Exception as ex:
raise FingerprintIndexWrite("Error inserting a row\n{}".format(ex))
#
def __selectFileList(self, md5_db):
try:
rows = self.cur.execute(
'''
SELECT fp_list, fp_count FROM md5_all WHERE md5_db=?
''', [md5_db])
for row in rows:
return (row[0], row[1])
except Exception as ex:
raise FingerprintIndexWrite("Error selecting fp_list\n{}".format(ex))
#
def __selectFileList222(self, md5_table):
try:
rows = self.cur.execute(
'''
SELECT fp_list, fp_count FROM md5_tables WHERE md5_table=?
''', [md5_table])
for row in rows:
return (row[0], row[1])
except Exception as ex:
raise FingerprintIndexWrite("Error selecting fp_list\n{}".format(ex))
#
def __checkIntegrity(self):
""" Sanity check the number of files against the index rows """
pass
#
def dirCompare(self, folder):
pass
def compareFingerprint(self, fp1, fp2):
pass