dbfp_pub/libs/fingerprint_index.py

184 lines
5.3 KiB
Python

#
#
# [ md5_all, md5_list, file_name ]
#
#
import os
import sys
import logging
import sqlite3 as sql
from libs.exceptions import FingerprintIndexWrite
from libs.exceptions import FingerprintIndexOpen
from libs.fingerprint import FingerprintDB
# prefixed with "_" so that it will be listed first and visible
INDEX_FILENAME = '_index_dpfp.db'
class FingerprintIndex:
"""
Class handling an index of fingerprints for effeciently locating a fingerprint
"""
#
def __init__(self):
self.db_conn = None
self.cur = None
return
#
def openIndex(self, fp_dir):
fq_fpidx = fp_dir + os.path.sep + INDEX_FILENAME
try:
if (os.path.isfile(fq_fpidx)):
self.db_conn = sql.connect(fq_fpidx)
logging.info("DB Open SUCCESSFUL")
else:
logging.info("No index file found, creating index now...")
self.createIndex(fp_dir)
except:
raise FingerprintIndexOpen("Error opening/creating an index file")
finally:
if self.db_conn:
self.db_conn.close()
self.db_conn = None
#
def createIndex(self, fp_dir):
fq_fpidx = fp_dir + os.path.sep + INDEX_FILENAME
try:
self.db_conn = sql.connect(fq_fpidx)
self.db_conn.execute('''
CREATE TABLE md5_all (
md5_db TEXT PRIMARY KEY,
md5_list TEXT,
fp_list TEXT,
fp_count INTEGER);
''')
self.db_conn.execute('''
CREATE TABLE md5_tables (
md5_table TEXT PRIMARY KEY,
fp_list TEXT,
fp_count INTEGER);
''')
logging.info("Successfully created index table")
self.__populateIndex(fp_dir)
logging.info("Successfully populated the index")
except Exception as e:
raise FingerprintIndexWrite("Error creating an index file\n{}".format(e))
finally:
if self.db_conn:
self.db_conn.close()
self.db_conn = None
#
def __populateIndex(self, fp_dir):
""" read each file, pull md5, add row to database """
failCount = 0
finCount = 0
try:
self.cur = self.db_conn.cursor()
db = FingerprintDB()
files = os.listdir(fp_dir)
for file in files:
try:
fq_file = fp_dir + os.path.sep + file
db.importJson(fq_file)
self.__insertMod_md5_all(db.db_hash, db.table_hashes.values(), file)
self.__insertMod_md5_tables(db.table_hashes.values(), file)
finCount = finCount+1
if ((finCount % 5) == 0):
self.db_conn.commit()
except Exception as e:
logging.error(e)
failCount = failCount+1
except:
pass
finally:
self.db_conn.commit()
logging.info("Completed populating the index. Completed: {} Failed: {} ".format(str(finCount), str(failCount)))
#
def __insertMod_md5_all(self, md5_db, md5_list, filename):
try:
# logging.info("INSERT INTO md5_index VALUES(?, ?, ?): {}; {}; {}".format(md5_all, str(md5_list), filename))
self.db_conn.execute(
'''
INSERT INTO md5_all VALUES(?, ?, ?, ?)
''', [md5_db, ','.join(md5_list), filename, 1])
except sql.IntegrityError:
try:
(fp_list, fp_count) = self.__selectFileList(md5_db)
fp_list += ","+filename
fp_count += 1
# logging.info("fp_list=={}".format(fp_list))
self.db_conn.execute(
'''
UPDATE md5_all SET fp_list=?, fp_count=? WHERE md5_db=?
''', [fp_list, fp_count, md5_db])
except Exception as e:
raise FingerprintIndexWrite("Error updating a row\n{}".format(e))
except Exception as e:
raise FingerprintIndexWrite("Error inserting a row\n{}".format(e))
#
def __insertMod_md5_tables(self, md5_db, filename):
# insert the md5 of the table schemas
for md5_table in md5_db:
try:
self.db_conn.execute(
'''
INSERT INTO md5_tables VALUES(?, ?, ?)
''', [md5_table, filename, 1])
except sql.IntegrityError:
try:
(fp_list, fp_count) = self.__selectFileList222(md5_table)
fp_list += ","+filename
fp_count += 1
self.db_conn.execute(
'''
UPDATE md5_tables SET fp_list=?, fp_count=? WHERE md5_table=?
''', [fp_list, fp_count, md5_table])
except Exception as e:
raise FingerprintIndexWrite("Error updating a row\n{}".format(e))
except Exception as e:
raise FingerprintIndexWrite("Error inserting a row\n{}".format(e))
#
def __selectFileList(self, md5_db):
try:
rows = self.cur.execute(
'''
SELECT fp_list, fp_count FROM md5_all WHERE md5_db=?
''', [md5_db])
for row in rows:
return (row[0], row[1])
except:
raise FingerprintIndexWrite("Error selecting fp_list\n{}".format(e))
#
def __selectFileList222(self, md5_table):
try:
rows = self.cur.execute(
'''
SELECT fp_list, fp_count FROM md5_tables WHERE md5_table=?
''', [md5_table])
for row in rows:
return (row[0], row[1])
except:
raise FingerprintIndexWrite("Error selecting fp_list\n{}".format(e))
#
def __checkIntegrity(self):
""" Sanity check the number of files against the index rows """
pass
#
def dirCompare(self, folder):
pass
def compareFingerprint(self, fp1, fp2):
pass