#! /usr/bin/env python import sys, os, time from stat import * import hashlib import urllib import tarfile import zipfile import logging import optparse # import ConfigParser # -------------------------------------------------------------------------- class Options : def __init__ (self) : self.quick = False; # True => skip md5 checksums self.inc = False; # True => skip md5 checksums for known files self.exif = False; # True => read exif information self.dir_list = False; # True => read lslR file self.cont = ""; # non-empty => read this file and append into it new data self.retry = False; # True => allow retries when reading files for checksums self.path = ""; # additional directory with Python modules # -------------------------------------------------------------------------- def init_imports () : if opt.path != "" : sys.path.append (opt.path) if opt.exif : global pyexiv2 import pyexiv2 # -------------------------------------------------------------------------- def init_log () : # set up logging to file logging.basicConfig (level=logging.DEBUG, format="%(asctime)s %(name)s %(levelname)-8s %(message)s", datefmt="%Y-%m-%d %H:%M", filename="log.txt", filemode="a") # "%(filename)s %(module)s %(funcName)s %(lineno)s %(msecs)s" # define a Handler which writes INFO messages or higher to the sys.stderr console = logging.StreamHandler () console.setLevel (logging.INFO) # set a format which is simpler for console use formatter = logging.Formatter ("%(name)s: %(levelname)s: %(message)s") # tell the handler to use this format console.setFormatter (formatter) # add the handler to the root logger logging.getLogger("").addHandler(console) global log name = os.path.basename (sys.argv[0]) log = logging.getLogger (name) # -------------------------------------------------------------------------- def read_options () : parser = optparse.OptionParser () parser.add_option("-q", "--quick", dest="quick", action="store_true", help="without md5") parser.add_option("-n", "--no-md5", dest="quick", action="store_true", help="without md5 (same as --quick)") parser.add_option("-i", "--inc", dest="inc", action="store_true", help="skip md5 calculation for known files") parser.add_option("-e", "--exif", dest="exif", action="store_true", help="add exif data") parser.add_option("-l", "--list", dest="dir_list", action="store_true", help="read file with ls -lR output") parser.add_option("-c", "--cont", dest="cont", help="read this file and append into it new data", default="") parser.add_option("-r", "--retry", dest="retry", action="store_true", help="allow retries", default="") parser.add_option("-p", "--path", dest="path", help="add into PYTHONPATH", default="") global opt global arg opt = Options () (opt, arg) = parser.parse_args (values=opt) # if len (arg) > 1 : if len (arg) > 2 : parser.error ("too many arguments") if len (arg) == 0 : parser.error ("missing argument") # -------------------------------------------------------------------------- def conv_time (t) : s = time.gmtime (t) return time.strftime ("%Y-%m-%d:%H:%M:%S", s) def conv_str (s) : return urllib.quote (s, ":/") def calculate_sum (file_name): m = hashlib.md5 () f = open (file_name, "rb") while True: d = f.read (32*1024) if not d : break m.update (d) f.close (); return m.hexdigest () def calc_sum (file_name) : if not opt.retry : return calculate_sum (file_name) else : result = '!' * 32 ok = False cnt = 1 max_cnt = 3 while not ok and cnt <= max_cnt : try : result = calculate_sum (file_name) ok = True except : result = '!' * 32 log.error ("error reading " + conv_str (file_name) + " (cnt=" + str (cnt) +")") cnt = cnt + 1 return result def scan_info (rel_file_name, size) : if size > 30000000: log.info ("scanning file " + conv_str (rel_file_name) + " (size=" + str(size) + ")") def scan_file (top_dir, rel_dir, loc_file_name) : rel_file_name = os.path.join (rel_dir, loc_file_name) file_name = os.path.join (top_dir, rel_file_name) if opt.cont != "": global known_files if rel_file_name in known_files: # log.info ("already known file " + conv_str (rel_file_name)) return # nothing to do else : # log.info ("unknown file " + conv_str (rel_file_name)) pass # log.info ("scanning " + conv_str (rel_file_name)) if not opt.retry : info = os.stat (file_name); else : try : info = os.stat (file_name); except : log.error ("error reading file information " + conv_str (file_name)) return size_num = info [ST_SIZE] size = "size=" + str (info [ST_SIZE]) mode = "mode=" + ("%o" % info [ST_MODE]) uid = "uid=" + str (info [ST_UID]) gid = "gid="+ str (info [ST_GID]) a_time = "atime=" + conv_time (info [ST_ATIME]) c_time = "ctime=" + conv_time (info [ST_CTIME]) m_time = "mtime=" + conv_time (info [ST_MTIME]) if opt.quick : sum = '-' * 32 elif S_ISREG (info [ST_MODE]) : if not opt.inc : scan_info (rel_file_name, size_num) sum = calc_sum (file_name) else : ident = loc_file_name ident = ident.lower () key = (ident, info [ST_SIZE]) global known_cache global conflict_cache if key in known_cache and not key in conflict_cache : sum = '-' * 32 # md5 is already known else : scan_info (rel_file_name, size_num) sum = calc_sum (file_name) else : sum = '-' * 32 log.warning ("strange file " + conv_str (rel_file_name)) exif_info = "" if opt.exif : exif_info = scan_exif (file_name) out_rel_name = conv_str (rel_file_name) # print sum, out_rel_name, size, mode, uid, gid, a_time, c_time, m_time, exif_info print sum, out_rel_name, size, m_time, exif_info def scan_dir (top_dir, rel_dir) : log.info ("scanning " + conv_str (rel_dir)) dir_name = os.path.join (top_dir, rel_dir) subitems = os.listdir (dir_name) subitems.sort () for loc_file_name in subitems : rel_file_name = os.path.join (rel_dir, loc_file_name) file_name = os.path.join (top_dir, rel_file_name) # info = os.stat (file_name); # if S_ISDIR (info [ST_MODE]) : if os.path.isdir (file_name) : scan_dir (top_dir, rel_file_name) else : scan_file (top_dir, rel_dir, loc_file_name) # -------------------------------------------------------------------------- def scan_zip (zip_file_name) : zip_file = zipfile.ZipFile (zip_file_name, mode="r") # zip_file.printdir () for info in zip_file.infolist () : name = info.filename size = "size=" + str (info.file_size) t = info.date_time time = "%4i-%02i-%02i:%02i:%02i:%02i" % t; time = "m_time=" + time m = hashlib.md5 () bytes = zip_file.read (name); m.update (bytes) sum = m.hexdigest () print sum, name, size, time zip_file.close () # -------------------------------------------------------------------------- def scan_tar (tar_file_name) : tar_file = tarfile.open (tar_file_name, mode="r") # tar_file.list () for info in tar_file.getmembers () : if info.isfile () : name = info.name size = "size=" + str (info.size) mode = "mode=" + ("%o" % info.mode) uid = "uid=" + str (info.uid) gid = "gid="+ str (info.gid) m_time = "mtime=" + conv_time (info.mtime) sum = "" m = hashlib.md5 () f = tar_file.extractfile (info) if f : while True: d = f.read (16*1024) if not d : break m.update (d) f.close (); sum = m.hexdigest () print sum, name, size, mode, uid, gid, m_time tar_file.close () # -------------------------------------------------------------------------- def exif_key (name, img, key): txt = "" if key in img.exifKeys () : value = img.interpretedExifValue (key) txt = name + "=" + conv_str (value) + " " return txt def scan_exif (file_name) : txt = "" try : # import pyexiv2 img = pyexiv2.Image (file_name) img.readMetadata () txt += exif_key ("e_time", img, 'Exif.Image.DateTime') txt += exif_key ("e_width", img, 'Exif.CanonPi.ImageWidth') txt += exif_key ("e_height", img, 'Exif.CanonPi.ImageHeight') txt += exif_key ("e_model", img, 'Exif.Image.Model') txt += exif_key ("e_number", img, 'Exif.Canon.ImageNumber') except: pass return txt # -------------------------------------------------------------------------- def scan_lslR (file_name) : file = open (file_name, "r") directory = "" for line in file : line = line.strip () print "line ", line if line == "" : pass elif line.endswith (":") : directory = line [0:-1] if directory == "." : directory = "" elif directory.startswith ("./") : directory = directory [2:] elif line.startswith ("total ") : pass else : items = line.split (None, 8) item_count = len (items) # print "item_count", item_count #, "items ", items name = "" if item_count == 1 or item_count == 8: name = items [item_count-1] elif item_count > 8 : name = items [7] for s in items [8:] : # strange name = name + " " + s if name.endswith ('/') or name.endswith ('*') : name = name [0:-1] if name != "" : # sum = '-' * 32 name = os.path.join (directory, name) name = "name=" + conv_str (name); if item_count == 1 : print name else : mode = "mode=" + items [0] # items [1] ... link count uid = "uid=" + items [2] gid = "gid="+ items [3] size = "size=" + items [4] m_time = "mtime=" + items[5] + ':' + items [6] print name, size, mode, uid, gid, m_time file.close () # -------------------------------------------------------------------------- def decode_str (s) : return urllib.unquote (s) def decode_num (s) : if len (s) == 0 : return 0 else: return int (s) def decode_time (s) : return s Normal = 0 Old = 1 New = 2 Modified = 3 Equal = 4 Duplicated = 5 class FileInfo : def __init__ (self) : self.kind = Normal self.name = "" self.size = 0 self.mtime = "" self.md5 = "" self.mode = Normal self.e_model = "" self.e_number = "" self.e_width = 0 self.e_height = 0 self.e_time = "" def readLine (line) : "Read one line from scan-file, return FileInfo" result = FileInfo () items = line.split () inx = 1 for item in items : if item.startswith ("md5=") : result.md5 = item[4:] elif item.startswith ("name=") : result.name = decode_str (item[5:]) elif item.startswith ("size=") : result.size = decode_num (item[5:]) elif item.startswith ("mode=") : result.mode = item[5:] elif item.startswith ("uid=") : result.uid = decode_num (item[4:]) elif item.startswith ("gid=") : result.gid = decode_num (item[4:]) elif item.startswith ("atime=") : result.atime = decode_time (item[6:]) elif item.startswith ("ctime=") : result.ctime = decode_time (item[6:]) elif item.startswith ("mtime=") : result.mtime = decode_time (item[6:]) elif item.startswith ("e_model=") : result.e_model = decode_str (item[8:]) elif item.startswith ("e_number=") : result.e_number = decode_str (item[9:]) elif item.startswith ("e_width=") : result.e_width = decode_num (item[8:]) elif item.startswith ("e_height=") : result.e_height = decode_num (item[9:]) elif item.startswith ("e_time=") : result.e_time = decode_time (decode_str (item[7:])) elif inx == 1 : result.md5 = item inx = inx + 1 elif inx == 2 : result.name = decode_str (item) inx = inx + 1 elif inx == 3 : result.size = decode_num (item) inx = inx + 1 elif inx == 4 : result.mtime = decode_time (item) inx = inx + 1 return result # -------------------------------------------------------------------------- known_cache = { } conflict_cache = { } def readIncFile (fileName) : global known_cache global conflict_cache # no_sum = '-' * 32 # bad_sum = '!' * 32 file = open (fileName, "r") for line in file : data = readLine (line) if data.name != "" and data.md5 != "" and data.md5[0] != '-' and data.md5[0] != '!' : ident = os.path.basename (data.name) ident = ident.lower () key = (ident, data.size) if not key in known_cache : known_cache [key] = data else : conflict_cache [key] = True file.close () def readInc () : readIncFile ("data9/pentium4-diskf-2010-06-18.txt") readIncFile ("data9/pentium4-diskg-2010-06-18.txt") # readIncFile ("data9/wd-foto-2010-06-27.txt") # readIncFile ("data9/wd-fotoarchiv-2010-06-27.txt") # readIncFile ("data9/wd-video-2010-06-27.txt") # readIncFile ("data9/wd-doplnky-2010-06-27.txt") # readIncFile ("data11/wd-foto-2011-04-17.txt") # readIncFile ("data11/wd-fotoarchiv-2011-04-17.txt") # readIncFile ("data11/wd-video-2011-04-17.txt") # readIncFile ("data11/wd-doplnky-2011-04-17.txt") # readIncFile ("data11/wd-new-2011-04-17.txt") # -------------------------------------------------------------------------- known_files = { } def readContFile (fileName) : global known_files # no_sum = '-' * 32 # bad_sum = '!' * 32 file = open (fileName, "r") for line in file : data = readLine (line) if data.name != "" and data.md5 != "" and data.md5[0] != '-' and data.md5[0] != '!' : key = data.name known_files [key] = data file.close () # -------------------------------------------------------------------------- def main () : init_log () read_options () init_imports () if opt.inc : readInc () if opt.cont != "" : readContFile (opt.cont) sys.stdout = open (opt.cont, "a") # append output to this file top_dir = arg[0] top_dir = os.path.expanduser (top_dir) top_rel = "" if len (arg) > 1 : top_rel = arg[1] if opt.dir_list : scan_lslR (top_dir) else : scan_dir (top_dir, top_rel) if __name__ == '__main__' : main () # -------------------------------------------------------------------------- # scan_zip ("/mnt/store/Transfer/disk_n/Work.zip") # scan_tar ("/mnt/store/product/packages/gtk-2.8.16-mandrake-10.2.tgz") # --------------------------------------------------------------------------