#!/usr/bin/env python
#
# ~/fsdb/fsdb ---
#

import argparse
import hashlib
import itertools
import math
import os
import os.path
import pdb
import pprint
import re
import subprocess
import sys

import sqlalchemy as sa
import sqlalchemy.ext.declarative as sa_e_d
import sqlalchemy.orm as sa_orm
import sqlalchemy.sql.expression as sa_s_e
import sqlalchemy.sql.expression
import sqlalchemy.ext.compiler

##########

DEBUG=None
VERBOSE=None
VERSION='$Revision: 1.14 $'

#####

SA_Base=sa_e_d.declarative_base()
SA_engine=None
SA_Session=None
SA_session=None

#####

class FsItem(SA_Base):
  __tablename__='fsitem'

  item_id=sa.Column(sa.Integer,primary_key=True)
  path=sa.Column(sa.String)
  file_size=sa.Column(sa.Integer)
  md5=sa.Column(sa.String)

  def __str__(self):
    return "<%s:%s:%s:%s>"%(self.__class__.__name__,self.item_id,self.md5,self.path)

  def md5_equals_cnt(self):
    rv=SA_session.query(FsItem).filter(FsItem.md5==self.md5).count()
    return rv

  def exists(self):
    return os.path.exists(self.path)

sa.Index('fsitem__path',FsItem.path)
sa.Index('fsitem__md5_path',FsItem.md5,FsItem.path)

#####

def db_get_path(path):
  if path:
    return path
  #
  path=os.environ.get("FSDB_DB")
  if path:
    return path
  #
  return os.path.expanduser("~/.fsdb.db")

def db_open(db_path,echo=False):
  global SA_engine,SA_session,SA_session
  #
  file_exists=os.path.isfile(db_path)
  #
  arg="sqlite:///"+db_path
  #print "db_open:",arg
  #
  SA_engine=sa.create_engine(
    arg,
    echo=echo,
    convert_unicode=True,
    #unicode_error='replace',
  )
  SA_Session=sa_orm.sessionmaker(bind=SA_engine)
  SA_session=SA_Session()
  #
  if not file_exists:
    db_init()

def db_init():
  SA_Base.metadata.create_all(SA_engine)

#####

def md5_of_path(path):
  h=hashlib.md5()
  fh=open(path,"r")
  while True:
    data=fh.read(8192)
    if data=="":
      break
    h.update(data)
  rv=h.hexdigest()
  print "# md5:",rv,path
  return rv

def db_add_file(path,force=False):
  #print "db_add_file:",path
  path=os.path.abspath(path)
  #
  if not os.path.isfile(path):
    return False
  #
  file_size=os.path.getsize(path)
  # get rid of old ones
  q=SA_session.query(FsItem)
  q=q.filter(FsItem.path==path)
  q=q.order_by(FsItem.item_id)
  fsitems=q.all()
  #
  fsitem=None
  if not force and 0<len(fsitems):
    fsitem=fsitems.pop(0)
    if fsitem.file_size==file_size:
      # no change in size, dont update the md5
      return True
  # remove the rest.
  for i in fsitems:
    SA_session.delete(i)
  #
  if fsitem is None:
    fsitem=FsItem()
    SA_session.add(fsitem)
  #
  md5=md5_of_path(path)
  fsitem.path=path
  fsitem.file_size=file_size
  fsitem.md5=md5
  #
  SA_session.commit()
  #
  return True

#####

def find_dups_query():
  rv=SA_session.query(FsItem).from_statement(
    """select fsi1.*
    from fsitem fsi1
    inner join (
    select fsi2.md5, count(*) as itemcnt
    from fsitem fsi2
    group by fsi2.md5
    ) as cnt on cnt.md5 = fsi1.md5
    where 1 < cnt.itemcnt
""")
  #print rv
  return rv

#####

def gen_prefix_filter(prefixes=None):
  # sqlalchemy will optimize out a blank or.
  if prefixes is None or len(prefixes)==0:
    return sa.or_()
  #
  prefixes=[os.path.abspath(p) for p in prefixes]
  prefixes.sort()
  lst=[FsItem.path.like(p+"/%") for p in prefixes]
  #
  rv=sa.or_(*lst)
  #print rv
  return rv

def find_prefix_query(prefixes=None):
  rv=SA_session.query(FsItem)
  rv=rv.filter(gen_prefix_filter(prefixes))
  rv=rv.order_by(FsItem.path,FsItem.md5)
  #print rv
  return rv

def find_md5_query(prefixes=None):
  rv=SA_session.query(sa.distinct(FsItem.md5))
  rv=rv.filter(gen_prefix_filter(prefixes))
  rv=rv.order_by(FsItem.md5)
  #print rv
  return rv

def find_md5_groups(prefixes=None):
  # this doesnt seem to work.
  # grp_q=SA_session.query(FsItem).filter(FsItem.md5==":foo").order_by(FsItem.path)
  # grp_q.param(foo=abc)
  #
  for (md5,) in find_md5_query(prefixes=prefixes):
    grp_q=SA_session.query(FsItem)
    grp_q=grp_q.filter(FsItem.md5==md5)
    grp_q=grp_q.order_by(FsItem.path)
    #
    lst=[]
    for fsitem in grp_q.all():
      if fsitem.exists():
        lst.append(fsitem)
    if 0<len(lst):
      yield lst

def print_md5_groups(grp_iter):
  for grp in grp_iter:
    print grp[0].md5
    for item in grp:
      print "   ",item.path

def re_comp(re_lst):
  if re_lst is None:
    return []
  re_lst=[re.compile(s) for s in re_lst]
  return re_lst

def process_del_md5_groups(grp_iter,
                           verbose=True,
                           keep_regexs=None,
                           test=False):
  #
  keep_regexs=re_comp(keep_regexs)
  #
  for grp in grp_iter:
    # filter out files which dont exist
    # pick out ones which should be kept
    # if one or more is kept, then delete the others.
    # set the action and why on the item for later processing.
    #
    items_keep=[]
    items_del=[]
    #
    for item in grp:
      if not item.exists():
        continue
      #
      item_path=item.path
      item_done=False
      for k_re in keep_regexs:
        if k_re.search(item_path):
          items_keep.append(item)
          item=None
          break
      if item is None:
        continue
      # not matched, and a duplicate
      items_del.append(item)
    # keep the first item if nothing matched
    if 0==len(items_keep) and 0<len(items_del):
      items_keep.append(items_del.pop())
    #
    if verbose:
      print "#",grp[0].md5
      for item in items_keep:
        print "#   ",item.path
    for item in items_del:
      print "rm  ",item.path

#####

def cmd_add(path,force=False,recursive=False):
  #print "db_add:",path,recursive
  #
  if os.path.isfile(path):
    return db_add_file(path,force=force)
  if os.path.isdir(path):
    if recursive:
      for (top,dirnames,filenames) in os.walk(path):
        dirnames.sort()
        filenames.sort()
        for f in filenames:
          p=os.path.join(top,f)
          try:
            db_add_file(p,force=force)
          except (sqlalchemy.exc.ProgrammingError) as e:
            print "### add_file error:",p
            pass
    return True
  #
  return False

def print_fsitems(obj):
  for fsitem in obj:
    print fsitem.md5,fsitem.md5_equals_cnt(),fsitem.path

#####

def main(raw_args):
  global DEBUG,VERBOSE,VERSION
  #
  parser=argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
  parser.description="""
DESCRIPTION:

"""
  parser.epilog="""
EXAMPLES:

"""
  #
  g = parser.add_argument_group('general')
  g.add_argument("--debug","-d",
                 action="store_true",
                 help="turn on the debugging flag.")
  g.add_argument("--pdb",
                 action="store_true",
                 help="enter the debugger after reading args.")
  g.add_argument("--verbose","-v",
                 default=True,
                 action="store_true",
                 help="be more verbose.")
  g.add_argument("--quiet","-q","--no-verbose",
                 dest="verbose",
                 action="store_false",
                 help="be more verbose.")
  #
  g.add_argument("--not-really","--test",
                 action="store_true",
                 help="Dont actually preform any actions.")
  #
  g.add_argument("--db",
                 help="path to the db.")
  g.add_argument("--db-init",
                 help="init the database.")
  g.add_argument("--db-echo",
                 action="store_true",
                 help="echo the db commands.")
  #
  g.add_argument("--add",
                 action="store_true",
                 help="")
  g.add_argument("--del-missing",
                 action="store_true",
                 help="delete rows from the db not in the fs.")
  g.add_argument("--show-missing",
                 action="store_true",
                 help="show items in the db not in the fs.")

  #
  g.add_argument("--force","-f",
                 action="store_true",
                 help="Force some operations.")
  g.add_argument("--recursive","-R",
                 action="store_true",
                 help="")
  #
  g.add_argument("--print-all",
                 action="store_true",
                 help="")
  g.add_argument("--print-dups",
                 action="store_true",
                 help="")
  g.add_argument("--print-existing",
                 action="store_true",
                 help="")
  g.add_argument("--print-md5-groups",
                 action="store_true",
                 help="")
  g.add_argument("--print-missing",
                 action="store_true",
                 help="")
  #
  g.add_argument("--keep-re","-K",
                 action="append",
                 help="")
  #
  g.add_argument("args",nargs="*",
                 help="the remaining args.")
  #
  args=parser.parse_args(raw_args)
  #
  if args.pdb:
    pdb.set_trace()
  if args.debug:
    DEBUG=args.debug
  if args.verbose:
    VERBOSE=args.verbose

  #
  db_path=db_get_path(args.db)
  db_open(db_path,args.db_echo)

  #
  if args.add:
    for a in args.args:
      cmd_add(a,recursive=args.recursive,force=args.force)

  #
  if args.print_all:
    i=find_prefix_query(prefixes=args.args)
    return print_fsitems(i)

  if args.print_dups:
    i=find_md5_groups(prefixes=args.args)
    return process_del_md5_groups(
      i,
      keep_regexs=args.keep_re,
      verbose=args.verbose,
      test=args.not_really)

  if args.print_existing:
    i=find_prefix_query(prefixes=args.args)
    i=itertools.ifilter(lambda item: item.exists(),i)
    return print_fsitems(i)

  if args.print_md5_groups:
    i=find_md5_groups(prefixes=args.args)
    return print_md5_groups(i)

  if args.print_missing:
    i=find_prefix_query(prefixes=args.args)
    i=itertools.ifilterfalse(lambda item: item.exists(),i)
    return print_fsitems(i)

#
if __name__ == "__main__":
  sys.exit(main(sys.argv[1:]))

# Local Variables:
# mode: python
# End:
