#!/usr/bin/env python """ Parses static pages and adds to database """ import os,psycopg,re reotitle = re.compile(r'([^<]*)',re.IGNORECASE) reoother = re.compile(r'>(.*?)<') conn = psycopg.connect("dbname=myproject user=postgres") def parsehtmlfiles(filelist): f = open(filelist,'r') c = conn.cursor() try: for file in f.readlines(): htmlfile = open(file[:-1],'r') htmltext = htmlfile.read() htmlfile.close() htmltitle = reotitle.search(htmltext).groups()[0] texttext = "" for i in reoother.findall(htmltext): if i != "": texttext += i + " " statfile = os.stat(file[:-1]) sql = """Insert into nwmail_staticpages VALUES( nextval('sample_searchs_id_seq'::regclass),'%s', timestamp '1970-01-01' + interval '%s seconds', $$%s$$,'%s',$$%s$$); """ % (htmltitle, statfile.st_mtime, texttext, file[:-1].replace("index.html",""), htmltext ) # print "Running SQL: %s" % sql c.execute(sql) f.close() except Exception,e: conn.rollback() raise Exception("parsehtmlfiles Error: %s" % e) conn.commit() conn.close() if __name__ == "__main__": os.system("wget -r http://www.sample.com") basedir = "." os.system("find '%s' -name \*.html -print > ./tmpfilelist" % basedir ); parsehtmlfiles("./tmpfilelist") os.system("rm ./tmpfilelist")