#!/usr/bin/env python
"""
Parses static pages and adds to
database
"""
import os,psycopg,re
reotitle = re.compile(r'
([^<]*)',re.IGNORECASE)
reoother = re.compile(r'>(.*?)<')
conn = psycopg.connect("dbname=myproject user=postgres")
def parsehtmlfiles(filelist):
f = open(filelist,'r')
c = conn.cursor()
try:
for file in f.readlines():
htmlfile = open(file[:-1],'r')
htmltext = htmlfile.read()
htmlfile.close()
htmltitle = reotitle.search(htmltext).groups()[0]
texttext = ""
for i in reoother.findall(htmltext):
if i != "":
texttext += i + " "
statfile = os.stat(file[:-1])
sql = """Insert into nwmail_staticpages
VALUES( nextval('sample_searchs_id_seq'::regclass),'%s',
timestamp '1970-01-01' + interval '%s seconds',
$$%s$$,'%s',$$%s$$);
""" % (htmltitle,
statfile.st_mtime,
texttext,
file[:-1].replace("index.html",""),
htmltext
)
# print "Running SQL: %s" % sql
c.execute(sql)
f.close()
except Exception,e:
conn.rollback()
raise Exception("parsehtmlfiles Error: %s" % e)
conn.commit()
conn.close()
if __name__ == "__main__":
os.system("wget -r http://www.sample.com")
basedir = "."
os.system("find '%s' -name \*.html -print > ./tmpfilelist" % basedir );
parsehtmlfiles("./tmpfilelist")
os.system("rm ./tmpfilelist")