from BeautifulSoup import BeautifulSoup
import sys
import os
import fnmatch
import re
import logging
from xml.dom import minidom

logging.basicConfig(level=logging.INFO)

tagmatch = re.compile(r'^bd(?P<tagname>\w{3})')

ind1match = re.compile(r'^First\s-\s(?P<firstind>.*)')
ind2match = re.compile(r'^Second\s-\s(?P<secind>.*)')
indoptionmatch = re.compile(r'^(?P<value>\w(?P<value2>-\d)*)\s-\s(?P<text>.*)')
sfstring = re.compile(r'^\$(?P<code>\w)\s-\s(?P<text>.*)')

def processFile(filename):
	logging.info('Processing %s' % filename)
	fmatch = tagmatch.match(filename)
	tagname = fmatch.group('tagname')
	html = open(filename).read()

	tag = xml.createElement('tag')
	tag.setAttribute('code', tagname)
	root.appendChild(tag)

	ind1 = xml.createElement('indicator')
	ind1.setAttribute('number', '1')
	ind2 = xml.createElement('indicator')
	ind2.setAttribute('number', '2')
	tag.appendChild(ind1)
	tag.appendChild(ind2)

	soup = BeautifulSoup( html )

	currentInd  = 1
	for p in soup.findAll('p'):
		try:
			if hasattr(p, 'string') and len(p.string) > 0:
				tag.setAttribute('description', p.string)
		except TypeError:
			logging.info('Not adding desc %s' % (p) )
	for ind in soup.findAll('div', {'class':'indicatorvalue'}):
		try:
			m1 =ind1match.match( ind.contents[0] ) 
		except TypeError:
			logging.info('Not able to process %s ' % ind)
		if m1:
			#print 'indicator 1'
			ind1desc = m1.group('firstind')
			ind1.setAttribute('description', ind1desc)	
			currentInd = 1
		try:
			m2 = ind2match.match( ind.contents[0] )
		except TypeError:
			logging.info('Not able to process %s' % ind )
		if m2:
			#print 'indicator 2'
			ind2desc = m2.group('secind')
			ind2text = xml.createTextNode(ind2desc)
			ind2.setAttribute('description', ind2desc)	
			currentInd = 2
		try:
			m = indoptionmatch.match( ind.contents[0] )
		except TypeError:
			logging.info('Not processing description for %s' % ind)
		if m:
			indval = m.group('value')
			indtext = m.group('text')
			option = xml.createElement('option')
			option.setAttribute('code', indval)
			option.setAttribute('name', indtext)
			for desc in ind.findAll('div', {'class':'description'}, recursive=False):
				if desc.string:
					option.setAttribute('description', desc.string.rstrip())	
			if currentInd == 1:
				ind1.appendChild(option)
			elif currentInd == 2:
				ind2.appendChild(option)

	for sf in soup.findAll('div', {'class':'subfieldvalue'}):
		#print "working on subfield"
		try:
			m = sfstring.match( sf.contents[0] )
		except TypeError:
			logging.info('Not processing subfield %s' % sf )
		if m:
			sfel = xml.createElement('subfield')
			sfel.setAttribute('code', m.group('code') )
			sfel.setAttribute('name', m.group('text') )
			for desc in sf.findAll('div', {'class':'description'}, recursive=False):
				try:
					sfel.setAttribute('description', desc.contents[0].rstrip())
				except TypeError:
					logging.info('Not adding description to %s ' % sf)
			tag.appendChild(sfel)

xml = minidom.Document()
root = xml.createElement('marc21spec')
xml.appendChild(root)


inpath = sys.argv[1]
outfile = sys.argv[2]
out = file(outfile, 'w')

if re.compile(r'.*html').match(inpath):
	processFile(inpath)
else:
	for filename in  os.listdir( inpath ):
		if fnmatch.fnmatch(filename, '*.html'):
			processFile(filename)

logging.info('Writing xml to %s' % (outfile) )
out.write( xml.toprettyxml(indent="   ") )
