from BeautifulSoup import BeautifulSoup
import sys
import os
import fnmatch
import re
import logging
from xml.dom import minidom

logging.basicConfig(level=logging.DEBUG, filemode='w', filename='extractControlfields.log')

tagmatch = re.compile(r'^bd(?P<tagname>\w{3})')
mattypematch = re.compile(r'\d{3}\s-\s(?P<type>.*)\(\w+\)')
charposmatch = re.compile(r'^(?P<position>\d{1,2}(?P<extent>-\d{2})*)\s-\s(?P<name>.*)')
charvalmatch = re.compile(r'^(?P<value>\S+)\s-\s(?P<desc>.*)')

def processFile(filename):
	logging.info('Processing %s ' % filename)
	fmatch = tagmatch.match(filename)
	tagname = fmatch.group('tagname')
	if tagname == 'lea':
		tagname = '000'
	html = open(filename).read()

	soup = BeautifulSoup(html)

	tag = xml.createElement('tag')
	root.appendChild(tag)
	tag.setAttribute('code', tagname)
	mattype = soup.find('h1').string
	mtmatch = mattypematch.match( mattype )
	if mtmatch:
		type = mtmatch.group('type')
		try:
			tag.setAttribute('materialtype', type.rstrip())
		except AttributeError:
			info.error('Unable to set materialtype attribute for this tag %s in %s: unexpected html' % (tagname, filename) )

	for charpos in soup.findAll('div', {'class':'characterposition'}):
		try:
			text = charpos.findNext('strong', recursive=False).contents[0].rstrip()
			logging.debug('Processing position %s ' % (text) )
			m = charposmatch.match(text)
			if m:
				position = m.group('position')
				name = m.group('name')
				positionel = xml.createElement('position')
				positionel.setAttribute('position', position)
				positionel.setAttribute('name', name)
				tag.appendChild(positionel)
# get description
				for desc in charpos.findAll('div', {'class':'description'}, recursive=False):
					if desc.string:
						positionel.setAttribute('description', desc.string.rstrip())
				# get charactervalues
				for charval in charpos.findAll('div', {'class':'charactervalue'}, recursive=False):
					logging.debug('Attempting to process %s' % charval)
					try:
						text = charval.findNext('strong', recursive=False).contents[0].rstrip()
						cvmatch = charvalmatch.match( text )
						if cvmatch:
							logging.debug('Processing %s ' % text )
							valueel = xml.createElement('value')
							value = cvmatch.group('value')
							desc = cvmatch.group('desc')
							valueel.setAttribute('code', value)
							valueel.setAttribute('description', desc)
							positionel.appendChild( valueel )
						else:
							logging.error('Not able to process %s in %s: no match' % (text, filename) )
					except TypeError:
						logging.error('Not able to process %s in %s unexpected html' % (charval, filename))
		except TypeError:
			logging.errro('Not able to process %s ' % (charpos) )

xml = minidom.Document()
root = xml.createElement('marc21spec')
xml.appendChild(root)

inpath = sys.argv[1]
outfile = sys.argv[2]

out = file(outfile, 'w')

if re.compile(r'.*html').match(inpath):
	processFile(inpath)
else:
	for filename in  os.listdir( inpath ):
		if fnmatch.fnmatch(filename, '*.html'):
			processFile(filename)

out.write( xml.toprettyxml(indent= "   ") )

