Monday, January 29, 2007

Python: HTML tables to Mediawiki converter

Some ideas and codes for the html2wiki are borrowed from html2csv converter. It reads any file and converts tables, if present, to wiki format. The code, in colored, can also be viewed from the code snippets.

import HTMLParser, re, sys
class html2wiki(HTMLParser.HTMLParser):
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
self.wiki = '' # The Wiki text
self.wikirow = '' # The current Wiki row of table being constructed from HTML
self.inTD = 0 # Used to track if we are inside or outside a <TD>...</TD> tag.
self.inTR = 0 # Used to track if we are inside or outside a <TR>...</TR> tag.
self.re_multiplespaces = re.compile('\s+') # regular expression used to remove spaces in excess
self.rowCount = 0 # output row counter.
self.rowspan = ''
self.colspan = ''
self.linebreak = '<br>'
self.data = ''
self.prop = ''

def handle_starttag(self, tag, attrs):
if tag == 'table': self.start_table()
elif tag == 'tr': self.start_tr()
elif tag == 'td': self.start_td(attrs)

def handle_endtag(self, tag):
if tag == 'table': self.end_table();
elif tag == 'tr': self.end_tr()
elif tag == 'td': self.end_td()

def start_table(self):
self.wiki += '{| border=1' + self.linebreak
self.wiki += '|-' + self.linebreak

def end_table(self):
self.wiki += '|}' + self.linebreak

def start_tr(self):
if self.inTR: self.end_tr() # <TR> implies </TR>
self.inTR = 1

def end_tr(self):
if self.inTD: self.end_td() # </TR> implies </TD>
self.inTR = 0
if len(self.wikirow) > 0:
self.wiki += self.wikirow
self.wiki += '|-' + self.linebreak
self.wikirow = ''
self.rowCount += 1

def start_td(self, attrs):
if not self.inTR: self.start_tr() # <TD> implies <TR>
self.data = ''
self.prop = ''
self.rowspan = ''
self.colspan = ''
for key, value in attrs:
if key == 'rowspan':
self.rowspan = value
elif key == 'colspan':
self.colspan = value
self.inTD = 1

def end_td(self):
if self.inTD:
self.wikirow += '| ' + self.prop + self.re_multiplespaces.sub(' ',self.data.replace('\t',' ').replace(self.linebreak,'').replace('\r','').replace('"','""'))+ self.linebreak;
self.data = ''
self.inTD = 0

def handle_data(self, data):
if self.inTD:
if data.strip() != '':
self.prop = ''
if self.rowspan != '':
self.prop = ' rowspan = '+self.rowspan
if self.colspan != '':
self.prop += ' colspan = '+self.colspan
if self.prop:
self.prop += ' | '
self.data += data

if __name__ == '__main__':
parser = html2wiki()
if len(sys.argv) == 2:
in_file = open(sys.argv[1],"r")
text = in_file.read()
parser.feed(text)
in_file.close()
print parser.wiki
else:
print 'Argument - filename required'
Since I need a web interface for users, I don't want to create another similar app in php nor do I want to write cgi in python. So I wrote another tiny php script and exploit the python script. I have used tinymce so that I can now just copy and paste html tables directly to the edit box and do the conversion easily.

<?
if($_POST['submit']) {
if(trim($_POST['html'])) {
$input = stripslashes(trim($_POST['html']));

$filename = 'uploads/'.date('Ymdhis').'.txt';
$fp = fopen($filename, 'w');
fwrite($fp,$input);
fclose($fp);
$ret = exec("python html2wiki.py $filename", $output, $retval);
$output = implode("\n",$output);
unlink($filename);
}
}
?>
<script language="javascript" type="text/javascript" src="/lib/tinymce/jscripts/tiny_mce/tiny_mce.js"></script>
<script language="javascript" type="text/javascript">
tinyMCE.init({
theme:"simple",
mode : "textareas"
});
</script>

<form name='converter' method='post'>
<input type='submit' value = 'Convert Html2Wiki >>' name='submit'><br>
<table>
<tr><td><textarea name='html' cols='50' rows='40'><?=$input?></textarea></td>
<td><textarea name='wiki' cols='50' rows='40'><?=$output?></textarea></td>
</tr></table>
</form>


The output is something like shown below.
Image Hosted by ImageShack.us

1 comment:

Anonymous said...

When we talk aboutdata conversion india has been a great place to get those jobs done at a cheaper price