import HTMLParser, re, sys
class html2wiki(HTMLParser.HTMLParser):
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
self.wiki = '' # The Wiki text
self.wikirow = '' # The current Wiki row of table being constructed from HTML
self.inTD = 0 # Used to track if we are inside or outside a <TD>...</TD> tag.
self.inTR = 0 # Used to track if we are inside or outside a <TR>...</TR> tag.
self.re_multiplespaces = re.compile('\s+') # regular expression used to remove spaces in excess
self.rowCount = 0 # output row counter.
self.rowspan = ''
self.colspan = ''
self.linebreak = '<br>'
self.data = ''
self.prop = ''
def handle_starttag(self, tag, attrs):
if tag == 'table': self.start_table()
elif tag == 'tr': self.start_tr()
elif tag == 'td': self.start_td(attrs)
def handle_endtag(self, tag):
if tag == 'table': self.end_table();
elif tag == 'tr': self.end_tr()
elif tag == 'td': self.end_td()
def start_table(self):
self.wiki += '{| border=1' + self.linebreak
self.wiki += '|-' + self.linebreak
def end_table(self):
self.wiki += '|}' + self.linebreak
def start_tr(self):
if self.inTR: self.end_tr() # <TR> implies </TR>
self.inTR = 1
def end_tr(self):
if self.inTD: self.end_td() # </TR> implies </TD>
self.inTR = 0
if len(self.wikirow) > 0:
self.wiki += self.wikirow
self.wiki += '|-' + self.linebreak
self.wikirow = ''
self.rowCount += 1
def start_td(self, attrs):
if not self.inTR: self.start_tr() # <TD> implies <TR>
self.data = ''
self.prop = ''
self.rowspan = ''
self.colspan = ''
for key, value in attrs:
if key == 'rowspan':
self.rowspan = value
elif key == 'colspan':
self.colspan = value
self.inTD = 1
def end_td(self):
if self.inTD:
self.wikirow += '| ' + self.prop + self.re_multiplespaces.sub(' ',self.data.replace('\t',' ').replace(self.linebreak,'').replace('\r','').replace('"','""'))+ self.linebreak;
self.data = ''
self.inTD = 0
def handle_data(self, data):
if self.inTD:
if data.strip() != '':
self.prop = ''
if self.rowspan != '':
self.prop = ' rowspan = '+self.rowspan
if self.colspan != '':
self.prop += ' colspan = '+self.colspan
if self.prop:
self.prop += ' | '
self.data += data
if __name__ == '__main__':
parser = html2wiki()
if len(sys.argv) == 2:
in_file = open(sys.argv[1],"r")
text = in_file.read()
parser.feed(text)
in_file.close()
print parser.wiki
else:
print 'Argument - filename required'
Since I need a web interface for users, I don't want to create another similar app in php nor do I want to write cgi in python. So I wrote another tiny php script and exploit the python script. I have used tinymce so that I can now just copy and paste html tables directly to the edit box and do the conversion easily.<?
if($_POST['submit']) {
if(trim($_POST['html'])) {
$input = stripslashes(trim($_POST['html']));
$filename = 'uploads/'.date('Ymdhis').'.txt';
$fp = fopen($filename, 'w');
fwrite($fp,$input);
fclose($fp);
$ret = exec("python html2wiki.py $filename", $output, $retval);
$output = implode("\n",$output);
unlink($filename);
}
}
?>
<script language="javascript" type="text/javascript" src="/lib/tinymce/jscripts/tiny_mce/tiny_mce.js"></script>
<script language="javascript" type="text/javascript">
tinyMCE.init({
theme:"simple",
mode : "textareas"
});
</script>
<form name='converter' method='post'>
<input type='submit' value = 'Convert Html2Wiki >>' name='submit'><br>
<table>
<tr><td><textarea name='html' cols='50' rows='40'><?=$input?></textarea></td>
<td><textarea name='wiki' cols='50' rows='40'><?=$output?></textarea></td>
</tr></table>
</form>
The output is something like shown below.