import HTMLParser, re, sys
class html2wiki(HTMLParser.HTMLParser):
def __init__(self):
   HTMLParser.HTMLParser.__init__(self)
   self.wiki = ''      # The Wiki text
   self.wikirow = ''   # The current Wiki row of table being constructed from HTML
   self.inTD = 0      # Used to track if we are inside or outside a <TD>...</TD> tag.
   self.inTR = 0      # Used to track if we are inside or outside a <TR>...</TR> tag.
   self.re_multiplespaces = re.compile('\s+')  # regular expression used to remove spaces in excess
   self.rowCount = 0  # output row counter.
   self.rowspan = ''
   self.colspan = ''
   self.linebreak = '<br>'
   self.data = ''
   self.prop = ''
def handle_starttag(self, tag, attrs):
   if tag == 'table': self.start_table()
   elif   tag == 'tr': self.start_tr()
   elif tag == 'td': self.start_td(attrs)
def handle_endtag(self, tag):
   if tag == 'table': self.end_table();
   elif   tag == 'tr': self.end_tr()
   elif tag == 'td': self.end_td()
def start_table(self):
   self.wiki += '{| border=1' + self.linebreak
   self.wiki += '|-' + self.linebreak
def end_table(self):
   self.wiki += '|}' + self.linebreak
def start_tr(self):
   if self.inTR: self.end_tr()  # <TR> implies </TR>
   self.inTR = 1
def end_tr(self):
   if self.inTD: self.end_td()  # </TR> implies </TD>
   self.inTR = 0       
   if len(self.wikirow) > 0:
       self.wiki += self.wikirow
       self.wiki += '|-' + self.linebreak
       self.wikirow = ''
   self.rowCount += 1
def start_td(self, attrs):
   if not self.inTR: self.start_tr() # <TD> implies <TR>
   self.data = ''
   self.prop = ''
   self.rowspan = ''
   self.colspan = ''
   for key, value in attrs:
       if key == 'rowspan':
           self.rowspan = value
       elif key == 'colspan':
           self.colspan = value       
   self.inTD = 1
def end_td(self):
   if self.inTD:           
       self.wikirow += '| ' + self.prop + self.re_multiplespaces.sub(' ',self.data.replace('\t',' ').replace(self.linebreak,'').replace('\r','').replace('"','""'))+ self.linebreak;
       self.data = ''
       self.inTD = 0
def handle_data(self, data):
   if self.inTD:
       if data.strip() != '':           
           self.prop = ''
           if self.rowspan != '':
               self.prop = ' rowspan = '+self.rowspan
           if self.colspan != '':
               self.prop += ' colspan = '+self.colspan
           if self.prop:
               self.prop += ' | '
           self.data += data
if __name__ == '__main__':           
parser = html2wiki()
if len(sys.argv) == 2:
   in_file = open(sys.argv[1],"r")
   text = in_file.read()
   parser.feed(text)
   in_file.close()
   print parser.wiki
else:
   print 'Argument - filename required'
<?
if($_POST['submit']) {
 if(trim($_POST['html'])) {
     $input = stripslashes(trim($_POST['html']));
  
     $filename = 'uploads/'.date('Ymdhis').'.txt';
     $fp = fopen($filename, 'w');
     fwrite($fp,$input);
     fclose($fp);         
     $ret = exec("python html2wiki.py $filename", $output, $retval);
     $output = implode("\n",$output);
     unlink($filename);
 }
}
?>
<script language="javascript" type="text/javascript" src="/lib/tinymce/jscripts/tiny_mce/tiny_mce.js"></script>
<script language="javascript" type="text/javascript">
tinyMCE.init({
 theme:"simple",
 mode : "textareas"
});
</script>
<form name='converter' method='post'>
<input type='submit' value = 'Convert Html2Wiki >>' name='submit'><br>
<table>
<tr><td><textarea name='html' cols='50' rows='40'><?=$input?></textarea></td>
<td><textarea name='wiki' cols='50' rows='40'><?=$output?></textarea></td>
</tr></table>
</form>
The output is something like shown below.
 
 
1 comment:
When we talk aboutdata conversion india has been a great place to get those jobs done at a cheaper price
Post a Comment