Spaces:
Sleeping
Sleeping
Create norm_html.py
Browse files- norm_html.py +99 -0
norm_html.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import html
|
| 3 |
+
import uuid
|
| 4 |
+
import subprocess
|
| 5 |
+
import unicodedata
|
| 6 |
+
from bs4 import BeautifulSoup
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def normalized_html_table(text):
|
| 10 |
+
def process_table_html(md_i):
|
| 11 |
+
"""
|
| 12 |
+
pred_md format edit
|
| 13 |
+
"""
|
| 14 |
+
def process_table_html(html_content):
|
| 15 |
+
soup = BeautifulSoup(html_content, 'html.parser')
|
| 16 |
+
th_tags = soup.find_all('th')
|
| 17 |
+
for th in th_tags:
|
| 18 |
+
th.name = 'td'
|
| 19 |
+
thead_tags = soup.find_all('thead')
|
| 20 |
+
for thead in thead_tags:
|
| 21 |
+
thead.unwrap() # unwrap()会移除标签但保留其内容
|
| 22 |
+
math_tags = soup.find_all('math')
|
| 23 |
+
for math_tag in math_tags:
|
| 24 |
+
alttext = math_tag.get('alttext', '')
|
| 25 |
+
alttext = f'${alttext}$'
|
| 26 |
+
if alttext:
|
| 27 |
+
math_tag.replace_with(alttext)
|
| 28 |
+
span_tags = soup.find_all('span')
|
| 29 |
+
for span in span_tags:
|
| 30 |
+
span.unwrap()
|
| 31 |
+
return str(soup)
|
| 32 |
+
|
| 33 |
+
table_res=''
|
| 34 |
+
table_res_no_space=''
|
| 35 |
+
if '<table' in md_i.replace(" ","").replace("'",'"'):
|
| 36 |
+
md_i = process_table_html(md_i)
|
| 37 |
+
table_res = html.unescape(md_i).replace('\n', '')
|
| 38 |
+
table_res = unicodedata.normalize('NFKC', table_res).strip()
|
| 39 |
+
pattern = r'<table\b[^>]*>(.*)</table>'
|
| 40 |
+
tables = re.findall(pattern, table_res, re.DOTALL | re.IGNORECASE)
|
| 41 |
+
table_res = ''.join(tables)
|
| 42 |
+
# table_res = re.sub('<table.*?>','',table_res)
|
| 43 |
+
table_res = re.sub('( style=".*?")', "", table_res)
|
| 44 |
+
table_res = re.sub('( style=".*?")', "", table_res)
|
| 45 |
+
table_res = re.sub('( height=".*?")', "", table_res)
|
| 46 |
+
table_res = re.sub('( width=".*?")', "", table_res)
|
| 47 |
+
table_res = re.sub('( colwidth=".*?")', "", table_res)
|
| 48 |
+
table_res = re.sub('( colheight=".*?")', "", table_res)
|
| 49 |
+
table_res = re.sub('( rowwidth=".*?")', "", table_res)
|
| 50 |
+
table_res = re.sub('( rowheight=".*?")', "", table_res)
|
| 51 |
+
table_res = re.sub('( align=".*?")', "", table_res)
|
| 52 |
+
table_res = re.sub('( class=".*?")', "", table_res)
|
| 53 |
+
table_res = re.sub('( rowspan="1")', "", table_res)
|
| 54 |
+
table_res = re.sub('( colspan="1")', "", table_res)
|
| 55 |
+
table_res = re.sub('</?tbody>',"",table_res)
|
| 56 |
+
|
| 57 |
+
table_res = re.sub(r'\s+', " ", table_res)
|
| 58 |
+
table_res_no_space = '<html><body><table>' + table_res.replace(' ','') + '</table></body></html>'
|
| 59 |
+
# table_res_no_space = re.sub(' (style=".*?")',"",table_res_no_space)
|
| 60 |
+
# table_res_no_space = re.sub(r'[ ]', " ", table_res_no_space)
|
| 61 |
+
table_res_no_space = re.sub('colspan="', ' colspan="', table_res_no_space)
|
| 62 |
+
table_res_no_space = re.sub('rowspan="', ' rowspan="', table_res_no_space)
|
| 63 |
+
table_res_no_space = re.sub('border="', ' border="', table_res_no_space)
|
| 64 |
+
|
| 65 |
+
table_res = '<html><body><table>' + table_res + '</table></body></html>'
|
| 66 |
+
# table_flow.append(table_res)
|
| 67 |
+
# table_flow_no_space.append(table_res_no_space)
|
| 68 |
+
|
| 69 |
+
return table_res, table_res_no_space
|
| 70 |
+
|
| 71 |
+
def clean_table(input_str,flag=True):
|
| 72 |
+
if flag:
|
| 73 |
+
input_str = input_str.replace('<sup>', '').replace('</sup>', '')
|
| 74 |
+
input_str = input_str.replace('<sub>', '').replace('</sub>', '')
|
| 75 |
+
input_str = input_str.replace('<span>', '').replace('</span>', '')
|
| 76 |
+
input_str = input_str.replace('<div>', '').replace('</div>', '')
|
| 77 |
+
input_str = input_str.replace('<p>', '').replace('</p>', '')
|
| 78 |
+
input_str = input_str.replace('<spandata-span-identity="">', '')
|
| 79 |
+
input_str = re.sub('<colgroup>.*?</colgroup>','',input_str)
|
| 80 |
+
return input_str
|
| 81 |
+
|
| 82 |
+
def process_formula(input_str):
|
| 83 |
+
# 处理行内数学公式
|
| 84 |
+
inline_pattern = r'<p><span\s+data-latex="([^"]*)"\s+data-type="inline-math"></span></p>'
|
| 85 |
+
input_str = re.sub(inline_pattern, r'\(\1\)', input_str)
|
| 86 |
+
|
| 87 |
+
# 处理块级数学公式(如果有的话)
|
| 88 |
+
block_pattern = r'<p><span\s+data-latex="([^"]*)"\s+data-type="display-math"></span></p>'
|
| 89 |
+
input_str = re.sub(block_pattern, r'\[\1\]', input_str)
|
| 90 |
+
return input_str
|
| 91 |
+
|
| 92 |
+
def process_uline(input_str):
|
| 93 |
+
return input_str.replace('<u>', '<u>').replace('</u>', '</u>')
|
| 94 |
+
|
| 95 |
+
text = process_formula(text)
|
| 96 |
+
text = process_uline(text)
|
| 97 |
+
norm_text, _ = process_table_html(text)
|
| 98 |
+
norm_text = clean_table(norm_text)
|
| 99 |
+
return norm_text.replace('> ', '>').replace(" </td>", "</td>")
|