import re
import copy
import Polygon
import numpy as np
from bs4 import BeautifulSoup as bs
from .time_counter import format_table
def check_continuous(seq):
if len(seq) > 0:
pre_val = seq[0]
for val in seq[1:]:
assert pre_val + 1 == val
pre_val = val
def table_to_latex(table):
def cal_cls_id(transcript):
transcript = ''.join(transcript)
if transcript == '':
return ''
elif transcript == ' ':
return ''
elif transcript == ' ':
return ''
else:
return ''
assert table['layout'].max() + 1 == len(table['cells'])
latex = [cal_cls_id(cell['transcript']) for cell in table['cells']]
return latex
def html_to_table(html):
tokens = html['html']['structure']['tokens']
layout = [[]]
def extend_table(x, y):
assert (x >= 0) and (y >= 0)
nonlocal layout
if x >= len(layout[0]):
for row in layout:
row.extend([-1] * (x - len(row) + 1))
if y >= len(layout):
for _ in range(y - len(layout) + 1):
layout.append([-1] * len(layout[0]))
def set_cell_val(x, y, val):
assert (x >= 0) and (y >= 0)
nonlocal layout
extend_table(x, y)
layout[y][x] = val
def get_cell_val(x, y):
assert (x >= 0) and (y >= 0)
nonlocal layout
extend_table(x, y)
return layout[y][x]
def parse_span_val(token):
span_val = int(token[token.index('"') + 1:token.rindex('"')])
return span_val
def maskout_left_rows():
nonlocal row_idx, layout
layout = layout[:max(row_idx+1, 1)]
row_idx = -1
col_idx = -1
line_idx = -1
inside_head = False
inside_body = False
head_rows = list()
body_rows = list()
col_span = 1
row_span = 1
for token in tokens:
if token == '':
inside_head = True
maskout_left_rows()
elif token == '':
inside_head = False
maskout_left_rows()
elif token == '
':
inside_body = True
maskout_left_rows()
elif token == '':
inside_body = False
maskout_left_rows()
elif token == '':
row_idx += 1
col_idx = -1
if inside_head:
head_rows.append(row_idx)
if inside_body:
body_rows.append(row_idx)
elif token in ['| ', ' | ':
for cur_row_idx in range(row_idx, row_idx + row_span):
for cur_col_idx in range(col_idx, col_idx + col_span):
set_cell_val(cur_col_idx, cur_row_idx, line_idx)
col_idx += col_span - 1
check_continuous(head_rows)
check_continuous(body_rows)
assert len(set(head_rows) | set(body_rows)) == len(layout)
layout = np.array(layout)
assert np.all(layout >= 0)
cells_info = list()
for cell_idx, cell in enumerate(html['html']['cells']):
transcript = cell['tokens']
cell_info = dict(
transcript=transcript
)
if 'bbox' in cell:
x1, y1, x2, y2 = cell['bbox']
cell_info['bbox'] = [x1, y1, x2, y2]
cell_info['segmentation'] = [[[x1, y1], [x2, y1], [x2, y2], [x1, y2]]]
cells_info.append(cell_info)
table = dict(
layout=layout,
cells=cells_info,
head_rows=head_rows,
body_rows=body_rows
)
return table
def segmentation_to_bbox(segmentation):
x1 = min([min([pt[0] for pt in contour]) for contour in segmentation])
y1 = min([min([pt[1] for pt in contour]) for contour in segmentation])
x2 = max([max([pt[0] for pt in contour]) for contour in segmentation])
y2 = max([max([pt[1] for pt in contour]) for contour in segmentation])
return [x1, y1, x2, y2]
def table_to_html(table):
layout = table['layout']
head_rows = table['head_rows']
body_rows = table['body_rows']
cells_span = list()
for cell_idx in range(len(table['cells'])):
cell_positions = np.argwhere(layout == cell_idx)
row_span = [np.min(cell_positions[:, 0]), np.max(cell_positions[:, 0]) + 1]
col_span = [np.min(cell_positions[:, 1]), np.max(cell_positions[:, 1]) + 1]
assert np.all(layout[row_span[0]:row_span[1], col_span[0]:col_span[1]] == cell_idx)
cells_span.append([row_span, col_span])
cells = list()
tokens = ['']
inside_head = True
for row_idx in range(layout.shape[0]):
if row_idx in body_rows:
if inside_head:
tokens.append('')
tokens.append(' |
')
inside_head = False
tokens.append('')
for col_idx in range(table['layout'].shape[1]):
cell_idx = layout[row_idx][col_idx]
assert cell_idx <= len(cells)
if cell_idx == len(cells):
row_span, col_span = cells_span[cell_idx]
if (row_span[1] - row_span[0]) == 1 and (col_span[1] - col_span[0] == 1):
tokens.append('| ')
else:
tokens.append(' | 1:
tokens.append(' rowspan="%d"' % (row_span[1] - row_span[0]))
if (col_span[1] - col_span[0]) > 1:
tokens.append(' colspan="%d"' % (col_span[1] - col_span[0]))
tokens.append('>')
tokens.append(' | ')
cell = dict()
cell['tokens'] = table['cells'][cell_idx]['transcript']
if 'segmentation' in table['cells'][cell_idx]:
cell['bbox'] = segmentation_to_bbox(table['cells'][cell_idx]['segmentation'])
cells.append(cell)
tokens.append('
')
if inside_head:
tokens.append('')
tokens.append('')
tokens.append('')
html = dict(
html=dict(
cells=cells,
structure=dict(
tokens=tokens
)
)
)
return html
def format_html_for_vis(html):
html_string = '''
''' % ''.join(html['html']['structure']['tokens'])
cell_nodes = list(re.finditer(r'(]*>)( | )', html_string))
assert len(cell_nodes) == len(html['html']['cells']), 'Number of cells defined in tags does not match the length of cells'
cells = [''.join(c['tokens']) for c in html['html']['cells']]
offset = 0
for n, cell in zip(cell_nodes, cells):
html_string = html_string[:n.end(1) + offset] + cell + html_string[n.start(2) + offset:]
offset += len(cell)
# prettify the html
soup = bs(html_string)
html_string = soup.prettify()
return html_string
def format_html(html):
html_string = '''''' % ''.join(html['html']['structure']['tokens'])
cell_nodes = list(re.finditer(r'(]*>)( | )', html_string))
assert len(cell_nodes) == len(html['html']['cells']), 'Number of cells defined in tags does not match the length of cells'
cells = [''.join(c['tokens']) for c in html['html']['cells']]
offset = 0
for n, cell in zip(cell_nodes, cells):
html_string = html_string[:n.end(1) + offset] + cell + html_string[n.start(2) + offset:]
offset += len(cell)
return html_string
def format_table_layout(table):
layout = table['table']['layout']
cell_lines = [cell['lines_idx'] for cell in table['table']['cells']]
table_cells_info = list()
for row in layout:
row_cells_info = list()
for cell_idx in row:
cell_str = ','.join([str(item) for item in cell_lines[cell_idx]])
row_cells_info.append(cell_str)
table_cells_info.append(row_cells_info)
return format_table(table_cells_info, padding=1)
def remove_blank_cell(html):
start_idx = 0
while '') + 1 + start_idx
content_end_idx = html[content_start_idx:].index(' | ') + content_start_idx
end_idx = content_end_idx + len('')
if content_end_idx == content_start_idx:
html = html[:start_idx] + html[end_idx:]
else:
start_idx = end_idx
return html