import re import copy import Polygon import numpy as np from bs4 import BeautifulSoup as bs from .time_counter import format_table def check_continuous(seq): if len(seq) > 0: pre_val = seq[0] for val in seq[1:]: assert pre_val + 1 == val pre_val = val def table_to_latex(table): def cal_cls_id(transcript): transcript = ''.join(transcript) if transcript == '': return '' elif transcript == ' ': return '' elif transcript == ' ': return '' else: return '' assert table['layout'].max() + 1 == len(table['cells']) latex = [cal_cls_id(cell['transcript']) for cell in table['cells']] return latex def html_to_table(html): tokens = html['html']['structure']['tokens'] layout = [[]] def extend_table(x, y): assert (x >= 0) and (y >= 0) nonlocal layout if x >= len(layout[0]): for row in layout: row.extend([-1] * (x - len(row) + 1)) if y >= len(layout): for _ in range(y - len(layout) + 1): layout.append([-1] * len(layout[0])) def set_cell_val(x, y, val): assert (x >= 0) and (y >= 0) nonlocal layout extend_table(x, y) layout[y][x] = val def get_cell_val(x, y): assert (x >= 0) and (y >= 0) nonlocal layout extend_table(x, y) return layout[y][x] def parse_span_val(token): span_val = int(token[token.index('"') + 1:token.rindex('"')]) return span_val def maskout_left_rows(): nonlocal row_idx, layout layout = layout[:max(row_idx+1, 1)] row_idx = -1 col_idx = -1 line_idx = -1 inside_head = False inside_body = False head_rows = list() body_rows = list() col_span = 1 row_span = 1 for token in tokens: if token == '': inside_head = True maskout_left_rows() elif token == '': inside_head = False maskout_left_rows() elif token == '': inside_body = True maskout_left_rows() elif token == '': inside_body = False maskout_left_rows() elif token == '': row_idx += 1 col_idx = -1 if inside_head: head_rows.append(row_idx) if inside_body: body_rows.append(row_idx) elif token in ['', '': for cur_row_idx in range(row_idx, row_idx + row_span): for cur_col_idx in range(col_idx, col_idx + col_span): set_cell_val(cur_col_idx, cur_row_idx, line_idx) col_idx += col_span - 1 check_continuous(head_rows) check_continuous(body_rows) assert len(set(head_rows) | set(body_rows)) == len(layout) layout = np.array(layout) assert np.all(layout >= 0) cells_info = list() for cell_idx, cell in enumerate(html['html']['cells']): transcript = cell['tokens'] cell_info = dict( transcript=transcript ) if 'bbox' in cell: x1, y1, x2, y2 = cell['bbox'] cell_info['bbox'] = [x1, y1, x2, y2] cell_info['segmentation'] = [[[x1, y1], [x2, y1], [x2, y2], [x1, y2]]] cells_info.append(cell_info) table = dict( layout=layout, cells=cells_info, head_rows=head_rows, body_rows=body_rows ) return table def segmentation_to_bbox(segmentation): x1 = min([min([pt[0] for pt in contour]) for contour in segmentation]) y1 = min([min([pt[1] for pt in contour]) for contour in segmentation]) x2 = max([max([pt[0] for pt in contour]) for contour in segmentation]) y2 = max([max([pt[1] for pt in contour]) for contour in segmentation]) return [x1, y1, x2, y2] def table_to_html(table): layout = table['layout'] head_rows = table['head_rows'] body_rows = table['body_rows'] cells_span = list() for cell_idx in range(len(table['cells'])): cell_positions = np.argwhere(layout == cell_idx) row_span = [np.min(cell_positions[:, 0]), np.max(cell_positions[:, 0]) + 1] col_span = [np.min(cell_positions[:, 1]), np.max(cell_positions[:, 1]) + 1] assert np.all(layout[row_span[0]:row_span[1], col_span[0]:col_span[1]] == cell_idx) cells_span.append([row_span, col_span]) cells = list() tokens = [''] inside_head = True for row_idx in range(layout.shape[0]): if row_idx in body_rows: if inside_head: tokens.append('') tokens.append('') inside_head = False tokens.append('') for col_idx in range(table['layout'].shape[1]): cell_idx = layout[row_idx][col_idx] assert cell_idx <= len(cells) if cell_idx == len(cells): row_span, col_span = cells_span[cell_idx] if (row_span[1] - row_span[0]) == 1 and (col_span[1] - col_span[0] == 1): tokens.append('') else: tokens.append(' 1: tokens.append(' rowspan="%d"' % (row_span[1] - row_span[0])) if (col_span[1] - col_span[0]) > 1: tokens.append(' colspan="%d"' % (col_span[1] - col_span[0])) tokens.append('>') tokens.append('') cell = dict() cell['tokens'] = table['cells'][cell_idx]['transcript'] if 'segmentation' in table['cells'][cell_idx]: cell['bbox'] = segmentation_to_bbox(table['cells'][cell_idx]['segmentation']) cells.append(cell) tokens.append('') if inside_head: tokens.append('') tokens.append('') tokens.append('') html = dict( html=dict( cells=cells, structure=dict( tokens=tokens ) ) ) return html def format_html_for_vis(html): html_string = ''' %s

''' % ''.join(html['html']['structure']['tokens']) cell_nodes = list(re.finditer(r'(]*>)()', html_string)) assert len(cell_nodes) == len(html['html']['cells']), 'Number of cells defined in tags does not match the length of cells' cells = [''.join(c['tokens']) for c in html['html']['cells']] offset = 0 for n, cell in zip(cell_nodes, cells): html_string = html_string[:n.end(1) + offset] + cell + html_string[n.start(2) + offset:] offset += len(cell) return html_string def format_table_layout(table): layout = table['table']['layout'] cell_lines = [cell['lines_idx'] for cell in table['table']['cells']] table_cells_info = list() for row in layout: row_cells_info = list() for cell_idx in row: cell_str = ','.join([str(item) for item in cell_lines[cell_idx]]) row_cells_info.append(cell_str) table_cells_info.append(row_cells_info) return format_table(table_cells_info, padding=1) def remove_blank_cell(html): start_idx = 0 while '') + 1 + start_idx content_end_idx = html[content_start_idx:].index('') + content_start_idx end_idx = content_end_idx + len('') if content_end_idx == content_start_idx: html = html[:start_idx] + html[end_idx:] else: start_idx = end_idx return html