File size: 9,478 Bytes

cb0ad2d

import re
import copy
import Polygon
import numpy as np
from bs4 import BeautifulSoup as bs
from .time_counter import format_table


def check_continuous(seq):
    if len(seq) > 0:
        pre_val = seq[0]
        for val in seq[1:]:
            assert pre_val + 1 == val
            pre_val = val

def table_to_latex(table):
    def cal_cls_id(transcript):
        transcript = ''.join(transcript)
        if transcript == '':
            return '</none>'
        elif transcript == '<b> </b>':
            return '</bold>'
        elif transcript == ' ':
            return '</space>'
        else:
            return '</line>'
    assert table['layout'].max() + 1 == len(table['cells'])
    latex = [cal_cls_id(cell['transcript']) for cell in table['cells']]
    return latex

def html_to_table(html):
    tokens = html['html']['structure']['tokens']

    layout = [[]]

    def extend_table(x, y):
        assert (x >= 0) and (y >= 0)
        nonlocal layout

        if x >= len(layout[0]):
            for row in layout:
                row.extend([-1] * (x - len(row) + 1))
        
        if y >= len(layout):
            for _ in range(y - len(layout) + 1):
                layout.append([-1] * len(layout[0]))

    def set_cell_val(x, y, val):
        assert (x >= 0) and (y >= 0)
        nonlocal layout
        extend_table(x, y)
        layout[y][x] = val

    def get_cell_val(x, y):
        assert (x >= 0) and (y >= 0)
        nonlocal layout
        extend_table(x, y)
        return layout[y][x]

    def parse_span_val(token):
        span_val = int(token[token.index('"') + 1:token.rindex('"')])
        return span_val

    def maskout_left_rows():
        nonlocal row_idx, layout
        layout = layout[:max(row_idx+1, 1)]

    row_idx = -1
    col_idx = -1
    line_idx = -1
    inside_head = False
    inside_body = False
    head_rows = list()
    body_rows = list()
    col_span = 1
    row_span = 1
    for token in tokens:
        if token == '<thead>':
            inside_head = True
            maskout_left_rows()
        elif token == '</thead>':
            inside_head = False
            maskout_left_rows()
        elif token == '<tbody>':
            inside_body = True
            maskout_left_rows()
        elif token == '</tbody>':
            inside_body = False
            maskout_left_rows()
        elif token == '<tr>':
            row_idx += 1
            col_idx = -1
            if inside_head:
                head_rows.append(row_idx)
            if inside_body:
                body_rows.append(row_idx)
        elif token in ['<td>', '<td']:
            line_idx += 1
            col_idx += 1
            row_span = 1
            col_span = 1
            while get_cell_val(col_idx, row_idx) != -1:
                col_idx += 1
        elif 'colspan' in token:
            col_span = parse_span_val(token)
        elif 'rowspan' in token:
            row_span = parse_span_val(token)
        elif token == '</td>':
            for cur_row_idx in range(row_idx, row_idx + row_span):
                for cur_col_idx in range(col_idx, col_idx + col_span):
                    set_cell_val(cur_col_idx, cur_row_idx, line_idx)
            col_idx += col_span - 1

    check_continuous(head_rows)
    check_continuous(body_rows)
    assert len(set(head_rows) | set(body_rows)) == len(layout)
    layout = np.array(layout)
    assert np.all(layout >= 0)

    cells_info = list()
    for cell_idx, cell in enumerate(html['html']['cells']):
        transcript = cell['tokens']
        cell_info = dict(
            transcript=transcript
        )
        if 'bbox' in cell:
            x1, y1, x2, y2 = cell['bbox']
            cell_info['bbox'] = [x1, y1, x2, y2]
            cell_info['segmentation'] = [[[x1, y1], [x2, y1], [x2, y2], [x1, y2]]]
        cells_info.append(cell_info)
    
    table = dict(
        layout=layout,
        cells=cells_info,
        head_rows=head_rows,
        body_rows=body_rows
    )
    return table


def segmentation_to_bbox(segmentation):
    x1 = min([min([pt[0] for pt in contour]) for contour in segmentation])
    y1 = min([min([pt[1] for pt in contour]) for contour in segmentation])
    x2 = max([max([pt[0] for pt in contour]) for contour in segmentation])
    y2 = max([max([pt[1] for pt in contour]) for contour in segmentation])
    return [x1, y1, x2, y2]


def table_to_html(table):
    layout = table['layout']
    head_rows = table['head_rows']
    body_rows = table['body_rows']

    cells_span = list()
    for cell_idx in range(len(table['cells'])):
        cell_positions = np.argwhere(layout == cell_idx)
        row_span = [np.min(cell_positions[:, 0]), np.max(cell_positions[:, 0]) + 1]
        col_span = [np.min(cell_positions[:, 1]), np.max(cell_positions[:, 1]) + 1]
        assert np.all(layout[row_span[0]:row_span[1], col_span[0]:col_span[1]] == cell_idx)
        cells_span.append([row_span, col_span])

    cells = list()
    tokens = ['<thead>']
    inside_head = True
    for row_idx in range(layout.shape[0]):
        if row_idx in body_rows:
            if inside_head:
                tokens.append('</thead>')
                tokens.append('<tbody>')
                inside_head = False
        tokens.append('<tr>')
        for col_idx in range(table['layout'].shape[1]):
            cell_idx = layout[row_idx][col_idx]
            assert cell_idx <= len(cells)
            if cell_idx == len(cells):
                row_span, col_span = cells_span[cell_idx]
                if (row_span[1] - row_span[0]) == 1 and (col_span[1] - col_span[0] == 1):
                    tokens.append('<td>')
                else:
                    tokens.append('<td')
                    if (row_span[1] - row_span[0]) > 1:
                        tokens.append(' rowspan="%d"' % (row_span[1] - row_span[0]))
                    if (col_span[1] - col_span[0]) > 1:
                        tokens.append(' colspan="%d"' % (col_span[1] - col_span[0]))
                    tokens.append('>')
                tokens.append('</td>')
                
                cell = dict()
                cell['tokens'] = table['cells'][cell_idx]['transcript']
                if 'segmentation' in table['cells'][cell_idx]:
                    cell['bbox'] = segmentation_to_bbox(table['cells'][cell_idx]['segmentation'])
                cells.append(cell)
        tokens.append('</tr>')
    if inside_head:
        tokens.append('</thead>')
        tokens.append('<tbody>')
    tokens.append('</tbody>')

    html = dict(
        html=dict(
            cells=cells,
            structure=dict(
                tokens=tokens
            )
        )
    )
    return html


def format_html_for_vis(html):
    html_string = '''<html>
                     <head>
                     <meta charset="UTF-8">
                     <style>
                     table, th, td {
                       border: 1px solid black;
                       font-size: 10px;
                     }
                     </style>
                     </head>
                     <body>
                     <table frame="hsides" rules="groups" width="100%%">
                         %s
                     </table>
                     </body>
                     </html>''' % ''.join(html['html']['structure']['tokens'])
    cell_nodes = list(re.finditer(r'(<td[^<>]*>)(</td>)', html_string))
    assert len(cell_nodes) == len(html['html']['cells']), 'Number of cells defined in tags does not match the length of cells'
    cells = [''.join(c['tokens']) for c in html['html']['cells']]
    offset = 0
    for n, cell in zip(cell_nodes, cells):
        html_string = html_string[:n.end(1) + offset] + cell + html_string[n.start(2) + offset:]
        offset += len(cell)
    # prettify the html
    soup = bs(html_string)
    html_string = soup.prettify()
    return html_string


def format_html(html):
    html_string = '''<html><body><table>%s</table></body></html>''' % ''.join(html['html']['structure']['tokens'])
    cell_nodes = list(re.finditer(r'(<td[^<>]*>)(</td>)', html_string))
    assert len(cell_nodes) == len(html['html']['cells']), 'Number of cells defined in tags does not match the length of cells'
    cells = [''.join(c['tokens']) for c in html['html']['cells']]
    offset = 0
    for n, cell in zip(cell_nodes, cells):
        html_string = html_string[:n.end(1) + offset] + cell + html_string[n.start(2) + offset:]
        offset += len(cell)
    return html_string


def format_table_layout(table):
    layout = table['table']['layout']
    cell_lines = [cell['lines_idx'] for cell in table['table']['cells']]

    table_cells_info = list()
    for row in layout:
        row_cells_info = list()
        for cell_idx in row:
            cell_str = ','.join([str(item) for item in cell_lines[cell_idx]])
            row_cells_info.append(cell_str)
        table_cells_info.append(row_cells_info)
    
    return format_table(table_cells_info, padding=1)


def remove_blank_cell(html):
    start_idx = 0
    while '<td' in html[start_idx:]:
        start_idx = html[start_idx:].index('<td') + start_idx
        content_start_idx = html[start_idx:].index('>') + 1 + start_idx
        content_end_idx = html[content_start_idx:].index('</td>') + content_start_idx
        end_idx = content_end_idx + len('</td>')
        if content_end_idx == content_start_idx:
            html = html[:start_idx] + html[end_idx:]
        else:
            start_idx = end_idx
    return html