|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from typing import List |
|
|
|
|
|
from .relation import Relation |
|
|
from .table import Table, Chunk |
|
|
|
|
|
|
|
|
DIR_HORIZ = 1 |
|
|
DIR_VERT = 2 |
|
|
DIR_SAME_CELL = 3 |
|
|
|
|
|
|
|
|
def normalize(s:str, rule=0): |
|
|
if rule == 0: |
|
|
s = s.replace("\r", "") |
|
|
s = s.replace("\n", "") |
|
|
s = s.replace(" ", "") |
|
|
s = s.replace("\t", "") |
|
|
return s.upper() |
|
|
else: |
|
|
raise NotImplementedError |
|
|
|
|
|
|
|
|
def eval_relations(gt:List[List], res:List[List], cmp_blank=True): |
|
|
"""Evaluate results |
|
|
|
|
|
Args: |
|
|
gt: a list of list of Relation |
|
|
res: a list of list of Relation |
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
assert len(gt) == len(res) |
|
|
tot_prec = 0 |
|
|
tot_recall = 0 |
|
|
total = 0 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
idx, t = 0, len(gt) |
|
|
for _gt, _res in zip(gt, res): |
|
|
idx += 1 |
|
|
print('Eval %d/%d (%d%%)' % (idx, t, idx / t * 100), ' ' * 45, end='\r') |
|
|
corr = compare_rel(_gt, _res, cmp_blank) |
|
|
precision = corr / len(_res) if len(_res) != 0 else 0 |
|
|
recall = corr / len(_gt) if len(_gt) != 0 else 0 |
|
|
tot_prec += precision |
|
|
tot_recall += recall |
|
|
total += 1 |
|
|
|
|
|
|
|
|
precision = tot_prec / total |
|
|
recall = tot_recall / total |
|
|
|
|
|
|
|
|
return precision, recall |
|
|
|
|
|
def compare_rel(gt_rel:List[Relation], res_rel:List[Relation], cmp_blank=True): |
|
|
count = 0 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
dup_res_rel = [r for r in res_rel] |
|
|
for gt in gt_rel: |
|
|
to_rm = None |
|
|
for i, res in enumerate(dup_res_rel): |
|
|
if gt.equal(res, cmp_blank): |
|
|
to_rm = i |
|
|
count += 1 |
|
|
break |
|
|
if to_rm is not None: |
|
|
dup_res_rel = dup_res_rel[:i] + dup_res_rel[i + 1:] |
|
|
|
|
|
return count |
|
|
|
|
|
def Table2Relations(t:Table): |
|
|
"""Convert a Table object to a List of Relation. |
|
|
""" |
|
|
ret = [] |
|
|
cl = t.coo2cell_id |
|
|
|
|
|
used = set() |
|
|
|
|
|
|
|
|
for r in range(t.row_n): |
|
|
for cFrom in range(t.col_n - 1): |
|
|
cTo = cFrom + 1 |
|
|
loop = True |
|
|
while loop and cTo < t.col_n: |
|
|
fid, tid = cl[r][cFrom], cl[r][cTo] |
|
|
if fid != -1 and tid != -1 and fid != tid: |
|
|
if (fid, tid) not in used: |
|
|
ret.append(Relation( |
|
|
from_text=t.cells[fid].text, |
|
|
to_text=t.cells[tid].text, |
|
|
direction=DIR_HORIZ, |
|
|
from_id=fid, |
|
|
to_id=tid, |
|
|
no_blanks=cTo - cFrom - 1 |
|
|
)) |
|
|
used.add((fid, tid)) |
|
|
loop = False |
|
|
else: |
|
|
if fid != -1 and tid != -1 and fid == tid: |
|
|
cFrom = cTo |
|
|
cTo += 1 |
|
|
|
|
|
|
|
|
for c in range(t.col_n): |
|
|
for rFrom in range(t.row_n - 1): |
|
|
rTo = rFrom + 1 |
|
|
loop = True |
|
|
while loop and rTo < t.row_n: |
|
|
fid, tid = cl[rFrom][c], cl[rTo][c] |
|
|
if fid != -1 and tid != -1 and fid != tid: |
|
|
if (fid, tid) not in used: |
|
|
ret.append(Relation( |
|
|
from_text=t.cells[fid].text, |
|
|
to_text=t.cells[tid].text, |
|
|
direction=DIR_VERT, |
|
|
from_id=fid, |
|
|
to_id=tid, |
|
|
no_blanks=rTo - rFrom - 1 |
|
|
)) |
|
|
used.add((fid, tid)) |
|
|
loop = False |
|
|
else: |
|
|
if fid != -1 and tid != -1 and fid == tid: |
|
|
rFrom = rTo |
|
|
rTo += 1 |
|
|
|
|
|
return ret |
|
|
|
|
|
def json2Table(json_obj, tid="", splitted_content=False): |
|
|
"""Construct a Table object from json object |
|
|
|
|
|
Args: |
|
|
json_obj: a json object |
|
|
Returns: |
|
|
a Table object |
|
|
""" |
|
|
jo = json_obj["cells"] |
|
|
row_n, col_n = 0, 0 |
|
|
cells = [] |
|
|
for co in jo: |
|
|
content = co["content"] |
|
|
if content is None: continue |
|
|
if splitted_content: |
|
|
content = " ".join(content) |
|
|
else: |
|
|
content = content.strip() |
|
|
if content == "": continue |
|
|
start_row = co["start_row"] |
|
|
end_row = co["end_row"] |
|
|
start_col = co["start_col"] |
|
|
end_col = co["end_col"] |
|
|
row_n = max(row_n, end_row) |
|
|
col_n = max(col_n, end_col) |
|
|
cell = Chunk(content, (start_row, end_row, start_col, end_col)) |
|
|
cells.append(cell) |
|
|
return Table(row_n + 1, col_n + 1, cells, tid) |
|
|
|
|
|
def json2Relations(json_obj, splitted_content): |
|
|
return Table2Relations(json2Table(json_obj, "", splitted_content)) |
|
|
|
|
|
|
|
|
|