|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import json |
|
|
|
|
|
from typing import Iterable, List, Tuple |
|
|
|
|
|
|
|
|
def load_chunks(chunk_path): |
|
|
with open(chunk_path, 'r') as f: |
|
|
chunks = json.load(f)['chunks'] |
|
|
|
|
|
ret = [] |
|
|
for chunk in chunks: |
|
|
if chunk["pos"][1] < chunk["pos"][0]: |
|
|
chunk["pos"][0], chunk["pos"][1] = chunk["pos"][1], chunk["pos"][0] |
|
|
print("Warning load illegal chunk.") |
|
|
c = Chunk.load_from_dict(chunk) |
|
|
|
|
|
|
|
|
ret.append(c) |
|
|
return ret |
|
|
|
|
|
|
|
|
class Box(object): |
|
|
|
|
|
def __init__(self, pos): |
|
|
"""pos: (x1, x2, y1, y2)""" |
|
|
self.set_pos(pos) |
|
|
|
|
|
def set_pos(self, pos): |
|
|
assert pos[0] <= pos[1] |
|
|
assert pos[2] <= pos[3] |
|
|
self.x1 = pos[0] |
|
|
self.x2 = pos[1] |
|
|
self.y1 = pos[2] |
|
|
self.y2 = pos[3] |
|
|
self.w = self.x2 - self.x1 |
|
|
self.h = self.y2 - self.y1 |
|
|
self.pos = pos |
|
|
|
|
|
def __lt__(self, other): |
|
|
return self.pos.__lt__(other.pos) |
|
|
|
|
|
def __contains__(self, other): |
|
|
if other.x1 >= self.x1 and other.x2 <= self.x2 and \ |
|
|
other.y1 >= self.y1 and other.y2 <= self.y2: |
|
|
return True |
|
|
return False |
|
|
|
|
|
def __str__(self): |
|
|
return 'Box(%d, %d, %d, %d)' % self.pos |
|
|
|
|
|
def __hash__(self): |
|
|
return self.pos.__hash__() |
|
|
|
|
|
|
|
|
class Chunk(Box): |
|
|
|
|
|
def __init__(self, text:str, pos:Tuple, size:float=0.0, cell_id=None): |
|
|
super(Chunk, self).__init__(pos) |
|
|
self.text = text |
|
|
self.size = size |
|
|
self.cell_id = cell_id |
|
|
|
|
|
def __str__(self): |
|
|
return 'Chunk(text="%s", pos=(%d, %d, %d, %d))' % (self.text, *self.pos) |
|
|
|
|
|
def __repr__(self): |
|
|
return self.__str__() |
|
|
|
|
|
def dump_as_json_obj(self): |
|
|
return {"text":self.text, "pos":self.pos, "cell_id":self.cell_id} |
|
|
|
|
|
@classmethod |
|
|
def load_from_dict(cls, d): |
|
|
assert type(d) == dict |
|
|
assert type(d["text"]) == str |
|
|
assert len(d["pos"]) == 4 |
|
|
cell_id = d["cell_id"] if "cell_id" in d else None |
|
|
return cls(d["text"].strip(), d["pos"], cell_id=cell_id) |
|
|
|
|
|
|
|
|
class Table(object): |
|
|
|
|
|
""" |
|
|
The output of table segmentation. |
|
|
With the Table object, we can get the set of cells |
|
|
and their corresponding text. |
|
|
""" |
|
|
def __init__(self, row_n, col_n, cells:Iterable[Chunk]=None, tid=""): |
|
|
|
|
|
|
|
|
|
|
|
self.tid = tid |
|
|
self.row_n = row_n |
|
|
self.col_n = col_n |
|
|
self.coo2cell_id = [ |
|
|
[ -1 for _ in range(col_n) ] for _ in range(row_n) ] |
|
|
self.cells:List[Chunk] = [] |
|
|
for cell in cells: |
|
|
self.add_cell(cell) |
|
|
|
|
|
def reverse(self, is_col=True): |
|
|
cells = self.cells |
|
|
self.cells = [] |
|
|
cell:Chunk = None |
|
|
for cell in cells: |
|
|
if is_col: |
|
|
_c = Chunk(cell.text, ( |
|
|
self.row_n - cell.x2, self.row_n - cell.x1, cell.y1, cell.y2)) |
|
|
else: |
|
|
_c = Chunk(cell.text, ( |
|
|
cell.x1, cell.x2, self.col_n - cell.y1, self.col_n - cell.y2)) |
|
|
self.add_cell(_c) |
|
|
|
|
|
def add_cell(self, cell:Chunk): |
|
|
|
|
|
assert cell.y2 < self.col_n |
|
|
assert cell.x2 < self.row_n |
|
|
|
|
|
for x in range(cell.x1, cell.x2 + 1, 1): |
|
|
for y in range(cell.y1, cell.y2 + 1, 1): |
|
|
self.coo2cell_id[x][y] = len(self.cells) |
|
|
self.cells.append(cell) |
|
|
|
|
|
def __getitem__(self, id_tuple): |
|
|
row_id, col_id = id_tuple |
|
|
assert row_id < self.row_n and col_id < self.col_n |
|
|
return self.cells[self.coo2cell_id[row_id][col_id]] |