kai-2054 commited on Sep 15, 2025

Commit

cb0ad2d

1 Parent(s): 71f63bc

Initial commit: add code

Browse files

Files changed (49) hide show

README.md +13 -0
dataset/extract_ocr.py +155 -0
dataset/process_scitsr.sh +0 -0
dataset/trans2lrc.py +145 -0
dataset/utils/extract_table_lines.py +192 -0
dataset/utils/list_record_cache.py +143 -0
dataset/utils/utils.py +449 -0
libs/configs/__init__.py +24 -0
libs/configs/default.py +77 -0
libs/data/__init__.py +69 -0
libs/data/batch_sampler.py +118 -0
libs/data/dataset.py +164 -0
libs/data/list_record_cache.py +143 -0
libs/data/transform.py +70 -0
libs/data/utils.py +188 -0
libs/model/__init__.py +16 -0
libs/model/backbone.py +281 -0
libs/model/cells_extractor.py +130 -0
libs/model/decoder.py +277 -0
libs/model/divide_predictor.py +57 -0
libs/model/extractor.py +88 -0
libs/model/fpn.py +37 -0
libs/model/model.py +65 -0
libs/model/pan.py +24 -0
libs/model/sa.py +35 -0
libs/model/segment_predictor.py +133 -0
libs/model/utils.py +371 -0
libs/utils/__init__.py +0 -0
libs/utils/cal_f1.py +214 -0
libs/utils/checkpoint.py +47 -0
libs/utils/comm.py +129 -0
libs/utils/context_cacher.py +15 -0
libs/utils/counter.py +43 -0
libs/utils/format_translate.py +278 -0
libs/utils/logger.py +64 -0
libs/utils/metric.py +58 -0
libs/utils/model_synchronizer.py +75 -0
libs/utils/scitsr/__init__.py +0 -0
libs/utils/scitsr/eval.py +179 -0
libs/utils/scitsr/relation.py +59 -0
libs/utils/scitsr/table.py +133 -0
libs/utils/teds.py +212 -0
libs/utils/teds_multiprocess.py +111 -0
libs/utils/time_counter.py +108 -0
libs/utils/utils.py +297 -0
libs/utils/vocab.py +36 -0
requirements.txt +93 -0
runner/train.py +245 -0
runner/valid.py +116 -0

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: Training
+emoji: 🦀
+colorFrom: pink
+colorTo: yellow
+sdk: gradio
+sdk_version: 5.45.0
+app_file: app.py
+pinned: false
+license: apache-2.0
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

dataset/extract_ocr.py ADDED Viewed

	@@ -0,0 +1,155 @@

+import os
+import glob
+import tqdm
+import numpy as np
+from utlis.list_record_cache import ListRecordCacher, merge_record_file
+from utlis.utlis import get_paths, get_sub_paths, crop_pdf, extract_ocr, refine_table, visualize_table
+def parse_args():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('src_dir', type=str, default=None)
+    parser.add_argument('dst_dir', type=str, default=None)
+    parser.add_argument('-n', '--num_workers', type=int, default=0)
+    args = parser.parse_args()
+    return args
+def single_process(paths, dst_dir):
+    output_pdf_dir = os.path.join(dst_dir, 'pdf')
+    if not os.path.exists(output_pdf_dir):
+        os.makedirs(output_pdf_dir)
+    output_img_dir = os.path.join(dst_dir, 'img')
+    if not os.path.exists(output_img_dir):
+        os.makedirs(output_img_dir)
+    output_error_dir = os.path.join(dst_dir, 'error')
+    if not os.path.exists(output_error_dir):
+        os.makedirs(output_error_dir)
+    output_visual_dir = os.path.join(dst_dir, 'visual')
+    if not os.path.exists(output_visual_dir):
+        os.makedirs(output_visual_dir)
+    cacher = ListRecordCacher(os.path.join(dst_dir, 'table.lrc'))
+    error_paths = []
+    error_count = 0
+    correct_count = 0
+    for id, path in enumerate(tqdm.tqdm(paths)):
+        try:
+            pdf_path, chunk_path, structure_path = path
+            name = os.path.splitext(os.path.basename(pdf_path))[0]
+            positions, transcripts = crop_pdf(path, output_pdf_dir)
+            table = extract_ocr(path, positions, transcripts)
+            table = refine_table(table, os.path.join(output_pdf_dir, name + '.png'), output_img_dir)
+            assert os.path.exists(structure_path), print('structure_path is not existed')
+            table['label_path'] = structure_path
+            visualize_table(os.path.join(output_img_dir, name + '.png'), output_visual_dir, table)
+            cacher.add_record(table)
+            correct_count += 1
+        except:
+            error_count += 1
+            error_paths.append(path)
+            crop_pdf(path, output_error_dir)
+    print("correct num: %d, error num: %d " % (correct_count, error_count))
+    if len(error_paths) > 0:
+        np.save(os.path.join(dst_dir, 'error_paths.npy'), error_paths)
+    cacher.close()
+def _worker(worker_idx, num_workers, paths, dst_dir, result_queue):
+    output_pdf_dir = os.path.join(dst_dir, 'pdf')
+    output_img_dir = os.path.join(dst_dir, 'img')
+    output_error_dir = os.path.join(dst_dir, 'error')
+    output_visual_dir = os.path.join(dst_dir, 'visual')
+    cacher = ListRecordCacher(os.path.join(dst_dir, 'table_%d.lrc' % worker_idx))
+    error_paths = []
+    error_count = 0
+    correct_count = 0
+    for id, path in enumerate(tqdm.tqdm(paths)):
+        try:
+            pdf_path, chunk_path, structure_path = path
+            name = os.path.splitext(os.path.basename(pdf_path))[0]
+            positions, transcripts = crop_pdf(path, output_pdf_dir)
+            table = extract_ocr(path, positions, transcripts)
+            table = refine_table(table, os.path.join(output_pdf_dir, name + '.png'), output_img_dir)
+            assert os.path.exists(structure_path), print('structure_path is not existed')
+            table['label_path'] = structure_path
+            visualize_table(os.path.join(output_img_dir, name + '.png'), output_visual_dir, table)
+            cacher.add_record(table)
+            correct_count += 1
+        except:
+            error_count += 1
+            error_paths.append(path)
+            crop_pdf(path, output_error_dir)
+    result_queue.put((correct_count, error_count, error_paths))
+def multi_process(path, dst_dir, num_workers):
+    import multiprocessing
+    manager = multiprocessing.Manager()
+    result_queue = manager.Queue()
+    workers = list()
+    for worker_idx in range(num_workers):
+        worker = multiprocessing.Process(
+            target=_worker,
+            args=(
+                worker_idx,
+                num_workers,
+                path[worker_idx::num_workers],
+                dst_dir,
+                result_queue
+            )
+        )
+        worker.daemon = True
+        worker.start()
+        workers.append(worker)
+    total_correct_count = 0
+    total_error_count = 0
+    total_error_paths = []
+    for _ in range(num_workers):
+        correct_count, error_count, error_paths = result_queue.get()
+        total_correct_count += correct_count
+        total_error_count += error_count
+        total_error_paths.extend(error_paths)
+    print("correct num: %d, error num: %d " % (total_correct_count, total_error_count))
+    if len(total_error_paths) > 0:
+        np.save(os.path.join(dst_dir, 'error_paths.npy'), total_error_paths)
+    # merge each worker lrc
+    cache_paths = glob.glob(os.path.join(dst_dir, '*.lrc'))
+    merge_record_file(cache_paths, os.path.join(dst_dir, 'table.lrc'))
+    for cache_path in cache_paths:
+        os.remove(cache_path)
+def main():
+    args = parse_args()
+    paths = get_sub_paths(args.src_dir, ["pdf", "chunk", "structure"], ['.pdf', '.chunk', '.json'])
+    # paths = get_paths(args.src_dir, ["pdf", "chunk", "structure"], '/yrfs1/intern/zrzhang6/TSR/Dataset/SciTSR/SciTSR-COMP.list', ['.pdf', '.chunk', '.json'])
+    if args.num_workers == 0:
+        single_process(paths, args.dst_dir)
+    else:
+        multi_process(paths, args.dst_dir, args.num_workers)
+if __name__ == "__main__":
+    main()

dataset/process_scitsr.sh ADDED Viewed

File without changes

dataset/trans2lrc.py ADDED Viewed

	@@ -0,0 +1,145 @@

+import os
+import glob
+import tqdm
+import numpy as np
+from utlis.list_record_cache import ListRecordCacher, merge_record_file
+from utlis.utlis import get_sub_paths, crop_pdf, crop_cells, visualize_cell, match_cells
+def parse_args():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('src_dir', type=str, default=None)
+    parser.add_argument('dst_dir', type=str, default=None)
+    parser.add_argument('-n', '--num_workers', type=int, default=0)
+    args = parser.parse_args()
+    return args
+def single_process(paths, dst_dir):
+    output_pdf_dir = os.path.join(dst_dir, 'pdf')
+    if not os.path.exists(output_pdf_dir):
+        os.makedirs(output_pdf_dir)
+    output_img_dir = os.path.join(dst_dir, 'img')
+    if not os.path.exists(output_img_dir):
+        os.makedirs(output_img_dir)
+    output_visual_dir = os.path.join(dst_dir, 'visual')
+    if not os.path.exists(output_visual_dir):
+        os.makedirs(output_visual_dir)
+    output_error_dir = os.path.join(dst_dir, 'error')
+    if not os.path.exists(output_error_dir):
+        os.makedirs(output_error_dir)
+    cacher = ListRecordCacher(os.path.join(dst_dir, 'table.lrc'))
+    error_paths = []
+    error_count = 0
+    correct_count = 0
+    for id, path in enumerate(tqdm.tqdm(paths)):
+        try:
+            pdf_path, chunk_path, structure_path = path
+            positions, transcripts = crop_pdf(path, output_pdf_dir)
+            table = match_cells([pdf_path, chunk_path, structure_path], positions, transcripts)
+            crop_cells(os.path.join(output_pdf_dir, os.path.splitext(os.path.basename(pdf_path))[0] + '.png'), output_img_dir, table)
+            table['id'] = id
+            table['image_path'] = os.path.join(output_img_dir, os.path.splitext(os.path.basename(pdf_path))[0] + '.png')
+            visualize_cell(os.path.join(output_img_dir, os.path.splitext(os.path.basename(pdf_path))[0] + '.png'), output_visual_dir, table)
+            cacher.add_record(table)
+            correct_count += 1
+        except:
+            error_count += 1
+            error_paths.append(path)
+            crop_pdf(path, output_error_dir)
+    print("correct num: %d, error num: %d " % (correct_count, error_count))
+    if len(error_paths) > 0:
+        np.save(os.path.join(dst_dir, 'error_paths.npy'), error_paths)
+    cacher.close()
+def _worker(worker_idx, num_workers, paths, dst_dir, result_queue):
+    output_pdf_dir = os.path.join(dst_dir, 'pdf')
+    output_img_dir = os.path.join(dst_dir, 'img')
+    output_visual_dir = os.path.join(dst_dir, 'visual')
+    output_error_dir = os.path.join(dst_dir, 'error')
+    cacher = ListRecordCacher(os.path.join(dst_dir, 'table_%d.lrc' % worker_idx))
+    error_paths = []
+    error_count = 0
+    correct_count = 0
+    for id, path in enumerate(tqdm.tqdm(paths)):
+        try:
+            pdf_path, chunk_path, structure_path = path
+            positions, transcripts = crop_pdf(path, output_pdf_dir)
+            table = match_cells([pdf_path, chunk_path, structure_path], positions, transcripts)
+            crop_cells(os.path.join(output_pdf_dir, os.path.splitext(os.path.basename(pdf_path))[0] + '.png'), output_img_dir, table)
+            table['id'] = int(id * num_workers + worker_idx)
+            table['image_path'] = os.path.join(output_img_dir, os.path.splitext(os.path.basename(pdf_path))[0] + '.png')
+            visualize_cell(os.path.join(output_img_dir, os.path.splitext(os.path.basename(pdf_path))[0] + '.png'), output_visual_dir, table)
+            cacher.add_record(table)
+            correct_count += 1
+        except:
+            error_count += 1
+            error_paths.append(path)
+            crop_pdf(path, output_error_dir)
+    result_queue.put((correct_count, error_count, error_paths))
+def multi_process(path, dst_dir, num_workers):
+    import multiprocessing
+    manager = multiprocessing.Manager()
+    result_queue = manager.Queue()
+    workers = list()
+    for worker_idx in range(num_workers):
+        worker = multiprocessing.Process(
+            target=_worker,
+            args=(
+                worker_idx,
+                num_workers,
+                path[worker_idx::num_workers],
+                dst_dir,
+                result_queue
+            )
+        )
+        worker.daemon = True
+        worker.start()
+        workers.append(worker)
+    total_correct_count = 0
+    total_error_count = 0
+    total_error_paths = []
+    for _ in range(num_workers):
+        correct_count, error_count, error_paths = result_queue.get()
+        total_correct_count += correct_count
+        total_error_count += error_count
+        total_error_paths.extend(error_paths)
+    print("correct num: %d, error num: %d " % (total_correct_count, total_error_count))
+    if len(total_error_paths) > 0:
+        np.save(os.path.join(dst_dir, 'error_paths.npy'), total_error_paths)
+    # merge each worker lrc
+    cache_paths = glob.glob(os.path.join(dst_dir, '*.lrc'))
+    merge_record_file(cache_paths, os.path.join(dst_dir, 'table.lrc'))
+    for cache_path in cache_paths:
+        os.remove(cache_path)
+def main():
+    args = parse_args()
+    paths = get_sub_paths(args.src_dir, ["pdf", "chunk", "structure"], ['.pdf', '.chunk', '.json'])
+    if args.num_workers == 0:
+        single_process(paths, args.dst_dir)
+    else:
+        multi_process(paths, args.dst_dir, args.num_workers)
+if __name__ == "__main__":
+    main()

dataset/utils/extract_table_lines.py ADDED Viewed

	@@ -0,0 +1,192 @@

+import math
+import numpy as np
+class InvalidFormat(Exception):
+    pass
+def segmentation_to_bbox(segmentation):
+    x1 = min([pt[0] for contour in segmentation for pt in contour])
+    y1 = min([pt[1] for contour in segmentation for pt in contour])
+    x2 = max([pt[0] for contour in segmentation for pt in contour])
+    y2 = max([pt[1] for contour in segmentation for pt in contour])
+    return (x1, y1, x2, y2)
+def cal_cell_bbox(table):
+    cells_bbox = list()
+    for cell in table['cells']:
+        if 'segmentation' not in cell:
+            cell_bbox = None
+        else:
+            segmentation = list()
+            if 'sublines' in cell:
+                for subline in cell['sublines']:
+                    segmentation.extend(subline['segmentation'])
+            if len(segmentation) == 0:
+                segmentation = cell['segmentation']
+            if len(segmentation) == 0:
+                cell_bbox = None
+            else:
+                cell_bbox = segmentation_to_bbox(segmentation)
+        cells_bbox.append(cell_bbox)
+    return cells_bbox
+def cal_cell_spans(table):
+    layout = table['layout']
+    num_cells = len(table['cells'])
+    cells_span = list()
+    for cell_id in range(num_cells):
+        cell_positions = np.argwhere(layout == cell_id)
+        y1 = np.min(cell_positions[:, 0])
+        y2 = np.max(cell_positions[:, 0])
+        x1 = np.min(cell_positions[:, 1])
+        x2 = np.max(cell_positions[:, 1])
+        assert np.all(layout[y1:y2, x1:x2] == cell_id)
+        cells_span.append([x1, y1, x2, y2])
+    return cells_span
+def cal_fg_bg_span(spans, edge):
+    num_span = len(spans)
+    bg_spans = list()
+    for idx in range(num_span):
+        if spans[idx] is None:
+            continue
+        if idx == 0:
+            if spans[idx][0] <= 0:
+                continue
+        else:
+            if spans[idx-1] is None:
+                continue
+            if spans[idx][0] <= spans[idx-1][1]:
+                continue
+        if idx == num_span - 1:
+            if spans[idx][1] >= edge:
+                continue
+        else:
+            if spans[idx+1] is None:
+                continue
+            if spans[idx][1] >= spans[idx+1][0]:
+                continue
+        bg_spans.append(spans[idx])
+    fg_spans = list()
+    for idx in range(num_span+1):
+        if idx == 0:
+            s = 0
+        else:
+            if spans[idx-1] is None:
+                continue
+            s = spans[idx-1][1]
+        if idx == num_span:
+            e = edge
+        else:
+            if spans[idx] is None:
+                continue
+            e = spans[idx][0]
+        if e <= s:
+            continue
+        fg_spans.append([s, e])
+    return fg_spans, bg_spans
+def shrink_spans(spans, size):
+    new_spans = list()
+    for idx, (start, end) in enumerate(spans):
+        if idx == 0:
+            if start <= 0:
+                start = 1
+        else:
+            _, pre_end = spans[idx - 1]
+            if start <= pre_end:
+                shrink_distance = pre_end - start + 1
+                start = start + math.ceil(shrink_distance / 2)
+        if idx == len(spans) - 1:
+            if end >= size:
+                end = size - 1
+        else:
+            next_start, _ = spans[idx + 1]
+            if end >= next_start:
+                shrink_distance = end - next_start + 1
+                end = end - math.ceil(shrink_distance / 2)
+        if end - start < 1:
+            raise InvalidFormat()
+        new_spans.append([start, end])
+    return new_spans
+def cal_row_span(table, cells_span, cells_bbox, height):
+    layout = table['layout']
+    rows_span = list()
+    for row_idx in range(layout.shape[0]):
+        row = layout[row_idx, :]
+        y1s = list()
+        y2s = list()
+        for cell_id in row:
+            cell_span = cells_span[cell_id]
+            cell_bbox = cells_bbox[cell_id]
+            if (cell_span[1] == row_idx) and (cell_bbox is not None):
+                y1s.append(cell_bbox[1])
+            if (cell_span[3] == row_idx) and (cell_bbox is not None):
+                y2s.append(cell_bbox[3])
+        if (len(y1s) > 0) and (len(y2s) > 0):
+            y1 = min(max(1, min(y1s)), height-1)
+            y2 = min(max(1, max(y2s) + 1), height-1)
+            rows_span.append([y1, y2])
+        else:
+            raise InvalidFormat()
+    rows_span = shrink_spans(rows_span, height)
+    rows_fg_span, rows_bg_span = cal_fg_bg_span(rows_span, height)
+    return rows_fg_span, rows_bg_span
+def cal_col_span(table, cells_span, cells_bbox, width):
+    layout = table['layout']
+    cols_span = list()
+    for col_idx in range(layout.shape[1]):
+        col = layout[:, col_idx]
+        x1s = list()
+        x2s = list()
+        for cell_id in col:
+            cell_span = cells_span[cell_id]
+            cell_bbox = cells_bbox[cell_id]
+            if (cell_span[0] == col_idx) and (cell_bbox is not None):
+                x1s.append(cell_bbox[0])
+            if (cell_span[2] == col_idx) and (cell_bbox is not None):
+                x2s.append(cell_bbox[2])
+        if (len(x1s) > 0) and (len(x2s) > 0):
+            x1 = min(max(1, min(x1s)), width-1)
+            x2 = min(max(1, max(x2s) + 1), width-1)
+            cols_span.append([x1, x2])
+        else:
+            raise InvalidFormat()
+    cols_span = shrink_spans(cols_span, width)
+    cols_fg_span, cols_bg_span = cal_fg_bg_span(cols_span, width)
+    return cols_fg_span, cols_bg_span
+def extract_fg_bg_spans(table, image_size):
+    width, height = image_size
+    cells_bbox = cal_cell_bbox(table)
+    cells_span = cal_cell_spans(table)
+    # cal rows fg bg span
+    rows_fg_span, rows_bg_span = cal_row_span(
+        table, cells_span, cells_bbox, height
+    )
+    # cal cols fg bg span
+    cols_fg_span, cols_bg_span = cal_col_span(
+        table, cells_span, cells_bbox, width
+    )
+    return rows_fg_span, rows_bg_span, cols_fg_span, cols_bg_span, cells_span

dataset/utils/list_record_cache.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import os
+import pickle
+import threading
+def merge_record_file(files, dst_file):
+    cmd = 'cat'
+    for file in files:
+        cmd += ' %s' % file
+    cmd += ' > %s' % dst_file
+    os.system(cmd)
+class ListRecordCacher:
+    OFFSET_LENGTH = 8
+    def __init__(self, cache_path):
+        self._record_pos_list = list()
+        self._cache_file = open(cache_path, 'wb')
+        self._cached_bytes = b'\x00' * self.OFFSET_LENGTH
+    def add_record(self, record):
+        record_bytes = pickle.dumps(record)
+        return self.add_record_bytes(record_bytes)
+    def add_record_bytes(self, record_bytes):
+        bytes_size = len(record_bytes)
+        offset_bytes = bytes_size.to_bytes(
+            length=self.OFFSET_LENGTH,
+            byteorder='big', signed=False
+        )
+        total_bytes = offset_bytes + record_bytes
+        cur_record_pos = None
+        if len(self._record_pos_list) == 0:
+            cur_record_pos = [self.OFFSET_LENGTH*2, bytes_size]
+        else:
+            cur_record_pos = [sum(self._record_pos_list[-1]) + self.OFFSET_LENGTH, bytes_size]
+        self._record_pos_list.append(cur_record_pos)
+        self._cached_bytes += total_bytes
+        if len(self._cached_bytes) > 1024*1024:
+            self._cache_file.seek(0, 2)
+            self._cache_file.write(self._cached_bytes)
+            self._cached_bytes = b''
+    def flush(self):
+        if len(self._cached_bytes) > 0:
+            self._cache_file.seek(0, 2)
+            self._cache_file.write(self._cached_bytes)
+            self._cached_bytes = b''
+    def _wirte_record_pos_list(self):
+        self.flush()
+        self._cache_file.seek(0, 2)
+        offset = self._cache_file.tell()
+        offset_bytes = offset.to_bytes(
+            length=self.OFFSET_LENGTH,
+            byteorder='big', signed=False
+        )
+        self._cache_file.seek(0)
+        self._cache_file.write(offset_bytes)
+        data_bytes = pickle.dumps(self._record_pos_list)
+        bytes_size = len(data_bytes)
+        offset_bytes = bytes_size.to_bytes(
+            length=self.OFFSET_LENGTH,
+            byteorder='big', signed=False
+        )
+        total_bytes = offset_bytes + data_bytes
+        self._cache_file.seek(0, 2)
+        self._cache_file.write(total_bytes)
+    def close(self):
+        if not self._cache_file.closed:
+            self._wirte_record_pos_list()
+            self._cache_file.close()
+    def __del__(self):
+        self.close()
+class ListRecordLoader:
+    OFFSET_LENGTH = 8
+    def __init__(self, load_path):
+        self._sync_lock = threading.Lock()
+        self._size = os.path.getsize(load_path)
+        self._load_path = load_path
+        self._open_file()
+        self._scan_file()
+    def _open_file(self):
+        self._pid = os.getpid()
+        self._cache_file = open(self._load_path, 'rb')
+    def _check_reopen(self):
+        if (self._pid != os.getpid()):
+            self._open_file()
+    def _scan_file(self):
+        record_pos_list = list()
+        pos = 0
+        while True:
+            if pos >= self._size:
+                break
+            self._cache_file.seek(pos)
+            offset = int().from_bytes(
+                self._cache_file.read(self.OFFSET_LENGTH),
+                byteorder='big', signed=False
+            )
+            offset = pos + offset
+            self._cache_file.seek(offset)
+            byte_size = int().from_bytes(
+                self._cache_file.read(self.OFFSET_LENGTH),
+                byteorder='big', signed=False
+            )
+            record_pos_list_bytes = self._cache_file.read(byte_size)
+            sub_record_pos_list = pickle.loads(record_pos_list_bytes)
+            assert isinstance(sub_record_pos_list, list)
+            sub_record_pos_list = [[item[0]+pos, item[1]] for item in sub_record_pos_list]
+            record_pos_list.extend(sub_record_pos_list)
+            pos = self._cache_file.tell()
+        self._record_pos_list = record_pos_list
+    def get_record(self, idx):
+        self._check_reopen()
+        record_bytes = self.get_record_bytes(idx)
+        record = pickle.loads(record_bytes)
+        return record
+    def get_record_bytes(self, idx):
+        offset, length = self._record_pos_list[idx]
+        self._sync_lock.acquire()
+        try:
+            self._cache_file.seek(offset)
+            record_bytes = self._cache_file.read(length)
+        finally:
+            self._sync_lock.release()
+        return record_bytes
+    def __len__(self):
+        return len(self._record_pos_list)

dataset/utils/utils.py ADDED Viewed

	@@ -0,0 +1,449 @@

+import os
+import cv2
+import json
+import copy
+import tqdm
+import numpy as np
+import fitz
+from .extract_table_lines import extract_fg_bg_spans
+def get_paths(root_dir, sub_names, names_path, exts, val=None):
+    # Check the existence of directories
+    assert os.path.isdir(root_dir)
+    with open(names_path, "r") as f:
+        names = f.readlines()
+        names = [name.strip() for name in names]
+    # TODO: sub_dirs redundancy
+    sub_dirs = []
+    for sub_name in sub_names:
+        sub_dir = os.path.join(root_dir, sub_name)
+        assert os.path.isdir(sub_dir), '"%s" is not dir.' % sub_dir
+        sub_dirs.append(sub_dir)
+    paths = []
+    names = names[:val]
+    for name in tqdm.tqdm(names):
+        sub_paths = []
+        for sub_dir, ext in zip(sub_dirs, exts):
+            sub_path = os.path.join(sub_dir, name + ext)
+            assert os.path.exists(sub_path), print('%s is not exist' % sub_path)
+            sub_paths.append(sub_path)
+        paths.append(sub_paths)
+    return paths
+def get_sub_paths(root_dir, sub_names, exts, val=None):
+    # Check the existence of directories
+    assert os.path.isdir(root_dir)
+    # TODO: sub_dirs redundancy
+    sub_dirs = []
+    for sub_name in sub_names:
+        sub_dir = os.path.join(root_dir, sub_name)
+        assert os.path.isdir(sub_dir), '"%s" is not dir.' % sub_dir
+        sub_dirs.append(sub_dir)
+    paths = []
+    d = os.listdir(sub_dirs[0])[:val]
+    for file_name in tqdm.tqdm(d):
+        sub_paths = [os.path.join(sub_dirs[0], file_name)]
+        name = os.path.splitext(file_name)[0]
+        for sub_name, ext in zip(sub_names[1:], exts[1:]):
+            sub_path = os.path.join(root_dir, sub_name, name + ext)
+            assert os.path.exists(sub_path)
+            sub_paths.append(sub_path)
+        paths.append(sub_paths)
+    return paths
+def cal_wer(label, rec):
+    dist_mat = np.zeros((len(label) + 1, len(rec) + 1), dtype='int32')
+    dist_mat[0, :] = range(len(rec) + 1)
+    dist_mat[:, 0] = range(len(label) + 1)
+    for i in range(1, len(label) + 1):
+        for j in range(1, len(rec) + 1):
+            hit_score = dist_mat[i - 1, j - 1] + (label[i - 1] != rec[j - 1])
+            ins_score = dist_mat[i, j - 1] + 1
+            del_score = dist_mat[i - 1, j] + 1
+            dist_mat[i, j] = min(hit_score, ins_score, del_score)
+    dist = dist_mat[len(label), len(rec)]
+    return 1 - dist / len(label)
+def visualize(img_path, chunks, structures):
+    image = cv2.imread(img_path)
+    for chunk in chunks:
+        x1, x2, y1, y2 = chunk["pos"]
+        transcript = chunk["text"]
+        cv2.rectangle(image, (int(x1), int(y1)), (int(x2), int(y2)), (0, 0, 255))
+        cv2.putText(image, ''.join(transcript), (int(x1), int(max(0, y1-1))), cv2.FONT_HERSHEY_COMPLEX, 0.25, (0 , 0, 255), 1)
+    return image
+def visualize_table(img_path, output_dir, table):
+    img = cv2.imread(img_path)
+    for cell in table['cells']:
+        x1, y1, x2, y2 = cell['bbox']
+        transcript = cell['transcript']
+        cv2.rectangle(img, (int(x1), int(y1)), (int(x2), int(y2)), (0, 0, 255))
+        cv2.putText(img, ''.join(transcript), (int(x1), int(max(0, y1-1))), cv2.FONT_HERSHEY_COMPLEX, 0.25, (0 , 0, 255), 1)
+    cv2.imwrite(os.path.join(output_dir, os.path.basename(img_path)), img)
+def crop_pdf(path, output_dir, zoom_x = 2.0, zoom_y = 2.0, rotate=0, expand=10, y_fix=.0):
+    '''
+        path:[pdf_path, chunk_path]
+        crop table region in pdf
+        save pdf_name.png
+        return list[x1, x2, y1, y2], [str]. note these are corresponding to crop pdf
+    '''
+    # load data
+    with open(path[1], 'r') as f:
+        chunks = json.load(f)['chunks']
+    doc = fitz.open(path[0])
+    pdf_name = os.path.splitext(os.path.basename(path[0]))[0]
+    assert doc.pageCount == 1, print(pdf_name, ' has more than 1 page!')
+    # transfer pdf to img
+    trans = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate)
+    pm = doc[0].getPixmap(matrix=trans, alpha=False)
+    pm.writePNG(os.path.join(output_dir, '%s.png' % pdf_name))
+    # crop table region
+    pdf_img = cv2.imread(os.path.join(output_dir, '%s.png' % pdf_name))
+    h, w, *_ = pdf_img.shape
+    positions = []
+    transcripts = []
+    for chunk in chunks:
+        positions.append([chunk['pos'][0], chunk['pos'][1], chunk['pos'][3], chunk['pos'][2]]) # x1, x2, y2, y1
+        transcripts.append(chunk["text"])
+    # the last chunk transcrip is repeated
+    transcripts[-1] = transcripts[-1][:-1]
+    positions = np.array(positions)
+    positions[:, :2] *= zoom_x
+    positions[:, 2:] = h - positions[:, 2:] * zoom_y
+    x_min = int(max(0, positions[:, :2].min() - expand))
+    y_min = int(max(0, positions[:, 2:].min() - expand))
+    x_max = int(min(w, positions[:, :2].max() + expand))
+    y_max = int(min(h, positions[:, 2:].max() + expand))
+    img_crop = pdf_img[y_min:y_max, x_min:x_max]
+    cv2.imwrite(os.path.join(output_dir, '%s.png' % pdf_name), img_crop)
+    positions[:, :2] = np.clip(positions[:, :2] - x_min, 0, w)
+    positions[:, 2] -= y_fix * zoom_y
+    positions[:, 2:] = np.clip(positions[:, 2:] - y_min, 0, h)
+    return positions, transcripts
+def crop_cells(img_path, output_dir, info, expand=10):
+    cells = info['cells']
+    img = cv2.imread(img_path)
+    h, w, *_ = img.shape
+    bboxes = [cell['bbox'] for cell in cells if 'bbox' in cell.keys()]
+    bboxes = np.array(bboxes)
+    x_min = int(max(bboxes[:, 0].min() - expand, 0))
+    y_min = int(max(bboxes[:, 1].min() - expand, 0))
+    x_max = int(min(bboxes[:, 2].max() + expand, w))
+    y_max = int(min(bboxes[:, 3].max() + expand, h))
+    cv2.imwrite(os.path.join(output_dir, os.path.splitext(os.path.basename(img_path))[0] + '.png'), img[y_min:y_max, x_min:x_max])
+    # refine cell bbox
+    new_cells = []
+    for cell in cells:
+        if 'bbox' not in cell.keys():
+            new_cells.append(cell)
+        else:
+            cell['bbox'][0] = max(0, cell['bbox'][0] - x_min)
+            cell['bbox'][1] = max(0, cell['bbox'][1] - y_min)
+            cell['bbox'][2] = max(0, cell['bbox'][2] - x_min)
+            cell['bbox'][3] = max(0, cell['bbox'][3] - y_min)
+            segmentation = cell['segmentation']
+            cell['segmentation'] = [[[pt[0] - x_min, pt[1] - y_min] for pt in contour] for contour in segmentation]
+            new_cells.append(cell)
+    info['cells'] = new_cells
+def visualize_ocr(img_path, output_dir, positions, transcripts):
+    img = cv2.imread(img_path)
+    for position, transcript in zip(positions, transcripts):
+        x1, x2, y1, y2 = position
+        cv2.rectangle(img, (int(x1), int(y1)), (int(x2), int(y2)), (0, 0, 255))
+        cv2.putText(img, transcript, (int(x1), int(y1)), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255,0,0), 1)
+    cv2.imwrite(os.path.join(output_dir, os.path.splitext(os.path.basename(img_path))[0] + '_ocr.png'), img)
+def cal_cell_spans(table):
+    layout = table['layout']
+    num_cells = len(table['cells'])
+    cells_span = list()
+    for cell_id in range(num_cells):
+        cell_positions = np.argwhere(layout == cell_id)
+        y1 = np.min(cell_positions[:, 0])
+        y2 = np.max(cell_positions[:, 0])
+        x1 = np.min(cell_positions[:, 1])
+        x2 = np.max(cell_positions[:, 1])
+        assert np.all(layout[y1:y2, x1:x2] == cell_id)
+        cells_span.append([x1, y1, x2, y2])
+    return cells_span
+def visualize_cell(img_path, output_dir, table):
+    def spans2lines(spans):
+        lines = []
+        lines.append(spans[0][0])
+        for span in spans[1:-1]:
+            t1, t2 = span
+            lines.append(int((t1 + t2) / 2))
+        lines.append(spans[-1][-1])
+        return lines
+    img = cv2.imread(img_path)
+    # draw table lines
+    rows_fg_span, rows_bg_span, cols_fg_span, cols_bg_span, cells_span = extract_fg_bg_spans(table, img.shape[::-1][-2:])
+    row_lines = spans2lines(rows_fg_span)
+    col_lines = spans2lines(cols_fg_span)
+    for span in cells_span:
+        x1, y1, x2, y2 = span
+        cv2.rectangle(img, (int(col_lines[x1]), int(row_lines[y1])), (int(col_lines[x2 + 1]), int(row_lines[y2 + 1])), (0, 0, 255), 2)
+    # draw ocr results
+    for cell in table['cells']:
+        if 'bbox' not in cell.keys():
+            continue
+        x1, y1, x2, y2 = cell['bbox']
+        transcript = cell['transcript']
+        cv2.rectangle(img, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 1)
+        cv2.putText(img, ''.join(transcript), (int(x1), int(y1)), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255,0,0), 1)
+    cv2.imwrite(os.path.join(output_dir, os.path.splitext(os.path.basename(img_path))[0] + '.png'), img)
+def match_cells(path, positions, transcripts, k=16, start=0.333, stop=0.1, stop_percent=0.3, gap=0.25):
+    '''
+        path: [pdf_path, chunk_path, structure_path]
+        positions: [x1, x2, y1, y2],
+        transcripts: [str]
+        retrun dict(
+            'layout':np.array()
+            'bbox':[x1, y1, x2, y2]
+            'transcript: str
+            'head_rows':[]
+            'body_rows':[]
+        )
+    '''
+    # load data
+    with open(path[2], 'r') as f:
+        cells = json.load(f)['cells']
+    # first sort cells from left to right, from top to down
+    cells_pos = [] # xl1, yl1, xl2, yl2
+    contents = []
+    for cell in cells:
+        cells_pos.append([cell['start_col'], cell['start_row'], cell['end_col'], cell['end_row']])
+        contents.append(' '.join(cell['content']))
+    # sorted cells from left to right, from top to down
+    sorted_idx = sorted(list(range(len(cells_pos))), key=lambda idx: cells_pos[idx][0] + 1e6 * cells_pos[idx][1])
+    cells_pos = [cells_pos[idx] for idx in sorted_idx]
+    contents = [contents[idx] for idx in sorted_idx]
+    # layout
+    n_row = np.array(cells_pos)[:, 3].max() + 1
+    n_col = np.array(cells_pos)[:, 2].max() + 1
+    layout = np.full((n_row, n_col), -1)
+    # head_rows & body_rows
+    head_rows = list(range((np.array(cells_pos)[np.array(cells_pos)[:,1] == 0][:, 3] - np.array(cells_pos)[np.array(cells_pos)[:,1] == 0][:, 1]).max() + 1))
+    body_rows = list(range((np.array(cells_pos)[np.array(cells_pos)[:,1] == 0][:, 3] - np.array(cells_pos)[np.array(cells_pos)[:,1] == 0][:, 1]).max() + 1, n_row))
+    lt = [-1, -1]
+    cells = []
+    valid_idx = list(range(len(transcripts)))
+    # init start/end index of ocr results
+    start_content = ''
+    for content in contents:
+        if len(content) > 0:
+            start_content = content
+            break
+    try:
+        start_index = [cal_wer(start_content, transcript) > start for transcript in transcripts[:k]].index(True)
+    except:
+        start_index = 0
+    end_content = ''
+    for content in contents[::-1]:
+        if len(content) > 0:
+            end_content = content
+            break
+    try:
+        end_index = [cal_wer(end_content, transcript) > start for transcript in transcripts[::-1][:k]].index(True)
+    except:
+        end_index = 0
+    valid_idx = valid_idx[start_index:] if end_index == 0 else valid_idx[start_index: -end_index]
+    assert len(contents) >= len(valid_idx), print('OCR Results Have Error')
+    stop_counts = 0
+    for index, (cell_pos, content) in enumerate(zip(cells_pos, contents)):
+        # confirm the cell pos is increase
+        assert cell_pos[0] > lt[0] or cell_pos[1] > lt[1], print('Sorted Cells Have Error')
+        lt = cell_pos[:2]
+        xl1, yl1, xl2, yl2 = cell_pos
+        layout[yl1:yl2+1, xl1:xl2+1] = index
+        if len(content) == 0:
+            cells.append(dict(transcript=[]))
+        else:
+            is_completed = False
+            bboxes_list = [positions[valid_idx[0]]]
+            transcripts_list = [transcripts[valid_idx[0]]]
+            valid_idx.pop(0)
+            wer_last = cal_wer(content, ' '.join(transcripts_list))
+            if wer_last < stop:
+                bboxes_list = np.array(bboxes_list)
+                x1 = int(bboxes_list[:, :2].min())
+                x2 = int(bboxes_list[:, :2].max())
+                y1 = int(bboxes_list[:, 2:].min())
+                y2 = int(bboxes_list[:, 2:].max())
+                cells.append(dict(transcript=list(content), bbox=[x1, y1, x2, y2], segmentation=[[[x1,y1],[x2,y1],[x2,y2],[x1,y2]]]))
+                stop_counts += 1
+                continue
+            for idx in valid_idx[:k]:
+                if content == ' '.join(transcripts_list):
+                    bboxes_list = np.array(bboxes_list)
+                    x1 = int(bboxes_list[:, :2].min())
+                    x2 = int(bboxes_list[:, :2].max())
+                    y1 = int(bboxes_list[:, 2:].min())
+                    y2 = int(bboxes_list[:, 2:].max())
+                    cells.append(dict(transcript=list(content), bbox=[x1, y1, x2, y2], segmentation=[[[x1,y1],[x2,y1],[x2,y2],[x1,y2]]]))
+                    is_completed = True
+                    break
+                else:
+                    cur_trans = copy.deepcopy(transcripts_list)
+                    cur_trans.append(transcripts[idx])
+                    wer = cal_wer(content, ' '.join(cur_trans))
+                    # if add new str, and wer is not increase a lot, it should not be added in
+                    if wer < wer_last + gap:
+                        continue
+                    else:
+                        transcripts_list.append(transcripts[idx])
+                        bboxes_list.append(positions[idx])
+                        valid_idx.pop(valid_idx.index(idx))
+                        if wer == 1.0:
+                            break
+                        else:
+                            wer_last = wer
+            if not is_completed:
+                bboxes_list = np.array(bboxes_list)
+                x1 = int(bboxes_list[:, :2].min())
+                x2 = int(bboxes_list[:, :2].max())
+                y1 = int(bboxes_list[:, 2:].min())
+                y2 = int(bboxes_list[:, 2:].max())
+                cells.append(dict(transcript=list(content), bbox=[x1, y1, x2, y2], segmentation=[[[x1,y1],[x2,y1],[x2,y2],[x1,y2]]]))
+    assert stop_counts / len(contents) < stop_percent, print('This Table Has Many Error Match with OCR Results')
+    assert layout.min() == 0, print('This Table Layout is not Completely Resolved')
+    return dict(
+        layout=layout,
+        cells=cells,
+        head_rows=head_rows,
+        body_rows=body_rows,
+    )
+def extract_ocr(path, positions, transcripts, k=16, start=0.333):
+    '''
+        path: [pdf_path, chunk_path, structure_path]
+        positions: [x1, x2, y1, y2],
+        transcripts: [ ]
+        retrun dict(
+            'cells':{
+                'bbox':[x1, y1, x2, y2]
+                'transcript: []
+            }
+        )
+    '''
+    # load data
+    with open(path[2], 'r') as f:
+        cells = json.load(f)['cells']
+    # first sort cells from left to right, from top to down
+    cells_pos = [] # xl1, yl1, xl2, yl2
+    contents = []
+    for cell in cells:
+        cells_pos.append([cell['start_col'], cell['start_row'], cell['end_col'], cell['end_row']])
+        contents.append(' '.join(cell['content']))
+    # sorted cells from left to right, from top to down
+    sorted_idx = sorted(list(range(len(cells_pos))), key=lambda idx: cells_pos[idx][0] + 1e6 * cells_pos[idx][1])
+    cells_pos = [cells_pos[idx] for idx in sorted_idx]
+    contents = [contents[idx] for idx in sorted_idx]
+    # init start/end index, condition is the first/last index must not over split, and wer should be larger than start threshold
+    valid_idx = list(range(len(transcripts)))
+    start_content = ''
+    for content in contents:
+        if len(content) > 0:
+            start_content = content
+            break
+    try:
+        start_index = [cal_wer(start_content, transcript) > start for transcript in transcripts[:k]].index(True)
+    except:
+        start_index = 0
+    end_content = ''
+    for content in contents[::-1]:
+        if len(content) > 0:
+            end_content = content
+            break
+    try:
+        end_index = [cal_wer(end_content, transcript) > start for transcript in transcripts[::-1][:k]].index(True)
+    except:
+        end_index = 0
+    valid_idx = valid_idx[start_index:] if end_index == 0 else valid_idx[start_index: -end_index]
+    cells = []
+    for idx in valid_idx:
+        x1, x2, y1, y2 = positions[idx].astype('int').tolist()
+        cells.append(dict(transcript=list(transcripts[idx]), bbox=[x1, y1, x2, y2], segmentation=[[[x1,y1],[x2,y1],[x2,y2],[x1,y2]]]))
+    return dict(
+        cells=cells
+    )
+def refine_table(table, img_path, output_dir, expand=10):
+    cells = table['cells']
+    bboxes = [cell['bbox'] for cell in table['cells'] if 'bbox' in cell.keys()]
+    bboxes = np.array(bboxes)
+    img = cv2.imread(img_path)
+    h, w, *_ = img.shape
+    x1 = int(max(0, bboxes[:, 0].min() - expand))
+    y1 = int(max(0, bboxes[:, 1].min() - expand))
+    x2 = int(min(w, bboxes[:, 2].max() + expand))
+    y2 = int(min(h, bboxes[:, 3].max() + expand))
+    # refine cells
+    bboxes[:, 0::2] = np.clip(bboxes[:, 0::2] - x1, 0, 1e6)
+    bboxes[:, 1::2] = np.clip(bboxes[:, 1::2] - y1, 0, 1e6)
+    bboxes = bboxes.tolist()
+    for cell, bbox in zip(cells, bboxes):
+        cell['bbox'] = bbox
+    img = img[y1:y2, x1:x2]
+    cv2.imwrite(os.path.join(output_dir, os.path.basename(img_path)), img)
+    table['image_path'] = os.path.join(output_dir, os.path.basename(img_path))
+    return table

libs/configs/__init__.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from . import default
+import importlib
+class CFG:
+    def __init__(self):
+        self.__dict__['cfg'] = None
+    def __getattr__(self, name):
+        return getattr(self.__dict__['cfg'], name)
+    def __setattr__(self, name, val):
+        setattr(self.__dict__['cfg'], name, val)
+cfg = CFG()
+cfg.__dict__['cfg'] = default
+def setup_config(cfg_name):
+    global cfg
+    module_name = 'libs.configs.' + cfg_name
+    cfg_module = importlib.import_module(module_name)
+    cfg.__dict__['cfg'] = cfg_module

libs/configs/default.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import os
+import torch
+from libs.utils.vocab import Vocab
+device = torch.device('cuda')
+train_lrcs_path = [
+    "/yrfs1/intern/pfhu6/TSR/Dataset/SciTSR/train/table.lrc"
+]
+train_data_dir = ''
+train_max_pixel_nums = 400 * 400 * 5
+train_bucket_seps = (50, 50, 50)
+train_max_batch_size = 8
+train_num_workers = 4
+valid_lrc_path = '/yrfs1/intern/pfhu6/TSR/Dataset/SciTSR/test/table.lrc'
+valid_data_dir = ''
+valid_num_workers = 0
+valid_batch_size = 1
+vocab = Vocab()
+# model params
+# backbone
+arch = "res34"
+pretrained_backbone = True
+backbone_out_channels = (64, 128, 256, 512)
+# fpn
+fpn_out_channels = 256
+# pan
+pan_num_levels = 4
+pan_in_dim = 256
+pan_out_dim = 256
+# row segment predictor
+rs_scale = 1
+# col segment predictor
+cs_scale = 1
+# divide predictor
+dp_head_nums = 8
+dp_scale = 1
+# cells extractor params
+ce_scale = 1 / 8
+ce_pool_size = (3, 3)
+ce_dim = 512
+ce_head_nums = 8
+ce_heads = 1
+# decoder
+embed_dim = 512
+feat_dim = 512
+lm_state_dim = 512
+proj_dim = 512
+cover_kernel = 7
+att_threshold = 0.5
+spatial_att_weight_loss_wight = 1.0
+# train params
+base_lr = 0.0001
+min_lr = 1e-6
+weight_decay = 0
+num_epochs = 20
+sync_rate = 20
+log_sep = 20
+work_dir = './experiments/heads_1'
+train_checkpoint = None
+eval_checkpoint = os.path.join(work_dir, 'best_f1_model.pth')

libs/data/__init__.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import torch
+from torch.utils.data.distributed import DistributedSampler
+from .batch_sampler import BucketSampler
+from .dataset import LRCRecordLoader
+from .dataset import Dataset, collate_func
+from libs.utils.comm import distributed, get_rank, get_world_size
+from . import transform as T
+def create_train_dataloader(vocab, lrcs_path, num_workers, max_batch_size, max_pixel_nums, bucket_seps, data_root_dir):
+    loaders = list()
+    for lrc_path in lrcs_path:
+        loader = LRCRecordLoader(lrc_path, data_root_dir)
+        loaders.append(loader)
+    transforms = T.Compose([
+        T.TableToLabel(vocab),
+        T.CalRowColSpans(),
+        T.CalCellSpans(),
+        T.CalHeadBodyDivide(),
+        T.ToTensor(),
+        T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+    ])
+    dataset = Dataset(loaders, transforms)
+    batch_sampler = BucketSampler(dataset, get_world_size(), get_rank(), max_pixel_nums=max_pixel_nums, max_batch_size=max_batch_size,seps=bucket_seps)
+    dataloader = torch.utils.data.DataLoader(
+        dataset,
+        num_workers=num_workers,
+        collate_fn=collate_func,
+        batch_sampler=batch_sampler
+    )
+    return dataloader
+def create_valid_dataloader(vocab, lrc_path, num_workers, batch_size, data_root_dir):
+    loader = LRCRecordLoader(lrc_path, data_root_dir)
+    transforms = T.Compose([
+        T.TableToLabel(vocab),
+        T.CalRowColSpans(),
+        T.CalCellSpans(),
+        T.CalHeadBodyDivide(),
+        T.ToTensor(),
+        T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+    ])
+    dataset = Dataset([loader], transforms)
+    if distributed():
+        sampler = DistributedSampler(dataset, get_world_size(), get_rank(), True)
+        dataloader = torch.utils.data.DataLoader(
+            dataset,
+            num_workers=num_workers,
+            batch_size=batch_size,
+            collate_fn=collate_func,
+            sampler=sampler,
+            drop_last=False
+        )
+    else:
+        dataloader = torch.utils.data.DataLoader(
+            dataset,
+            num_workers=num_workers,
+            batch_size=batch_size,
+            collate_fn=collate_func,
+            shuffle=False,
+            drop_last=False
+        )
+    return dataloader

libs/data/batch_sampler.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import tqdm
+import copy
+import random
+from collections import defaultdict
+from libs.utils import logger
+import numpy as np
+class BucketSampler:
+    def __init__(self, dataset, world_size, rank_id, fix_batch_size=None, max_pixel_nums=None, max_batch_size=8, min_batch_size=1, seps=(100, 100, 20)):
+        self.dataset = dataset
+        self.world_size = world_size
+        self.rank_id = rank_id
+        self.seps = seps
+        self.fix_batch_size = fix_batch_size
+        self.max_batch_size = max_batch_size
+        self.min_batch_size = min_batch_size
+        self.max_pixel_nums = max_pixel_nums
+        assert (fix_batch_size is not None) or (max_pixel_nums is not None)
+        self.cal_buckets()
+        self.seed = 20
+        self.epoch = 0
+def count_keys(self):
+    infos = []
+    for i in tqdm.tqdm(range(len(self.dataset))):
+        info = self.dataset.get_info(i)
+        infos.append(info)
+    return infos
+def cal_buckets(self):
+    infos = self.count_keys()
+    np.save('count_keys.npy', infos)
+    min_sizes = None # (64, 18, 2)
+    max_sizes = None # (1223, 742, 2080)
+    for info in infos:
+        if min_sizes is None:
+            min_sizes = info
+            max_sizes = info
+        else: # get the max size of each item of tuple
+            min_sizes = tuple(min(min_sizes[idx], info[idx]) for idx in range(len(min_sizes)))
+            max_sizes = tuple(max(max_sizes[idx], info[idx]) for idx in range(len(max_sizes)))
+    assert (min_sizes is not None) and (len(self.seps) == len(min_sizes))
+    print('max sizes: {}, min size: {}'.format(max_sizes, min_sizes))
+    buckets = defaultdict(list)
+    for idx, info in enumerate(infos):
+        bucket_idxes = list()
+        for sep, size, min_size in zip(self.seps, info, min_sizes):
+            bucket_idx = (size - min_size) // sep
+            bucket_idxes.append(str(bucket_idx))
+        bucket_idxes = '-'.join(bucket_idxes)
+        buckets[bucket_idxes].append(idx)
+    np.save('buckets.npy', buckets)
+    valid_buckets = dict()
+    for bucket_key, bucket_samples in buckets.items():
+        if len(bucket_samples) < self.min_batch_size:
+            continue
+        if (self.fix_batch_size is not None) and (len(bucket_samples) < self.fix_batch_size):
+            continue
+        w, h, *_ = [(int(item) + 1) * sep + min_size for item, min_size, sep in zip(bucket_key.split('-'), min_sizes, self.seps)]
+        if self.fix_batch_size is not None:
+            if h * w * self.fix_batch_size > self.max_pixel_nums:
+                continue
+        else:
+            if h * w * self.min_batch_size > self.max_pixel_nums:
+                continue
+        if self.fix_batch_size is not None:
+            batch_size = self.fix_batch_size
+        else:
+            batch_size = min(self.max_batch_size, max(self.max_pixel_nums // (w * h), self.min_batch_size), len(bucket_samples))
+        valid_buckets[bucket_key] = dict(
+            samples=bucket_samples,
+            batch_size=batch_size
+        )
+    self.buckets = [valid_buckets[bucket_key] for bucket_key in sorted(valid_buckets.keys())]
+    total_nums = len(infos)
+    valid_nums = sum([len(item['samples']) for item in valid_buckets.values()])
+    logger.info('Total %d samples, but ignore %d samples' % (total_nums, total_nums - valid_nums))
+def __iter__(self):
+    random_inst = random.Random(self.seed + self.epoch)
+    batches = list()
+    for bucket in self.buckets:
+        sample = copy.deepcopy(bucket['samples'])
+        batch_size = bucket['batch_size']
+        random_inst.shuffle(sample)
+        idx = 0
+        while idx < len(sample):
+            batch = sample[idx:idx + batch_size]
+            idx += batch_size
+            if len(batch) < self.min_batch_size:
+                continue
+            batches.append(batch)
+    random_inst.shuffle(batches)
+    align_nums = (len(batches) // self.world_size) * self.world_size
+    batches = batches[: align_nums]
+    for batch_idx in range(self.rank_id, len(batches), self.world_size):
+        yield batches[batch_idx]
+def __len__(self):
+    batch_nums = 0
+    for bucket in self.buckets:
+        bucket_sample_nums = len(bucket["samples"])
+        bucket_bs = bucket['batch_size']
+        batch_nums += bucket_sample_nums // bucket_bs
+        if bucket_sample_nums % bucket_bs >= self.min_batch_size:
+            batch_nums += 1
+    return batch_nums // self.world_size
+def set_epoch(self, epoch):
+    self.epoch = epoch

libs/data/dataset.py ADDED Viewed

	@@ -0,0 +1,164 @@

+import os
+import copy
+import json
+import pickle
+import random
+from torch._C import layout
+import tqdm
+import torch
+import numpy as np
+from PIL import Image
+from .list_record_cache import ListRecordLoader
+from libs.utils.format_translate import table_to_html
+class LRCRecordLoader:
+    def __init__(self, lrc_path, data_dir=''):
+        self.loader = ListRecordLoader(lrc_path)
+        self.data_root_dir = data_dir
+    def __len__(self):
+        return len(self.loader)
+    def get_info(self, idx):
+        table = self.loader.get_record(idx)
+        image = Image.open(table['image_path']).convert('RGB')
+        w = image.width
+        h = image.height
+        n_rows, n_cols = table['layout'].shape
+        n_cells = n_rows * n_cols
+        return w, h, n_cells
+    def get_data(self, idx):
+        table = self.loader.get_record(idx)
+        img_path = os.path.join(self.data_root_dir, table['image_path'])
+        image = Image.open(img_path).convert('RGB')
+        return image, table
+class Dataset:
+    def __init__(self, loaders, transforms):
+        self.loaders = loaders
+        self.transforms = transforms
+    def _match_loader(self, idx):
+        offset = 0
+        for loader in self.loaders:
+            if len(loader) + offset > idx:
+                return loader, idx - offset
+            else:
+                offset += len(loader)
+        raise IndexError()
+    def get_info(self, idx):
+        loader, rela_idx = self._match_loader(idx)
+        return loader.get_info(rela_idx)
+    def __len__(self):
+        return sum([len(loader) for loader in self.loaders])
+    def __getitem__(self,idx):
+        try:
+            loader, rela_idx = self._match_loader(idx)
+            image, table = loader.get_data(rela_idx)
+            image, _, cls_label, \
+                rows_fg_span, rows_bg_span, \
+                    cols_fg_span, cols_bg_span, \
+                        cells_span, divide = self.transforms(image, table) if 'layout' in table.keys() else self.transforms(image)
+            return dict(
+                id=idx,
+                image_size=(image.shape[2], image.shape[1]),
+                image=image,
+                cls_label=cls_label,
+                rows_fg_span=rows_fg_span,
+                rows_bg_span=rows_bg_span,
+                cols_fg_span=cols_fg_span,
+                cols_bg_span=cols_bg_span,
+                cells_span=cells_span,
+                layout=table['layout'] if 'layout' in table.keys() else None,
+                divide=divide,
+                table=table
+            )
+        except Exception as e:
+            print('Error occured while load data: %d' % idx)
+            raise e
+def collate_func(batch_data):
+    batch_size = len(batch_data)
+    image_dim = batch_data[0]['image'].shape[0]
+    max_h = max([data['image'].shape[1] for data in batch_data])
+    max_w = max([data['image'].shape[2] for data in batch_data])
+    batch_id = list()
+    batch_image_size = list()
+    batch_image = torch.zeros([batch_size, image_dim, max_h, max_w], dtype=torch.float)
+    batch_image_mask = torch.zeros([batch_size, 1, max_h, max_w], dtype=torch.float)
+    batch_rows_fg_span = list()
+    batch_rows_bg_span = list()
+    batch_cols_fg_span = list()
+    batch_cols_bg_span = list()
+    batch_cells_span = list()
+    batch_divide = list()
+    tables = list()
+    if all([(data['cls_label'] is None) and (data['layout'] is None) for data in batch_data]):
+        batch_cls_label = list()
+        batch_label_mask = list()
+        batch_layout = list()
+    else:
+        assert not any([(data['cls_label'] is None) or (data['layout'] is None) for data in batch_data])
+        max_label_length = max([data['cls_label'].shape[0] for data in batch_data])
+        batch_cls_label = torch.zeros([batch_size, max_label_length], dtype=torch.long)
+        batch_label_mask = torch.zeros([batch_size, max_label_length], dtype=torch.float)
+        max_nr = max([data['layout'].shape[0] for data in batch_data])
+        max_nc = max([data['layout'].shape[1] for data in batch_data])
+        batch_layout = torch.full([batch_size, max_nr, max_nc], -1, dtype=torch.float)
+    for batch_idx, data in enumerate(batch_data):
+        batch_id.append(data['id'])
+        batch_image_size.append(data['image_size'])
+        _, cur_h, cur_w = data['image'].shape
+        batch_image[batch_idx, :, :cur_h, :cur_w] = data["image"]
+        batch_image_mask[batch_idx, :, :cur_h, :cur_w] = 1
+        if (data['cls_label'] is None) and (data['layout'] is None):
+            batch_cls_label.append(data["cls_label"])
+            batch_label_mask.append(None)
+            batch_layout.append(data["layout"])
+        else:
+            label_length = data['cls_label'].shape[0]
+            batch_cls_label[batch_idx, :label_length] = data['cls_label']
+            batch_label_mask[batch_idx, :label_length] = 1.0
+            layout_nr, layout_nc = data["layout" ].shape
+            batch_layout[batch_idx, :layout_nr, :layout_nc] = torch.from_numpy(data['layout']).float()
+        batch_rows_fg_span.append(data["rows_fg_span"])
+        batch_rows_bg_span.append(data['rows_bg_span'])
+        batch_cols_fg_span.append(data["cols_fg_span"])
+        batch_cols_bg_span.append(data["cols_bg_span"])
+        batch_cells_span.append(data["cells_span"])
+        batch_divide.append(data["divide"])
+        tables.append(data['table'])
+    batch_divide = torch.tensor(batch_divide, dtype=torch.long) if batch_divide[0] is not None else batch_divide
+    return dict(
+        ids=batch_id,
+        images_size=batch_image_size,
+        images=batch_image,
+        images_mask=batch_image_mask,
+        cls_labels=batch_cls_label,
+        labels_mask=batch_label_mask,
+        rows_fg_spans=batch_rows_fg_span,
+        rows_bg_spans=batch_rows_bg_span,
+        cols_fg_spans=batch_cols_fg_span,
+        cols_bg_spans=batch_cols_bg_span,
+        cells_spans=batch_cells_span,
+        divide_labels=batch_divide,
+        layouts=batch_layout,
+        tables=tables
+    )

libs/data/list_record_cache.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import os
+import pickle
+import threading
+def merge_record_file(files, dst_file):
+    cmd = 'cat'
+    for file in files:
+        cmd += ' %s' % file
+    cmd += ' > %s' % dst_file
+    os.system(cmd)
+class ListRecordCacher:
+    OFFSET_LENGTH = 8
+    def __init__(self, cache_path):
+        self._record_pos_list = list()
+        self._cache_file = open(cache_path, 'wb')
+        self._cached_bytes = b'\x00' * self.OFFSET_LENGTH
+    def add_record(self, record):
+        record_bytes = pickle.dumps(record)
+        return self.add_record_bytes(record_bytes)
+    def add_record_bytes(self, record_bytes):
+        bytes_size = len(record_bytes)
+        offset_bytes = bytes_size.to_bytes(
+            length=self.OFFSET_LENGTH,
+            byteorder='big', signed=False
+        )
+        total_bytes = offset_bytes + record_bytes
+        cur_record_pos = None
+        if len(self._record_pos_list) == 0:
+            cur_record_pos = [self.OFFSET_LENGTH*2, bytes_size]
+        else:
+            cur_record_pos = [sum(self._record_pos_list[-1]) + self.OFFSET_LENGTH, bytes_size]
+        self._record_pos_list.append(cur_record_pos)
+        self._cached_bytes += total_bytes
+        if len(self._cached_bytes) > 1024*1024:
+            self._cache_file.seek(0, 2)
+            self._cache_file.write(self._cached_bytes)
+            self._cached_bytes = b''
+    def flush(self):
+        if len(self._cached_bytes) > 0:
+            self._cache_file.seek(0, 2)
+            self._cache_file.write(self._cached_bytes)
+            self._cached_bytes = b''
+    def _wirte_record_pos_list(self):
+        self.flush()
+        self._cache_file.seek(0, 2)
+        offset = self._cache_file.tell()
+        offset_bytes = offset.to_bytes(
+            length=self.OFFSET_LENGTH,
+            byteorder='big', signed=False
+        )
+        self._cache_file.seek(0)
+        self._cache_file.write(offset_bytes)
+        data_bytes = pickle.dumps(self._record_pos_list)
+        bytes_size = len(data_bytes)
+        offset_bytes = bytes_size.to_bytes(
+            length=self.OFFSET_LENGTH,
+            byteorder='big', signed=False
+        )
+        total_bytes = offset_bytes + data_bytes
+        self._cache_file.seek(0, 2)
+        self._cache_file.write(total_bytes)
+    def close(self):
+        if not self._cache_file.closed:
+            self._wirte_record_pos_list()
+            self._cache_file.close()
+    def __del__(self):
+        self.close()
+class ListRecordLoader:
+    OFFSET_LENGTH = 8
+    def __init__(self, load_path):
+        self._sync_lock = threading.Lock()
+        self._size = os.path.getsize(load_path)
+        self._load_path = load_path
+        self._open_file()
+        self._scan_file()
+    def _open_file(self):
+        self._pid = os.getpid()
+        self._cache_file = open(self._load_path, 'rb')
+    def _check_reopen(self):
+        if (self._pid != os.getpid()):
+            self._open_file()
+    def _scan_file(self):
+        record_pos_list = list()
+        pos = 0
+        while True:
+            if pos >= self._size:
+                break
+            self._cache_file.seek(pos)
+            offset = int().from_bytes(
+                self._cache_file.read(self.OFFSET_LENGTH),
+                byteorder='big', signed=False
+            )
+            offset = pos + offset
+            self._cache_file.seek(offset)
+            byte_size = int().from_bytes(
+                self._cache_file.read(self.OFFSET_LENGTH),
+                byteorder='big', signed=False
+            )
+            record_pos_list_bytes = self._cache_file.read(byte_size)
+            sub_record_pos_list = pickle.loads(record_pos_list_bytes)
+            assert isinstance(sub_record_pos_list, list)
+            sub_record_pos_list = [[item[0]+pos, item[1]] for item in sub_record_pos_list]
+            record_pos_list.extend(sub_record_pos_list)
+            pos = self._cache_file.tell()
+        self._record_pos_list = record_pos_list
+    def get_record(self, idx):
+        self._check_reopen()
+        record_bytes = self.get_record_bytes(idx)
+        record = pickle.loads(record_bytes)
+        return record
+    def get_record_bytes(self, idx):
+        offset, length = self._record_pos_list[idx]
+        self._sync_lock.acquire()
+        try:
+            self._cache_file.seek(offset)
+            record_bytes = self._cache_file.read(length)
+        finally:
+            self._sync_lock.release()
+        return record_bytes
+    def __len__(self):
+        return len(self._record_pos_list)

libs/data/transform.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import math
+import torch
+import random
+import numpy as np
+from torchvision.transforms import functional as F
+from libs.utils.format_translate import table_to_latex
+from .utils import extract_fg_bg_spans, cal_cell_spans
+class Compose:
+    def __init__(self, transforms):
+        self.transforms = transforms
+    def __call__(self, *data):
+        for transform in self.transforms:
+            data = transform(*data)
+        return data
+class TableToLabel:
+    def __init__(self, vocab):
+        self.vocab = vocab
+    def __call__(self, image, table=None):
+        if table is None:
+            return image, None, None
+        latex = table_to_latex(table) # image.size = (w, h)
+        cls_label = self.vocab.words_to_ids(latex)
+        return image, table, cls_label
+class CalRowColSpans:
+    def __call__(self, image, table=None, cls_label=None):
+        if table is None:
+            return image, table, None, None, None, None, None
+        image_size = (image.width, image.height)
+        rows_fg_span, rows_bg_span, cols_fg_span, cols_bg_span = extract_fg_bg_spans(table, image_size)
+        return image, table, cls_label, rows_fg_span, rows_bg_span, cols_fg_span, cols_bg_span
+class CalCellSpans:
+    def __call__(self, image, table=None, cls_label=None, rows_fg_span=None, rows_bg_span=None, cols_fg_span=None, cols_bg_span=None):
+        if table is not None:
+            cells_span = cal_cell_spans(table)
+        else:
+            cells_span = None
+        return image, table, cls_label, rows_fg_span, rows_bg_span, cols_fg_span, cols_bg_span, cells_span
+class CalHeadBodyDivide:
+    def __call__(self, image, table=None, cls_label=None, rows_fg_span=None, rows_bg_span=None, cols_fg_span=None, cols_bg_span=None, cells_span=None):
+        if table is None:
+            divide = None
+        else:
+            head_rows = table['head_rows']
+            divide = len(head_rows)
+        return image, table, cls_label, rows_fg_span, rows_bg_span, cols_fg_span, cols_bg_span, cells_span, divide
+class ToTensor:
+    def __call__(self, image, table=None, cls_label=None, rows_fg_span=None, rows_bg_span=None, cols_fg_span=None, cols_bg_span=None, cells_span=None, divide=None):
+        image = F.to_tensor(image)
+        if cls_label is not None:
+            cls_label = torch.tensor(cls_label, dtype=torch.long)
+        return image, table, cls_label, rows_fg_span, rows_bg_span, cols_fg_span, cols_bg_span, cells_span, divide
+class Normalize:
+    def __init__(self, mean, std, inplace=False):
+        self.mean = mean
+        self.std = std
+        self.inplace = inplace
+    def __call__(self, image, table=None, cls_label=None, rows_fg_span=None, rows_bg_span=None, cols_fg_span=None, cols_bg_span=None, cells_span=None, divide=None):
+        image = F.normalize(image, self.mean, self.std, self.inplace)
+        return image, table, cls_label, rows_fg_span, rows_bg_span, cols_fg_span, cols_bg_span, cells_span, divide

libs/data/utils.py ADDED Viewed

	@@ -0,0 +1,188 @@

+import math
+from libs.utils.format_translate import table_to_html, format_html
+import numpy as np
+class InvalidFormat(Exception):
+    pass
+def segmentation_to_bbox(segmentation):
+    x1 = min([pt[0] for contour in segmentation for pt in contour])
+    y1 = min([pt[1] for contour in segmentation for pt in contour])
+    x2 = max([pt[0] for contour in segmentation for pt in contour])
+    y2 = max([pt[1] for contour in segmentation for pt in contour])
+    return (x1, y1, x2, y2)
+def cal_cell_bbox(table):
+    cells_bbox = list()
+    for cell in table['cells']:
+        if 'segmentation' not in cell:
+            cell_bbox = None
+        else:
+            segmentation = list()
+            if 'sublines' in cell:
+                for subline in cell['sublines']:
+                    segmentation.extend(subline['segmentation'])
+            if len(segmentation) == 0:
+                segmentation = cell['segmentation']
+            if len(segmentation) == 0:
+                cell_bbox = None
+            else:
+                cell_bbox = segmentation_to_bbox(segmentation)
+            cells_bbox.append(cell_bbox)
+    return cells_bbox
+def cal_cell_spans(table):
+    layout = table['layout']
+    num_cells = len(table['cells'])
+    cells_span = list()
+    for cell_id in range(num_cells):
+        cell_positions = np.argwhere(layout == cell_id)
+        y1 = np.min(cell_positions[:, 0])
+        y2 = np.max(cell_positions[:, 0])
+        x1 = np.min(cell_positions[:, 1])
+        x2 = np.max(cell_positions[:, 1])
+        assert np.all(layout[y1:y2, x1:x2] == cell_id)
+        cells_span.append([x1, y1, x2, y2])
+    return cells_span
+def cal_fg_bg_span(spans, edge):
+    num_span = len(spans)
+    bg_spans = list()
+    for idx in range(num_span):
+        if spans[idx] is None:
+            continue
+        if idx == 0:
+            if spans[idx][0] <= 0:
+                continue
+        else:
+            if spans[idx-1] is None:
+                continue
+            if spans[idx][0] <= spans[idx-1][1]:
+                continue
+        if idx == num_span - 1:
+            if spans[idx][1] >= edge:
+                continue
+        else:
+            if spans[idx+1] is None:
+                continue
+            if spans[idx][1] >= spans[idx+1][0]:
+                continue
+        bg_spans.append(spans[idx])
+    fg_spans = list()
+    for idx in range(num_span+1):
+        if idx == 0:
+            s = 0
+        else:
+            if spans[idx-1] is None:
+                continue
+            s = spans[idx-1][1]
+        if idx == num_span:
+            e = edge
+        else:
+            if spans[idx] is None:
+                continue
+            e = spans[idx][0]
+        if e <= s:
+            continue
+        fg_spans.append([s, e])
+        return fg_spans, bg_spans
+def shrink_spans(spans, size):
+    new_spans = list()
+    for idx, (start, end) in enumerate(spans):
+        if idx == 0:
+            if start <= 0:
+                start = 1
+        else:
+            _, pre_end = spans[idx - 1]
+            if start <= pre_end:
+                shrink_distance = pre_end - start + 1
+                start = start + math.ceil(shrink_distance / 2)
+        if idx == len(spans) - 1:
+            if end >= size:
+                end = size - 1
+        else:
+            next_start, _ = spans[idx + 1]
+            if end >= next_start:
+                shrink_distance = end - next_start + 1
+                end = end - math.ceil(shrink_distance / 2)
+        if end - start < 1:
+            raise InvalidFormat()
+        new_spans.append([start, end])
+    return new_spans
+def cal_row_span(table, cells_span, cells_bbox, height):
+    layout = table['layout']
+    rows_span = list()
+    for row_idx in range(layout.shape[0]):
+        row = layout[row_idx, :]
+        y1s = list()
+        y2s = list()
+        for cell_id in row:
+            cell_span = cells_span[cell_id]
+            cell_bbox = cells_bbox[cell_id]
+            if (cell_span[1] == row_idx) and (cell_bbox is not None):
+                y1s.append(cell_bbox[1])
+            if (cell_span[3] == row_idx) and (cell_bbox is not None):
+                y2s.append(cell_bbox[3])
+        if (len(y1s) > 0) and (len(y2s) > 0):
+            y1 = min(max(1, min(y1s)), height - 1)
+            y2 = min(max(1,max(y2s) + 1), height - 1)
+            rows_span.append([y1, y2])
+        else:
+            raise InvalidFormat()
+    rows_span = shrink_spans(rows_span, height)
+    rows_fg_span, rows_bg_span = cal_fg_bg_span(rows_span, height)
+    return rows_fg_span, rows_bg_span
+def cal_col_span(table, cells_span, cells_bbox, width):
+    layout = table['layout']
+    cols_span = list()
+    for col_idx in range(layout.shape[1]):
+        col = layout[:, col_idx]
+        x1s = list()
+        x2s = list()
+        for cell_id in col:
+            cell_span = cells_span[cell_id]
+            cell_bbox = cells_bbox[cell_id]
+            if (cell_span[0] == col_idx) and (cell_bbox is not None):
+                x1s.append(cell_bbox[0])
+            if (cell_span[2] == col_idx) and (cell_bbox is not None):
+                x2s.append(cell_bbox[2])
+        if (len(x1s) > 0) and (len(x2s) > 0):
+            x1 = min(max(1, min(x1s)), width - 1)
+            x2 = min(max(1, max(x2s) + 1), width - 1)
+            cols_span.append([x1, x2])
+        else:
+            raise InvalidFormat()
+    cols_span = shrink_spans(cols_span, width)
+    cols_fg_span, cols_bg_span = cal_fg_bg_span(cols_span, width)
+    return cols_fg_span, cols_bg_span
+def extract_fg_bg_spans(table, image_size):
+    width, height = image_size
+    cells_bbox = cal_cell_bbox(table)
+    cells_span = cal_cell_spans(table)
+    # cal rows fg bg span
+    rows_fg_span, rows_bg_span = cal_row_span(table, cells_span, cells_bbox, height)
+    #cal cols fg bg span
+    cols_fg_span, cols_bg_span = cal_col_span(table, cells_span, cells_bbox, width)
+    return rows_fg_span, rows_bg_span, cols_fg_span, cols_bg_span

libs/model/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from .model import Model
+import torch
+from torch import nn
+from libs.utils.comm import get_world_size
+def build_model(cfg):
+    if get_world_size() == 1:
+        norm_layer = nn.BatchNorm2d
+    else:
+        norm_layer = nn.BatchNorm2d
+    model = Model(
+        cfg,
+        norm_layer=norm_layer
+    )
+    return model

libs/model/backbone.py ADDED Viewed

	@@ -0,0 +1,281 @@

+import torch
+from torch import Tensor
+import torch.nn as nn
+from typing import Type, Any, Callable, Union, List, Optional
+model_paths = {
+    'resnet18': '/yrfs2/cv6/frwang/PretrainedModelParams/pytorch/ImageNet/ResNet/resnet18-5c106cde.pth',
+    'resnet34': '/Pretrain/resnet_34.pth',
+    'resnet50': '/yrfs2/cv6/frwang/PretrainedModelParams/pytorch/ImageNet/ResNet/resnet50-19c8e357.pth',
+    'resnet101': '/yrfs2/cv6/frwang/PretrainedModelParams/pytorch/ImageNet/ResNet/resnet101-5d3b4d8f.pth',
+    'resnet152': '/yrfs2/cv6/frwang/PretrainedModelParams/pytorch/ImageNet/ResNet/resnet152-b121ed2d.pth',
+}
+def conv3x3(in_planes: int, out_planes: int, stride: int = 1, groups: int = 1, dilation: int = 1) -> nn.Conv2d:
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=dilation, groups=groups, bias=False, dilation=dilation, padding_mode='reflect')
+def conv1x1(in_planes: int, out_planes: int, stride: int = 1) -> nn.Conv2d:
+    """1x1 convolution"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
+class BasicBlock(nn.Module):
+    expansion: int = 1
+    def __init__(
+        self,
+        inplanes: int,
+        planes: int,
+        stride: int = 1,
+        downsample: Optional[nn.Module] = None,
+        groups: int = 1,
+        base_width: int = 64,
+        dilation: int = 1,
+        norm_layer: Optional[Callable[..., nn.Module]] = None
+    ) -> None:
+        super(BasicBlock, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        if groups != 1 or base_width != 64:
+            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
+        if dilation > 1:
+            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
+        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = norm_layer(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu(out)
+        return out
+class Bottleneck(nn.Module):
+    # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
+    # while original implementation places the stride at the first 1x1 convolution(self.conv1)
+    # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
+    # This variant is also known as ResNet V1.5 and improves accuracy according to
+    # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
+    expansion: int = 4
+    def __init__(
+        self,
+        inplanes: int,
+        planes: int,
+        stride: int = 1,
+        downsample: Optional[nn.Module] = None,
+        groups: int = 1,
+        base_width: int = 64,
+        dilation: int = 1,
+        norm_layer: Optional[Callable[..., nn.Module]] = None
+    ) -> None:
+        super(Bottleneck, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        width = int(planes * (base_width / 64.)) * groups
+        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv1x1(inplanes, width)
+        self.bn1 = norm_layer(width)
+        self.conv2 = conv3x3(width, width, stride, groups, dilation)
+        self.bn2 = norm_layer(width)
+        self.conv3 = conv1x1(width, planes * self.expansion)
+        self.bn3 = norm_layer(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu(out)
+        return out
+class ResNet(nn.Module):
+    def __init__(
+        self,
+        block: Type[Union[BasicBlock, Bottleneck]],
+        layers: List[int],
+        zero_init_residual: bool = False,
+        groups: int = 1,
+        width_per_group: int = 64,
+        replace_stride_with_dilation: Optional[List[bool]] = None,
+        norm_layer: Optional[Callable[..., nn.Module]] = None
+    ) -> None:
+        super(ResNet, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        self._norm_layer = norm_layer
+        self.inplanes = 64
+        self.dilation = 1
+        if replace_stride_with_dilation is None:
+            # each element in the tuple indicates if we should replace
+            # the 2x2 stride with a dilated convolution instead
+            replace_stride_with_dilation = [False, False, False]
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError("replace_stride_with_dilation should be None "
+                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
+        self.groups = groups
+        self.base_width = width_per_group
+        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=1, padding=3,
+                               bias=False, padding_mode='reflect')
+        self.bn1 = norm_layer(self.inplanes)
+        self.relu = nn.ReLU(inplace=True)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
+                                       dilate=replace_stride_with_dilation[0])
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
+                                       dilate=replace_stride_with_dilation[1])
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
+                                       dilate=replace_stride_with_dilation[2])
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+        # Zero-initialize the last BN in each residual branch,
+        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
+        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
+        if zero_init_residual:
+            for m in self.modules():
+                if isinstance(m, Bottleneck):
+                    nn.init.constant_(m.bn3.weight, 0)  # type: ignore[arg-type]
+                elif isinstance(m, BasicBlock):
+                    nn.init.constant_(m.bn2.weight, 0)  # type: ignore[arg-type]
+    def _make_layer(self, block: Type[Union[BasicBlock, Bottleneck]], planes: int, blocks: int,
+                    stride: int = 1, dilate: bool = False) -> nn.Sequential:
+        norm_layer = self._norm_layer
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                conv1x1(self.inplanes, planes * block.expansion, stride),
+                norm_layer(planes * block.expansion),
+            )
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
+                            self.base_width, previous_dilation, norm_layer))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(block(self.inplanes, planes, groups=self.groups,
+                                base_width=self.base_width, dilation=self.dilation,
+                                norm_layer=norm_layer))
+        return nn.Sequential(*layers)
+    def _forward_impl(self, x: Tensor) -> Tensor:
+        input = x # (512, 256)
+        x = self.conv1(x) # (256, 128) # stride=2
+        x = self.bn1(x) # (256, 128)
+        x = self.relu(x) # (256, 128)
+        c2 = self.layer1(x) # (256, 128)  # stride=1, total_stride=2
+        c3 = self.layer2(c2) # (128, 64) # stride=2, total_stride=4
+        c4 = self.layer3(c3) # (64, 32) # stride=2, total_stride=8
+        c5 = self.layer4(c4) # (32, 16) # stride=2, total_stride=16
+        return c2, c3, c4, c5
+    def forward(self, x: Tensor) -> Tensor:
+        return self._forward_impl(x)
+def _resnet(
+    arch: str,
+    block: Type[Union[BasicBlock, Bottleneck]],
+    layers: List[int],
+    pretrained: bool,
+    **kwargs: Any
+) -> ResNet:
+    model = ResNet(block, layers, **kwargs)
+    if pretrained:
+        checkpoint = torch.load(model_paths[arch], map_location='cpu')
+        state_dict = model.state_dict()
+        for key, val in state_dict.items():
+            if key in checkpoint:
+                if val.shape == checkpoint[key].shape:
+                    state_dict[key] = checkpoint[key]
+        model.load_state_dict(state_dict)
+    return model
+def resnet18(pretrained: bool = False, **kwargs: Any) -> ResNet:
+    return _resnet('resnet18', BasicBlock, [2, 2, 2, 2], pretrained, **kwargs)
+def resnet34(pretrained: bool = False, **kwargs: Any) -> ResNet:
+    return _resnet('resnet34', BasicBlock, [3, 4, 6, 3], pretrained, **kwargs)
+def resnet50(pretrained: bool = False, **kwargs: Any) -> ResNet:
+    return _resnet('resnet50', Bottleneck, [3, 4, 6, 3], pretrained, **kwargs)
+def resnet101(pretrained: bool = False, **kwargs: Any) -> ResNet:
+    return _resnet('resnet101', Bottleneck, [3, 4, 23, 3], pretrained, **kwargs)
+def resnet152(pretrained: bool = False, **kwargs: Any) -> ResNet:
+    return _resnet('resnet152', Bottleneck, [3, 8, 36, 3], pretrained, **kwargs)
+def build_backbone(arch, pretrained=True, norm_layer=nn.BatchNorm2d):
+    arch_map = {
+        'res34': resnet34,
+        'res50': resnet50,
+        'res101': resnet101,
+        'res152': resnet152
+    }
+    if arch not in arch_map:
+        raise ValueError('Unknown backbone arch: %s' % arch)
+    return arch_map[arch](pretrained=pretrained, norm_layer=norm_layer)

libs/model/cells_extractor.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import torch
+import math
+from torch import nn
+from torch.nn import functional as F
+from .extractor import RoiPosFeatExtraxtor
+class SALayer(nn.Module):
+    def __init__(self, in_dim, att_dim, head_nums):
+        super().__init__()
+        self.in_dim = in_dim
+        self.att_dim = att_dim
+        self.head_nums = head_nums
+        assert self.in_dim % self.head_nums == 0
+        self.key_layer = nn.Conv1d(self.in_dim, self.att_dim, 1, 1, 0)
+        self.query_layer = nn.Conv1d(self.in_dim, self.att_dim, 1, 1, 0)
+        self.value_layer = nn.Conv1d(self.in_dim, self.in_dim, 1, 1, 0)
+        self.scale = 1 / math.sqrt(self.att_dim)
+    def forward(self, feats, masks=None):
+        bs, c, n = feats.shape
+        keys = self.key_layer(feats).reshape(bs, -1, self.head_nums, n)
+        querys = self.query_layer(feats).reshape(bs, -1, self.head_nums, n)
+        values = self.value_layer(feats).reshape(bs, -1, self.head_nums, n)
+        logits = torch.einsum('bchk,bchq->bhkq', keys, querys) * self.scale
+        if masks is not None:
+            logits = logits - (1 - masks[:, None, :, None]) * 1e8
+        weights = torch.softmax(logits, dim=2)
+        new_feats = torch.einsum('bchk,bhkq->bchq', values, weights)
+        new_feats = new_feats.reshape(bs, -1, n)
+        return new_feats + feats
+def gen_cells_bbox(row_segments, col_segments, device):
+    cells_bbox = list()
+    for row_segments_pi, col_segments_pi in zip(row_segments, col_segments):
+        num_rows = len(row_segments_pi) - 1
+        num_cols = len(col_segments_pi) - 1
+        cells_bbox_pi = list()
+        for row_idx in range(num_rows):
+            for col_idx in range(num_cols):
+                bbox = [
+                    col_segments_pi[col_idx],
+                    row_segments_pi[row_idx],
+                    col_segments_pi[col_idx + 1],
+                    row_segments_pi[row_idx + 1]
+                ]
+                cells_bbox_pi.append(bbox)
+        cells_bbox_pi = torch.tensor(cells_bbox_pi, dtype=torch.float, device=device)
+        cells_bbox.append(cells_bbox_pi)
+    return cells_bbox
+def align_cells_feat(cells_feat, num_rows, num_cols):
+    batch_size = len(cells_feat)
+    dtype = cells_feat[0].dtype
+    device = cells_feat[0].device
+    max_row_nums = max(num_rows)
+    max_col_nums = max(num_cols)
+    aligned_cells_feat = list()
+    masks = torch.zeros([batch_size, max_row_nums, max_col_nums], dtype=dtype, device=device)
+    for batch_idx in range(batch_size):
+        num_rows_pi = num_rows[batch_idx]
+        num_cols_pi = num_cols[batch_idx]
+        cells_feat_pi = cells_feat[batch_idx]
+        cells_feat_pi = cells_feat_pi.transpose(0, 1).reshape(-1, num_rows_pi, num_cols_pi)
+        aligned_cells_feat_pi = F.pad(
+            cells_feat_pi,
+            (0, max_col_nums - num_cols_pi, 0, max_row_nums - num_rows_pi, 0, 0),
+            mode='constant',
+            value=0
+        )
+        aligned_cells_feat.append(aligned_cells_feat_pi)
+        masks[batch_idx, :num_rows_pi, :num_cols_pi] = 1
+    aligned_cells_feat = torch.stack(aligned_cells_feat, dim=0)
+    return aligned_cells_feat, masks
+class CellsExtractor(nn.Module):
+    def __init__(self, in_dim, cell_dim, heads, head_nums, pool_size, scale=1):
+        super().__init__()
+        self.in_dim = in_dim
+        self.cell_dim = cell_dim
+        self.pool_size = pool_size
+        self.scale = scale
+        self.box_feat_extractor = RoiPosFeatExtraxtor(
+            self.scale,
+            self.pool_size,
+            self.in_dim,
+            self.cell_dim
+        )
+        self.heads = heads
+        self.row_sas = nn.ModuleList()
+        self.col_sas = nn.ModuleList()
+        for _ in range(self.heads):
+            self.row_sas.append(SALayer(cell_dim, cell_dim, head_nums))
+            self.col_sas.append(SALayer(cell_dim, cell_dim, head_nums))
+    def forward(self, feats, row_segments, col_segments, img_sizes):
+        device = feats.device
+        num_rows = [len(row_segments_pi) - 1 for row_segments_pi in row_segments]
+        num_cols = [len(col_segments_pi) - 1 for col_segments_pi in col_segments]
+        cells_bbox = gen_cells_bbox(row_segments, col_segments, device)
+        cells_feat = self.box_feat_extractor(feats, cells_bbox, img_sizes)
+        aligned_cells_feat, masks = align_cells_feat(cells_feat, num_rows, num_cols)
+        bs, c, nr, nc = aligned_cells_feat.shape
+        for idx in range(self.heads):
+            col_cells_feat = aligned_cells_feat.permute(0, 2, 1, 3).contiguous().reshape(bs * nr, c, nc)
+            col_masks = masks.reshape(bs * nr, nc)
+            col_cells_feat = self.col_sas[idx](col_cells_feat, col_masks) # self-attention
+            aligned_cells_feat = col_cells_feat.reshape(bs, nr, c, nc).permute(0, 2, 1, 3).contiguous()
+            row_cells_feat = aligned_cells_feat.permute(0, 3, 1, 2).contiguous().reshape(bs * nc, c, nr)
+            row_masks = masks.transpose(1, 2).reshape(bs * nc, nr)
+            row_cells_feat = self.row_sas[idx](row_cells_feat, row_masks) # self-attention
+            aligned_cells_feat = row_cells_feat.reshape(bs, nc, c, nr).permute(0, 2, 3, 1).contiguous()
+        return aligned_cells_feat, masks

libs/model/decoder.py ADDED Viewed

	@@ -0,0 +1,277 @@

+import math
+from numpy.core.fromnumeric import argmax
+import torch
+from torch import nn
+from torch._C import device, dtype, layout
+from torch.nn import functional as F
+from torch.nn.functional import cross_entropy, embedding
+from torch.nn.modules import loss
+from torch.nn.modules.activation import Tanh
+from libs.utils.metric import CellMergeAcc, AccMetric
+from .utils import gen_proposals
+class ImageAttention(nn.Module):
+    def __init__(self, key_dim, query_dim, cover_kernel):
+        super().__init__()
+        self.query_transform = nn.Linear(query_dim, key_dim)
+        self.weight_transform = nn.Conv2d(1, key_dim, cover_kernel, 1, padding=cover_kernel // 2)
+        self.cum_weight_transform = nn.Conv2d(1, key_dim, cover_kernel, 1, padding=cover_kernel // 2)
+        self.logit_transform = nn.Conv2d(key_dim, 1, 1, 1, 0)
+    def forward(self, key, key_mask, query, spatial_att_weight, cum_spatial_att_weight, value, state, layouts=None, layouts_cum=None, spatial_att_weight_scores=None):
+        query = self.query_transform(query)
+        weight_query = self.weight_transform(spatial_att_weight)
+        cum_weight_query = self.cum_weight_transform(cum_spatial_att_weight)
+        fusion = key + query[:, :, None, None] + weight_query + cum_weight_query
+        # cal new_spatial_att_logit
+        new_spatial_att_logit = self.logit_transform(torch.tanh(fusion))
+        # cal new_spatial_att_weight
+        new_spatial_att_weight = new_spatial_att_logit - (1 - key_mask) * 1e8
+        bs, _, h, w = new_spatial_att_weight.shape
+        new_spatial_att_weight = new_spatial_att_weight.reshape(bs, h * w)
+        new_spatial_att_weight = torch.softmax(new_spatial_att_weight, dim=1).reshape(bs, 1, h, w)
+        # cal new_cum_spatial_att_weight
+        if self.training:
+            outputs = list()
+            for (value_pi, layout) in zip(value, layouts):
+                h, w = torch.where(layout == 1.)
+                if len(h) == 0 or len(w) == 0:
+                    outputs.append(torch.zeros_like(query[0]))
+                else:
+                    outputs.append(value_pi[:, h, w].mean(-1))
+            outputs = torch.stack(outputs, dim=0)
+            new_cum_spatial_att_weight = torch.clamp(layouts.unsqueeze(1).float() + cum_spatial_att_weight, max=1.)
+            return state, outputs, new_spatial_att_logit, new_spatial_att_weight, new_cum_spatial_att_weight, None, None
+        else:
+            state_list = list()
+            outputs_list = list()
+            scores_list = list()
+            proposals_list = list()
+            new_spatial_att_weight_list = list()
+            new_cum_spatial_att_weight_list = list()
+            layouts_pred = new_spatial_att_logit.squeeze(1).sigmoid()
+            for idx, (value_pi, state_pi, layout) in enumerate(zip(value, state, layouts_pred)):
+                if cum_spatial_att_weight[idx].min() == 1:
+                    state_list.append(state_pi)
+                    outputs_list.append(torch.zeros_like(query[0]))
+                    proposals_list.append(torch.cat((layouts_cum[idx], torch.zeros_like(layout.unsqueeze(0))), dim=0))
+                    scores_list.append(spatial_att_weight_scores[idx])
+                    new_spatial_att_weight_list.append(new_spatial_att_weight[idx])
+                    new_cum_spatial_att_weight_list.append(cum_spatial_att_weight[idx])
+                else:
+                    srow, scol = torch.where(cum_spatial_att_weight[idx].squeeze(0) == cum_spatial_att_weight[idx].squeeze(0).min())
+                    scol = scol[srow == srow.min()].min()
+                    srow = srow.min()
+                    proposals, scores = gen_proposals(layout, srow, scol, score_threshold=0.5)
+                    scores = scores + spatial_att_weight_scores[idx]
+                    for s in scores:
+                        scores_list.append(s)
+                    for p in proposals:
+                        proposals_list.append(torch.cat((layouts_cum[idx], p.unsqueeze(0)), dim=0))
+                        h, w = torch.where(p == 1.)
+                        outputs_list.append(value_pi[:, h, w].mean(-1))
+                        state_list.append(state_pi)
+                        new_spatial_att_weight_list.append(new_spatial_att_weight[idx])
+                        new_cum_spatial_att_weight_list.append(torch.clamp(cum_spatial_att_weight[idx] + p.unsqueeze(0), max=1.))
+                state_list = torch.stack(state_list, dim=0)
+                proposals_list = torch.stack(proposals_list, dim=0)
+                scores_list = torch.stack(scores_list, dim=0)
+                outputs_list = torch.stack(outputs_list, dim=0)
+                new_spatial_att_weight_list = torch.stack(new_spatial_att_weight_list, dim=0)
+                new_cum_spatial_att_weight_list = torch.stack(new_cum_spatial_att_weight_list, dim=0)
+                sorted_scores, sorted_idxes = torch.sort(scores_list, dim=0, descending=True)
+                sorted_scores = sorted_scores[:6]
+                sorted_idxes = sorted_idxes[:6]
+                proposals = proposals_list[sorted_idxes]
+                new_spatial_att_weight = new_spatial_att_weight_list[sorted_idxes]
+                new_cum_spatial_att_weight = new_cum_spatial_att_weight_list[sorted_idxes]
+                outputs = outputs_list[sorted_idxes]
+                state = state_list[sorted_idxes]
+                return state, outputs, new_spatial_att_logit, new_spatial_att_weight, new_cum_spatial_att_weight, proposals, sorted_scores
+class Decoder(nn.Module):
+    def __init__(self, vocab, embed_dim, feat_dim, lm_state_dim, proj_dim, cover_kernel, att_threshold, spatial_att_logit_loss_wight):
+        super().__init__()
+        self.vocab = vocab
+        self.embed_dim = embed_dim
+        self.feat_dim = feat_dim
+        self.lm_state_dim = lm_state_dim
+        self.proj_dim = proj_dim
+        self.cover_kernel = cover_kernel
+        self.att_threshold = att_threshold
+        self.spatial_att_logit_loss_wight = spatial_att_logit_loss_wight
+        self.feat_projection = nn.Conv2d(self.feat_dim, self.proj_dim, 1, 1, 0)
+        self.state_init_projection = nn.Conv2d(self.feat_dim, self.lm_state_dim, 1, 1, 0)
+        self.lm_rnn1 = nn.GRUCell(input_size=self.feat_dim, hidden_size=self.lm_state_dim)
+        self.lm_rnn2 = nn.GRUCell(input_size=self.feat_dim, hidden_size=self.lm_state_dim)
+        self.image_attention = ImageAttention(self.proj_dim, self.feat_dim + self.lm_state_dim, cover_kernel)
+        self.struct_cls = nn.Sequential(
+            nn.Linear(self.feat_dim + self.lm_state_dim, self.lm_state_dim),
+            nn.Tanh(),
+            nn.Linear(self.lm_state_dim, len(self.vocab))
+        )
+    def init_state(self, feats, feats_mask):
+        bs, _, h, w = feats.shape
+        project_feats = self.feat_projection(feats) * feats_mask
+        init_state = torch.sum(self.state_init_projection(feats), dim=(2, 3))/torch.sum(feats_mask, dim=(2, 3))
+        init_context = torch.sum(feats, dim=(2, 3)) / torch.sum(feats_mask, dim=(2, 3))
+        init_spatial_att_weight = torch.zeros([bs, 1, h, w], dtype=torch.float, device=feats.device)
+        init_cum_spatial_att_weight = torch.zeros([bs, 1, h, w], dtype=torch.float, device=feats.device)
+        return project_feats, init_state, init_context, init_spatial_att_weight, init_cum_spatial_att_weight
+    def step(self, feats, project_feats, feats_mask, state, context, spatial_att_weight, cum_spatial_att_weight, layouts=None, layouts_cum=None, spatial_att_weight_scores=None):
+        new_state = self.lm_rnn1(context, state)
+        new_state, new_context, new_spatial_att_logit, \
+            new_spatial_att_weight, new_cum_spatial_att_weight, \
+                layouts_cum, spatial_att_weight_scores = self.image_attention(
+            project_feats,
+            feats_mask,
+            torch.cat([context, new_state], dim=1),
+            spatial_att_weight,
+            cum_spatial_att_weight,
+            feats,
+            new_state,
+            layouts,
+            layouts_cum,
+            spatial_att_weight_scores
+        )
+        new_state = self.lm_rnn2(new_context, new_state)
+        cls_feat = torch.cat([new_context, new_state], dim=1)
+        cls_logits_pt = self.struct_cls(cls_feat)
+        return cls_logits_pt, new_state, new_context, new_spatial_att_logit, new_spatial_att_weight, new_cum_spatial_att_weight, layouts_cum, spatial_att_weight_scores
+    def forward(self, feats, feats_mask, cls_labels=None, labels_mask=None, layouts=None):
+        if self.training:
+            return self.forward_backward(feats, feats_mask, cls_labels, labels_mask, layouts)
+        else:
+            return self.inference(feats, feats_mask)
+    def inference(self, feats, feats_mask):
+        bs, _, h, w = feats.shape
+        device = feats.device
+        assert bs == 1, print('bs should be 1')
+        layouts_cum = torch.zeros_like(feats[:, : 1])
+        spatial_att_weight_scores = torch.zeros(bs).to(device=device, dtype=feats.dtype)
+        project_feats, init_state, init_context, spatial_att_weight, cum_spatial_att_weight = self.init_state(feats, feats_mask)
+        state = init_state
+        context = init_context
+        for _ in range(h*w):
+            cls_logits_pt, state, context, spatial_att_logit, spatial_att_weight, \
+                cum_spatial_att_weight, layouts_cum, spatial_att_weight_scores \
+                    = self.step(
+                            feats, project_feats,
+                            feats_mask, state, context,
+                            spatial_att_weight, cum_spatial_att_weight, None, layouts_cum, spatial_att_weight_scores)
+            feats = feats[:1].repeat(layouts_cum.shape[0], 1, 1, 1)
+            feats_mask = feats_mask[:1].repeat(layouts_cum.shape[0], 1, 1, 1)
+            project_feats = project_feats[:1].repeat(layouts_cum.shape[0], 1, 1, 1)
+            if cum_spatial_att_weight.min() == 1:
+                break
+        spatial_att_logit_preds = layouts_cum[spatial_att_weight_scores.argmax(), 1:].unsqueeze(0)
+        return spatial_att_logit_preds, {}
+    def forward_backward(self, feats, feats_mask, cls_labels, labels_mask, layouts):
+        device = feats.device
+        valid_cls_length = torch.sum((labels_mask == 1) & (cls_labels != -1), dim=1).detach()
+        valid_spatial_att_logit_length = torch.stack([layout.max() + 1 for layout in layouts])
+        max_length = valid_cls_length.max()
+        project_feats, init_state, init_context, spatial_att_weight, cum_spatial_att_weight = self.init_state(feats, feats_mask)
+        state = init_state
+        context = init_context
+        loss_cache = dict()
+        cls_loss = list()
+        cls_preds = list()
+        spatial_att_logit_loss = list()
+        spatial_att_logit_preds = list()
+        spatial_att_logit_masks = list()
+        spatial_att_logit_labels = list()
+        for time_t in range(max_length):
+            cls_logits_pt, state, context, spatial_att_logit, spatial_att_weight, cum_spatial_att_weight, *_  \
+                = self.step(
+                        feats, project_feats,
+                        feats_mask, state, context,
+                        spatial_att_weight, cum_spatial_att_weight, layouts == time_t
+                    )
+            cls_label = cls_labels[:, time_t]
+            label_mask = labels_mask[:, time_t]
+            # cal cls loss
+            cls_loss_pt = F.cross_entropy(cls_logits_pt, cls_label, ignore_index=-1, reduction='none') * label_mask
+            cls_loss.append(cls_loss_pt)
+            # save for acc
+            cls_preds.append(torch.argmax(cls_logits_pt, dim=1).detach())
+            spatial_att_logit_preds.append(spatial_att_logit.sigmoid() > self.att_threshold)
+            spatial_att_logit_masks.append((layouts != -1).unsqueeze(1))
+            spatial_att_logit_labels.append((layouts == time_t).unsqueeze(1))
+            # cal spatial att loss
+            spatial_att_logit_loss_pt = list()
+            for spatial_att_logit_pi, layout in zip(spatial_att_logit, layouts):
+                target = layout == time_t
+                if torch.any(target) == False:
+                    spatial_att_logit_loss_pt_pi = torch.tensor(0.0, dtype=torch.float, device=device)
+                else:
+                    mask = (layout != -1).float()
+                    spatial_att_logit_loss_pt_pi = F.binary_cross_entropy_with_logits(
+                        spatial_att_logit_pi,
+                        target.float().unsqueeze(0),
+                        reduction='none'
+                    )
+                    spatial_att_logit_loss_pt_pi = (spatial_att_logit_loss_pt_pi * mask).sum()
+                spatial_att_logit_loss_pt.append(spatial_att_logit_loss_pt_pi)
+            spatial_att_logit_loss_pt = torch.stack(spatial_att_logit_loss_pt, dim=0)
+            spatial_att_logit_loss.append(spatial_att_logit_loss_pt)
+        cls_loss = torch.mean(torch.sum(torch.stack(cls_loss, dim=1), dim=1)/valid_cls_length)
+        spatial_att_logit_loss = self.spatial_att_logit_loss_wight * torch.mean(torch.sum(torch.stack(spatial_att_logit_loss, dim=1), dim=1) / valid_spatial_att_logit_length)
+        loss_cache['cls_loss'] = cls_loss
+        loss_cache['spatial_att_logit_loss'] = spatial_att_logit_loss
+        cls_preds = torch.stack(cls_preds, dim=1)
+        spatial_att_logit_preds = torch.stack(spatial_att_logit_preds, dim=1)
+        spatial_att_logit_masks = torch.stack(spatial_att_logit_masks, dim=1)
+        spatial_att_logit_labels = torch.stack(spatial_att_logit_labels, dim=1)
+        acc_metric = AccMetric()
+        cell_merge_acc = CellMergeAcc()
+        cls_correct, cls_total = acc_metric(cls_preds, cls_labels, labels_mask)
+        cls_none_correct, cls_none_total = acc_metric(cls_preds, cls_labels, (labels_mask == 1) & (cls_labels == self.vocab.none_id))
+        cls_bold_correct, cls_bold_total = acc_metric(cls_preds, cls_labels, (labels_mask == 1) & (cls_labels == self.vocab.bold_id))
+        cls_space_correct, cls_space_total = acc_metric(cls_preds, cls_labels, (labels_mask == 1) & (cls_labels == self.vocab.space_id))
+        cls_blank_correct = cls_none_correct + cls_bold_correct + cls_space_correct
+        cls_blank_total = cls_none_total + cls_bold_total + cls_space_total
+        cells_correct_nums, cells_total_nums = cell_merge_acc(spatial_att_logit_preds, spatial_att_logit_labels, spatial_att_logit_masks)
+        loss_cache['cls_acc'] = cls_correct / cls_total
+        loss_cache['cls_none_acc'] = cls_none_correct / cls_none_total
+        loss_cache['cls_bold_acc'] = cls_bold_correct / cls_bold_total
+        loss_cache['cls_space_acc'] = cls_space_correct / cls_space_total
+        loss_cache['cls_blank_acc'] = cls_blank_correct / cls_blank_total
+        loss_cache['spatial_att_logit_acc'] = cells_correct_nums / cells_total_nums
+        return (spatial_att_logit_preds), loss_cache
+def build_decoder(cfg):
+    decoder = Decoder(
+        vocab=cfg.vocab,
+        feat_dim=cfg.encode_dim,
+        line_dim=cfg.extractor_dim,
+        embed_dim=cfg.embed_dim,
+        lm_state_dim=cfg.lm_state_dim,
+        proj_dim=cfg.proj_dim,
+        hidden_dim=cfg.hidden_dim,
+        cover_kernel=cfg.cover_kernel,
+        max_length=cfg.max_length
+    )
+    return decoder

libs/model/divide_predictor.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import torch
+from torch import nn
+from torch.nn import functional as F
+from .sa import SALayer
+from libs.utils.metric import cal_cls_acc
+def align_segments_feat(segments_feat):
+    dtype = segments_feat[0].dtype
+    device = segments_feat[0].device
+    batch_size = len(segments_feat)
+    max_segment_nums = max([item.shape[1] for item in segments_feat])
+    aligned_segments_feat = list()
+    masks = torch.zeros([batch_size, max_segment_nums], dtype=dtype, device=device)
+    for batch_idx in range(batch_size):
+        cur_segment_nums = segments_feat[batch_idx].shape[1]
+        masks[batch_idx, :cur_segment_nums] = 1
+        aligned_segments_feat.append(
+            F.pad(
+                segments_feat[batch_idx],
+                (0, max_segment_nums - cur_segment_nums, 0, 0),
+                mode='constant',
+                value=0
+            )
+        )
+    aligned_segments_feat = torch.stack(aligned_segments_feat, dim=0)
+    return aligned_segments_feat, masks
+class HeadBodyDividePredictor(nn.Module):
+    def __init__(self, in_dim, head_nums, scale=1):
+        super().__init__()
+        self.in_dim = in_dim
+        self.scale = scale
+        self.fusion_layer = SALayer(in_dim, in_dim, head_nums)
+        self.classifier= nn.Conv1d(in_dim, 1, 1, 1, 0)
+    def forward(self, feats, segments, divide_labels=None):
+        segments = [[int(subitem * self.scale) for subitem in item] for item in segments]
+        segments_feat = [feats_pi[:, segments_pi] for feats_pi, segments_pi in zip(feats, segments)]
+        aligned_segments_feat, masks = align_segments_feat(segments_feat)
+        aligned_segments_feat = self.fusion_layer(aligned_segments_feat, masks)
+        divide_logits = self.classifier(aligned_segments_feat).squeeze(1)
+        divide_logits = divide_logits - (1 - masks) * 1e8
+        divide_preds = torch.argmax(divide_logits, dim=1)
+        result_info = dict()
+        ext_info = dict()
+        if self.training:
+            result_info['divide_loss'] = F.cross_entropy(divide_logits, divide_labels)
+            correct_nums, total_nums = cal_cls_acc(divide_preds, divide_labels)
+            if total_nums != 0:
+                result_info['divide_acc'] = correct_nums / total_nums
+        divide_preds = divide_preds.detach().cpu().tolist()
+        return divide_preds, result_info, ext_info

libs/model/extractor.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import torch
+from torch import nn
+from torchvision.ops import roi_align
+def convert_to_roi_format(lines_box):
+    concat_boxes = torch.cat(lines_box, dim=0)
+    device, dtype = concat_boxes.device, concat_boxes.dtype
+    ids = torch.cat(
+        [
+            torch.full((lines_box_pi.shape[0], 1), i, dtype=dtype, device=device)
+            for i, lines_box_pi in enumerate(lines_box)
+        ],
+        dim=0
+    )
+    rois = torch.cat([ids, concat_boxes], dim=1)
+    return rois
+class RoiFeatExtraxtor(nn.Module):
+    def __init__(self, scale, pool_size, input_dim, output_dim):
+        super().__init__()
+        self.scale = scale
+        self.pool_size = pool_size
+        self.output_dim = output_dim
+        input_dim = input_dim * self.pool_size[0] * self.pool_size[1]
+        self.fc = nn.Sequential(
+            nn.Linear(input_dim, self.output_dim),
+            nn.ReLU(),
+            nn.Linear(self.output_dim, self.output_dim)
+        )
+    def forward(self, feats, lines_box):
+        rois = convert_to_roi_format(lines_box)
+        lines_feat = roi_align(
+            input=feats,
+            boxes=rois,
+            output_size=self.pool_size,
+            spatial_scale=self.scale,
+            sampling_ratio=2
+        )
+        lines_feat = lines_feat.reshape(lines_feat.shape[0], -1)
+        lines_feat = self.fc(lines_feat)
+        lines_feat = torch.split(lines_feat, [item.shape[0] for item in lines_box])
+        return list(lines_feat)
+class RoiPosFeatExtraxtor(nn.Module):
+    def __init__(self, scale, pool_size, input_dim, output_dim):
+        super().__init__()
+        self.scale = scale
+        self.pool_size = pool_size
+        self.output_dim = output_dim
+        input_dim = input_dim * self.pool_size[0] * self.pool_size[1]
+        self.fc = nn.Sequential(
+            nn.Linear(input_dim, self.output_dim),
+            nn.ReLU(),
+            nn.Linear(self.output_dim, self.output_dim)
+        )
+        self.bbox_ln = nn.LayerNorm(self.output_dim)
+        self.bbox_tranform = nn.Linear(4, self.output_dim)
+        self.add_ln = nn.LayerNorm(self.output_dim)
+    def forward(self, feats, lines_box, img_sizes):
+        rois = convert_to_roi_format(lines_box)
+        lines_feat = roi_align(
+            input=feats,
+            boxes=rois,
+            output_size=self.pool_size,
+            spatial_scale=self.scale,
+            sampling_ratio=2
+        )
+        lines_feat = lines_feat.reshape(lines_feat.shape[0], -1)
+        lines_feat = self.fc(lines_feat)
+        lines_feat = list(torch.split(lines_feat, [item.shape[0] for item in lines_box]))
+        # Add Pos Embedding
+        feats_H, feats_W = feats.shape[-2:]
+        for idx, (line_box, img_size) in enumerate(zip(lines_box, img_sizes)):
+            line_box[:, 0] = line_box[:, 0] * self.scale / feats_W
+            line_box[:, 1] = line_box[:, 1] * self.scale / feats_H
+            line_box[:, 2] = line_box[:, 2] * self.scale / feats_W
+            line_box[:, 3] = line_box[:, 3] * self.scale / feats_H
+            lines_feat[idx] = self.add_ln(lines_feat[idx] + self.bbox_ln(self.bbox_tranform(line_box)))
+        return list(lines_feat)

libs/model/fpn.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import torch
+from torch import nn
+from torch.nn import functional as F
+class FPN(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        assert len(in_channels) == 4
+        self.in_channels = in_channels
+        self.lat_layers = nn.ModuleList()
+        self.out_layers = nn.ModuleList()
+        for in_channels_pl in in_channels:
+            self.lat_layers.append(
+                nn.Conv2d(in_channels_pl, out_channels, kernel_size=1, stride=1, padding=0)
+            )
+            self.out_layers.append(
+                nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, padding_mode='reflect')
+            )
+    def forward(self, feats):
+        c2, c3, c4, c5 = feats
+        p5 = self.lat_layers[3](c5)
+        p4 = F.interpolate(p5, size=c4.shape[2:], align_corners=False, mode='bilinear') + self.lat_layers[2](c4)
+        p3 = F.interpolate(p4, size=c3.shape[2:], align_corners=False, mode='bilinear') + self.lat_layers[1](c3)
+        p2 = F.interpolate(p3, size=c2.shape[2:], align_corners=False, mode='bilinear') + self.lat_layers[0](c2)
+        p2 = self.out_layers[0](p2)
+        p3 = self.out_layers[1](p3)
+        p4 = self.out_layers[2](p4)
+        p5 = self.out_layers[3](p5)
+        return p2, p3, p4, p5
+def build_fpn(in_channels, out_channels):
+    return FPN(in_channels, out_channels)

libs/model/model.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import torch
+from torch import nn
+from .backbone import build_backbone
+from .fpn import build_fpn
+from .pan import PAN
+from .segment_predictor import SegmentPredictor
+from .divide_predictor import HeadBodyDividePredictor
+from .cells_extractor import CellsExtractor
+from .decoder import Decoder
+from .utils import extend_segments, spatial_att_to_spans
+class Model(nn.Module):
+    def __init__(self, cfg, norm_layer=nn.BatchNorm2d):
+        super().__init__()
+        self.backbone = build_backbone(cfg.arch, cfg.pretrained_backbone, norm_layer=norm_layer)
+        self.fpn = build_fpn(cfg.backbone_out_channels, cfg.fpn_out_channels)
+        self.pan = PAN(cfg.pan_num_levels, cfg.pan_in_dim, cfg.pan_out_dim)
+        self.row_segment_predictor = SegmentPredictor(cfg.fpn_out_channels, scale=cfg.rs_scale, type='row')
+        self.col_segment_predictor = SegmentPredictor(cfg.fpn_out_channels, scale=cfg.cs_scale, type='col')
+        self.divide_predictor = HeadBodyDividePredictor(cfg.fpn_out_channels, cfg.dp_head_nums, scale=cfg.dp_scale)
+        self.cells_extractor = CellsExtractor(cfg.fpn_out_channels, cfg.ce_dim, cfg.ce_heads, cfg.ce_head_nums, cfg.ce_pool_size, cfg.ce_scale)
+        self.decoder = Decoder(cfg.vocab, cfg.embed_dim, cfg.feat_dim, cfg.lm_state_dim, cfg.proj_dim, cfg.cover_kernel, cfg.att_threshold, cfg.spatial_att_weight_loss_wight)
+    def forward(self, images, images_size, cls_labels=None, labels_mask=None, layouts=None, rows_fg_spans=None,
+        rows_bg_spans=None, cols_fg_spans=None, cols_bg_spans=None, cells_spans=None, divide_labels=None):
+        feats = self.fpn(self.backbone(images))
+        row_feats = torch.mean(feats[0], dim=3)
+        result_info = dict()
+        ext_info = dict()
+        row_segments, rs_result_info, rs_ext_info = self.row_segment_predictor(feats[0], images_size, rows_fg_spans, rows_bg_spans)
+        rs_result_info = {'row_%s' % key: val for key, val in rs_result_info.items()}
+        rs_ext_info = {'row_%s' % key: val for key, val in rs_ext_info.items()}
+        result_info.update(rs_result_info)
+        ext_info.update(rs_ext_info)
+        col_segments, cs_result_info, cs_ext_info = self.col_segment_predictor(feats[0], images_size, cols_fg_spans, cols_bg_spans)
+        cs_result_info = {'col_%s' % key: val for key, val in cs_result_info.items()}
+        cs_ext_info = {'col_%s' % key: val for key, val in cs_ext_info.items()}
+        result_info.update(cs_result_info)
+        ext_info.update(cs_ext_info)
+        if self.training:
+            row_segments, col_segments, cells_spans, layouts, divide_labels = extend_segments(row_segments, rs_ext_info['row_ext_segments'],
+                col_segments, cs_ext_info['col_ext_segments'], cells_spans, layouts, divide_labels)
+        divide_preds, dp_result_info, dp_ext_info = self.divide_predictor(row_feats, row_segments, divide_labels=divide_labels)
+        result_info.update(dp_result_info)
+        ext_info.update(dp_ext_info)
+        feat_maps, feats_masks = self.cells_extractor(self.pan(feats), row_segments, col_segments, images_size)
+        if self.training:
+            assert feat_maps.shape[-2:] == layouts.shape[-2:], print('feat_maps is not the same with layouts')
+        de_preds, de_result_info = self.decoder(feat_maps, feats_masks.unsqueeze(1), cls_labels, labels_mask, layouts)
+        result_info.update(de_result_info)
+        if not self.training:
+            assert de_preds.shape[0] == 1, print("batch size should be 1")
+            de_recog_spans = spatial_att_to_spans(de_preds[0])
+            return (row_segments, col_segments, divide_preds, de_recog_spans), result_info
+        else:
+            return (row_segments, col_segments, divide_preds), result_info

libs/model/pan.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import torch
+from torch import nn
+from torch.nn import functional as F
+class PAN(nn.Module):
+    def __init__(self, num_levels, in_channels, out_channels):
+        super().__init__()
+        self.num_levels = num_levels
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.pan_layers = nn.ModuleList()
+        for _ in range(num_levels - 1):
+            self.pan_layers.append(
+                nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1, padding_mode='reflect')
+            )
+    def forward(self, feats):
+        p2, p3, p4, p5 = feats
+        p2_ = p2
+        p3_ = self.pan_layers[0](F.interpolate(p2_, size=p3.shape[2:], align_corners=False, mode='bilinear') + p3)
+        p4_ = self.pan_layers[1](F.interpolate(p3_, size=p4.shape[2:], align_corners=False, mode='bilinear') + p4)
+        p5_ = self.pan_layers[2](F.interpolate(p4_, size=p5.shape[2:], align_corners=False, mode='bilinear') + p5)
+        return p5_

libs/model/sa.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import math
+import torch
+from torch import nn
+class SALayer(nn.Module):
+    def __init__(self, in_dim, att_dim, head_nums):
+        super().__init__()
+        self.in_dim = in_dim
+        self.att_dim = att_dim
+        self.head_nums = head_nums
+        assert self.in_dim % self.head_nums == 0
+        self.key_layer = nn.Conv1d(self.in_dim, self.att_dim * self.head_nums, 1, 1, 0)
+        self.query_layer = nn.Conv1d(self.in_dim, self.att_dim * self.head_nums, 1, 1, 0)
+        self.value_layer = nn.Conv1d(self.in_dim, self.in_dim, 1, 1, 0)
+        self.scale = 1 / math.sqrt(self.att_dim)
+    def forward(self, feats, masks=None):
+        bs, c, n = feats.shape
+        keys = self.key_layer(feats).reshape(bs, -1, self.head_nums, n)
+        querys = self.query_layer(feats).reshape(bs, -1, self.head_nums, n)
+        values = self.value_layer(feats).reshape(bs, -1, self.head_nums, n)
+        logits = torch.einsum('bchk,bchq->bhkq', keys, querys) * self.scale
+        if masks is not None:
+            logits = logits - (1 - masks[:, None, :, None]) * 1e8
+        weights = torch.softmax(logits, dim=2)
+        new_feats = torch.einsum('bchk,bhkq->bchq', values, weights)
+        new_feats = new_feats.reshape(bs, -1, n)
+        return new_feats + feats

libs/model/segment_predictor.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.nn.modules.activation import ReLU
+from libs.utils.metric import cal_segment_pr
+from .utils import draw_spans, save_logitmap
+def cal_segments(cls_probs, spans, scale=1.0):
+    segments = list()
+    for span in spans:
+        span_cls_probs = cls_probs[int(span[0] * scale): int(span[1] * scale)]
+        segment = torch.argmax(span_cls_probs).item() + int(span[0] * scale)
+        segments.append(segment)
+    segments = [int(item/scale) for item in segments]
+    return segments
+def cal_spans(cls_probs, threshold=0.5):
+    ids = (cls_probs > threshold).long().tolist()
+    spans = list()
+    for idx, id in enumerate(ids):
+        if id == 1:
+            if (idx == 0) or (ids[idx-1] != 1):
+                spans.append([idx, idx+1])
+            else:
+                spans[-1][1] = idx + 1
+    return spans
+# draw_spans('row_segment_spans.png', 'row_segment.png', spans, 'row')
+def cls_logits_to_segments(segments_logit, masks, type, spans=None, scale=1, threshold=0.5):
+    if type == 'col':
+        cls_probs = segments_logit.squeeze(1).sigmoid().mean(dim=1)
+        lengths = [int(mask[0, :].sum().item()) for mask in masks]
+    else:
+        cls_probs = segments_logit.squeeze(1).sigmoid().mean(dim=2)
+        lengths = [int(mask[:, 0].sum().item()) for mask in masks]
+    batch_size = cls_probs.shape[0]
+    segments = list()
+    for batch_idx in range(batch_size):
+        length = lengths[batch_idx]
+        if spans is None:
+            spans_pi = cal_spans(cls_probs[batch_idx, :length], threshold)
+            if len(spans_pi) <= 2:
+                spans_pi = [[0, 1], [length-1, length]]
+        else:
+            spans_pi = spans[batch_idx]
+        segments_pi = cal_segments(cls_probs[batch_idx, :length], spans_pi, scale)
+        segments.append(segments_pi)
+    return segments, cls_probs, lengths
+def cal_ext_segments(cls_probs, lengths, bg_spans, scale=1, threshold=0.5):
+    """
+    Ѱ�Ҽ�����. ��bg_spans(��line����,����������)��Ѱ��Ԥ��������, �Ҵ���threshold����.
+    """
+    batch_size = cls_probs.shape[0]
+    ext_segments = list()
+    for batch_idx in range(batch_size):
+        length = lengths[batch_idx]
+        ext_segments_pi = cal_segments(cls_probs[batch_idx, :length], bg_spans[batch_idx], scale)
+        ext_segments_pi = [segment for segment in ext_segments_pi if cls_probs[batch_idx, segment] > threshold]
+        ext_segments.append(ext_segments_pi)
+    return ext_segments
+def gen_masks(sizes, scale, device):
+    batch_size = len(sizes)
+    max_size = [int(max(item) * scale) for item in zip(*sizes)]
+    masks = torch.zeros([batch_size, *max_size], dtype=torch.float, device=device)
+    for batch_idx in range(batch_size):
+        masks[batch_idx, :sizes[batch_idx][0], :sizes[batch_idx][1]] = 1.
+    return masks
+def gen_targets(sizes, scale, device, fg_spans, bg_spans, type):
+    batch_size = len(sizes)
+    max_size = [int(max(item) * scale) for item in zip(*sizes)]
+    targets = torch.zeros([batch_size, *max_size], dtype=torch.float, device=device)
+    for batch_idx, fg_spans_pb in enumerate(fg_spans):
+        if type == 'col':
+            for fg_spans_pi in fg_spans_pb:
+                targets[batch_idx, :, int(fg_spans_pi[0] * scale) : int(fg_spans_pi[1] * scale)] = 1.
+        else:
+            for fg_spans_pi in fg_spans_pb:
+                targets[batch_idx, int(fg_spans_pi[0] * scale) : int(fg_spans_pi[1] * scale), :] = 1.
+    return targets
+class SegmentPredictor(nn.Module):
+    def __init__(self, in_dim, scale=1, threshold=0.5, type=None):
+        super().__init__()
+        self.scale = scale
+        self.in_dim = in_dim
+        assert type in ['col', 'row']
+        self.type = type
+        self.threshold = threshold
+        self.convs = nn.Sequential(
+            nn.Conv2d(in_dim, in_dim // 2, kernel_size=(3,3), stride=(1,1), padding=(1,1)),
+            nn.ReLU(),
+            nn.Conv2d(in_dim // 2, 1, kernel_size=(1,1), stride=(1,1), padding=(0,0))
+        )
+    def forward(self, feats, images_size, fg_spans=None, bg_spans=None):
+        batch_size = feats.shape[0]
+        images_size = [image_size[::-1] for image_size in images_size]
+        segments_logit = self.convs(feats)
+        masks = gen_masks(images_size, self.scale, feats.device)
+        # save_logitmap('row_segment.png', segments_logit[0][0])
+        result_info = dict()
+        ext_info = dict()
+        if self.training:
+            targets = gen_targets(images_size, self.scale, feats.device, fg_spans, bg_spans, self.type)
+            segments_loss = F.binary_cross_entropy_with_logits(
+                segments_logit,
+                targets.unsqueeze(1),
+                reduction='none'
+            )
+            segments_loss = (segments_loss * masks[:, None, :, :]).sum() / targets.sum()
+            result_info['segments_loss'] = segments_loss
+            pred_segments, cls_probs, lengths = cls_logits_to_segments(segments_logit, masks, self.type, spans=None, scale=self.scale, threshold=self.threshold)
+            correct_nums, segment_nums, span_nums = cal_segment_pr(pred_segments, fg_spans, bg_spans)
+            if segment_nums != 0:
+                result_info['precision'] = correct_nums/segment_nums
+            if span_nums != 0:
+                result_info['recall'] = correct_nums/span_nums
+            ext_segments = cal_ext_segments(cls_probs, lengths, bg_spans, self.scale, self.threshold)
+            ext_info['ext_segments'] = ext_segments
+        pred_segments, *_ = cls_logits_to_segments(segments_logit, masks, self.type, spans=fg_spans, scale=self.scale, threshold=self.threshold)
+        return pred_segments, result_info, ext_info

libs/model/utils.py ADDED Viewed

	@@ -0,0 +1,371 @@

+import cv2
+import copy
+import torch
+import numpy as np
+from torch.nn import functional as F
+def proposal_colspan(layout, layout_score, srow, scol):
+    y, x = torch.where(layout == 1)
+    if torch.all(layout[y.min():y.max() + 1, x.min():x.max()+1] == 1):
+        return layout, layout_score[y.min():y.max() + 1, x.min():x.max()+1].mean()
+    else:
+        lf_row = srow
+        lf_col = scol
+        col_count = 0
+        for col_ in range(lf_col, x.max() + 1):
+            if layout[lf_row, col_] == 1:
+                col_count = col_count + 1
+            else:
+                break
+        row_count = 0
+        for row_ in range(lf_row, y.max() + 1):
+            if torch.all(layout[row_, lf_col: lf_col + col_count] == 1):
+                row_count = row_count + 1
+            else:
+                break
+        layout[:, :] = 0
+        layout[lf_row:lf_row + row_count, lf_col : lf_col + col_count] = 1
+        return layout, layout_score[lf_row:lf_row + row_count, lf_col : lf_col + col_count].mean()
+def proposal_rowspan(layout, layout_score, srow, scol):
+    y, x = torch.where(layout == 1)
+    if torch.all(layout[y.min():y.max() + 1, x.min():x.max()+1] == 1):
+        return layout, layout_score[y.min():y.max() + 1, x.min():x.max()+1].mean()
+    else:
+        lf_row = srow
+        lf_col = scol
+        row_count = 0
+        for row_ in range(lf_row, y.max() + 1):
+            if layout[row_, lf_col] == 1:
+                row_count = row_count + 1
+            else:
+                break
+        col_count = 0
+        for col_ in range(lf_col, x.max() + 1):
+            if torch.all(layout[lf_row : lf_row + row_count, col_] == 1):
+                col_count = col_count + 1
+            else:
+                break
+        layout[:, :] = 0
+        layout[lf_row:lf_row + row_count, lf_col : lf_col + col_count] = 1
+        return layout, layout_score[lf_row:lf_row + row_count, lf_col : lf_col + col_count].mean()
+def proposal_maxcontain(layout, layout_score, srow, scol):
+    y, x = torch.where(layout == 1)
+    if torch.all(layout[y.min():y.max() + 1, x.min():x.max()+1] == 1):
+        return layout, layout_score[y.min():y.max() + 1, x.min():x.max()+1].mean()
+    else:
+        lf_row = srow
+        lf_col = scol
+        layout[:, :] = 0
+        layout[lf_row: y.max()+1, lf_col : x.max() + 1] = 1
+        return layout, layout_score[lf_row: y.max()+1, lf_col : x.max() + 1].mean()
+def proposal_maxrowspan(layout, layout_score, srow, scol):
+    y, x = torch.where(layout == 1)
+    if torch.all(layout[y.min():y.max() + 1, x.min():x.max()+1] == 1):
+        return layout, layout_score[y.min():y.max() + 1, x.min():x.max()+1].mean()
+    else:
+        lf_row = srow
+        lf_col = scol
+        row_count = 1
+        for row_ in range(lf_row + 1, y.max() + 1):
+            if torch.all(layout[lf_row] == layout[row_]):
+                row_count = row_count + 1
+            else:
+                break
+        layout[:, :] = 0
+        layout[lf_row : lf_row + row_count, lf_col : x.max() + 1] = 1
+        return layout, layout_score[lf_row : lf_row + row_count, lf_col : x.max() + 1].mean()
+def proposal_maxcolspan(layout, layout_score, srow, scol):
+    y, x = torch.where(layout == 1)
+    if torch.all(layout[y.min():y.max() + 1, x.min():x.max()+1] == 1):
+        return layout, layout_score[y.min():y.max() + 1, x.min():x.max()+1].mean()
+    else:
+        lf_row = srow
+        lf_col = scol
+        col_count = 1
+        for col_ in range(lf_col + 1, x.max() + 1):
+            if torch.all(layout[:, lf_col] == layout[:, col_]):
+                col_count = col_count + 1
+            else:
+                break
+        layout[:, :] = 0
+        layout[lf_row : y.max() + 1, lf_col : lf_col + col_count] = 1
+        return layout, layout_score[lf_row : y.max() + 1, lf_col : lf_col + col_count].mean()
+def gen_proposals(layout_score, srow, scol, score_threshold=0.5):
+    layout = layout_score > score_threshold
+    layout[srow, scol] = 1
+    y, x = torch.where(layout == 1)
+    if torch.all(layout[y.min():y.max() + 1, x.min():x.max()+1] == 1):
+        return layout.unsqueeze(0), layout_score[y.min():y.max() + 1, x.min():x.max()+1].mean().unsqueeze(0).log()
+    else:
+        proposal_1, score_1 = proposal_colspan(copy.deepcopy(layout), layout_score, srow, scol)
+        proposal_2, score_2 = proposal_rowspan(copy.deepcopy(layout), layout_score, srow, scol)
+        proposal_3, score_3 = proposal_maxcontain(copy.deepcopy(layout), layout_score, srow, scol)
+        proposal_4, score_4 = proposal_maxrowspan(copy.deepcopy(layout), layout_score, srow, scol)
+        proposal_5, score_5 = proposal_maxcolspan(copy.deepcopy(layout), layout_score, srow, scol)
+        proposals = torch.stack([proposal_1, proposal_2, proposal_3, proposal_4, proposal_5], dim=0)
+        scores = torch.stack([score_1.log(), score_2.log(), score_3.log(), score_4.log(), score_5.log()], dim=0)
+        return proposals, scores
+def extend_segments(row_segments, rows_es, col_segments, cols_es, cells_spans, layouts, divide_labels):
+    batch_size = len(row_segments)
+    ext_row_segments = list()
+    ext_col_segments = list()
+    ext_cells_spans = list()
+    ext_layouts = list()
+    ext_divide_labels = list()
+    for batch_idx in range(batch_size):
+        row_segments_pi = row_segments[batch_idx]
+        col_segments_pi = col_segments[batch_idx]
+        rows_es_pi = rows_es[batch_idx]
+        cols_es_pi = cols_es[batch_idx]
+        cells_spans_pi = cells_spans[batch_idx]
+        ext_row_segments_pi = row_segments_pi + rows_es_pi
+        ext_col_segments_pi = col_segments_pi + cols_es_pi
+        row_segments_idx = sorted(list(range(len(ext_row_segments_pi))), key=lambda idx: ext_row_segments_pi[idx])
+        col_segments_idx = sorted(list(range(len(ext_col_segments_pi))), key=lambda idx: ext_col_segments_pi[idx])
+        ext_divide_labels.append(row_segments_idx.index(divide_labels[batch_idx].item()))
+        ext_row_segments.append([ext_row_segments_pi[idx] for idx in row_segments_idx])
+        ext_col_segments.append([ext_col_segments_pi[idx] for idx in col_segments_idx])
+        ext_layouts_pi = np.full((len(ext_row_segments_pi) - 1, len(ext_col_segments_pi) - 1), -1)
+        ext_cells_spans_pi = list()
+        for cell_idx, cell_span in enumerate(cells_spans_pi):
+            l, t, r, b = cell_span
+            l = col_segments_idx.index(l)
+            r = col_segments_idx.index(r+1) - 1
+            t = row_segments_idx.index(t)
+            b = row_segments_idx.index(b+1) - 1
+            ext_cells_spans_pi.append([l, t, r, b])
+            ext_layouts_pi[t:b+1, l:r+1] = cell_idx
+        ext_cells_spans.append(ext_cells_spans_pi)
+        ext_layouts.append(ext_layouts_pi)
+    return ext_row_segments, ext_col_segments, ext_cells_spans, aligned_layouts(ext_layouts, layouts), torch.tensor(ext_divide_labels).to(divide_labels.device)
+def aligned_layouts(layouts_list, layouts):
+    batch_size = len(layouts_list)
+    dtype = layouts.dtype
+    device = layouts.device
+    max_row_nums = max([l.shape[0] for l in layouts_list])
+    max_col_nums = max([l.shape[1] for l in layouts_list])
+    aligned_layouts = list()
+    for batch_idx in range(batch_size):
+        num_rows_pi = layouts_list[batch_idx].shape[0]
+        num_cols_pi = layouts_list[batch_idx].shape[1]
+        layouts_pi = torch.from_numpy(layouts_list[batch_idx]).to(dtype=dtype, device=device)
+        aligned_layouts_pi = F.pad(
+            layouts_pi,
+            (0, max_col_nums-num_cols_pi, 0, max_row_nums-num_rows_pi),
+            mode='constant',
+            value=-1
+        )
+        aligned_layouts.append(aligned_layouts_pi)
+    aligned_layouts = torch.stack(aligned_layouts, dim=0)
+    return aligned_layouts
+def parse_layout(spans, num_rows, num_cols):
+    layout = np.full([num_rows, num_cols], -1, dtype=np.int)
+    cell_count = 0
+    for x1, y1, x2, y2, prob in spans:
+        layout[y1:y2+1, x1:x2+1] = cell_count
+        cell_count += 1
+    cells_id = list()
+    for row_idx in range(num_rows):
+        for col_idx in range(num_cols):
+            cell_id = layout[row_idx, col_idx]
+            if cell_id in cells_id:
+                layout[row_idx, col_idx] = cells_id.index(cell_id)
+            else:
+                layout[row_idx, col_idx] = len(cells_id)
+                cells_id.append(cell_id)
+    return layout
+def parse_cells(layout, row_segments, col_segments):
+    cells = list()
+    num_cells = np.max(layout) + 1
+    for cell_id in range(num_cells):
+        cell_positions = np.argwhere(layout == cell_id)
+        y1 = np.min(cell_positions[:, 0])
+        y2 = np.max(cell_positions[:, 0])
+        x1 = np.min(cell_positions[:, 1])
+        x2 = np.max(cell_positions[:, 1])
+        assert np.all(layout[y1:y2, x1:x2] == cell_id)
+        x1 = col_segments[x1]
+        x2 = col_segments[x2+1]
+        y1 = row_segments[y1]
+        y2 = row_segments[y2+1]
+        cell = dict(
+            segmentation=[[[x1, y1], [x2, y1], [x2, y2], [x1, y2]]]
+        )
+        cells.append(cell)
+    return cells
+def process_layout(score, index):
+    layout = torch.full_like(index, -1)
+    layout_mask = torch.full_like(index, -1)
+    nrow, ncol = score.shape
+    for cell_id in range(nrow * ncol):
+        if layout_mask.min() != -1:
+            break
+        crow, ccol = torch.where(layout_mask == layout_mask.min())
+        ccol = ccol[crow == crow.min()].min()
+        crow = crow.min()
+        id = index[crow, ccol]
+        h, w = torch.where(index == id)
+        if h.shape[0] == 1 or w.shape[0] == 1:
+            layout_mask[h, w] = 1
+            layout[h, w] = cell_id
+            continue
+        else:
+            h_min = h.min()
+            h_max = h.max()
+            w_min = w.min()
+            w_max = w.max()
+            if torch.all(index[h_min:h_max+1, w_min:w_max+1] == id):
+                layout_mask[h_min:h_max+1, w_min:w_max+1] = 1
+                layout[h_min:h_max+1, w_min:w_max+1] = cell_id
+            else:
+                lf_row = crow
+                lf_col = ccol
+                col_mem = -1
+                for col_ in range(lf_col, w_max + 1):
+                    if index[lf_row, col_] == id:
+                        layout_mask[lf_row, col_] = 1
+                        layout[lf_row, col_] = cell_id
+                        col_mem = col_
+                    else:
+                        break
+                for row_ in range(lf_row + 1, h_max + 1):
+                    if torch.all(index[row_, lf_col: col_mem + 1] == id):
+                        layout_mask[row_, lf_col: col_mem + 1] = 1
+                        layout[row_, lf_col: col_mem + 1] = cell_id
+                    else:
+                        break
+    return layout
+def process_layout(score, index, use_score=False, is_merge=True, score_threshold=0.5):
+    if use_score:
+        if is_merge:
+            y, x = torch.where(score < score_threshold)
+            index[y, x] = index.max() + 1
+        else:
+            y, x = torch.where(score < score_threshold)
+            index[y, x] = torch.arange(index.max() + 1, index.max() + 1 + len(y)).to(index.device, index.dtype)
+    layout = torch.full_like(index, -1)
+    layout_mask = torch.full_like(index, -1)
+    nrow, ncol = score.shape
+    for cell_id in range(max(nrow * ncol, index.max() + 1)):
+        if layout_mask.min() != -1:
+            break
+        crow, ccol = torch.where(layout_mask == layout_mask.min())
+        ccol = ccol[crow == crow.min()].min()
+        crow = crow.min()
+        id = index[crow, ccol]
+        h, w = torch.where(index == id)
+        if h.shape[0] == 1 or w.shape[0] == 1:
+            layout_mask[h, w] = 1
+            layout[h, w] = cell_id
+            continue
+        else:
+            h_min = h.min()
+            h_max = h.max()
+            w_min = w.min()
+            w_max = w.max()
+            if torch.all(index[h_min:h_max+1, w_min:w_max+1] == id):
+                layout_mask[h_min:h_max+1, w_min:w_max+1] = 1
+                layout[h_min:h_max+1, w_min:w_max+1] = cell_id
+            else:
+                lf_row = crow
+                lf_col = ccol
+                col_mem = -1
+                for col_ in range(lf_col, w_max + 1):
+                    if index[lf_row, col_] == id:
+                        layout_mask[lf_row, col_] = 1
+                        layout[lf_row, col_] = cell_id
+                        col_mem = col_
+                    else:
+                        break
+                for row_ in range(lf_row + 1, h_max + 1):
+                    if torch.all(index[row_, lf_col: col_mem + 1] == id):
+                        layout_mask[row_, lf_col: col_mem + 1] = 1
+                        layout[row_, lf_col: col_mem + 1] = cell_id
+                    else:
+                        break
+    return layout
+def layout2spans(layout):
+    rows, cols = layout.shape[-2:]
+    cells_span = list()
+    for cell_id in range(rows * cols):
+        cell_positions = np.argwhere(layout == cell_id)
+        if len(cell_positions) == 0:
+            continue
+        y1 = np.min(cell_positions[:, 0])
+        y2 = np.max(cell_positions[:, 0])
+        x1 = np.min(cell_positions[:, 1])
+        x2 = np.max(cell_positions[:, 1])
+        assert np.all(layout[y1:y2, x1:x2] == cell_id)
+        cells_span.append([x1, y1, x2, y2])
+    return [cells_span]
+def spatial_att_to_spans(spatial_att_weight_pred):
+    max_score, max_index = spatial_att_weight_pred.max(dim=0)
+    layout = process_layout(max_score, max_index, use_score=True, is_merge=False)
+    layout = process_layout(max_score, layout)
+    layout = layout.cpu().numpy()
+    spans = layout2spans(layout)
+    return spans
+def save_logitmap(filename, logit):
+    cv2.imwrite(filename, (logit.sigmoid()*255).cpu().numpy().astype('uint8'))
+def draw_spans(dst, src, spans, type):
+    image = cv2.imread(src)
+    H, W, *_ = image.shape
+    for span in spans:
+        if type == 'col':
+            cv2.rectangle(image, (span[0], 0), (span[1], H), (0, 0, 255), thickness=1)
+        elif type == 'row':
+            cv2.rectangle(image, (0, span[0]), (W, span[1]), (0, 0, 255), thickness=1)
+    cv2.imwrite(dst, image)

libs/utils/__init__.py ADDED Viewed

File without changes

libs/utils/cal_f1.py ADDED Viewed

	@@ -0,0 +1,214 @@

+import cv2
+from numpy.core.fromnumeric import sort
+import tqdm
+import json
+import copy
+import Polygon
+import numpy as np
+from .scitsr.eval import json2Relations, eval_relations
+def parse_layout(spans, num_rows, num_cols):
+    layout = np.full([num_rows, num_cols], -1, dtype=np.int)
+    cell_count = 0
+    for x1, y1, x2, y2 in spans:
+        layout[y1:y2+1, x1:x2+1] = cell_count
+        cell_count += 1
+    cells_id = list()
+    for row_idx in range(num_rows):
+        for col_idx in range(num_cols):
+            cell_id = layout[row_idx, col_idx]
+            if cell_id in cells_id:
+                layout[row_idx, col_idx] = cells_id.index(cell_id)
+            else:
+                layout[row_idx, col_idx] = len(cells_id)
+                cells_id.append(cell_id)
+    return layout
+def parse_cells(layout, spans, row_segments, col_segments, lines):
+    cells = list()
+    num_cells = np.max(layout) + 1
+    for cell_id in range(num_cells):
+        cell_positions = np.argwhere(layout == cell_id)
+        y1 = np.min(cell_positions[:, 0])
+        y2 = np.max(cell_positions[:, 0])
+        x1 = np.min(cell_positions[:, 1])
+        x2 = np.max(cell_positions[:, 1])
+        assert np.all(layout[y1:y2, x1:x2] == cell_id)
+        x1 = col_segments[x1]
+        x2 = col_segments[x2+1]
+        y1 = row_segments[y1]
+        y2 = row_segments[y2+1]
+        cell = dict(
+            segmentation=[[[x1, y1], [x2, y1], [x2, y2], [x1, y2]]]
+        )
+        cells.append(cell)
+    extend_cell_lines(cells, lines)
+    return cells
+def extend_cell_lines(cells, lines):
+    def segmentation_to_polygon(segmentation):
+        polygon = Polygon.Polygon()
+        for contour in segmentation:
+            polygon = polygon + Polygon.Polygon(contour)
+        return polygon
+    lines = copy.deepcopy(lines)
+    cells_poly = [segmentation_to_polygon(item['segmentation']) for item in cells]
+    lines_poly = [segmentation_to_polygon(item['segmentation']) for item in lines]
+    cells_lines = [[] for _ in range(len(cells))]
+    for line_idx, line_poly in enumerate(lines_poly):
+        if line_poly.area() == 0:
+            continue
+        line_area = line_poly.area()
+        max_overlap = 0
+        max_overlap_idx = None
+        for cell_idx, cell_poly in enumerate(cells_poly):
+            overlap = (cell_poly & line_poly).area() / line_area
+            if overlap > max_overlap:
+                max_overlap_idx = cell_idx
+                max_overlap = overlap
+        if max_overlap > 0:
+            cells_lines[max_overlap_idx].append(line_idx)
+    lines_y1 = [segmentation_to_bbox(item['segmentation'])[1] for item in lines]
+    cells_lines = [sorted(item, key=lambda idx: lines_y1[idx]) for item in cells_lines]
+    for cell, cell_lines in zip(cells, cells_lines):
+        transcript = []
+        for idx in cell_lines:
+            transcript.extend(lines[idx]['transcript'])
+        cell['transcript'] = transcript
+def segmentation_to_bbox(segmentation):
+    x1 = min([min([pt[0] for pt in contour]) for contour in segmentation])
+    y1 = min([min([pt[1] for pt in contour]) for contour in segmentation])
+    x2 = max([max([pt[0] for pt in contour]) for contour in segmentation])
+    y2 = max([max([pt[1] for pt in contour]) for contour in segmentation])
+    return [x1, y1, x2, y2]
+def cal_cell_spans(table):
+    layout = table['layout']
+    num_cells = len(table['cells'])
+    cells_span = list()
+    for cell_id in range(num_cells):
+        cell_positions = np.argwhere(layout == cell_id)
+        y1 = np.min(cell_positions[:, 0])
+        y2 = np.max(cell_positions[:, 0])
+        x1 = np.min(cell_positions[:, 1])
+        x2 = np.max(cell_positions[:, 1])
+        assert np.all(layout[y1:y2, x1:x2] == cell_id)
+        cells_span.append([x1, y1, x2, y2])
+    return cells_span
+def pred_result_to_table(table, pred_result):
+    # gt ocr result
+    lines = [dict(segmentation=cell['segmentation'], transcript=cell['transcript']) for cell in table['cells'] if 'bbox' in cell.keys()]
+    row_segments, col_segments, divide, spans = pred_result
+    num_rows = len(row_segments) - 1
+    num_cols = len(col_segments) - 1
+    layout = parse_layout(spans, num_rows, num_cols)
+    cells = parse_cells(layout, spans, row_segments, col_segments, lines)
+    head_rows = list(range(0, divide))
+    body_rows = list(range(divide, num_rows))
+    table = dict(
+        layout=layout,
+        head_rows=head_rows,
+        body_rows=body_rows,
+        cells=cells
+    )
+    return table
+def table_to_relations(table):
+    cell_spans = cal_cell_spans(table)
+    contents = [''.join(cell['transcript']).split() for cell in table['cells']]
+    relations = []
+    for span, content in zip(cell_spans, contents):
+        x1, y1, x2, y2 = span
+        relations.append(dict(start_row=y1, end_row=y2, start_col=x1, end_col=x2, content=content))
+    return dict(cells=relations)
+def cal_f1(label, pred):
+    label = json2Relations(label, splitted_content=True)
+    pred = json2Relations(pred, splitted_content=True)
+    precision, recall = eval_relations(gt=[label], res=[pred], cmp_blank=True)
+    f1 = 2.0 * precision * recall / (precision + recall) if precision + recall > 0 else 0
+    return [precision, recall, f1]
+def single_process(labels, preds):
+    scores = dict()
+    for key in tqdm.tqdm(labels.keys()):
+        pred = preds.get(key, '')
+        label = labels.get(key, '')
+        score = cal_f1(label, pred)
+        scores[key] = score
+    return scores
+def _worker(labels, preds,  keys, result_queue):
+    for key in keys:
+        label = labels.get(key, '')
+        pred = preds.get(key, '')
+        score = cal_f1(label, pred)
+        result_queue.put((key, score))
+def multi_process(labels, preds, num_workers):
+    import multiprocessing
+    manager = multiprocessing.Manager()
+    result_queue = manager.Queue()
+    keys = list(labels.keys())
+    workers = list()
+    for worker_idx in range(num_workers):
+        worker = multiprocessing.Process(
+            target=_worker,
+            args=(
+                labels,
+                preds,
+                keys[worker_idx::num_workers],
+                result_queue
+            )
+        )
+        worker.daemon = True
+        worker.start()
+        workers.append(worker)
+    scores = dict()
+    tq = tqdm.tqdm(total=len(keys))
+    for _ in range(len(keys)):
+        key, val = result_queue.get()
+        scores[key] = val
+        P, R, F1 = (100 * np.array(list(scores.values()))).mean(0).tolist()
+        tq.set_description('P: %.2f, R: %.2f, F1: %.2f' % (P, R, F1), False)
+        tq.update()
+    return scores
+def evaluate_f1(labels, preds, num_workers=0):
+    preds = {idx: pred for idx, pred in enumerate(preds)}
+    labels = {idx: label for idx, label in enumerate(labels)}
+    if num_workers == 0:
+        scores = single_process(labels, preds)
+    else:
+        scores = multi_process(labels, preds, num_workers)
+    sorted_idx = sorted(list(range(len(list(scores)))), key=lambda idx: list(scores.keys())[idx])
+    scores = [scores[idx] for idx in sorted_idx]
+    return scores

libs/utils/checkpoint.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import os
+import torch
+from .comm import get_rank, synchronize
+def save_checkpoint(checkpoint, model, optimizer=None, best_metric=None, epoch=None):
+    if isinstance(model, torch.nn.parallel.DistributedDataParallel):
+        model = model.module
+    if get_rank() == 0:
+        if not os.path.exists(os.path.dirname(checkpoint)):
+            os.makedirs(os.path.dirname(checkpoint))
+        infos = dict()
+        infos['model_param'] = model.state_dict()
+        if optimizer is not None:
+            infos['opt_param'] = optimizer.state_dict()
+        if best_metric is not None:
+            infos['best_metric'] = best_metric
+        if epoch is not None:
+            infos['epoch'] = epoch
+        torch.save(infos, checkpoint)
+    synchronize()
+def load_checkpoint(checkpoint, model, optimizer=None):
+    if isinstance(model, torch.nn.parallel.DistributedDataParallel):
+        model = model.module
+    checkpoint = torch.load(checkpoint, map_location='cpu')
+    model.load_state_dict(checkpoint['model_param'], strict=False)
+    if (optimizer is not None) and ('opt_param' in checkpoint):
+        optimizer.load_state_dict(checkpoint['opt_param'])
+    if 'best_metric' in checkpoint:
+        best_metric = checkpoint['best_metric']
+    else:
+        best_metric = None
+    if 'epoch' in checkpoint:
+        epoch = checkpoint['epoch']
+    else:
+        epoch = None
+    return best_metric, epoch

libs/utils/comm.py ADDED Viewed

	@@ -0,0 +1,129 @@

+"""
+This file contains primitives for multi-gpu communication.
+This is useful when doing distributed training.
+"""
+import os
+import pickle
+import torch
+import torch.distributed as dist
+def distributed():
+    num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
+    distributed = num_gpus > 1
+    return distributed
+def get_world_size():
+    if not dist.is_available():
+        return 1
+    if not dist.is_initialized():
+        return 1
+    return dist.get_world_size()
+def get_rank():
+    if not dist.is_available():
+        return 0
+    if not dist.is_initialized():
+        return 0
+    return dist.get_rank()
+def get_local_rank():
+    if 'LOCAL_RANK' not in os.environ:
+        return get_rank()
+    else:
+        return int(os.environ['LOCAL_RANK'])
+def is_main_process():
+    return get_rank() == 0
+def synchronize():
+    """
+    Helper function to synchronize (barrier) among all processes when
+    using distributed training
+    """
+    if not dist.is_available():
+        return
+    if not dist.is_initialized():
+        return
+    world_size = dist.get_world_size()
+    if world_size == 1:
+        return
+    dist.barrier()
+def all_gather(data):
+    """
+    Run all_gather on arbitrary picklable data (not necessarily tensors)
+    Args:
+        data: any picklable object
+    Returns:
+        list[data]: list of data gathered from each rank
+    """
+    world_size = get_world_size()
+    if world_size == 1:
+        return [data]
+    # serialized to a Tensor
+    buffer = pickle.dumps(data)
+    storage = torch.ByteStorage.from_buffer(buffer)
+    tensor = torch.ByteTensor(storage).to("cuda")
+    # obtain Tensor size of each rank
+    local_size = torch.LongTensor([tensor.numel()]).to("cuda")
+    size_list = [torch.LongTensor([0]).to("cuda") for _ in range(world_size)]
+    dist.all_gather(size_list, local_size)
+    size_list = [int(size.item()) for size in size_list]
+    max_size = max(size_list)
+    # receiving Tensor from all ranks
+    # we pad the tensor because torch all_gather does not support
+    # gathering tensors of different shapes
+    tensor_list = []
+    for _ in size_list:
+        tensor_list.append(torch.ByteTensor(size=(max_size,)).to("cuda"))
+    if local_size != max_size:
+        padding = torch.ByteTensor(size=(max_size - local_size,)).to("cuda")
+        tensor = torch.cat((tensor, padding), dim=0)
+    dist.all_gather(tensor_list, tensor)
+    data_list = []
+    for size, tensor in zip(size_list, tensor_list):
+        buffer = tensor.cpu().numpy().tobytes()[:size]
+        data_list.append(pickle.loads(buffer))
+    return data_list
+def reduce_dict(input_dict, average=True):
+    """
+    Args:
+        input_dict (dict): all the values will be reduced
+        average (bool): whether to do average or sum
+    Reduce the values in the dictionary from all processes so that process with rank
+    0 has the averaged results. Returns a dict with the same fields as
+    input_dict, after reduction.
+    """
+    world_size = get_world_size()
+    if world_size < 2:
+        return input_dict
+    with torch.no_grad():
+        names = []
+        values = []
+        # sort the keys so that they are consistent across processes
+        for k in sorted(input_dict.keys()):
+            names.append(k)
+            values.append(input_dict[k])
+        values = torch.stack(values, dim=0)
+        dist.reduce(values, dst=0)
+        if dist.get_rank() == 0 and average:
+            # only main process gets accumulated, so only divide by
+            # world_size in this case
+            values /= world_size
+        reduced_dict = {k: v for k, v in zip(names, values)}
+    return reduced_dict

libs/utils/context_cacher.py ADDED Viewed

	@@ -0,0 +1,15 @@

+class ContextCacher:
+    def __init__(self):
+        self.infos = dict()
+    def reset(self):
+        self.infos.clear()
+    def cache_info(self, key, info):
+        self.infos[key] = info
+    def get_info(self, key):
+        return self.infos[key]
+global_context_cacher = ContextCacher()

libs/utils/counter.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import torch
+from collections import defaultdict
+from .comm import distributed, all_gather
+def format_dict(res_dict):
+    res_strs = []
+    for key, val in res_dict.items():
+        res_strs.append('%s: %s' % (key, val))
+    return ', '.join(res_strs)
+class Counter:
+    def __init__(self, cache_nums=1000):
+        self.cache_nums = cache_nums
+        self.reset()
+    def update(self, metric):
+        for key, val in metric.items():
+            if isinstance(val, torch.Tensor):
+                val = val.item()
+            self.metric_dict[key].append(val)
+            if self.cache_nums is not None:
+                self.metric_dict[key] = self.metric_dict[key][-1*self.cache_nums:]
+    def reset(self):
+        self.metric_dict = defaultdict(list)
+    def _sync(self):
+        metric_dicts = all_gather(self.metric_dict)
+        total_metric_dict = defaultdict(list)
+        for metric_dict in metric_dicts:
+            for key, val in metric_dict.items():
+                total_metric_dict[key].extend(val)
+        return total_metric_dict
+    def format_mean(self, sync=True):
+        if sync and distributed():
+            metric_dict = self._sync()
+        else:
+            metric_dict = self.metric_dict
+        res_dict = {key: '%.4f' % (sum(val)/len(val)) for key, val in metric_dict.items()}
+        return format_dict(res_dict)

libs/utils/format_translate.py ADDED Viewed

	@@ -0,0 +1,278 @@

+import re
+import copy
+import Polygon
+import numpy as np
+from bs4 import BeautifulSoup as bs
+from .time_counter import format_table
+def check_continuous(seq):
+    if len(seq) > 0:
+        pre_val = seq[0]
+        for val in seq[1:]:
+            assert pre_val + 1 == val
+            pre_val = val
+def table_to_latex(table):
+    def cal_cls_id(transcript):
+        transcript = ''.join(transcript)
+        if transcript == '':
+            return '</none>'
+        elif transcript == '<b> </b>':
+            return '</bold>'
+        elif transcript == ' ':
+            return '</space>'
+        else:
+            return '</line>'
+    assert table['layout'].max() + 1 == len(table['cells'])
+    latex = [cal_cls_id(cell['transcript']) for cell in table['cells']]
+    return latex
+def html_to_table(html):
+    tokens = html['html']['structure']['tokens']
+    layout = [[]]
+    def extend_table(x, y):
+        assert (x >= 0) and (y >= 0)
+        nonlocal layout
+        if x >= len(layout[0]):
+            for row in layout:
+                row.extend([-1] * (x - len(row) + 1))
+        if y >= len(layout):
+            for _ in range(y - len(layout) + 1):
+                layout.append([-1] * len(layout[0]))
+    def set_cell_val(x, y, val):
+        assert (x >= 0) and (y >= 0)
+        nonlocal layout
+        extend_table(x, y)
+        layout[y][x] = val
+    def get_cell_val(x, y):
+        assert (x >= 0) and (y >= 0)
+        nonlocal layout
+        extend_table(x, y)
+        return layout[y][x]
+    def parse_span_val(token):
+        span_val = int(token[token.index('"') + 1:token.rindex('"')])
+        return span_val
+    def maskout_left_rows():
+        nonlocal row_idx, layout
+        layout = layout[:max(row_idx+1, 1)]
+    row_idx = -1
+    col_idx = -1
+    line_idx = -1
+    inside_head = False
+    inside_body = False
+    head_rows = list()
+    body_rows = list()
+    col_span = 1
+    row_span = 1
+    for token in tokens:
+        if token == '<thead>':
+            inside_head = True
+            maskout_left_rows()
+        elif token == '</thead>':
+            inside_head = False
+            maskout_left_rows()
+        elif token == '<tbody>':
+            inside_body = True
+            maskout_left_rows()
+        elif token == '</tbody>':
+            inside_body = False
+            maskout_left_rows()
+        elif token == '<tr>':
+            row_idx += 1
+            col_idx = -1
+            if inside_head:
+                head_rows.append(row_idx)
+            if inside_body:
+                body_rows.append(row_idx)
+        elif token in ['<td>', '<td']:
+            line_idx += 1
+            col_idx += 1
+            row_span = 1
+            col_span = 1
+            while get_cell_val(col_idx, row_idx) != -1:
+                col_idx += 1
+        elif 'colspan' in token:
+            col_span = parse_span_val(token)
+        elif 'rowspan' in token:
+            row_span = parse_span_val(token)
+        elif token == '</td>':
+            for cur_row_idx in range(row_idx, row_idx + row_span):
+                for cur_col_idx in range(col_idx, col_idx + col_span):
+                    set_cell_val(cur_col_idx, cur_row_idx, line_idx)
+            col_idx += col_span - 1
+    check_continuous(head_rows)
+    check_continuous(body_rows)
+    assert len(set(head_rows) | set(body_rows)) == len(layout)
+    layout = np.array(layout)
+    assert np.all(layout >= 0)
+    cells_info = list()
+    for cell_idx, cell in enumerate(html['html']['cells']):
+        transcript = cell['tokens']
+        cell_info = dict(
+            transcript=transcript
+        )
+        if 'bbox' in cell:
+            x1, y1, x2, y2 = cell['bbox']
+            cell_info['bbox'] = [x1, y1, x2, y2]
+            cell_info['segmentation'] = [[[x1, y1], [x2, y1], [x2, y2], [x1, y2]]]
+        cells_info.append(cell_info)
+    table = dict(
+        layout=layout,
+        cells=cells_info,
+        head_rows=head_rows,
+        body_rows=body_rows
+    )
+    return table
+def segmentation_to_bbox(segmentation):
+    x1 = min([min([pt[0] for pt in contour]) for contour in segmentation])
+    y1 = min([min([pt[1] for pt in contour]) for contour in segmentation])
+    x2 = max([max([pt[0] for pt in contour]) for contour in segmentation])
+    y2 = max([max([pt[1] for pt in contour]) for contour in segmentation])
+    return [x1, y1, x2, y2]
+def table_to_html(table):
+    layout = table['layout']
+    head_rows = table['head_rows']
+    body_rows = table['body_rows']
+    cells_span = list()
+    for cell_idx in range(len(table['cells'])):
+        cell_positions = np.argwhere(layout == cell_idx)
+        row_span = [np.min(cell_positions[:, 0]), np.max(cell_positions[:, 0]) + 1]
+        col_span = [np.min(cell_positions[:, 1]), np.max(cell_positions[:, 1]) + 1]
+        assert np.all(layout[row_span[0]:row_span[1], col_span[0]:col_span[1]] == cell_idx)
+        cells_span.append([row_span, col_span])
+    cells = list()
+    tokens = ['<thead>']
+    inside_head = True
+    for row_idx in range(layout.shape[0]):
+        if row_idx in body_rows:
+            if inside_head:
+                tokens.append('</thead>')
+                tokens.append('<tbody>')
+                inside_head = False
+        tokens.append('<tr>')
+        for col_idx in range(table['layout'].shape[1]):
+            cell_idx = layout[row_idx][col_idx]
+            assert cell_idx <= len(cells)
+            if cell_idx == len(cells):
+                row_span, col_span = cells_span[cell_idx]
+                if (row_span[1] - row_span[0]) == 1 and (col_span[1] - col_span[0] == 1):
+                    tokens.append('<td>')
+                else:
+                    tokens.append('<td')
+                    if (row_span[1] - row_span[0]) > 1:
+                        tokens.append(' rowspan="%d"' % (row_span[1] - row_span[0]))
+                    if (col_span[1] - col_span[0]) > 1:
+                        tokens.append(' colspan="%d"' % (col_span[1] - col_span[0]))
+                    tokens.append('>')
+                tokens.append('</td>')
+                cell = dict()
+                cell['tokens'] = table['cells'][cell_idx]['transcript']
+                if 'segmentation' in table['cells'][cell_idx]:
+                    cell['bbox'] = segmentation_to_bbox(table['cells'][cell_idx]['segmentation'])
+                cells.append(cell)
+        tokens.append('</tr>')
+    if inside_head:
+        tokens.append('</thead>')
+        tokens.append('<tbody>')
+    tokens.append('</tbody>')
+    html = dict(
+        html=dict(
+            cells=cells,
+            structure=dict(
+                tokens=tokens
+            )
+        )
+    )
+    return html
+def format_html_for_vis(html):
+    html_string = '''<html>
+                     <head>
+                     <meta charset="UTF-8">
+                     <style>
+                     table, th, td {
+                       border: 1px solid black;
+                       font-size: 10px;
+                     }
+                     </style>
+                     </head>
+                     <body>
+                     <table frame="hsides" rules="groups" width="100%%">
+                         %s
+                     </table>
+                     </body>
+                     </html>''' % ''.join(html['html']['structure']['tokens'])
+    cell_nodes = list(re.finditer(r'(<td[^<>]*>)(</td>)', html_string))
+    assert len(cell_nodes) == len(html['html']['cells']), 'Number of cells defined in tags does not match the length of cells'
+    cells = [''.join(c['tokens']) for c in html['html']['cells']]
+    offset = 0
+    for n, cell in zip(cell_nodes, cells):
+        html_string = html_string[:n.end(1) + offset] + cell + html_string[n.start(2) + offset:]
+        offset += len(cell)
+    # prettify the html
+    soup = bs(html_string)
+    html_string = soup.prettify()
+    return html_string
+def format_html(html):
+    html_string = '''<html><body><table>%s</table></body></html>''' % ''.join(html['html']['structure']['tokens'])
+    cell_nodes = list(re.finditer(r'(<td[^<>]*>)(</td>)', html_string))
+    assert len(cell_nodes) == len(html['html']['cells']), 'Number of cells defined in tags does not match the length of cells'
+    cells = [''.join(c['tokens']) for c in html['html']['cells']]
+    offset = 0
+    for n, cell in zip(cell_nodes, cells):
+        html_string = html_string[:n.end(1) + offset] + cell + html_string[n.start(2) + offset:]
+        offset += len(cell)
+    return html_string
+def format_table_layout(table):
+    layout = table['table']['layout']
+    cell_lines = [cell['lines_idx'] for cell in table['table']['cells']]
+    table_cells_info = list()
+    for row in layout:
+        row_cells_info = list()
+        for cell_idx in row:
+            cell_str = ','.join([str(item) for item in cell_lines[cell_idx]])
+            row_cells_info.append(cell_str)
+        table_cells_info.append(row_cells_info)
+    return format_table(table_cells_info, padding=1)
+def remove_blank_cell(html):
+    start_idx = 0
+    while '<td' in html[start_idx:]:
+        start_idx = html[start_idx:].index('<td') + start_idx
+        content_start_idx = html[start_idx:].index('>') + 1 + start_idx
+        content_end_idx = html[content_start_idx:].index('</td>') + content_start_idx
+        end_idx = content_end_idx + len('</td>')
+        if content_end_idx == content_start_idx:
+            html = html[:start_idx] + html[end_idx:]
+        else:
+            start_idx = end_idx
+    return html

libs/utils/logger.py ADDED Viewed

	@@ -0,0 +1,64 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import logging
+import os
+import sys
+from .comm import get_rank
+_default_logger = None
+def __init_logger():
+    global _default_logger
+    if get_rank() == 0:
+        logger = logging.getLogger('default')
+        logger.setLevel(logging.DEBUG)
+        formatter = logging.Formatter("%(asctime)s %(name)s %(levelname)s: %(message)s")
+        if not any([isinstance(item, logging.StreamHandler) for item in logger.handlers]):
+            ch = logging.StreamHandler(stream=sys.stdout)
+            ch.setLevel(logging.DEBUG)
+            ch.setFormatter(formatter)
+            logger.addHandler(ch)
+        _default_logger = logger
+__init_logger()
+def setup_logger(name, save_dir, filename="log.txt"):
+    global _default_logger
+    # don't log results for the non-master process
+    if get_rank() == 0:
+        logger = logging.getLogger(name)
+        logger.setLevel(logging.DEBUG)
+        formatter = logging.Formatter("%(asctime)s %(name)s %(levelname)s: %(message)s")
+        if not any([isinstance(item, logging.StreamHandler) for item in logger.handlers]):
+            ch = logging.StreamHandler(stream=sys.stdout)
+            ch.setLevel(logging.DEBUG)
+            ch.setFormatter(formatter)
+            logger.addHandler(ch)
+        logger.handlers = [item for item in logger.handlers if not isinstance(item, logging.FileHandler)]
+        if save_dir:
+            log_path = os.path.join(save_dir, filename)
+            if not os.path.exists(os.path.dirname(log_path)):
+                os.makedirs(os.path.dirname(log_path))
+            fh = logging.FileHandler(log_path)
+            fh.setLevel(logging.DEBUG)
+            fh.setFormatter(formatter)
+            logger.addHandler(fh)
+        _default_logger = logger
+def info(*args, **kwargs):
+    if get_rank() == 0:
+        _default_logger.info(*args, **kwargs)
+def error(*args, **kwargs):
+    if get_rank() == 0:
+        _default_logger.error(*args, **kwargs)

libs/utils/metric.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import torch
+from .utils import match_segment_spans, find_unmatch_segment_spans
+from .teds import TEDS
+class CellMergeAcc:
+    def __call__(self, preds, labels, labels_mask):
+        preds = preds & labels_mask
+        labels = labels & labels_mask
+        flag = preds == labels
+        flag = flag.reshape(flag.shape[0], flag.shape[1], -1).min(-1)[0]
+        mask = labels.reshape(labels.shape[0], labels.shape[1], -1).max(-1)[0]
+        correct_nums = float(torch.sum(flag & mask).detach().cpu().item())
+        total_nums = max(float(torch.sum(mask).detach().cpu().item()), 1e-6)
+        return correct_nums, total_nums
+class AccMetric:
+    def __call__(self, preds, labels, labels_mask):
+        mask = (labels_mask != 0) & (labels != -1)
+        correct_nums = float(torch.sum((preds == labels) & mask).detach().cpu().item())
+        total_nums = max(float(torch.sum(mask).detach().cpu().item()), 1e-6)
+        return correct_nums, total_nums
+def cal_cls_acc(cls_preds, cls_labels):
+    mask = (cls_labels != -1)
+    total_nums = float(torch.sum(mask).item())
+    pred_nums = float(torch.sum((cls_preds == cls_labels) & mask).item())
+    return pred_nums, total_nums
+def cal_segment_pr(pred_segments, fg_spans, bg_spans):
+    correct_nums = 0
+    segment_nums = 0
+    span_nums = 0
+    for pred_segments_pi, fg_spans_pi, bg_spans_pi in zip(pred_segments, fg_spans, bg_spans):
+        matched_segments_idx, _ = match_segment_spans(pred_segments_pi, fg_spans_pi)
+        unmatched_segments_idx = find_unmatch_segment_spans(pred_segments_pi, fg_spans_pi + bg_spans_pi)
+        correct_nums += len(matched_segments_idx)
+        segment_nums += len(pred_segments_pi) - len(unmatched_segments_idx)
+        span_nums += len(fg_spans_pi)
+    return correct_nums, segment_nums, span_nums
+class TEDSMetric:
+    def __init__(self, num_workers=1, structure_only=False):
+        self.evaluator = TEDS(n_jobs=num_workers, structure_only=structure_only)
+    def __call__(self, pred_htmls, label_htmls):
+        assert len(pred_htmls) == len(label_htmls)
+        pred_jsons = {idx: pred_html for idx, pred_html in enumerate(pred_htmls)}
+        label_jsons = {idx: dict(html=label_html) for idx, label_html in enumerate(label_htmls)}
+        scores = self.evaluator.batch_evaluate(pred_jsons, label_jsons)
+        scores = [scores[idx] for idx in range(len(pred_htmls))]
+        return scores

libs/utils/model_synchronizer.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import torch
+from .comm import get_world_size
+import torch.distributed as dist
+class ModelSynchronizer:
+    bm_map = {
+        2: 0.65,
+        4: 0.75,
+        8: 0.875,
+        12: 0.8875,
+        16: 0.9,
+        32: 0.9
+    }
+    def __init__(self, model, sync_rate, bm=None, blr=1.0, rescale_grad=1.0):
+        if bm is None:
+            self.bm = self.bm_map[get_world_size()]
+        else:
+            self.bm = bm
+        self.blr = blr
+        self.model = model
+        self.sync_rate = sync_rate
+        self.rescale_grad = rescale_grad
+        self.count = 0
+        self.param_align()
+        self.momentums = dict()
+        self.global_params = dict()
+        for k, v in self.model.named_parameters():
+            temp = torch.zeros_like(v, requires_grad=False)
+            temp.copy_(v.data)
+            self.global_params[k] = v
+            self.momentums[k] = torch.zeros_like(v, requires_grad=False)
+    def param_align(self):
+        for v in self.model.parameters():
+            dist.broadcast_multigpu([v.data], src=0)
+        for k, v in self.model.named_buffers():
+            if 'num_batches_tracked' in k:
+                continue
+            dist.broadcast_multigpu([v.data], src=0)
+    def sync_params(self):
+        size = float(get_world_size())
+        for v in self.model.parameters():
+            dist.all_reduce(v.data, op=dist.ReduceOp.SUM)
+            v.data /= size
+        for k, v in self.model.named_buffers():
+            if 'num_batches_tracked' in k:
+                continue
+            dist.all_reduce(v.data, op=dist.ReduceOp.SUM)
+            v.data /= size
+    def __call__(self, final_align=False):
+        self.count += 1
+        if (self.count % self.sync_rate == 0) or final_align:
+            with torch.no_grad():
+                if final_align:
+                    self.param_align()
+                else:
+                    self.sync_params()
+                    for k, v in self.model.named_parameters():
+                        global_param = self.global_params[k]
+                        momentum = self.momentums[k]
+                        grad = v.data * self.rescale_grad - global_param
+                        momentum *= self.bm
+                        global_param -= momentum
+                        momentum += self.blr * grad
+                        global_param += (1.0 + self.bm) * momentum
+                        v.detach().copy_(global_param.detach())

libs/utils/scitsr/__init__.py ADDED Viewed

File without changes

libs/utils/scitsr/eval.py ADDED Viewed

	@@ -0,0 +1,179 @@

+# Copyright (c) 2019-present, Zewen Chi
+# All rights reserved.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import List
+from .relation import Relation
+from .table import Table, Chunk
+DIR_HORIZ = 1
+DIR_VERT = 2
+DIR_SAME_CELL = 3
+def normalize(s:str, rule=0):
+  if rule == 0:
+    s = s.replace("\r", "")
+    s = s.replace("\n", "")
+    s = s.replace(" ", "")
+    s = s.replace("\t", "")
+    return s.upper()
+  else:
+    raise NotImplementedError
+def eval_relations(gt:List[List], res:List[List], cmp_blank=True):
+  """Evaluate results
+  Args:
+    gt: a list of list of Relation
+    res: a list of list of Relation
+  """
+  #TODO to know how to calculate the total recall and prec
+  assert len(gt) == len(res)
+  tot_prec = 0
+  tot_recall = 0
+  total = 0
+  # print("evaluating result...")
+  # for _gt, _res in tqdm(zip(gt, res)):
+  # for _gt, _res in tqdm(zip(gt, res), total=len(gt), desc='eval'):
+  idx, t = 0, len(gt)
+  for _gt, _res in zip(gt, res):
+    idx += 1
+    print('Eval %d/%d (%d%%)' % (idx, t, idx / t * 100), ' ' * 45, end='\r')
+    corr = compare_rel(_gt, _res, cmp_blank)
+    precision = corr / len(_res) if len(_res) != 0 else 0
+    recall = corr / len(_gt) if len(_gt) != 0 else 0
+    tot_prec += precision
+    tot_recall += recall
+    total += 1
+  # print()
+  precision = tot_prec / total
+  recall = tot_recall / total
+  # print("Test on %d instances. Precision: %.2f, Recall: %.2f" % (
+  #   total, precision, recall))
+  return precision, recall
+def compare_rel(gt_rel:List[Relation], res_rel:List[Relation], cmp_blank=True):
+  count = 0
+  #print("compare_rel =======================")
+  #for gt in gt_rel:
+  #  print("rel gt:", gt.from_text, gt.to_text, gt.direction)
+  #for gt in res_rel:
+  #  print("rel res:", gt.from_text, gt.to_text, gt.direction)
+  #print("\n\n\n\n\n")
+  dup_res_rel = [r for r in res_rel]
+  for gt in gt_rel:
+    to_rm = None
+    for i, res in enumerate(dup_res_rel):
+      if gt.equal(res, cmp_blank):
+        to_rm = i
+        count += 1
+        break
+    if to_rm is not None:
+      dup_res_rel = dup_res_rel[:i] + dup_res_rel[i + 1:]
+  return count
+def Table2Relations(t:Table):
+  """Convert a Table object to a List of Relation.
+  """
+  ret = []
+  cl = t.coo2cell_id
+  # remove duplicates with pair set
+  used = set()
+  # look right
+  for r in range(t.row_n):
+    for cFrom in range(t.col_n - 1):
+      cTo = cFrom + 1
+      loop = True
+      while loop and cTo < t.col_n:
+        fid, tid = cl[r][cFrom], cl[r][cTo]
+        if fid != -1 and tid != -1 and fid != tid:
+          if (fid, tid) not in used:
+            ret.append(Relation(
+              from_text=t.cells[fid].text,
+              to_text=t.cells[tid].text,
+              direction=DIR_HORIZ,
+              from_id=fid,
+              to_id=tid,
+              no_blanks=cTo - cFrom - 1
+            ))
+            used.add((fid, tid))
+          loop = False
+        else:
+          if fid != -1 and tid != -1 and fid == tid:
+            cFrom = cTo
+        cTo += 1
+  # look down
+  for c in range(t.col_n):
+    for rFrom in range(t.row_n - 1):
+      rTo = rFrom + 1
+      loop = True
+      while loop and rTo < t.row_n:
+        fid, tid = cl[rFrom][c], cl[rTo][c]
+        if fid != -1 and tid != -1 and fid != tid:
+          if (fid, tid) not in used:
+            ret.append(Relation(
+              from_text=t.cells[fid].text,
+              to_text=t.cells[tid].text,
+              direction=DIR_VERT,
+              from_id=fid,
+              to_id=tid,
+              no_blanks=rTo - rFrom - 1
+            ))
+            used.add((fid, tid))
+          loop = False
+        else:
+          if fid != -1 and tid != -1 and fid == tid:
+            rFrom = rTo
+        rTo += 1
+  return ret
+def json2Table(json_obj, tid="", splitted_content=False):
+  """Construct a Table object from json object
+  Args:
+    json_obj: a json object
+  Returns:
+    a Table object
+  """
+  jo = json_obj["cells"]
+  row_n, col_n = 0, 0
+  cells = []
+  for co in jo:
+    content = co["content"]
+    if content is None: continue
+    if splitted_content:
+      content = " ".join(content)
+    else:
+      content = content.strip()
+    if content == "": continue
+    start_row = co["start_row"]
+    end_row = co["end_row"]
+    start_col = co["start_col"]
+    end_col = co["end_col"]
+    row_n = max(row_n, end_row)
+    col_n = max(col_n, end_col)
+    cell = Chunk(content, (start_row, end_row, start_col, end_col))
+    cells.append(cell)
+  return Table(row_n + 1, col_n + 1, cells, tid)
+def json2Relations(json_obj, splitted_content):
+  return Table2Relations(json2Table(json_obj, "", splitted_content))

libs/utils/scitsr/relation.py ADDED Viewed

	@@ -0,0 +1,59 @@

+# Copyright (c) 2019-present, Zewen Chi
+# All rights reserved.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import sys
+def normalize(s:str, rule=0):
+  if rule == 0:
+    s = s.replace("\r", "")
+    s = s.replace("\n", "")
+    s = s.replace(" ", "")
+    s = s.replace("\t", "")
+    return s.upper()
+  else:
+    raise NotImplementedError
+class Relation(object):
+  def __init__(self, from_text, to_text, direction, from_id=0, to_id=0, no_blanks=0):
+    self.from_text = from_text
+    self.to_text = to_text
+    self.direction = direction
+    self.no_blanks = no_blanks
+    self.from_id = from_id
+    self.to_id = to_id
+  def __eq__(self, rl):
+    this_ft = normalize(self.from_text)
+    this_tt = normalize(self.to_text)
+    rl_ft = normalize(rl.from_text)
+    rl_tt = normalize(rl.to_text)
+    if len(this_ft) == 0 or len(this_tt) == 0 or \
+       len(rl_ft) == 0 or len(rl_tt) == 0:
+      print("Warning: Text comparison of 0-length strings after normalization",
+        file=sys.stderr)
+    return this_ft == rl_ft and this_tt == rl_tt and \
+      self.direction == rl.direction and self.no_blanks == rl.no_blanks
+  def equal(self, rl, cmp_blank=True):
+    this_ft = normalize(self.from_text)
+    this_tt = normalize(self.to_text)
+    rl_ft = normalize(rl.from_text)
+    rl_tt = normalize(rl.to_text)
+    if len(this_ft) == 0 or len(this_tt) == 0 or \
+       len(rl_ft) == 0 or len(rl_tt) == 0:
+      print("Warning: Text comparison of 0-length strings after normalization",
+        file=sys.stderr)
+    return this_ft == rl_ft and this_tt == rl_tt and \
+      self.direction == rl.direction and \
+      (self.no_blanks == rl.no_blanks if cmp_blank else True)
+  def __str__(self):
+    return "%d:%d" % (self.direction, self.no_blanks)

libs/utils/scitsr/table.py ADDED Viewed

	@@ -0,0 +1,133 @@

+# Copyright (c) 2019-present, Zewen Chi
+# All rights reserved.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+from typing import Iterable, List, Tuple
+def load_chunks(chunk_path):
+  with open(chunk_path, 'r') as f:
+    chunks = json.load(f)['chunks']
+  # NOTE remove the chunk with 0 len
+  ret = []
+  for chunk in chunks:
+    if chunk["pos"][1] < chunk["pos"][0]:
+        chunk["pos"][0], chunk["pos"][1] = chunk["pos"][1], chunk["pos"][0]
+        print("Warning load illegal chunk.")
+    c = Chunk.load_from_dict(chunk)
+    #if c.x2 == c.x1 or c.y2 == c.y1 or c.text == "":
+    #    continue
+    ret.append(c)
+  return ret
+class Box(object):
+  def __init__(self, pos):
+    """pos: (x1, x2, y1, y2)"""
+    self.set_pos(pos)
+  def set_pos(self, pos):
+    assert pos[0] <= pos[1]
+    assert pos[2] <= pos[3]
+    self.x1 = pos[0]
+    self.x2 = pos[1]
+    self.y1 = pos[2]
+    self.y2 = pos[3]
+    self.w = self.x2 - self.x1
+    self.h = self.y2 - self.y1
+    self.pos = pos
+  def __lt__(self, other):
+    return self.pos.__lt__(other.pos)
+  def __contains__(self, other):
+    if other.x1 >= self.x1 and other.x2 <= self.x2 and \
+       other.y1 >= self.y1 and other.y2 <= self.y2:
+       return True
+    return False
+  def __str__(self):
+    return 'Box(%d, %d, %d, %d)' % self.pos
+  def __hash__(self):
+    return self.pos.__hash__()
+class Chunk(Box):
+  def __init__(self, text:str, pos:Tuple, size:float=0.0, cell_id=None):
+    super(Chunk, self).__init__(pos)
+    self.text = text
+    self.size = size
+    self.cell_id = cell_id
+  def __str__(self):
+    return 'Chunk(text="%s", pos=(%d, %d, %d, %d))' % (self.text, *self.pos)
+  def __repr__(self):
+    return self.__str__()
+  def dump_as_json_obj(self):
+    return {"text":self.text, "pos":self.pos, "cell_id":self.cell_id}
+  @classmethod
+  def load_from_dict(cls, d):
+    assert type(d) == dict
+    assert type(d["text"]) == str
+    assert len(d["pos"]) == 4
+    cell_id = d["cell_id"] if "cell_id" in d else None
+    return cls(d["text"].strip(), d["pos"], cell_id=cell_id)
+class Table(object):
+  """
+  The output of table segmentation.
+  With the Table object, we can get the set of cells
+  and their corresponding text.
+  """
+  def __init__(self, row_n, col_n, cells:Iterable[Chunk]=None, tid=""):
+    # NOTE the Chunk object here represents the coordinate of
+    # the cell in the table.
+    # NOTE x in cell object represents the row id
+    self.tid = tid
+    self.row_n = row_n
+    self.col_n = col_n
+    self.coo2cell_id = [
+      [ -1 for _ in range(col_n) ] for _ in range(row_n) ]
+    self.cells:List[Chunk] = []
+    for cell in cells:
+      self.add_cell(cell)
+  def reverse(self, is_col=True):
+    cells = self.cells
+    self.cells = []
+    cell:Chunk = None
+    for cell in cells:
+      if is_col:
+        _c = Chunk(cell.text, (
+          self.row_n - cell.x2, self.row_n - cell.x1, cell.y1, cell.y2))
+      else:
+        _c = Chunk(cell.text, (
+          cell.x1, cell.x2, self.col_n - cell.y1, self.col_n - cell.y2))
+      self.add_cell(_c)
+  def add_cell(self, cell:Chunk):
+    # TODO Check conflicts of cells
+    assert cell.y2 < self.col_n
+    assert cell.x2 < self.row_n
+    for x in range(cell.x1, cell.x2 + 1, 1):
+      for y in range(cell.y1, cell.y2 + 1, 1):
+        self.coo2cell_id[x][y] = len(self.cells)
+    self.cells.append(cell)
+  def __getitem__(self, id_tuple):
+    row_id, col_id = id_tuple
+    assert row_id < self.row_n and col_id < self.col_n
+    return self.cells[self.coo2cell_id[row_id][col_id]]

libs/utils/teds.py ADDED Viewed

	@@ -0,0 +1,212 @@

+# Copyright 2020 IBM
+# Author: peter.zhong@au1.ibm.com
+#
+# This is free software; you can redistribute it and/or modify
+# it under the terms of the Apache 2.0 License.
+#
+# This software is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# Apache 2.0 License for more details.
+import distance
+from apted import APTED, Config
+from apted.helpers import Tree
+from lxml import etree, html
+from collections import deque
+from tqdm import tqdm
+from concurrent.futures import ProcessPoolExecutor, as_completed
+def parallel_process(array, function, n_jobs=16, use_kwargs=False, front_num=0):
+    """
+        A parallel version of the map function with a progress bar.
+        Args:
+            array (array-like): An array to iterate over.
+            function (function): A python function to apply to the elements of array
+            n_jobs (int, default=16): The number of cores to use
+            use_kwargs (boolean, default=False): Whether to consider the elements of array as dictionaries of
+                keyword arguments to function
+            front_num (int, default=3): The number of iterations to run serially before kicking off the parallel job.
+                Useful for catching bugs
+        Returns:
+            [function(array[0]), function(array[1]), ...]
+    """
+    # We run the first few iterations serially to catch bugs
+    if front_num > 0:
+        front = [function(**a) if use_kwargs else function(a) for a in array[:front_num]]
+    else:
+        front = []
+    # If we set n_jobs to 1, just run a list comprehension. This is useful for benchmarking and debugging.
+    if n_jobs == 1:
+        return front + [function(**a) if use_kwargs else function(a) for a in tqdm(array[front_num:])]
+    # Assemble the workers
+    with ProcessPoolExecutor(max_workers=n_jobs) as pool:
+        # Pass the elements of array into function
+        if use_kwargs:
+            futures = [pool.submit(function, **a) for a in array[front_num:]]
+        else:
+            futures = [pool.submit(function, a) for a in array[front_num:]]
+        kwargs = {
+            'total': len(futures),
+            'unit': 'it',
+            'unit_scale': True,
+            'leave': True
+        }
+        # Print out the progress as tasks complete
+        for f in tqdm(as_completed(futures), **kwargs):
+            pass
+    out = []
+    # Get the results from the futures.
+    for i, future in tqdm(enumerate(futures)):
+        try:
+            out.append(future.result())
+        except Exception as e:
+            out.append(e)
+    return front + out
+class TableTree(Tree):
+    def __init__(self, tag, colspan=None, rowspan=None, content=None, *children):
+        self.tag = tag
+        self.colspan = colspan
+        self.rowspan = rowspan
+        self.content = content
+        self.children = list(children)
+    def bracket(self):
+        """Show tree using brackets notation"""
+        if self.tag == 'td':
+            result = '"tag": %s, "colspan": %d, "rowspan": %d, "text": %s' % \
+                     (self.tag, self.colspan, self.rowspan, self.content)
+        else:
+            result = '"tag": %s' % self.tag
+        for child in self.children:
+            result += child.bracket()
+        return "{{{}}}".format(result)
+class CustomConfig(Config):
+    @staticmethod
+    def maximum(*sequences):
+        """Get maximum possible value
+        """
+        return max(map(len, sequences))
+    def normalized_distance(self, *sequences):
+        """Get distance from 0 to 1
+        """
+        return float(distance.levenshtein(*sequences)) / self.maximum(*sequences)
+    def rename(self, node1, node2):
+        """Compares attributes of trees"""
+        if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan):
+            return 1.
+        if node1.tag == 'td':
+            if node1.content or node2.content:
+                return self.normalized_distance(node1.content, node2.content)
+        return 0.
+class TEDS(object):
+    ''' Tree Edit Distance basead Similarity
+    '''
+    def __init__(self, structure_only=False, n_jobs=1, ignore_nodes=None):
+        assert isinstance(n_jobs, int) and (n_jobs >= 1), 'n_jobs must be an integer greather than 1'
+        self.structure_only = structure_only
+        self.n_jobs = n_jobs
+        self.ignore_nodes = ignore_nodes
+        self.__tokens__ = []
+    def tokenize(self, node):
+        ''' Tokenizes table cells
+        '''
+        self.__tokens__.append('<%s>' % node.tag)
+        if node.text is not None:
+            self.__tokens__ += list(node.text)
+        for n in node.getchildren():
+            self.tokenize(n)
+        if node.tag != 'unk':
+            self.__tokens__.append('</%s>' % node.tag)
+        if node.tag != 'td' and node.tail is not None:
+            self.__tokens__ += list(node.tail)
+    def load_html_tree(self, node, parent=None):
+        ''' Converts HTML tree to the format required by apted
+        '''
+        global __tokens__
+        if node.tag == 'td':
+            if self.structure_only:
+                cell = []
+            else:
+                self.__tokens__ = []
+                self.tokenize(node)
+                cell = self.__tokens__[1:-1].copy()
+            new_node = TableTree(node.tag,
+                                 int(node.attrib.get('colspan', '1')),
+                                 int(node.attrib.get('rowspan', '1')),
+                                 cell, *deque())
+        else:
+            new_node = TableTree(node.tag, None, None, None, *deque())
+        if parent is not None:
+            parent.children.append(new_node)
+        if node.tag != 'td':
+            for n in node.getchildren():
+                self.load_html_tree(n, new_node)
+        if parent is None:
+            return new_node
+    def evaluate(self, pred, true):
+        ''' Computes TEDS score between the prediction and the ground truth of a
+            given sample
+        '''
+        if (not pred) or (not true):
+            return 0.0
+        parser = html.HTMLParser(remove_comments=True, encoding='utf-8')
+        pred = html.fromstring(pred, parser=parser)
+        true = html.fromstring(true, parser=parser)
+        if pred.xpath('body/table') and true.xpath('body/table'):
+            pred = pred.xpath('body/table')[0]
+            true = true.xpath('body/table')[0]
+            if self.ignore_nodes:
+                etree.strip_tags(pred, *self.ignore_nodes)
+                etree.strip_tags(true, *self.ignore_nodes)
+            n_nodes_pred = len(pred.xpath(".//*"))
+            n_nodes_true = len(true.xpath(".//*"))
+            n_nodes = max(n_nodes_pred, n_nodes_true)
+            tree_pred = self.load_html_tree(pred)
+            tree_true = self.load_html_tree(true)
+            distance = APTED(tree_pred, tree_true, CustomConfig()).compute_edit_distance()
+            return 1.0 - (float(distance) / n_nodes)
+        else:
+            return 0.0
+    def batch_evaluate(self, pred_json, true_json):
+        ''' Computes TEDS score between the prediction and the ground truth of
+            a batch of samples
+            @params pred_json: {'FILENAME': 'HTML CODE', ...}
+            @params true_json: {'FILENAME': {'html': 'HTML CODE'}, ...}
+            @output: {'FILENAME': 'TEDS SCORE', ...}
+        '''
+        samples = true_json.keys()
+        if self.n_jobs == 1:
+            scores = [self.evaluate(pred_json.get(filename, ''), true_json[filename]['html']) for filename in tqdm(samples)]
+        else:
+            inputs = [{'pred': pred_json.get(filename, ''), 'true': true_json[filename]['html']} for filename in samples]
+            scores = parallel_process(inputs, self.evaluate, use_kwargs=True, n_jobs=self.n_jobs, front_num=1)
+        scores = dict(zip(samples, scores))
+        return scores
+if __name__ == '__main__':
+    import json
+    import pprint
+    with open('sample_pred.json') as fp:
+        pred_json = json.load(fp)
+    with open('sample_gt.json') as fp:
+        true_json = json.load(fp)
+    teds = TEDS(n_jobs=4)
+    scores = teds.batch_evaluate(pred_json, true_json)
+    pp = pprint.PrettyPrinter()
+    pp.pprint(scores)

libs/utils/teds_multiprocess.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import json
+import tqdm
+from libs.utils.teds import TEDS
+from collections import defaultdict
+# def parse_args():
+#     import argparse
+#     parser = argparse.ArgumentParser()
+#     parser.add_argument('pred_path', type=str, default=None)
+#     parser.add_argument('label_path', type=str, default=None)
+#     parser.add_argument('-s', '--structure_only', action='store_true')
+#     parser.add_argument('-n', '--num_workers', type=int, default=1)
+#     args = parser.parse_args()
+#     return args
+def is_simple(data):
+    if ('colspan' in data) or ('rowspan' in data):
+        return False
+    else:
+        return True
+def judge_type(data):
+    if is_simple(data):
+        return 'Simple'
+    else:
+        return 'Complex'
+def single_process(pred_htmls, label_htmls, structure_only=False):
+    evaluator = TEDS(structure_only=structure_only)
+    scores = dict()
+    for key in tqdm.tqdm(label_htmls.keys()):
+        pred_html = pred_htmls.get(key, '')
+        label_html = label_htmls[key]['html']
+        score = evaluator.evaluate(pred_html, label_html)
+        scores[key] = score
+    return scores
+def _worker(pred_htmls, label_htmls, keys, result_queue, structure_only=False):
+    evaluator = TEDS(structure_only=structure_only)
+    for key in keys:
+        pred_html = pred_htmls.get(key, '')
+        label_html = label_htmls[key]['html']
+        score = evaluator.evaluate(pred_html, label_html)
+        result_queue.put((key, score))
+def multi_process(pred_htmls, label_htmls, num_workers, structure_only=False):
+    import multiprocessing
+    manager = multiprocessing.Manager()
+    result_queue = manager.Queue()
+    keys = list(label_htmls.keys())
+    workers = list()
+    for worker_idx in range(num_workers):
+        worker = multiprocessing.Process(
+            target=_worker,
+            args=(
+                pred_htmls,
+                label_htmls,
+                keys[worker_idx::num_workers],
+                result_queue
+            )
+        )
+        worker.daemon = True
+        worker.start()
+        workers.append(worker)
+    scores = dict()
+    tq = tqdm.tqdm(total=len(keys))
+    for _ in range(len(keys)):
+        key, val = result_queue.get()
+        scores[key] = val
+        teds = sum(scores.values()) / len(scores)
+        tq.set_description('Teds: %s' % teds, False)
+        tq.update()
+    tq.close()
+    return scores
+def evaluate(pred_htmls, label_htmls, num_workers, structure_only=False):
+    if num_workers <= 1:
+        scores = single_process(pred_htmls, label_htmls, structure_only)
+    else:
+        scores = multi_process(pred_htmls, label_htmls, num_workers, structure_only)
+    teds = sum(scores.values())/len(scores)
+    typed_teds = defaultdict(list)
+    for key, score in scores.items():
+        data_type = judge_type(label_htmls[key]['html'])
+        typed_teds[data_type].append(score)
+    typed_teds = {key: sum(val)/len(val) for key, val in typed_teds.items()}
+    return teds, typed_teds
+# def main():
+#     args = parse_args()
+#     pred_data = json.load(open(args.pred_path))
+#     label_data = json.load(open(args.label_path))
+#     teds, typed_teds = evaluate(pred_data, label_data, args.num_workers, args.structure_only)
+#     print('Teds: %s' % teds)
+#     for key, val in typed_teds.items():
+#         print('    %s Teds: %s' % (key, val))
+# if __name__ == '__main__':
+#     main()

libs/utils/time_counter.py ADDED Viewed

	@@ -0,0 +1,108 @@

+from collections import defaultdict
+import time
+import datetime
+class TimeCounter:
+    def __init__(self, start_epoch, num_epochs, epoch_iters):
+        self.start_epoch = start_epoch
+        self.num_epochs = num_epochs
+        self.epoch_iters = epoch_iters
+        self.start_time = None
+    def reset(self):
+        self.start_time = time.time()
+    def step(self, epoch, batch):
+        used = time.time() - self.start_time
+        finished_batch_nums = (epoch - self.start_epoch) * self.epoch_iters + batch
+        batch_time_cost = used / finished_batch_nums
+        total = (self.num_epochs - self.start_epoch) * self.epoch_iters * batch_time_cost
+        left = total - used
+        return str(datetime.timedelta(seconds=left))
+def format_table(table, padding=1):
+    table = [[str(subitem) for subitem in item] for item in table]
+    num_cols = max([len(item) for item in table])
+    cols_width = [0] * num_cols
+    for row in table:
+        for col_idx, cell in enumerate(row):
+            cols_width[col_idx] = max(cols_width[col_idx], len(cell))
+    string = '��'
+    for col_idx in range(num_cols):
+        string += '��' * (padding * 2 + cols_width[col_idx])
+        if col_idx == num_cols - 1:
+            string += '��'
+        else:
+            string += '��'
+    string += '\n'
+    for row_idx, row in enumerate(table):
+        string += '��'
+        for col_idx in range(num_cols):
+            if col_idx < len(row):
+                word = row[col_idx]
+            else:
+                word = ''
+            col_width = cols_width[col_idx]
+            left_pad = (col_width - len(word))//2
+            right_pad = col_width - len(word) - left_pad
+            string += ' ' * (padding + left_pad)
+            string += word
+            string += ' ' * (padding + right_pad)
+            string += '��'
+        string += '\n'
+        if row_idx < len(table) - 1:
+            string += '��'
+        else:
+            string += '��'
+        for col_idx in range(num_cols):
+            string += '��' * (padding * 2 + cols_width[col_idx])
+            if col_idx == num_cols - 1:
+                if row_idx < len(table) - 1:
+                    string += '��'
+                else:
+                    string += '��'
+            else:
+                if row_idx < len(table) - 1:
+                    string += '��'
+                else:
+                    string += '��'
+        string += '\n'
+    return string
+class TicTocCounter:
+    def __init__(self):
+        self.tics = dict()
+        self.seps = defaultdict(list)
+    def tic(self, name):
+        self.tics[name] = time.time()
+    def toc(self, name):
+        toc = time.time()
+        if name in self.tics:
+            self.seps[name].append(toc-self.tics[name])
+    def __repr__(self):
+        string = 'TicTocCount Result:\n'
+        infos = [['Name', 'Mean Time', 'Total Time']]
+        for key, val in self.seps.items():
+            mean = sum(val)/len(val)
+            total = sum(val)
+            infos.append([key, '%0.4f' % mean, '%0.4f' % total])
+        string += format_table(infos)
+        return string
+    def reset(self):
+        self.tics.clear()
+        self.seps.clear()
+global_tictoc_counter = TicTocCounter()

libs/utils/utils.py ADDED Viewed

	@@ -0,0 +1,297 @@

+import cv2
+import copy
+import Polygon
+import numpy as np
+def cal_mean_lr(optimizer):
+    lrs = [group['lr'] for group in optimizer.param_groups]
+    return sum(lrs)/len(lrs)
+def cal_pr_f1(pr_info):
+    precision = pr_info[0] / pr_info[1]
+    recall = pr_info[0] / pr_info[2]
+    f1 = 2*precision*recall/(precision+recall)
+    return precision, recall, f1
+def match_segment_spans(segments, spans):
+    matched_segments = list()
+    matched_spans = list()
+    for segment_idx, segment in enumerate(segments):
+        for span_idx, span in enumerate(spans):
+            if span_idx not in matched_spans:
+                if (segment >= span[0]) and (segment < span[1]):
+                    matched_segments.append(segment_idx)
+                    matched_spans.append(span_idx)
+    return matched_segments, matched_spans
+def find_unmatch_segment_spans(segments, spans):
+    unmatched_segments = list()
+    for segment_idx, segment in enumerate(segments):
+        matched = False
+        for span in spans:
+            if (segment >= span[0]) and (segment < span[1]):
+                matched = True
+                break
+        if not matched:
+            unmatched_segments.append(segment_idx)
+    return unmatched_segments
+def parse_layout(spans, num_rows, num_cols):
+    layout = np.full([num_rows, num_cols], -1, dtype=np.int)
+    cell_count = 0
+    for x1, y1, x2, y2 in spans:
+        layout[y1:y2+1, x1:x2+1] = cell_count
+        cell_count += 1
+    cells_id = list()
+    for row_idx in range(num_rows):
+        for col_idx in range(num_cols):
+            cell_id = layout[row_idx, col_idx]
+            if cell_id in cells_id:
+                layout[row_idx, col_idx] = cells_id.index(cell_id)
+            else:
+                layout[row_idx, col_idx] = len(cells_id)
+                cells_id.append(cell_id)
+    return layout
+def parse_cells(layout, spans, row_segments, col_segments):
+    cells = list()
+    num_cells = np.max(layout) + 1
+    for cell_id in range(num_cells):
+        cell_positions = np.argwhere(layout == cell_id)
+        y1 = np.min(cell_positions[:, 0])
+        y2 = np.max(cell_positions[:, 0])
+        x1 = np.min(cell_positions[:, 1])
+        x2 = np.max(cell_positions[:, 1])
+        assert np.all(layout[y1:y2, x1:x2] == cell_id)
+        x1 = col_segments[x1]
+        x2 = col_segments[x2+1]
+        y1 = row_segments[y1]
+        y2 = row_segments[y2+1]
+        cell = dict(
+            segmentation=[[[x1, y1], [x2, y1], [x2, y2], [x1, y2]]]
+        )
+        cells.append(cell)
+    for span in spans:
+        cell_id = layout[span[1], span[0]]
+        cells[cell_id]['transcript'] = 'None'
+    return cells
+def segmentation_to_bbox(segmentation):
+    x1 = min([min([pt[0] for pt in contour]) for contour in segmentation])
+    y1 = min([min([pt[1] for pt in contour]) for contour in segmentation])
+    x2 = max([max([pt[0] for pt in contour]) for contour in segmentation])
+    y2 = max([max([pt[1] for pt in contour]) for contour in segmentation])
+    return [x1, y1, x2, y2]
+def extend_cell_lines(cells, lines):
+    def segmentation_to_polygon(segmentation):
+        polygon = Polygon.Polygon()
+        for contour in segmentation:
+            polygon = polygon + Polygon.Polygon(contour)
+        return polygon
+    lines = copy.deepcopy(lines)
+    cells_poly = [segmentation_to_polygon(item['segmentation']) for item in cells]
+    lines_poly = [segmentation_to_polygon(item['segmentation']) for item in lines]
+    cells_lines = [[] for _ in range(len(cells))]
+    for line_idx, line_poly in enumerate(lines_poly):
+        if line_poly.area() == 0:
+            continue
+        line_area = line_poly.area()
+        max_overlap = 0
+        max_overlap_idx = None
+        for cell_idx, cell_poly in enumerate(cells_poly):
+            overlap = (cell_poly & line_poly).area()/line_area
+            if overlap > max_overlap:
+                max_overlap_idx = cell_idx
+                max_overlap = overlap
+        if max_overlap > 0:
+            cells_lines[max_overlap_idx].append(line_idx)
+    lines_y1 = [segmentation_to_bbox(item['segmentation'])[1] for item in lines]
+    cells_lines = [sorted(item, key=lambda idx: lines_y1[idx]) for item in cells_lines]
+    for cell, cell_lines in zip(cells, cells_lines):
+        cell['lines_idx'] = cell_lines
+def rerange_layout(table):
+    layout = table['layout']
+    cells = table['cells']
+    valid_cells_id = list()
+    for row_idx in range(layout.shape[0]):
+        for col_idx in range(layout.shape[1]):
+            cell_id = layout[row_idx, col_idx]
+            if cell_id not in valid_cells_id:
+                valid_cells_id.append(cell_id)
+            layout[row_idx, col_idx] = valid_cells_id.index(cell_id)
+    cells = [cells[cell_id] for cell_id in valid_cells_id]
+    table['layout'] = layout
+    table['cells'] = cells
+def cal_cell_spans(table):
+    layout = table['layout']
+    num_cells = len(table['cells'])
+    cells_span = list()
+    for cell_id in range(num_cells):
+        cell_positions = np.argwhere(layout == cell_id)
+        y1 = np.min(cell_positions[:, 0])
+        y2 = np.max(cell_positions[:, 0])
+        x1 = np.min(cell_positions[:, 1])
+        x2 = np.max(cell_positions[:, 1])
+        assert np.all(layout[y1:y2, x1:x2] == cell_id)
+        cells_span.append([x1, y1, x2, y2])
+    return cells_span
+def remove_repeat_rcs(table):
+    layout = table['layout']
+    head_rows = table['head_rows']
+    body_rows = table['body_rows']
+    while True:
+        num_rows = layout.shape[0]
+        num_cols = layout.shape[1]
+        valid_rows_idx = list()
+        valid_rows_key = list()
+        for row_idx in range(num_rows):
+            row = layout[row_idx, :]
+            if len(np.unique(row)) == 1 and row_idx in body_rows: # remove repeated row
+                continue
+            row_key = ','.join([str(item) for item in row])
+            if row_key not in valid_rows_key:
+                valid_rows_idx.append(row_idx)
+                valid_rows_key.append(row_key)
+        valid_cols_idx = list()
+        valid_cols_key = list()
+        for col_idx in range(num_cols):
+            col = layout[:, col_idx]
+            if len(np.unique(col)) == 1: # remove repeated col
+                continue
+            col_key = ','.join([str(item) for item in col])
+            if col_key not in valid_cols_key:
+                valid_cols_idx.append(col_idx)
+                valid_cols_key.append(col_key)
+        if (len(valid_rows_idx) == num_rows) and (len(valid_cols_idx) == num_cols):
+            break
+        layout = layout[valid_rows_idx][:, valid_cols_idx]
+        head_rows = [n_idx for n_idx, o_idx in enumerate(valid_rows_idx) if o_idx in head_rows]
+        body_rows = [n_idx for n_idx, o_idx in enumerate(valid_rows_idx) if o_idx in body_rows]
+    table['layout'] = layout
+    table['head_rows'] = head_rows
+    table['body_rows'] = body_rows
+    rerange_layout(table)
+def pred_result_to_table(pred_result):
+    row_segments, col_segments, divide, spans = pred_result
+    num_rows = len(row_segments) - 1
+    num_cols = len(col_segments) - 1
+    layout = parse_layout(spans, num_rows, num_cols)
+    cells = parse_cells(layout, spans, row_segments, col_segments)
+    head_rows = list(range(0, divide))
+    body_rows = list(range(divide, num_rows))
+    table = dict(
+        layout=layout,
+        head_rows=head_rows,
+        body_rows=body_rows,
+        cells=cells
+    )
+    # remove_repeat_rcs(table)
+    return table
+def is_simple_table(table):
+    layout = table['layout']
+    num_rows, num_cols = layout.shape
+    if num_rows * num_cols == len(table['cells']):
+        return True
+    else:
+        return False
+def tensor_to_image(tensor):
+    image = tensor.detach().cpu().numpy()
+    if (len(image.shape) == 3) and (image.shape[0] != 3) and (image.shape[0] != 1):
+        image = np.sqrt(np.sum(np.power(image, 2), axis=0, keepdims=True))
+    image = 255 * (image-np.min(image))/(np.max(image) - np.min(image))
+    image = image.astype(np.uint8)
+    if len(image.shape) == 3:
+        image = np.transpose(image, (1, 2, 0)).copy()
+        if image.shape[2] == 1:
+            image = image[:, :, 0]
+    return image
+def visualize_layout(image, table):
+    def draw_segmentation(image, segmentation, color):
+        for contour in segmentation:
+            contour = np.array(contour, dtype=np.int32)
+            image = cv2.polylines(image, [contour], True, color)
+        return image
+    for cell in table['cells']:
+        if 'segmentation' in cell:
+            image = draw_segmentation(image, cell['segmentation'], (255, 0, 0))
+    return image
+virtual_chars = ["<b>", "</b>", "<i>", "</i>", "<sup>", "</sup>", "<sub>", "</sub>", "<overline>", "</overline>", "<underline>", "</underline>", "<strike>", "</strike>"]
+def is_blank(content):
+    global virtual_chars
+    new_content = content
+    for item in virtual_chars:
+        new_content = new_content.replace(item, '')
+    return new_content.strip() == ''
+def filt_content(content, filt_blank=False, filt_virtual=False, filt_pad=False):
+    global virtual_chars
+    if filt_blank:
+        if is_blank(content):
+            content = ''
+    if filt_virtual:
+        for item in content:
+            content = content.replace(item, '')
+    if filt_pad:
+        content = content.strip()
+    return content
+def filt_transcript(html, filt_blank=False, filt_virtual=False, filt_pad=False):
+    start_idx = 0
+    while '<td' in html[start_idx:]:
+        start_idx = html[start_idx:].index('<td') + start_idx
+        content_start_idx = html[start_idx:].index('>') + 1 + start_idx
+        content_end_idx = html[content_start_idx:].index('</td>') + content_start_idx
+        end_idx = content_end_idx + len('</td>')
+        content = html[content_start_idx:content_end_idx]
+        content = filt_content(content, filt_blank, filt_virtual, filt_pad)
+        html = html[:content_start_idx] + content + html[content_end_idx:]
+        start_idx = end_idx - (content_end_idx-content_start_idx - len(content))
+    return html

libs/utils/vocab.py ADDED Viewed

	@@ -0,0 +1,36 @@

+class Vocab:
+    key_words = [
+        '</line>',
+        '</none>', # []
+        '</bold>', # ['<b>', ' ', '</b>']
+        '</space>' # [' ']
+    ]
+    def __init__(self):
+        self._words_ids_map = dict()
+        self._ids_words_map = dict()
+        for word_id, word in enumerate(self.key_words):
+            self._words_ids_map[word] = word_id
+            self._ids_words_map[word_id] = word
+        self.line_id = self._words_ids_map['</line>']
+        self.none_id = self._words_ids_map['</none>']
+        self.bold_id = self._words_ids_map['</bold>']
+        self.space_id = self._words_ids_map['</space>']
+        self.blank_ids = [self.none_id, self.bold_id, self.space_id]
+    def __len__(self):
+        return len(self._words_ids_map)
+    def word_to_id(self, word):
+        return self._words_ids_map[word]
+    def words_to_ids(self, words):
+        return [self.word_to_id(word) for word in words]
+    def id_to_word(self, word_id):
+        return self._ids_words_map[word_id]
+    def ids_to_words(self, words_id):
+        return [self.id_to_word(word_id) for word_id in words_id]

requirements.txt ADDED Viewed

	@@ -0,0 +1,93 @@

+addict==2.4.0
+aliyun-python-sdk-core==2.16.0
+aliyun-python-sdk-kms==2.16.5
+anyio==4.10.0
+apted==1.0.3
+beautifulsoup4==4.12.2
+blinker==1.4
+certifi==2025.8.3
+charset-normalizer==3.4.3
+click==8.1.8
+colorama==0.4.6
+contourpy==1.3.0
+crcmod==1.7
+cryptography==3.4.8
+cycler==0.12.1
+dbus-python==1.2.18
+Distance==0.1.3
+distro==1.7.0
+exceptiongroup==1.3.0
+filelock==3.14.0
+fonttools==4.59.2
+h11==0.16.0
+httpcore==1.0.9
+httplib2==0.20.2
+httpx==0.28.1
+idna==3.10
+importlib-metadata==4.6.4
+importlib_resources==6.5.2
+jeepney==0.7.1
+jmespath==0.10.0
+keyring==23.5.0
+kiwisolver==1.4.7
+launchpadlib==1.10.16
+lazr.restfulclient==0.14.4
+lazr.uri==1.0.6
+lxml==4.9.2
+Mako==1.1.3
+Markdown==3.3.6
+markdown-it-py==3.0.0
+MarkupSafe==2.0.1
+matplotlib==3.7.1
+mdurl==0.1.2
+mmcv-full==1.6.2
+mmdet==2.28.2
+model-index==0.1.11
+more-itertools==8.10.0
+numpy==1.26.4
+oauthlib==3.2.0
+opencv-python==4.7.0.72
+opendatalab==0.0.10
+openmim==0.3.9
+openxlab==0.1.2
+ordered-set==4.1.0
+oss2==2.17.0
+packaging==24.2
+pandas==2.0.2
+Pillow==10.0.0
+platformdirs==4.4.0
+polygon==1.1.0
+Polygon3==3.0.9.1
+pycocotools==2.0.10
+pycryptodome==3.23.0
+Pygments==2.19.2
+PyGObject==3.42.1
+PyJWT==2.3.0
+pyparsing==3.2.3
+python-apt==2.4.0+ubuntu4
+python-dateutil==2.9.0.post0
+pytz==2023.4
+PyYAML==6.0.2
+requests==2.28.2
+rich==13.4.2
+scipy==1.13.1
+seaborn==0.12.2
+SecretStorage==3.3.1
+shapely==2.0.1
+six==1.17.0
+sniffio==1.3.1
+soupsieve==2.8
+tabulate==0.9.0
+terminaltables==3.1.10
+tomli==2.2.1
+torch==1.12.0
+torchvision==0.13.0
+tqdm==4.65.0
+typing_extensions==4.15.0
+tzdata==2025.2
+urllib3==1.26.20
+wadllib==1.3.6
+websocket-client==1.8.0
+websockets==15.0.1
+yapf==0.43.0
+zipp==3.23.0

runner/train.py ADDED Viewed

	@@ -0,0 +1,245 @@

+import shutil
+import torch
+import tqdm
+import json
+import os
+import sys
+sys.path.append('./')
+sys.path.append('../')
+import numpy as np
+from torch.optim.lr_scheduler import CosineAnnealingLR
+from collections import defaultdict
+from libs.utils.cal_f1 import pred_result_to_table, table_to_relations, evaluate_f1
+from libs.utils.comm import distributed, synchronize
+from libs.utils.checkpoint import load_checkpoint, save_checkpoint
+from libs.data import create_train_dataloader, create_valid_dataloader
+from libs.utils.model_synchronizer import ModelSynchronizer
+from libs.utils.time_counter import TimeCounter
+from libs.utils.utils import is_simple_table
+from libs.utils.utils import cal_mean_lr
+from libs.utils.counter import Counter
+from libs.utils import logger
+from libs.model import build_model
+from libs.configs import cfg, setup_config
+metrics_name = ['f1']
+best_metrics = [0.0]
+def init():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--cfg', type=str, default='debug')
+    parser.add_argument('--local_rank', type=int, default=0)
+    args = parser.parse_args()
+    setup_config(args.cfg)
+    os.environ['LOCAL_RANK'] = str(args.local_rank)
+    num_gpus = int(os.environ['MORLD_SIZE']) if 'WORLD_SIZE' in os.environ else 1
+    distributed = num_gpus > 1
+    if distributed:
+        torch.cuda.set_device(args.local_rank)
+        torch.distributed.init_process_group(backend='nccl', init_method='env://')
+        synchronize()
+    logger.setup_logger('Line Detect Model', cfg.work_dir, 'train.log')
+    logger.info('Use config:%s' % args.cfg)
+def train(cfg, epoch, dataloader, model, optimizer, scheduler, time_counter, synchronizer=None):
+    model.train()
+    counter = Counter(cache_nums=1000)
+    for it, data_batch in enumerate(dataloader):
+        ids = data_batch['ids']
+        images_size = data_batch['images_size']
+        images = data_batch['images'].to(cfg.device)
+        cls_labels = data_batch['cls_labels'].to(cfg.device)
+        labels_mask = data_batch['labels_mask'].to(cfg.device)
+        rows_fg_spans = data_batch['rows_fg_spans']
+        rows_bg_spans = data_batch['rows_bg_spans']
+        cols_fg_spans = data_batch['cols_fg_spans']
+        cols_bg_spans = data_batch['cols_bg_spans']
+        cells_spans = data_batch['cells_spans']
+        divide_labels = data_batch['divide_labels'].to(cfg.device)
+        layouts = data_batch['layouts'].to(cfg.device)
+        try:
+            optimizer.zero_grad()
+            pred_result, result_info = model(
+                images, images_size,
+                cls_labels, labels_mask, layouts,
+                rows_fg_spans, rows_bg_spans,
+                cols_fg_spans, cols_bg_spans,
+                cells_spans, divide_labels,
+            )
+            loss = sum([val for key, val in result_info.items() if 'loss' in key])
+            loss.backward()
+            optimizer.step()
+            scheduler.step()
+            counter.update(result_info)
+        except:
+            logger.info('CUDA Out Of Memory')
+        if it % cfg.log_sep == 0:
+            logger.info(
+                '[Train][Epoch %03d Iter %04d][Memory: %.0f ][Mean LR: %f ][Left: %s] %s' %
+                (
+                    epoch,
+                    it,
+                    torch.cuda.max_memory_allocated()/1024/1024,
+                    cal_mean_lr(optimizer),
+                    time_counter.step(epoch, it + 1),
+                    counter.format_mean(sync=False)
+                )
+            )
+        if synchronizer is not None:
+            synchronizer()
+        if synchronizer is not None:
+            synchronizer(final_align=True)
+def valid(cfg, dataloader, model):
+    model.eval()
+    total_label_relations = list()
+    total_pred_relations = list()
+    total_relations_metric = list()
+    for it, data_batch in enumerate(tqdm.tqdm(dataloader)):
+        ids = data_batch['ids']
+        images_size = data_batch['images_size']
+        images = data_batch['images'].to(cfg.device)
+        tables = data_batch['tables']
+        pred_result, _ = model(images, images_size)
+        pred_tables = [
+            pred_result_to_table(tables[batch_idx],
+                                 (pred_result[0][batch_idx], pred_result[1][batch_idx],
+                                  pred_result[2][batch_idx], pred_result[3][batch_idx])
+            )
+            for batch_idx in range(len(ids))
+        ]
+        pred_relations = [table_to_relations(table) for table in pred_tables]
+        total_pred_relations.extend(pred_relations)
+        # label
+        label_relations = []
+        for table in tables:
+            label_path = os.path.join(cfg.valid_data_dir, table['label_path'])
+            with open(table['label_path'], 'r') as f:
+                label_relations.append(json.load(f))
+        total_label_relations.extend(label_relations)
+    # cal P, R, F1
+    total_relations_metric = evaluate_f1(total_label_relations, total_pred_relations, num_workers=40)
+    P, R, F1 = np.array(total_relations_metric).mean(0).tolist()
+    F1 = 2 * P * R / (P + R)
+    logger.info('[Valid] Total Type Mertric: Precision: %s, Recall: %s, F1-Score: %s' % (P, R, F1))
+    return (F1,)
+def build_optimizer(cfg, model):
+    params = list()
+    for _, value in model.named_parameters():
+        if not value.requires_grad:
+            continue
+        lr = cfg.base_lr
+        weight_decay = cfg.weight_decay
+        params += [{'params': [value], 'lr': lr, 'weight_decay': weight_decay}]
+    optimizer = torch.optim.Adam(params, cfg.base_lr)
+    return optimizer
+def build_scheduler(cfg, optimizer, epoch_iters, start_epoch=0):
+    scheduler = CosineAnnealingLR(
+        optimizer=optimizer,
+        T_max=cfg.num_epochs * epoch_iters,
+        eta_min=cfg.min_lr,
+        last_epoch=-1 if start_epoch == 0 else start_epoch * epoch_iters
+    )
+    return scheduler
+def main():
+    init()
+    train_dataloader = create_train_dataloader(
+        cfg.vocab,
+        cfg.train_lrcs_path,
+        cfg.train_num_workers,
+        cfg.train_max_batch_size,
+        cfg.train_max_pixel_nums,
+        cfg.train_bucket_seps,
+        cfg.train_data_dir
+    )
+    logger.info(
+        'Train dataset have %d samples, %d batchs' %
+        (
+            len(train_dataloader.dataset),
+            len(train_dataloader.batch_sampler)
+        )
+    )
+    valid_dataloader = create_valid_dataloader(
+        cfg.vocab,
+        cfg.valid_lrc_path,
+        cfg.valid_num_workers,
+        cfg.valid_batch_size,
+        cfg.valid_data_dir
+    )
+    logger.info(
+        'Valid dataset have %d samples, %d batchs with batch_size=%d' %
+        (
+            len(valid_dataloader.dataset),
+            len(valid_dataloader.batch_sampler),
+            valid_dataloader.batch_size
+        )
+    )
+    model = build_model(cfg)
+    model.cuda()
+    if distributed():
+        synchronizer = ModelSynchronizer(model, cfg.sync_rate)
+    else:
+        synchronizer = None
+    epoch_iters = len(train_dataloader.batch_sampler)
+    optimizer = build_optimizer(cfg, model)
+    global metrics_name
+    global best_metrics
+    start_epoch = 0
+    resume_path = os.path.join(cfg.work_dir, 'latest_model.pth')
+    if os.path.exists(resume_path):
+        best_metrics, start_epoch = load_checkpoint(resume_path, model, optimizer)
+        start_epoch += 1
+        logger.info('resume from: %s' % resume_path)
+    elif cfg.train_checkpoint is not None:
+        load_checkpoint(cfg.train_checkpoint, model)
+        logger.info('load checkpoint from: %s' % cfg.train_checkpoint)
+    scheduler = build_scheduler(cfg, optimizer, epoch_iters, start_epoch)
+    time_counter = TimeCounter(start_epoch, cfg.num_epochs, epoch_iters)
+    time_counter.reset()
+    for epoch in range(start_epoch, cfg.num_epochs):
+        if hasattr(train_dataloader.sampler, 'set_epoch'):
+            train_dataloader.sampler.set_epoch(epoch)
+        train(cfg, epoch, train_dataloader, model, optimizer, scheduler, time_counter, synchronizer)
+        with torch.no_grad():
+            metrics = valid(cfg, valid_dataloader, model)
+        for metric_idx in range(len(metrics_name)):
+            if metrics[metric_idx] > best_metrics[metric_idx]:
+                best_metrics[metric_idx] = metrics[metric_idx]
+                save_checkpoint(os.path.join(cfg.work_dir, 'best_%s_model.pth' % metrics_name[metric_idx]), model, optimizer, best_metrics, epoch)
+                logger.info('Save current model as best_%s_model' % metrics_name[metric_idx])
+        save_checkpoint(os.path.join(cfg.work_dir, 'latest_model.pth'), model, optimizer, best_metrics, epoch)
+if __name__ == '__main__':
+    main()

runner/valid.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import sys
+import json
+sys.path.append('./')
+sys.path.append('../')
+import os
+import tqdm
+import torch
+import numpy as np
+from libs.configs import cfg, setup_config
+from libs.model import build_model
+from libs.data import create_valid_dataloader
+from libs.utils import logger
+from libs.utils.cal_f1 import pred_result_to_table, table_to_relations, evaluate_f1
+from libs.utils.checkpoint import load_checkpoint
+from libs.utils.comm import synchronize, all_gather
+def init():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--lrc", type=str, default=None)
+    parser.add_argument("--cfg", type=str, default='default')
+    parser.add_argument("--local_rank", type=int, default=0)
+    args = parser.parse_args()
+    setup_config(args.cfg)
+    if args.lrc is not None:
+        cfg.valid_lrc_path = args.lrc
+    os.environ['LOCAL_RANK'] = str(args.local_rank)
+    num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
+    distributed = num_gpus > 1
+    if distributed:
+        torch.cuda.set_device(args.local_rank)
+        torch.distributed.init_process_group(
+            backend="nccl", init_method="env://"
+        )
+        synchronize()
+    logger.setup_logger('Line Detect Model', cfg.work_dir, 'valid.log')
+    logger.info('Use config: %s' % args.cfg)
+    logger.info('Evaluate Dataset: %s' % cfg.valid_lrc_path)
+def valid(cfg, dataloader, model):
+    model.eval()
+    total_label_relations = list()
+    total_pred_relations = list()
+    total_relations_metric = list()
+    for it, data_batch in enumerate(tqdm.tqdm(dataloader)):
+        ids = data_batch['ids']
+        images_size = data_batch['images_size']
+        images = data_batch['images'].to(cfg.device)
+        tables = data_batch['tables']
+        pred_result, _ = model(images, images_size)
+        # pred
+        pred_tables = [
+            pred_result_to_table(tables[batch_idx],
+                (pred_result[0][batch_idx], pred_result[1][batch_idx], \
+                    pred_result[2][batch_idx], pred_result[3][batch_idx])
+            ) \
+            for batch_idx in range(len(ids))
+        ]
+        pred_relations = [table_to_relations(table) for table in pred_tables]
+        total_pred_relations.extend(pred_relations)
+        # label
+        label_relations = []
+        for table in tables:
+            with open(table['label_path'], 'r') as f:
+                label_relations.append(json.load(f))
+        total_label_relations.extend(label_relations)
+    # cal P, R, F1
+    total_relations_metric = evaluate_f1(total_label_relations, total_pred_relations, num_workers=40)
+    P, R, F1 = np.array(total_relations_metric).mean(0).tolist()
+    F1 = 2 * P * R / (P + R)
+    logger.info('[Valid] Total Type Mertric: Precision: %s, Recall: %s, F1-Score: %s' % (P, R, F1))
+    return (F1, )
+def main():
+    init()
+    valid_dataloader = create_valid_dataloader(
+        cfg.vocab,
+        cfg.valid_lrc_path,
+        cfg.valid_num_workers,
+        cfg.valid_batch_size
+    )
+    logger.info(
+        'Valid dataset have %d samples, %d batchs with batch_size=%d' % \
+            (
+                len(valid_dataloader.dataset),
+                len(valid_dataloader.batch_sampler),
+                valid_dataloader.batch_size
+            )
+    )
+    model = build_model(cfg)
+    model.cuda()
+    load_checkpoint(cfg.eval_checkpoint, model)
+    logger.info('Load checkpoint from: %s' % cfg.eval_checkpoint)
+    with torch.no_grad():
+        valid(cfg, valid_dataloader, model)
+if __name__ == '__main__':
+    main()