math2tex / ScanSSD /gtdb /split_annotations_per_page.py
duycse1603's picture
[Add] source
6163604
# Author: Parag Mali
# This script generates page level annotations from the PDF level annotations
# provided in the dataset
import sys
import os
from multiprocessing import Pool
import csv
import cv2
def split(args):
gt_dir, pdf_name, out_dir, ext = args
file_path = os.path.join(gt_dir, pdf_name + "." + ext)
img_dir = '/home/psm2208/data/GTDB/images/'
# create a map of page to list of math boxes
map = {}
if ext == "math":
file_ip = open(file_path, "r")
for line in file_ip:
entries = line.strip().split(",")
# if entry is not in map
if entries[0] not in map:
map[entries[0]] = []
map[entries[0]].append(entries[1:])
for key in map:
boxes = map[key]
key = float(key)
img_file = os.path.join(img_dir, pdf_name, str(int(key) + 1) + ".png")
img = cv2.imread(img_file)
height, width, channels = img.shape
#width_ratio = 512 / width
#height_ratio = 512 / height
width_ratio = 1
height_ratio = 1
# create processed math file
file_op = open(os.path.join(out_dir, pdf_name, str(int(key) + 1)) + ".p" + ext, "w")
for box in boxes:
# xmin, ymin, xmax, ymax
box[0] = float(box[0]) * width_ratio
box[1] = float(box[1]) * height_ratio
box[2] = float(box[2]) * width_ratio
box[3] = float(box[3]) * height_ratio
file_op.write(','.join(str(e) for e in box) + "\n")
file_op.close()
file_ip.close()
elif ext == "char":
with open(file_path, 'r') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
for row in reader:
# if entry is not in map
if row[0] not in map:
map[row[0]] = []
map[row[0]].append(row)
for key in map:
boxes = map[key]
with open(os.path.join(out_dir, pdf_name, str(int(key) + 1)) + ".p" + ext, "w") as csvfile:
writer = csv.writer(csvfile, delimiter=',')
for box in boxes:
writer.writerow(box)
def test():
filename = sys.argv[1] # file names to be processed
out_dir = sys.argv[2] # output dir
gt_dir = sys.argv[3] # gt dir
ext = sys.argv[4] # file extension
pdf_names_list = []
pdf_names = open(filename, 'r')
for pdf_name in pdf_names:
pdf_name = pdf_name.strip()
if not os.path.exists(os.path.join(out_dir, pdf_name)):
os.mkdir(os.path.join(out_dir, pdf_name))
if pdf_name != '':
pdf_names_list.append((gt_dir, pdf_name, out_dir, ext))
pdf_names.close()
pool = Pool(processes=32)
pool.map(split, pdf_names_list)
pool.close()
pool.join()
if __name__ == "__main__":
test()