Spaces:

duycse1603
/

math2tex

Runtime error

App Files Files Community

math2tex / ScanSSD /gtdb /split_annotations_per_page.py

duycse1603

[Add] source

6163604 over 2 years ago

raw

history blame contribute delete

2.98 kB

	# Author: Parag Mali
	# This script generates page level annotations from the PDF level annotations
	# provided in the dataset

	import sys
	import os
	from multiprocessing import Pool
	import csv
	import cv2

	def split(args):

	gt_dir, pdf_name, out_dir, ext = args

	file_path = os.path.join(gt_dir, pdf_name + "." + ext)
	img_dir = '/home/psm2208/data/GTDB/images/'

	# create a map of page to list of math boxes
	map = {}

	if ext == "math":

	file_ip = open(file_path, "r")
	for line in file_ip:
	entries = line.strip().split(",")

	# if entry is not in map
	if entries[0] not in map:
	map[entries[0]] = []

	map[entries[0]].append(entries[1:])

	for key in map:

	boxes = map[key]
	key = float(key)
	img_file = os.path.join(img_dir, pdf_name, str(int(key) + 1) + ".png")
	img = cv2.imread(img_file)

	height, width, channels = img.shape

	#width_ratio = 512 / width
	#height_ratio = 512 / height

	width_ratio = 1
	height_ratio = 1

	# create processed math file
	file_op = open(os.path.join(out_dir, pdf_name, str(int(key) + 1)) + ".p" + ext, "w")

	for box in boxes:
	# xmin, ymin, xmax, ymax

	box[0] = float(box[0]) * width_ratio
	box[1] = float(box[1]) * height_ratio
	box[2] = float(box[2]) * width_ratio
	box[3] = float(box[3]) * height_ratio

	file_op.write(','.join(str(e) for e in box) + "\n")

	file_op.close()
	file_ip.close()

	elif ext == "char":
	with open(file_path, 'r') as csvfile:
	reader = csv.reader(csvfile, delimiter=',')
	for row in reader:
	# if entry is not in map
	if row[0] not in map:
	map[row[0]] = []

	map[row[0]].append(row)

	for key in map:

	boxes = map[key]

	with open(os.path.join(out_dir, pdf_name, str(int(key) + 1)) + ".p" + ext, "w") as csvfile:
	writer = csv.writer(csvfile, delimiter=',')

	for box in boxes:
	writer.writerow(box)

	def test():

	filename = sys.argv[1] # file names to be processed
	out_dir = sys.argv[2] # output dir
	gt_dir = sys.argv[3] # gt dir
	ext = sys.argv[4] # file extension

	pdf_names_list = []
	pdf_names = open(filename, 'r')

	for pdf_name in pdf_names:
	pdf_name = pdf_name.strip()

	if not os.path.exists(os.path.join(out_dir, pdf_name)):
	os.mkdir(os.path.join(out_dir, pdf_name))

	if pdf_name != '':
	pdf_names_list.append((gt_dir, pdf_name, out_dir, ext))

	pdf_names.close()

	pool = Pool(processes=32)
	pool.map(split, pdf_names_list)
	pool.close()
	pool.join()


	if __name__ == "__main__":
	test()