File size: 2,980 Bytes
6163604
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# Author: Parag Mali
# This script generates page level annotations from the PDF level annotations
# provided in the dataset

import sys
import os
from multiprocessing import Pool
import csv
import cv2

def split(args):

    gt_dir, pdf_name, out_dir, ext = args

    file_path = os.path.join(gt_dir, pdf_name + "." + ext)
    img_dir = '/home/psm2208/data/GTDB/images/'

    # create a map of page to list of math boxes
    map = {}

    if ext == "math":

        file_ip = open(file_path, "r")
        for line in file_ip:
            entries = line.strip().split(",")

            # if entry is not in map
            if entries[0] not in map:
                map[entries[0]] = []

            map[entries[0]].append(entries[1:])

        for key in map:

            boxes = map[key]
            key = float(key)
            img_file = os.path.join(img_dir, pdf_name, str(int(key) + 1) + ".png")
            img = cv2.imread(img_file)

            height, width, channels = img.shape

            #width_ratio = 512 / width
            #height_ratio = 512 / height

            width_ratio = 1
            height_ratio = 1

            # create processed math file
            file_op = open(os.path.join(out_dir, pdf_name, str(int(key) + 1)) + ".p" + ext, "w")

            for box in boxes:
                # xmin, ymin, xmax, ymax

                box[0] = float(box[0]) * width_ratio
                box[1] = float(box[1]) * height_ratio
                box[2] = float(box[2]) * width_ratio
                box[3] = float(box[3]) * height_ratio

                file_op.write(','.join(str(e) for e in box) + "\n")

            file_op.close()
            file_ip.close()

    elif ext == "char":
        with open(file_path, 'r') as csvfile:
            reader = csv.reader(csvfile, delimiter=',')
            for row in reader:
                # if entry is not in map
                if row[0] not in map:
                    map[row[0]] = []

                map[row[0]].append(row)

        for key in map:

            boxes = map[key]

            with open(os.path.join(out_dir, pdf_name, str(int(key) + 1)) + ".p" + ext, "w") as csvfile:
                writer = csv.writer(csvfile, delimiter=',')

                for box in boxes:
                    writer.writerow(box)

def test():

    filename = sys.argv[1] # file names to be processed
    out_dir = sys.argv[2] # output dir
    gt_dir = sys.argv[3] # gt dir
    ext = sys.argv[4] # file extension

    pdf_names_list = []
    pdf_names = open(filename, 'r')

    for pdf_name in pdf_names:
        pdf_name = pdf_name.strip()

        if not os.path.exists(os.path.join(out_dir, pdf_name)):
            os.mkdir(os.path.join(out_dir, pdf_name))

        if pdf_name != '':
            pdf_names_list.append((gt_dir, pdf_name, out_dir, ext))

    pdf_names.close()

    pool = Pool(processes=32)
    pool.map(split, pdf_names_list)
    pool.close()
    pool.join()


if __name__ == "__main__":
    test()