# Author: Parag Mali # This file contains functions to remove rectangles # that are inside other rectangles import sys sys.path.extend(['/home/psm2208/code', '/home/psm2208/code']) import cv2 import os import csv import numpy as np import utils.visualize as visualize from multiprocessing import Pool from cv2.dnn import NMSBoxes from scipy.ndimage.measurements import label import scipy.ndimage as ndimage import copy import shutil from gtdb import box_utils def remove(args): try: output_dir, pdf_name, page_num, page_math = args valid = [True] * page_math.shape[0] for i, m1 in enumerate(page_math): for j, m2 in enumerate(page_math): if i!=j and box_utils.check_inside(m1, m2): valid[i] = False break final_math = page_math[valid] math_file = open(os.path.join(output_dir, pdf_name + ".csv"), 'a') writer = csv.writer(math_file, delimiter=",") for math_region in final_math: math_region = math_region.tolist() math_region.insert(0, page_num) writer.writerow(math_region) print("Saved ", os.path.join(output_dir, pdf_name + ".csv"), " > ", page_num) print('Before ', len(page_math), '--> after ', len(final_math)) except: print("Exception while processing ", pdf_name, " ", page_num, " ", sys.exc_info()[0]) def remove_rect(filename, math_dir, output_dir): if os.path.exists(output_dir): shutil.rmtree(output_dir) if not os.path.exists(output_dir): os.mkdir(output_dir) pages_list = [] pdf_names = open(filename, 'r') for pdf_name in pdf_names: print('Processing-1', pdf_name) pdf_name = pdf_name.strip() if pdf_name != '': math_file = os.path.join(math_dir, pdf_name + ".csv") math_regions = np.genfromtxt(math_file, delimiter=',', dtype=float) pages = np.unique(math_regions[:, 0]) for page_num in pages: page_math = math_regions[np.where(math_regions[:,0]==page_num)] page_math = page_math[:,1:] pages_list.append([output_dir, pdf_name, page_num, page_math]) pdf_names.close() pool = Pool(processes=4) pool.map(remove, pages_list) pool.close() pool.join() if __name__ == "__main__": home_data = "/home/psm2208/data/GTDB/" home_eval = "/home/psm2208/code/eval/" home_images = "/home/psm2208/data/GTDB/images/" home_anno = "/home/psm2208/data/GTDB/annotations/" math_dir = "/home/psm2208/code/eval/tt_samsung" #"/home/psm2208/data/GTDB/relations_train_adjust_csv" output_dir = "/home/psm2208/code/eval/tt_samsung_removed" #"/home/psm2208/data/GTDB/relations_train_adjust_csv_removed" type = sys.argv[1] remove_rect(home_data + type, math_dir, output_dir)