Kushalguptaiitb commited on
Commit
0538136
·
verified ·
1 Parent(s): dedcc8c

Upload post_processing_v2 (1).py

Browse files
Files changed (1) hide show
  1. post_processing_v2 (1).py +410 -0
post_processing_v2 (1).py ADDED
@@ -0,0 +1,410 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import pandas as pd
3
+
4
+ def read_json(json_file):
5
+ with open(json_file, 'r', encoding='utf-8') as file:
6
+ return json.load(file)
7
+
8
+ def adjust_page_dimensions_and_bbox(modified_model_output_json, pdfminer_json):
9
+ for page_number, blocks in modified_model_output_json.items():
10
+
11
+ if page_number in pdfminer_json.keys():
12
+ if pdfminer_json[page_number]:
13
+
14
+ page_info = pdfminer_json[page_number][0]
15
+ page_width = page_info['page_width']
16
+ page_height = page_info['page_height']
17
+ for block in blocks:
18
+ original_width = block['page_img_width']
19
+ original_height = block['page_img_height']
20
+ width_scale = page_width / original_width
21
+ height_scale = page_height / original_height
22
+ block['page_img_width'] = page_width
23
+ block['page_img_height'] = page_height
24
+ block['bbox'] = [
25
+ block['bbox'][0] * width_scale,
26
+ block['bbox'][1] * height_scale,
27
+ block['bbox'][2] * width_scale,
28
+ block['bbox'][3] * height_scale
29
+ ]
30
+
31
+ else:
32
+ print(f"Page {page_number} is empty.")
33
+ return modified_model_output_json
34
+
35
+ def convert_to_dataframe(extracted_df):
36
+ if isinstance(extracted_df, pd.DataFrame):
37
+ return extracted_df
38
+
39
+ elif isinstance(extracted_df, dict):
40
+ if all(isinstance(value, list) for value in extracted_df.values()):
41
+ return pd.DataFrame(extracted_df)
42
+ else:
43
+ return pd.DataFrame([extracted_df])
44
+
45
+ elif isinstance(extracted_df, list):
46
+ if all(isinstance(item, dict) for item in extracted_df):
47
+ return pd.DataFrame(extracted_df)
48
+ else:
49
+ return pd.DataFrame(extracted_df, columns=['Value'])
50
+
51
+ else:
52
+ return pd.DataFrame([extracted_df], columns=['Value'])
53
+
54
+ def calculate_centroid(bbox):
55
+ x1, y1, x2, y2 = bbox
56
+ x_center = (x1 + x2) / 2
57
+ y_center = (y1 + y2) / 2
58
+ return (x_center, y_center)
59
+
60
+ def is_within_radius(text_block_bbox, header_bbox, radius=50):
61
+ text_xmin, text_ymin, text_xmax, text_ymax = text_block_bbox
62
+ header_xmin, header_ymin, header_xmax, header_ymax = header_bbox
63
+
64
+ # Check for overlap between text_block_bbox and header_bbox
65
+ overlap_x = max(0, min(text_xmax, header_xmax) - max(text_xmin, header_xmin))
66
+ overlap_y = max(0, min(text_ymax, header_ymax) - max(text_ymin, header_ymin))
67
+
68
+ # If there is any overlap, return True
69
+ if overlap_x > 0 and overlap_y > 0:
70
+ return True
71
+
72
+ return False
73
+
74
+ def is_overlapped(text_block_bbox, header_bbox, threshold=0.20):
75
+ # Unpack bounding boxes
76
+ text_xmin, text_ymin, text_xmax, text_ymax = text_block_bbox
77
+ header_xmin, header_ymin, header_xmax, header_ymax = header_bbox
78
+
79
+ # Calculate overlap in the x and y directions
80
+ overlap_x = max(0, min(text_xmax, header_xmax) - max(text_xmin, header_xmin))
81
+ overlap_y = max(0, min(text_ymax, header_ymax) - max(text_ymin, header_ymin))
82
+
83
+ # Calculate the area of overlap
84
+ overlap_area = overlap_x * overlap_y
85
+
86
+ # Calculate the area of the text block and header
87
+ text_area = (text_xmax - text_xmin) * (text_ymax - text_ymin)
88
+ header_area = (header_xmax - header_xmin) * (header_ymax - header_ymin)
89
+
90
+ # Calculate the overlap ratio with respect to the smaller of the two areas
91
+ smaller_area = min(text_area, header_area)
92
+ overlap_ratio = overlap_area / smaller_area
93
+
94
+ # Check if the overlap ratio exceeds the threshold
95
+ if overlap_ratio > threshold:
96
+ return True
97
+
98
+ return False
99
+
100
+ def detect_header(text_block_bbox, adjusted_model_output_json, page_number ,next_header_index_in_model_udop):
101
+ text_centroid = calculate_centroid(text_block_bbox)
102
+ if str(page_number) in adjusted_model_output_json:
103
+ if next_header_index_in_model_udop is not None :
104
+ next_header_index_in_model_udop = int(next_header_index_in_model_udop)
105
+ header_block = adjusted_model_output_json[str(page_number)][next_header_index_in_model_udop]
106
+ if is_overlapped(text_block_bbox, header_block['bbox']):
107
+ return True
108
+ return False
109
+
110
+ def remove_header_from_start(first_row_text: str, first_row_header_text: str) -> str:
111
+ length_header_text = len(first_row_header_text)
112
+ return first_row_text[length_header_text:].strip()
113
+
114
+ def extract_last_header_index(all_blocks_with_indices):
115
+ last_header_index = -1
116
+
117
+ # Iterate through the list in reverse
118
+ for index in reversed(range(len(all_blocks_with_indices))):
119
+ block = all_blocks_with_indices[index]
120
+
121
+ # Check if the block is a Page-header or Section-header
122
+ if block['label_name'] in ['Page-header', 'Section-header']:
123
+ last_header_index = index
124
+ break
125
+
126
+ return last_header_index
127
+
128
+ def match_headers_with_text(adjusted_model_json, pdfminer_json):
129
+ matched_data = []
130
+ tree_format_matched_data = []
131
+ current_header = None
132
+ current_content = []
133
+ current_header_table_content = []
134
+ current_header_tree_structure = []
135
+ sorted_pages = sorted(adjusted_model_json.items(), key=lambda x: int(x[0]))
136
+
137
+ all_blocks_with_indices = []
138
+ for key, blocks in sorted_pages:
139
+ for index, block in enumerate(blocks):
140
+ if block['label_name'] in ['Page-header','Section-header','Table', "Portfolio-Company-Table"]:
141
+ block['used_model_index'] = index
142
+ all_blocks_with_indices.append(block)
143
+
144
+
145
+ for id,block in enumerate(all_blocks_with_indices):
146
+ if block['label_name'] in ['Page-header','Section-header']:
147
+ next_header_detect_flag = False
148
+ current_header_index_in_model = block['used_model_index']
149
+ current_header_bbox = block['bbox']
150
+ current_header_type = block['label_name']
151
+ current_header_centroid = calculate_centroid(block['bbox'])
152
+ current_header_page_number = block['pdf_page_id']
153
+ current_header_text = block['extracted_text'][0] if block['extracted_text'] else ""
154
+ current_header_page_width = block['page_img_width']
155
+ current_header_page_height = block['page_img_height']
156
+ current_header_page_block_id = block['page_block_id']
157
+ current_header_pdf_name = block['pdf_name']
158
+ content_source_pages = [] # Track pages where content is collected
159
+ new_start_index = id + 1
160
+ if new_start_index < len(all_blocks_with_indices):
161
+ for next_id ,next_block in enumerate(all_blocks_with_indices[new_start_index:], start = new_start_index):
162
+ if next_block['label_name'] in ['Page-header', 'Section-header']:
163
+ next_header_index_in_model_udop = next_block['used_model_index']
164
+ next_header_bbox = next_block['bbox']
165
+ next_header_centroid = calculate_centroid(next_block['bbox'])
166
+ next_header_page_number = next_block["pdf_page_id"]
167
+ next_header_text = next_block['extracted_text'][0] if next_block['extracted_text'] else ""
168
+ break
169
+
170
+ else:
171
+ next_header_bbox = None
172
+ next_header_centroid = None
173
+ next_header_page_number = None
174
+ next_header_index_in_model_udop = None
175
+ next_header_text = None
176
+
177
+ last_header_index = extract_last_header_index(all_blocks_with_indices)
178
+ if id == len(all_blocks_with_indices) - 1 or id == last_header_index:
179
+ next_header_bbox = None
180
+ next_header_centroid = None
181
+ next_header_page_number = None
182
+ next_header_index_in_model_udop = None
183
+ next_header_text = None
184
+
185
+ if current_header_text:
186
+ if current_header is not None:
187
+ current_content = []
188
+ current_header_table_content = []
189
+ current_header_tree_structure = []
190
+
191
+ current_header = {
192
+ "page_number": current_header_page_number,
193
+ "header_text": current_header_text,
194
+ "element_id": None,
195
+ "text_block_id": None
196
+ }
197
+ new_start_index = id + 1
198
+ for new_id,new_block in enumerate(all_blocks_with_indices[new_start_index:], start = new_start_index):
199
+ extracted_df_flag = False
200
+ next_block = new_block
201
+ if next_block and next_block['label_name'] in ['Page-header', 'Section-header']:
202
+ extracted_df_flag = False
203
+ break
204
+
205
+ # if next_block and next_block['label_name'] in ['Table']:
206
+ if next_block and next_block['label_name'] in ['Table', "Portfolio-Company-Table"]:
207
+ extracted_df_flag = True
208
+ extracted_df = next_block['extracted_text'][0]
209
+ if next_block["associated_table_header_info"] is not None:
210
+ extracted_df_table_header = next_block["associated_table_header_info"]['extracted_text'][0]
211
+ else:
212
+ extracted_df_table_header = None
213
+
214
+ extracted_df_new = convert_to_dataframe(extracted_df)
215
+ extracted_df_new_column_headers = extracted_df_new.columns.tolist()
216
+ extracted_df_markdown = extracted_df_new.to_csv(index=False)
217
+
218
+ table_metadata = { 'pdf_name': next_block['pdf_name'] ,
219
+ 'table_page_id': next_block['pdf_page_id'],
220
+ 'table_page_id_width' : next_block['page_img_width'],
221
+ 'table_page_id_height': next_block['page_img_height'],
222
+ 'table_bbox' : next_block['bbox']
223
+ }
224
+
225
+ table_header_pair = {
226
+ # 'label_name':'Table-header',
227
+ 'label_name':next_block['label_name'],
228
+ 'table_header': extracted_df_table_header,
229
+ 'table_column_header' : extracted_df_new_column_headers,
230
+ 'table_info': extracted_df_new,
231
+ 'metadata' : table_metadata
232
+ }
233
+
234
+ tree_table_header_info = {
235
+ 'label_name':'Table-header',
236
+ # 'label_name':next_block['label_name'],
237
+ 'table_header_info': next_block["associated_table_header_info"],
238
+ 'table_column_header' : extracted_df_new_column_headers,
239
+ 'table_info': next_block
240
+ }
241
+
242
+ # current_header_table_content.append(extracted_df)
243
+ current_header_table_content.append(table_header_pair)
244
+ current_header_tree_structure.append(next_block)
245
+
246
+ last_pdf_page = int(list(pdfminer_json.keys())[-1])
247
+ first_append_flag = False
248
+ first_append_text = " "
249
+ for pdf_page_num in range(int(current_header_page_number), last_pdf_page + 1):
250
+ text_blocks = pdfminer_json.get(str(pdf_page_num), [])
251
+ start_index = 0
252
+ page_content_added = False # Track if content was added from this page
253
+ if current_header["element_id"] is None and current_header["text_block_id"] is None:
254
+ for index, text_block in enumerate(text_blocks):
255
+ if is_overlapped(text_block['bbox'],current_header_bbox):
256
+ current_header["element_id"] = text_block["element_id"]
257
+ current_header["text_block_id"] = text_block["text_block_id"]
258
+ start_index = index
259
+ first_append_flag = True
260
+ break
261
+
262
+ for next_header_index, text_block in enumerate(text_blocks[start_index:], start = start_index):
263
+ last_text_reached_flag = False
264
+ if first_append_flag:
265
+ first_row_text = text_block['text']
266
+ first_row_header_text = current_header_text
267
+ first_append_text = remove_header_from_start(first_row_text,first_row_header_text)
268
+ current_content.append(first_append_text)
269
+ page_content_added = True
270
+ first_append_flag = False
271
+ continue
272
+
273
+ if next_header_text is not None and pdf_page_num == int(next_header_page_number):
274
+ next_header_found_flag = False
275
+
276
+ if detect_header(text_block['bbox'], adjusted_model_json, next_header_page_number,next_header_index_in_model_udop):
277
+ next_header_found_flag = True
278
+ matched_data.append({
279
+ "page_number": current_header["page_number"],
280
+ "pdf_name" : current_header_pdf_name ,
281
+ "header": current_header["header_text"],
282
+ "label_name": current_header_type,
283
+ "content": " ".join(current_content),
284
+ "table_content" : current_header_table_content,
285
+ "all_source_pages": content_source_pages
286
+ })
287
+ tree_format_matched_data.append({
288
+ "header_page_number": current_header["page_number"],
289
+ "label_name":current_header_type,
290
+ 'page_block_id' : current_header_page_block_id,
291
+ "header_bbox": current_header_bbox,
292
+ "header_page_width":current_header_page_width,
293
+ "header_page_height": current_header_page_height,
294
+ "header": current_header["header_text"],
295
+ "content": " ".join(current_content),
296
+ 'tree_table_content' : current_header_tree_structure
297
+ })
298
+ current_content = []
299
+ current_table_content = []
300
+ current_header_tree_structure = []
301
+ next_header_detect_flag = True
302
+ break
303
+
304
+ if next_header_index == len(text_blocks) - 1:
305
+ last_text_block = text_block
306
+ if not next_header_found_flag and last_text_block:
307
+ matched_data.append({
308
+ "page_number": current_header["page_number"],
309
+ "pdf_name" : current_header_pdf_name ,
310
+ "header": current_header["header_text"],
311
+ "label_name": current_header_type,
312
+ "content": " ".join(current_content),
313
+ "table_content" : current_header_table_content,
314
+ "all_source_pages": content_source_pages
315
+ })
316
+ tree_format_matched_data.append({
317
+ "header_page_number": current_header["page_number"],
318
+ "label_name":currentHeaderType,
319
+ 'page_block_id' : current_header_page_block_id,
320
+ "header_bbox": current_header_bbox,
321
+ "header_page_width":current_header_page_width,
322
+ "header_page_height": current_header_page_height,
323
+ "header": current_header["header_text"],
324
+ "content": " ".join(current_content),
325
+ 'tree_table_content' : current_header_tree_structure
326
+ })
327
+ current_content = []
328
+ current_header_table_content = []
329
+ current_header_tree_structure = []
330
+ next_header_detect_flag = True
331
+ next_header_found_flag = True
332
+ break
333
+
334
+ current_content.append(text_block['text'])
335
+ page_content_added = True
336
+ if next_header_detect_flag:
337
+ break
338
+
339
+ # Add page number to source pages if content was added from this page
340
+ if page_content_added and pdf_page_num not in content_source_pages:
341
+ content_source_pages.append(pdf_page_num)
342
+
343
+ if next_header_detect_flag:
344
+ break
345
+
346
+ if next_header_text is None and next_header_page_number is None:
347
+ current_header = {
348
+ "page_number": current_header_page_number,
349
+ "header_text": current_header_text,
350
+ "element_id": None,
351
+ "text_block_id": None
352
+ }
353
+
354
+ for pdf_page_num in range(int(current_header_page_number), last_pdf_page + 1):
355
+ text_blocks = pdfminer_json.get(str(pdf_page_num), [])
356
+ start_index = 0
357
+ page_content_added = False # Track if content was added from this page
358
+ if current_header["element_id"] is None and current_header["text_block_id"] is None:
359
+ for index, text_block in enumerate(text_blocks):
360
+ if is_overlapped(text_block['bbox'],current_header_bbox):
361
+ current_header["element_id"] = text_block["element_id"]
362
+ current_header["text_block_id"] = text_block["text_block_id"]
363
+ start_index = index
364
+ first_append_flag = True
365
+ break
366
+
367
+ for no_header_index, text_block in enumerate(text_blocks[start_index:], start=start_index):
368
+ if first_append_flag:
369
+ first_row_text = text_block['text']
370
+ first_row_header_text = current_header_text
371
+ first_append_text = remove_header_from_start(first_row_text,first_row_header_text)
372
+ current_content.append(first_append_text)
373
+ page_content_added = True
374
+ first_append_flag = False
375
+ continue
376
+
377
+ # Add page number to source pages if content was added from this page
378
+ if page_content_added and pdf_page_num not in content_source_pages:
379
+ content_source_pages.append(pdf_page_num)
380
+
381
+ matched_data.append({
382
+ "page_number": current_header["page_number"],
383
+ "pdf_name" : current_header_pdf_name ,
384
+ "header": current_header["header_text"],
385
+ "label_name": current_header_type,
386
+ "content": " ".join(current_content),
387
+ "table_content" : current_header_table_content,
388
+ "all_source_pages": content_source_pages
389
+ })
390
+ tree_format_matched_data.append({
391
+ "header_page_number": current_header["page_number"],
392
+ "label_name": current_header_type,
393
+ 'page_block_id' : current_header_page_block_id,
394
+ "header_bbox": current_header_bbox,
395
+ "header_page_width":current_header_page_width,
396
+ "header_page_height": current_header_page_height,
397
+ "header": current_header["header_text"],
398
+ "content": " ".join(current_content),
399
+ 'tree_table_content' : current_header_tree_structure
400
+ })
401
+
402
+ return matched_data,tree_format_matched_data
403
+
404
+ def main_header_pipeline(modified_udop_json, pdfminer_json):
405
+ modified_udop_json = adjust_page_dimensions_and_bbox(modified_udop_json, pdfminer_json)
406
+ matched_data,tree_format_matched_data= match_headers_with_text(modified_udop_json, pdfminer_json)
407
+ df = pd.DataFrame(matched_data)
408
+ return df,tree_format_matched_data
409
+
410
+