File size: 22,643 Bytes
0538136
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
import json
import pandas as pd

def read_json(json_file):
    with open(json_file, 'r', encoding='utf-8') as file:
        return json.load(file)

def adjust_page_dimensions_and_bbox(modified_model_output_json, pdfminer_json):
    for page_number, blocks in modified_model_output_json.items():
        
        if page_number in pdfminer_json.keys():
            if pdfminer_json[page_number]:

                page_info = pdfminer_json[page_number][0]
                page_width = page_info['page_width']
                page_height = page_info['page_height']
                for block in blocks:
                    original_width = block['page_img_width']
                    original_height = block['page_img_height']
                    width_scale = page_width / original_width
                    height_scale = page_height / original_height
                    block['page_img_width'] = page_width
                    block['page_img_height'] = page_height
                    block['bbox'] = [
                        block['bbox'][0] * width_scale,
                        block['bbox'][1] * height_scale,
                        block['bbox'][2] * width_scale,
                        block['bbox'][3] * height_scale
                    ]
            
            else:
                print(f"Page {page_number} is empty.")
    return modified_model_output_json

def convert_to_dataframe(extracted_df):
    if isinstance(extracted_df, pd.DataFrame):
        return extracted_df

    elif isinstance(extracted_df, dict):
        if all(isinstance(value, list) for value in extracted_df.values()):
            return pd.DataFrame(extracted_df)
        else:
            return pd.DataFrame([extracted_df])

    elif isinstance(extracted_df, list):
        if all(isinstance(item, dict) for item in extracted_df):
            return pd.DataFrame(extracted_df)
        else:
            return pd.DataFrame(extracted_df, columns=['Value'])

    else:
        return pd.DataFrame([extracted_df], columns=['Value'])

def calculate_centroid(bbox):
    x1, y1, x2, y2 = bbox
    x_center = (x1 + x2) / 2
    y_center = (y1 + y2) / 2
    return (x_center, y_center)

def is_within_radius(text_block_bbox, header_bbox, radius=50):
    text_xmin, text_ymin, text_xmax, text_ymax = text_block_bbox
    header_xmin, header_ymin, header_xmax, header_ymax = header_bbox

    # Check for overlap between text_block_bbox and header_bbox
    overlap_x = max(0, min(text_xmax, header_xmax) - max(text_xmin, header_xmin))
    overlap_y = max(0, min(text_ymax, header_ymax) - max(text_ymin, header_ymin))

    # If there is any overlap, return True
    if overlap_x > 0 and overlap_y > 0:
        return True

    return False

def is_overlapped(text_block_bbox, header_bbox, threshold=0.20):
    # Unpack bounding boxes
    text_xmin, text_ymin, text_xmax, text_ymax = text_block_bbox
    header_xmin, header_ymin, header_xmax, header_ymax = header_bbox

    # Calculate overlap in the x and y directions
    overlap_x = max(0, min(text_xmax, header_xmax) - max(text_xmin, header_xmin))
    overlap_y = max(0, min(text_ymax, header_ymax) - max(text_ymin, header_ymin))

    # Calculate the area of overlap
    overlap_area = overlap_x * overlap_y

    # Calculate the area of the text block and header
    text_area = (text_xmax - text_xmin) * (text_ymax - text_ymin)
    header_area = (header_xmax - header_xmin) * (header_ymax - header_ymin)

    # Calculate the overlap ratio with respect to the smaller of the two areas
    smaller_area = min(text_area, header_area)
    overlap_ratio = overlap_area / smaller_area

    # Check if the overlap ratio exceeds the threshold
    if overlap_ratio > threshold:
        return True

    return False

def detect_header(text_block_bbox, adjusted_model_output_json, page_number ,next_header_index_in_model_udop):
    text_centroid = calculate_centroid(text_block_bbox)
    if str(page_number) in adjusted_model_output_json:
        if next_header_index_in_model_udop is not None :
            next_header_index_in_model_udop = int(next_header_index_in_model_udop)            
            header_block = adjusted_model_output_json[str(page_number)][next_header_index_in_model_udop]
            if is_overlapped(text_block_bbox, header_block['bbox']):
                return True
    return False

def remove_header_from_start(first_row_text: str, first_row_header_text: str) -> str:
    length_header_text = len(first_row_header_text)
    return first_row_text[length_header_text:].strip()
    
def extract_last_header_index(all_blocks_with_indices):
    last_header_index = -1  

    # Iterate through the list in reverse
    for index in reversed(range(len(all_blocks_with_indices))):
        block = all_blocks_with_indices[index]

        # Check if the block is a Page-header or Section-header
        if block['label_name'] in ['Page-header', 'Section-header']:
            last_header_index = index
            break 

    return last_header_index

def match_headers_with_text(adjusted_model_json, pdfminer_json):
    matched_data = []
    tree_format_matched_data = []
    current_header = None
    current_content = []
    current_header_table_content = []
    current_header_tree_structure = []
    sorted_pages = sorted(adjusted_model_json.items(), key=lambda x: int(x[0]))

    all_blocks_with_indices = []
    for key, blocks in sorted_pages:
        for index, block in enumerate(blocks):
            if block['label_name'] in ['Page-header','Section-header','Table', "Portfolio-Company-Table"]: 
                block['used_model_index'] = index
                all_blocks_with_indices.append(block)


    for id,block in enumerate(all_blocks_with_indices): 
        if block['label_name'] in ['Page-header','Section-header']:    
            next_header_detect_flag = False
            current_header_index_in_model = block['used_model_index'] 
            current_header_bbox = block['bbox']
            current_header_type = block['label_name']
            current_header_centroid = calculate_centroid(block['bbox'])
            current_header_page_number = block['pdf_page_id']
            current_header_text = block['extracted_text'][0] if block['extracted_text'] else ""
            current_header_page_width = block['page_img_width']
            current_header_page_height = block['page_img_height']
            current_header_page_block_id = block['page_block_id']
            current_header_pdf_name  = block['pdf_name']
            content_source_pages = []  # Track pages where content is collected
            new_start_index = id + 1
            if new_start_index < len(all_blocks_with_indices):
                for next_id ,next_block in enumerate(all_blocks_with_indices[new_start_index:], start = new_start_index):
                    if next_block['label_name'] in ['Page-header', 'Section-header']:
                        next_header_index_in_model_udop = next_block['used_model_index']
                        next_header_bbox = next_block['bbox']
                        next_header_centroid = calculate_centroid(next_block['bbox'])
                        next_header_page_number = next_block["pdf_page_id"]
                        next_header_text = next_block['extracted_text'][0] if next_block['extracted_text'] else ""
                        break  

            else:
                next_header_bbox = None
                next_header_centroid = None
                next_header_page_number = None
                next_header_index_in_model_udop = None
                next_header_text = None

            last_header_index = extract_last_header_index(all_blocks_with_indices)
            if id == len(all_blocks_with_indices) - 1 or id == last_header_index:
                next_header_bbox = None
                next_header_centroid = None
                next_header_page_number = None
                next_header_index_in_model_udop = None
                next_header_text = None
       
            if current_header_text:
                if current_header is not None:
                    current_content = []
                    current_header_table_content = []
                    current_header_tree_structure = []

                current_header = {
                    "page_number": current_header_page_number,
                    "header_text": current_header_text,
                    "element_id": None,
                    "text_block_id": None
                }
                new_start_index = id + 1
                for new_id,new_block in enumerate(all_blocks_with_indices[new_start_index:], start = new_start_index):
                    extracted_df_flag = False
                    next_block = new_block
                    if next_block and next_block['label_name'] in ['Page-header', 'Section-header']:
                        extracted_df_flag = False
                        break

                    # if next_block and next_block['label_name'] in ['Table']:
                    if next_block and next_block['label_name'] in ['Table', "Portfolio-Company-Table"]:
                        extracted_df_flag = True
                        extracted_df = next_block['extracted_text'][0]
                        if next_block["associated_table_header_info"] is not None:
                            extracted_df_table_header = next_block["associated_table_header_info"]['extracted_text'][0]
                        else:
                            extracted_df_table_header = None

                        extracted_df_new = convert_to_dataframe(extracted_df)
                        extracted_df_new_column_headers = extracted_df_new.columns.tolist()
                        extracted_df_markdown = extracted_df_new.to_csv(index=False)

                        table_metadata = {  'pdf_name': next_block['pdf_name'] ,
                                            'table_page_id': next_block['pdf_page_id'],
                                            'table_page_id_width' : next_block['page_img_width'],
                                            'table_page_id_height': next_block['page_img_height'], 
                                            'table_bbox' : next_block['bbox']
                                            }

                        table_header_pair = {
                                            #  'label_name':'Table-header',
                                              'label_name':next_block['label_name'],
                                              'table_header': extracted_df_table_header,
                                              'table_column_header' : extracted_df_new_column_headers,
                                              'table_info': extracted_df_new,
                                              'metadata' : table_metadata 
                                              }
                        
                        tree_table_header_info = {
                                                'label_name':'Table-header',
                                                # 'label_name':next_block['label_name'],
                                                'table_header_info': next_block["associated_table_header_info"],
                                                'table_column_header' : extracted_df_new_column_headers,
                                                'table_info': next_block
                                                 }
           
                        # current_header_table_content.append(extracted_df)
                        current_header_table_content.append(table_header_pair)
                        current_header_tree_structure.append(next_block)
                        
                last_pdf_page = int(list(pdfminer_json.keys())[-1])
                first_append_flag = False
                first_append_text = " "
                for pdf_page_num in range(int(current_header_page_number), last_pdf_page + 1):
                    text_blocks = pdfminer_json.get(str(pdf_page_num), [])
                    start_index = 0
                    page_content_added = False  # Track if content was added from this page
                    if current_header["element_id"] is None and current_header["text_block_id"] is None:
                        for index, text_block in enumerate(text_blocks):
                            if is_overlapped(text_block['bbox'],current_header_bbox):
                                current_header["element_id"] = text_block["element_id"]
                                current_header["text_block_id"] = text_block["text_block_id"]
                                start_index = index
                                first_append_flag = True
                                break

                    for next_header_index, text_block in enumerate(text_blocks[start_index:], start = start_index):
                        last_text_reached_flag = False
                        if first_append_flag:
                            first_row_text = text_block['text']
                            first_row_header_text = current_header_text
                            first_append_text = remove_header_from_start(first_row_text,first_row_header_text)
                            current_content.append(first_append_text)
                            page_content_added = True
                            first_append_flag = False
                            continue

                        if next_header_text is not None and pdf_page_num == int(next_header_page_number):
                            next_header_found_flag = False

                            if detect_header(text_block['bbox'], adjusted_model_json, next_header_page_number,next_header_index_in_model_udop):
                                next_header_found_flag = True
                                matched_data.append({
                                    "page_number": current_header["page_number"],
                                    "pdf_name" : current_header_pdf_name ,
                                    "header": current_header["header_text"],
                                    "label_name": current_header_type,
                                    "content": " ".join(current_content),
                                    "table_content" : current_header_table_content,
                                    "all_source_pages": content_source_pages
                                })
                                tree_format_matched_data.append({
                                    "header_page_number": current_header["page_number"],
                                    "label_name":current_header_type,
                                    'page_block_id' : current_header_page_block_id,
                                    "header_bbox": current_header_bbox,
                                    "header_page_width":current_header_page_width,
                                    "header_page_height": current_header_page_height,
                                    "header": current_header["header_text"],
                                    "content": " ".join(current_content),
                                    'tree_table_content' : current_header_tree_structure
                                })
                                current_content = []
                                current_table_content = []
                                current_header_tree_structure = []
                                next_header_detect_flag = True
                                break

                            if next_header_index == len(text_blocks) - 1:
                                last_text_block = text_block
                                if not next_header_found_flag and last_text_block:
                                    matched_data.append({
                                                            "page_number": current_header["page_number"],
                                                            "pdf_name" : current_header_pdf_name ,
                                                            "header": current_header["header_text"],
                                                            "label_name": current_header_type,                                                        
                                                            "content": " ".join(current_content),
                                                            "table_content" : current_header_table_content,
                                                            "all_source_pages": content_source_pages
                                                        })
                                    tree_format_matched_data.append({
                                                                "header_page_number": current_header["page_number"],
                                                                "label_name":currentHeaderType,
                                                                'page_block_id' : current_header_page_block_id,
                                                                "header_bbox": current_header_bbox,
                                                                "header_page_width":current_header_page_width,
                                                                "header_page_height": current_header_page_height,
                                                                "header": current_header["header_text"],
                                                                "content": " ".join(current_content),
                                                                'tree_table_content' : current_header_tree_structure
                                                            })
                                    current_content = []
                                    current_header_table_content = []
                                    current_header_tree_structure = []
                                    next_header_detect_flag = True
                                    next_header_found_flag = True
                                    break

                        current_content.append(text_block['text'])
                        page_content_added = True
                        if next_header_detect_flag:
                            break
                    
                    # Add page number to source pages if content was added from this page
                    if page_content_added and pdf_page_num not in content_source_pages:
                        content_source_pages.append(pdf_page_num)

                    if next_header_detect_flag:
                        break

                if next_header_text is None and next_header_page_number is None:
                    current_header = {
                                            "page_number": current_header_page_number,
                                            "header_text": current_header_text,
                                            "element_id": None,
                                            "text_block_id": None
                                        }

                    for pdf_page_num in range(int(current_header_page_number), last_pdf_page + 1):
                        text_blocks = pdfminer_json.get(str(pdf_page_num), [])                        
                        start_index = 0
                        page_content_added = False  # Track if content was added from this page
                        if current_header["element_id"] is None and current_header["text_block_id"] is None:
                            for index, text_block in enumerate(text_blocks):
                                if is_overlapped(text_block['bbox'],current_header_bbox):
                                    current_header["element_id"] = text_block["element_id"]
                                    current_header["text_block_id"] = text_block["text_block_id"]
                                    start_index = index
                                    first_append_flag = True
                                    break

                        for no_header_index, text_block in enumerate(text_blocks[start_index:], start=start_index):
                            if first_append_flag:
                                first_row_text = text_block['text']
                                first_row_header_text = current_header_text
                                first_append_text = remove_header_from_start(first_row_text,first_row_header_text)
                                current_content.append(first_append_text)
                                page_content_added = True
                                first_append_flag = False
                                continue
                        
                        # Add page number to source pages if content was added from this page
                        if page_content_added and pdf_page_num not in content_source_pages:
                            content_source_pages.append(pdf_page_num)
                            
                    matched_data.append({
                        "page_number": current_header["page_number"],
                        "pdf_name" : current_header_pdf_name ,
                        "header": current_header["header_text"],
                        "label_name": current_header_type,
                        "content": " ".join(current_content),
                        "table_content" : current_header_table_content,
                        "all_source_pages": content_source_pages
                    })
                    tree_format_matched_data.append({
                        "header_page_number": current_header["page_number"],
                        "label_name": current_header_type,
                        'page_block_id' : current_header_page_block_id,
                        "header_bbox": current_header_bbox,
                        "header_page_width":current_header_page_width,
                        "header_page_height": current_header_page_height,
                        "header": current_header["header_text"],
                        "content": " ".join(current_content),
                        'tree_table_content' : current_header_tree_structure
                    })
        
    return matched_data,tree_format_matched_data

def main_header_pipeline(modified_udop_json, pdfminer_json):
    modified_udop_json = adjust_page_dimensions_and_bbox(modified_udop_json, pdfminer_json)
    matched_data,tree_format_matched_data= match_headers_with_text(modified_udop_json, pdfminer_json)
    df = pd.DataFrame(matched_data)
    return df,tree_format_matched_data