Spaces:
Runtime error
Runtime error
Update InitialMarkups.py
Browse files- InitialMarkups.py +9 -9
InitialMarkups.py
CHANGED
|
@@ -2377,7 +2377,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
|
|
| 2377 |
|
| 2378 |
hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
|
| 2379 |
listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
|
| 2380 |
-
print('listofHeaderstoMarkup',listofHeaderstoMarkup)
|
| 2381 |
# Precompute all children headers once
|
| 2382 |
allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
|
| 2383 |
allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
|
|
@@ -2393,7 +2393,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
|
|
| 2393 |
subHeaderFontSize= top_3_font_sizes[1]
|
| 2394 |
subsubheaderFontSize= top_3_font_sizes[1]
|
| 2395 |
|
| 2396 |
-
print("π Has TOC:", bool(toc_pages), " | Pages to skip:", toc_pages)
|
| 2397 |
|
| 2398 |
# Preload all pages to avoid repeated loading
|
| 2399 |
# pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
|
|
@@ -2402,7 +2402,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
|
|
| 2402 |
heading_to_search = heading_to_searchDict['text']
|
| 2403 |
heading_to_searchPageNum = heading_to_searchDict['page']
|
| 2404 |
|
| 2405 |
-
print('headertosearch', heading_to_search)
|
| 2406 |
|
| 2407 |
# Initialize variables
|
| 2408 |
headertoContinue1 = False
|
|
@@ -2508,7 +2508,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
|
|
| 2508 |
Alltext_tobebilled+=combined_line_norm
|
| 2509 |
collecting = True
|
| 2510 |
matched_header_font_size = max(span["size"] for span in header_spans)
|
| 2511 |
-
print(f"π₯ Start collecting after header: {combined_line_norm} (Font size: {matched_header_font_size})")
|
| 2512 |
|
| 2513 |
collected_lines.append(line_text)
|
| 2514 |
valid_spans = [span for span in spans if span.get("bbox")]
|
|
@@ -2580,7 +2580,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
|
|
| 2580 |
# Convert list to JSON
|
| 2581 |
json_output = json.dumps(data_list_JSON, indent=4)
|
| 2582 |
|
| 2583 |
-
print("Final URL:", final_url)
|
| 2584 |
i += 2
|
| 2585 |
continue
|
| 2586 |
else:
|
|
@@ -2605,8 +2605,8 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
|
|
| 2605 |
Alltext_tobebilled+=combined_line_norm
|
| 2606 |
collecting = True
|
| 2607 |
matched_header_font_size = max(span["size"] for span in header_spans)
|
| 2608 |
-
print(f"π₯ Start collecting after header: {combined_line_norm} "
|
| 2609 |
-
|
| 2610 |
|
| 2611 |
collected_lines.append(line_text)
|
| 2612 |
valid_spans = [span for span in spans if span.get("bbox")]
|
|
@@ -2702,7 +2702,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
|
|
| 2702 |
norm_line != heading_norm and
|
| 2703 |
is_probably_real_header):
|
| 2704 |
if line_text not in heading_norm:
|
| 2705 |
-
|
| 2706 |
collecting = False
|
| 2707 |
done = True
|
| 2708 |
headertoContinue1 = False
|
|
@@ -2756,7 +2756,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
|
|
| 2756 |
|
| 2757 |
pdf_bytes = BytesIO()
|
| 2758 |
docHighlights.save(pdf_bytes)
|
| 2759 |
-
print('JSONN',json_output)
|
| 2760 |
return pdf_bytes.getvalue(), docHighlights , json_output , Alltext_tobebilled
|
| 2761 |
|
| 2762 |
|
|
|
|
| 2377 |
|
| 2378 |
hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
|
| 2379 |
listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
|
| 2380 |
+
# print('listofHeaderstoMarkup',listofHeaderstoMarkup)
|
| 2381 |
# Precompute all children headers once
|
| 2382 |
allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
|
| 2383 |
allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
|
|
|
|
| 2393 |
subHeaderFontSize= top_3_font_sizes[1]
|
| 2394 |
subsubheaderFontSize= top_3_font_sizes[1]
|
| 2395 |
|
| 2396 |
+
# print("π Has TOC:", bool(toc_pages), " | Pages to skip:", toc_pages)
|
| 2397 |
|
| 2398 |
# Preload all pages to avoid repeated loading
|
| 2399 |
# pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
|
|
|
|
| 2402 |
heading_to_search = heading_to_searchDict['text']
|
| 2403 |
heading_to_searchPageNum = heading_to_searchDict['page']
|
| 2404 |
|
| 2405 |
+
# print('headertosearch', heading_to_search)
|
| 2406 |
|
| 2407 |
# Initialize variables
|
| 2408 |
headertoContinue1 = False
|
|
|
|
| 2508 |
Alltext_tobebilled+=combined_line_norm
|
| 2509 |
collecting = True
|
| 2510 |
matched_header_font_size = max(span["size"] for span in header_spans)
|
| 2511 |
+
# print(f"π₯ Start collecting after header: {combined_line_norm} (Font size: {matched_header_font_size})")
|
| 2512 |
|
| 2513 |
collected_lines.append(line_text)
|
| 2514 |
valid_spans = [span for span in spans if span.get("bbox")]
|
|
|
|
| 2580 |
# Convert list to JSON
|
| 2581 |
json_output = json.dumps(data_list_JSON, indent=4)
|
| 2582 |
|
| 2583 |
+
# print("Final URL:", final_url)
|
| 2584 |
i += 2
|
| 2585 |
continue
|
| 2586 |
else:
|
|
|
|
| 2605 |
Alltext_tobebilled+=combined_line_norm
|
| 2606 |
collecting = True
|
| 2607 |
matched_header_font_size = max(span["size"] for span in header_spans)
|
| 2608 |
+
# print(f"π₯ Start collecting after header: {combined_line_norm} "
|
| 2609 |
+
# f"(Font: {matched_header_font_size}, Word match: {word_match_percent:.0f}%)")
|
| 2610 |
|
| 2611 |
collected_lines.append(line_text)
|
| 2612 |
valid_spans = [span for span in spans if span.get("bbox")]
|
|
|
|
| 2702 |
norm_line != heading_norm and
|
| 2703 |
is_probably_real_header):
|
| 2704 |
if line_text not in heading_norm:
|
| 2705 |
+
# print(f"π Stop at header with same or larger font: '{line_text}' ({header_font_size} β₯ {matched_header_font_size})")
|
| 2706 |
collecting = False
|
| 2707 |
done = True
|
| 2708 |
headertoContinue1 = False
|
|
|
|
| 2756 |
|
| 2757 |
pdf_bytes = BytesIO()
|
| 2758 |
docHighlights.save(pdf_bytes)
|
| 2759 |
+
# print('JSONN',json_output)
|
| 2760 |
return pdf_bytes.getvalue(), docHighlights , json_output , Alltext_tobebilled
|
| 2761 |
|
| 2762 |
|