Spaces:
Sleeping
Sleeping
Update InitialMarkups.py
Browse files- InitialMarkups.py +10 -10
InitialMarkups.py
CHANGED
|
@@ -1109,7 +1109,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
|
|
| 1109 |
|
| 1110 |
hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
|
| 1111 |
listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
|
| 1112 |
-
print('listofHeaderstoMarkup',listofHeaderstoMarkup)
|
| 1113 |
# Precompute all children headers once
|
| 1114 |
allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
|
| 1115 |
allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
|
|
@@ -1125,7 +1125,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
|
|
| 1125 |
subHeaderFontSize= top_3_font_sizes[1]
|
| 1126 |
subsubheaderFontSize= top_3_font_sizes[1]
|
| 1127 |
|
| 1128 |
-
print("📌 Has TOC:", bool(toc_pages), " | Pages to skip:", toc_pages)
|
| 1129 |
|
| 1130 |
# Preload all pages to avoid repeated loading
|
| 1131 |
# pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
|
|
@@ -1134,7 +1134,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
|
|
| 1134 |
heading_to_search = heading_to_searchDict['text']
|
| 1135 |
heading_to_searchPageNum = heading_to_searchDict['page']
|
| 1136 |
|
| 1137 |
-
print('headertosearch', heading_to_search)
|
| 1138 |
|
| 1139 |
# Initialize variables
|
| 1140 |
headertoContinue1 = False
|
|
@@ -1240,7 +1240,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
|
|
| 1240 |
Alltext_Tobebilled+=combined_line_norm
|
| 1241 |
collecting = True
|
| 1242 |
matched_header_font_size = max(span["size"] for span in header_spans)
|
| 1243 |
-
print(f"📥 Start collecting after header: {combined_line_norm} (Font size: {matched_header_font_size})")
|
| 1244 |
|
| 1245 |
collected_lines.append(line_text)
|
| 1246 |
valid_spans = [span for span in spans if span.get("bbox")]
|
|
@@ -1312,7 +1312,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
|
|
| 1312 |
# Convert list to JSON
|
| 1313 |
json_output = json.dumps(data_list_JSON, indent=4)
|
| 1314 |
|
| 1315 |
-
print("Final URL:", final_url)
|
| 1316 |
i += 2
|
| 1317 |
continue
|
| 1318 |
else:
|
|
@@ -1337,8 +1337,8 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
|
|
| 1337 |
Alltext_Tobebilled+=combined_line_norm
|
| 1338 |
collecting = True
|
| 1339 |
matched_header_font_size = max(span["size"] for span in header_spans)
|
| 1340 |
-
print(f"📥 Start collecting after header: {combined_line_norm} "
|
| 1341 |
-
|
| 1342 |
|
| 1343 |
collected_lines.append(line_text)
|
| 1344 |
valid_spans = [span for span in spans if span.get("bbox")]
|
|
@@ -1410,7 +1410,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
|
|
| 1410 |
# Convert list to JSON
|
| 1411 |
json_output = json.dumps(data_list_JSON, indent=4)
|
| 1412 |
|
| 1413 |
-
print("Final URL:", final_url)
|
| 1414 |
i += 2
|
| 1415 |
continue
|
| 1416 |
if collecting:
|
|
@@ -1434,7 +1434,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
|
|
| 1434 |
norm_line != heading_norm and
|
| 1435 |
is_probably_real_header):
|
| 1436 |
if line_text not in heading_norm:
|
| 1437 |
-
|
| 1438 |
collecting = False
|
| 1439 |
done = True
|
| 1440 |
headertoContinue1 = False
|
|
@@ -1488,7 +1488,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
|
|
| 1488 |
|
| 1489 |
pdf_bytes = BytesIO()
|
| 1490 |
docHighlights.save(pdf_bytes)
|
| 1491 |
-
print('JSONN',json_output)
|
| 1492 |
return pdf_bytes.getvalue(), docHighlights , json_output , Alltext_Tobebilled
|
| 1493 |
|
| 1494 |
|
|
|
|
| 1109 |
|
| 1110 |
hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
|
| 1111 |
listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
|
| 1112 |
+
# print('listofHeaderstoMarkup',listofHeaderstoMarkup)
|
| 1113 |
# Precompute all children headers once
|
| 1114 |
allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
|
| 1115 |
allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
|
|
|
|
| 1125 |
subHeaderFontSize= top_3_font_sizes[1]
|
| 1126 |
subsubheaderFontSize= top_3_font_sizes[1]
|
| 1127 |
|
| 1128 |
+
# print("📌 Has TOC:", bool(toc_pages), " | Pages to skip:", toc_pages)
|
| 1129 |
|
| 1130 |
# Preload all pages to avoid repeated loading
|
| 1131 |
# pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
|
|
|
|
| 1134 |
heading_to_search = heading_to_searchDict['text']
|
| 1135 |
heading_to_searchPageNum = heading_to_searchDict['page']
|
| 1136 |
|
| 1137 |
+
# print('headertosearch', heading_to_search)
|
| 1138 |
|
| 1139 |
# Initialize variables
|
| 1140 |
headertoContinue1 = False
|
|
|
|
| 1240 |
Alltext_Tobebilled+=combined_line_norm
|
| 1241 |
collecting = True
|
| 1242 |
matched_header_font_size = max(span["size"] for span in header_spans)
|
| 1243 |
+
# print(f"📥 Start collecting after header: {combined_line_norm} (Font size: {matched_header_font_size})")
|
| 1244 |
|
| 1245 |
collected_lines.append(line_text)
|
| 1246 |
valid_spans = [span for span in spans if span.get("bbox")]
|
|
|
|
| 1312 |
# Convert list to JSON
|
| 1313 |
json_output = json.dumps(data_list_JSON, indent=4)
|
| 1314 |
|
| 1315 |
+
# print("Final URL:", final_url)
|
| 1316 |
i += 2
|
| 1317 |
continue
|
| 1318 |
else:
|
|
|
|
| 1337 |
Alltext_Tobebilled+=combined_line_norm
|
| 1338 |
collecting = True
|
| 1339 |
matched_header_font_size = max(span["size"] for span in header_spans)
|
| 1340 |
+
# print(f"📥 Start collecting after header: {combined_line_norm} "
|
| 1341 |
+
# f"(Font: {matched_header_font_size}, Word match: {word_match_percent:.0f}%)")
|
| 1342 |
|
| 1343 |
collected_lines.append(line_text)
|
| 1344 |
valid_spans = [span for span in spans if span.get("bbox")]
|
|
|
|
| 1410 |
# Convert list to JSON
|
| 1411 |
json_output = json.dumps(data_list_JSON, indent=4)
|
| 1412 |
|
| 1413 |
+
# print("Final URL:", final_url)
|
| 1414 |
i += 2
|
| 1415 |
continue
|
| 1416 |
if collecting:
|
|
|
|
| 1434 |
norm_line != heading_norm and
|
| 1435 |
is_probably_real_header):
|
| 1436 |
if line_text not in heading_norm:
|
| 1437 |
+
# print(f"🛑 Stop at header with same or larger font: '{line_text}' ({header_font_size} ≥ {matched_header_font_size})")
|
| 1438 |
collecting = False
|
| 1439 |
done = True
|
| 1440 |
headertoContinue1 = False
|
|
|
|
| 1488 |
|
| 1489 |
pdf_bytes = BytesIO()
|
| 1490 |
docHighlights.save(pdf_bytes)
|
| 1491 |
+
# print('JSONN',json_output)
|
| 1492 |
return pdf_bytes.getvalue(), docHighlights , json_output , Alltext_Tobebilled
|
| 1493 |
|
| 1494 |
|