Marthee commited on
Commit
a46a5a0
·
verified ·
1 Parent(s): 0496117

Update InitialMarkups.py

Browse files
Files changed (1) hide show
  1. InitialMarkups.py +10 -10
InitialMarkups.py CHANGED
@@ -1109,7 +1109,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1109
 
1110
  hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
1111
  listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
1112
- print('listofHeaderstoMarkup',listofHeaderstoMarkup)
1113
  # Precompute all children headers once
1114
  allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
1115
  allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
@@ -1125,7 +1125,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1125
  subHeaderFontSize= top_3_font_sizes[1]
1126
  subsubheaderFontSize= top_3_font_sizes[1]
1127
 
1128
- print("📌 Has TOC:", bool(toc_pages), " | Pages to skip:", toc_pages)
1129
 
1130
  # Preload all pages to avoid repeated loading
1131
  # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
@@ -1134,7 +1134,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1134
  heading_to_search = heading_to_searchDict['text']
1135
  heading_to_searchPageNum = heading_to_searchDict['page']
1136
 
1137
- print('headertosearch', heading_to_search)
1138
 
1139
  # Initialize variables
1140
  headertoContinue1 = False
@@ -1240,7 +1240,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1240
  Alltext_Tobebilled+=combined_line_norm
1241
  collecting = True
1242
  matched_header_font_size = max(span["size"] for span in header_spans)
1243
- print(f"📥 Start collecting after header: {combined_line_norm} (Font size: {matched_header_font_size})")
1244
 
1245
  collected_lines.append(line_text)
1246
  valid_spans = [span for span in spans if span.get("bbox")]
@@ -1312,7 +1312,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1312
  # Convert list to JSON
1313
  json_output = json.dumps(data_list_JSON, indent=4)
1314
 
1315
- print("Final URL:", final_url)
1316
  i += 2
1317
  continue
1318
  else:
@@ -1337,8 +1337,8 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1337
  Alltext_Tobebilled+=combined_line_norm
1338
  collecting = True
1339
  matched_header_font_size = max(span["size"] for span in header_spans)
1340
- print(f"📥 Start collecting after header: {combined_line_norm} "
1341
- f"(Font: {matched_header_font_size}, Word match: {word_match_percent:.0f}%)")
1342
 
1343
  collected_lines.append(line_text)
1344
  valid_spans = [span for span in spans if span.get("bbox")]
@@ -1410,7 +1410,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1410
  # Convert list to JSON
1411
  json_output = json.dumps(data_list_JSON, indent=4)
1412
 
1413
- print("Final URL:", final_url)
1414
  i += 2
1415
  continue
1416
  if collecting:
@@ -1434,7 +1434,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1434
  norm_line != heading_norm and
1435
  is_probably_real_header):
1436
  if line_text not in heading_norm:
1437
- print(f"🛑 Stop at header with same or larger font: '{line_text}' ({header_font_size} ≥ {matched_header_font_size})")
1438
  collecting = False
1439
  done = True
1440
  headertoContinue1 = False
@@ -1488,7 +1488,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1488
 
1489
  pdf_bytes = BytesIO()
1490
  docHighlights.save(pdf_bytes)
1491
- print('JSONN',json_output)
1492
  return pdf_bytes.getvalue(), docHighlights , json_output , Alltext_Tobebilled
1493
 
1494
 
 
1109
 
1110
  hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
1111
  listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
1112
+ # print('listofHeaderstoMarkup',listofHeaderstoMarkup)
1113
  # Precompute all children headers once
1114
  allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
1115
  allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
 
1125
  subHeaderFontSize= top_3_font_sizes[1]
1126
  subsubheaderFontSize= top_3_font_sizes[1]
1127
 
1128
+ # print("📌 Has TOC:", bool(toc_pages), " | Pages to skip:", toc_pages)
1129
 
1130
  # Preload all pages to avoid repeated loading
1131
  # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
 
1134
  heading_to_search = heading_to_searchDict['text']
1135
  heading_to_searchPageNum = heading_to_searchDict['page']
1136
 
1137
+ # print('headertosearch', heading_to_search)
1138
 
1139
  # Initialize variables
1140
  headertoContinue1 = False
 
1240
  Alltext_Tobebilled+=combined_line_norm
1241
  collecting = True
1242
  matched_header_font_size = max(span["size"] for span in header_spans)
1243
+ # print(f"📥 Start collecting after header: {combined_line_norm} (Font size: {matched_header_font_size})")
1244
 
1245
  collected_lines.append(line_text)
1246
  valid_spans = [span for span in spans if span.get("bbox")]
 
1312
  # Convert list to JSON
1313
  json_output = json.dumps(data_list_JSON, indent=4)
1314
 
1315
+ # print("Final URL:", final_url)
1316
  i += 2
1317
  continue
1318
  else:
 
1337
  Alltext_Tobebilled+=combined_line_norm
1338
  collecting = True
1339
  matched_header_font_size = max(span["size"] for span in header_spans)
1340
+ # print(f"📥 Start collecting after header: {combined_line_norm} "
1341
+ # f"(Font: {matched_header_font_size}, Word match: {word_match_percent:.0f}%)")
1342
 
1343
  collected_lines.append(line_text)
1344
  valid_spans = [span for span in spans if span.get("bbox")]
 
1410
  # Convert list to JSON
1411
  json_output = json.dumps(data_list_JSON, indent=4)
1412
 
1413
+ # print("Final URL:", final_url)
1414
  i += 2
1415
  continue
1416
  if collecting:
 
1434
  norm_line != heading_norm and
1435
  is_probably_real_header):
1436
  if line_text not in heading_norm:
1437
+ # print(f"🛑 Stop at header with same or larger font: '{line_text}' ({header_font_size} ≥ {matched_header_font_size})")
1438
  collecting = False
1439
  done = True
1440
  headertoContinue1 = False
 
1488
 
1489
  pdf_bytes = BytesIO()
1490
  docHighlights.save(pdf_bytes)
1491
+ # print('JSONN',json_output)
1492
  return pdf_bytes.getvalue(), docHighlights , json_output , Alltext_Tobebilled
1493
 
1494