Marthee commited on
Commit
14a2e4b
·
verified ·
1 Parent(s): dca8e8f

Update InitialMarkups.py

Browse files
Files changed (1) hide show
  1. InitialMarkups.py +9 -9
InitialMarkups.py CHANGED
@@ -2377,7 +2377,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
2377
 
2378
  hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
2379
  listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
2380
- print('listofHeaderstoMarkup',listofHeaderstoMarkup)
2381
  # Precompute all children headers once
2382
  allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
2383
  allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
@@ -2393,7 +2393,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
2393
  subHeaderFontSize= top_3_font_sizes[1]
2394
  subsubheaderFontSize= top_3_font_sizes[1]
2395
 
2396
- print("📌 Has TOC:", bool(toc_pages), " | Pages to skip:", toc_pages)
2397
 
2398
  # Preload all pages to avoid repeated loading
2399
  # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
@@ -2402,7 +2402,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
2402
  heading_to_search = heading_to_searchDict['text']
2403
  heading_to_searchPageNum = heading_to_searchDict['page']
2404
 
2405
- print('headertosearch', heading_to_search)
2406
 
2407
  # Initialize variables
2408
  headertoContinue1 = False
@@ -2508,7 +2508,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
2508
  Alltext_tobebilled+=combined_line_norm
2509
  collecting = True
2510
  matched_header_font_size = max(span["size"] for span in header_spans)
2511
- print(f"📥 Start collecting after header: {combined_line_norm} (Font size: {matched_header_font_size})")
2512
 
2513
  collected_lines.append(line_text)
2514
  valid_spans = [span for span in spans if span.get("bbox")]
@@ -2580,7 +2580,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
2580
  # Convert list to JSON
2581
  json_output = json.dumps(data_list_JSON, indent=4)
2582
 
2583
- print("Final URL:", final_url)
2584
  i += 2
2585
  continue
2586
  else:
@@ -2605,8 +2605,8 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
2605
  Alltext_tobebilled+=combined_line_norm
2606
  collecting = True
2607
  matched_header_font_size = max(span["size"] for span in header_spans)
2608
- print(f"📥 Start collecting after header: {combined_line_norm} "
2609
- f"(Font: {matched_header_font_size}, Word match: {word_match_percent:.0f}%)")
2610
 
2611
  collected_lines.append(line_text)
2612
  valid_spans = [span for span in spans if span.get("bbox")]
@@ -2702,7 +2702,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
2702
  norm_line != heading_norm and
2703
  is_probably_real_header):
2704
  if line_text not in heading_norm:
2705
- print(f"🛑 Stop at header with same or larger font: '{line_text}' ({header_font_size} ≥ {matched_header_font_size})")
2706
  collecting = False
2707
  done = True
2708
  headertoContinue1 = False
@@ -2756,7 +2756,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
2756
 
2757
  pdf_bytes = BytesIO()
2758
  docHighlights.save(pdf_bytes)
2759
- print('JSONN',json_output)
2760
  return pdf_bytes.getvalue(), docHighlights , json_output , Alltext_tobebilled
2761
 
2762
 
 
2377
 
2378
  hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
2379
  listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
2380
+ # print('listofHeaderstoMarkup',listofHeaderstoMarkup)
2381
  # Precompute all children headers once
2382
  allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
2383
  allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
 
2393
  subHeaderFontSize= top_3_font_sizes[1]
2394
  subsubheaderFontSize= top_3_font_sizes[1]
2395
 
2396
+ # print("📌 Has TOC:", bool(toc_pages), " | Pages to skip:", toc_pages)
2397
 
2398
  # Preload all pages to avoid repeated loading
2399
  # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
 
2402
  heading_to_search = heading_to_searchDict['text']
2403
  heading_to_searchPageNum = heading_to_searchDict['page']
2404
 
2405
+ # print('headertosearch', heading_to_search)
2406
 
2407
  # Initialize variables
2408
  headertoContinue1 = False
 
2508
  Alltext_tobebilled+=combined_line_norm
2509
  collecting = True
2510
  matched_header_font_size = max(span["size"] for span in header_spans)
2511
+ # print(f"📥 Start collecting after header: {combined_line_norm} (Font size: {matched_header_font_size})")
2512
 
2513
  collected_lines.append(line_text)
2514
  valid_spans = [span for span in spans if span.get("bbox")]
 
2580
  # Convert list to JSON
2581
  json_output = json.dumps(data_list_JSON, indent=4)
2582
 
2583
+ # print("Final URL:", final_url)
2584
  i += 2
2585
  continue
2586
  else:
 
2605
  Alltext_tobebilled+=combined_line_norm
2606
  collecting = True
2607
  matched_header_font_size = max(span["size"] for span in header_spans)
2608
+ # print(f"📥 Start collecting after header: {combined_line_norm} "
2609
+ # f"(Font: {matched_header_font_size}, Word match: {word_match_percent:.0f}%)")
2610
 
2611
  collected_lines.append(line_text)
2612
  valid_spans = [span for span in spans if span.get("bbox")]
 
2702
  norm_line != heading_norm and
2703
  is_probably_real_header):
2704
  if line_text not in heading_norm:
2705
+ # print(f"🛑 Stop at header with same or larger font: '{line_text}' ({header_font_size} ≥ {matched_header_font_size})")
2706
  collecting = False
2707
  done = True
2708
  headertoContinue1 = False
 
2756
 
2757
  pdf_bytes = BytesIO()
2758
  docHighlights.save(pdf_bytes)
2759
+ # print('JSONN',json_output)
2760
  return pdf_bytes.getvalue(), docHighlights , json_output , Alltext_tobebilled
2761
 
2762