InitialMarkups2

Sleeping

App Files Files Community

Marthee commited on Jul 8, 2025

Commit

a46a5a0

verified ·

1 Parent(s): 0496117

Update InitialMarkups.py

Browse files

Files changed (1) hide show

InitialMarkups.py +10 -10

InitialMarkups.py CHANGED Viewed

@@ -1109,7 +1109,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
     hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
     listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
-    print('listofHeaderstoMarkup',listofHeaderstoMarkup)
     # Precompute all children headers once
     allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
     allchildrenheaders_set = set(allchildrenheaders)  # For faster lookups
@@ -1125,7 +1125,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
         subHeaderFontSize= top_3_font_sizes[1]
         subsubheaderFontSize= top_3_font_sizes[1]
-    print("📌 Has TOC:", bool(toc_pages), " | Pages to skip:", toc_pages)
     # Preload all pages to avoid repeated loading
     # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
@@ -1134,7 +1134,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
         heading_to_search = heading_to_searchDict['text']
         heading_to_searchPageNum = heading_to_searchDict['page']
-        print('headertosearch', heading_to_search)
         # Initialize variables
         headertoContinue1 = False
@@ -1240,7 +1240,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
                             Alltext_Tobebilled+=combined_line_norm
                             collecting = True
                             matched_header_font_size = max(span["size"] for span in header_spans)
-                            print(f"📥 Start collecting after header: {combined_line_norm} (Font size: {matched_header_font_size})")
                             collected_lines.append(line_text)
                             valid_spans = [span for span in spans if span.get("bbox")]
@@ -1312,7 +1312,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
                                 # Convert list to JSON
                                 json_output = json.dumps(data_list_JSON, indent=4)
-                                print("Final URL:", final_url)
                                 i += 2
                                 continue
                     else:
@@ -1337,8 +1337,8 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
                                 Alltext_Tobebilled+=combined_line_norm
                                 collecting = True
                                 matched_header_font_size = max(span["size"] for span in header_spans)
-                                print(f"📥 Start collecting after header: {combined_line_norm} "
-                                      f"(Font: {matched_header_font_size}, Word match: {word_match_percent:.0f}%)")
                                 collected_lines.append(line_text)
                                 valid_spans = [span for span in spans if span.get("bbox")]
@@ -1410,7 +1410,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
                                     # Convert list to JSON
                                     json_output = json.dumps(data_list_JSON, indent=4)
-                                    print("Final URL:", final_url)
                                     i += 2
                                     continue
                     if collecting:
@@ -1434,7 +1434,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
                                 norm_line != heading_norm and
                                 is_probably_real_header):
                                 if line_text not in heading_norm:
-                                  print(f"🛑 Stop at header with same or larger font: '{line_text}' ({header_font_size} ≥ {matched_header_font_size})")
                                   collecting = False
                                   done = True
                                   headertoContinue1 = False
@@ -1488,7 +1488,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
     pdf_bytes = BytesIO()
     docHighlights.save(pdf_bytes)
-    print('JSONN',json_output)
     return pdf_bytes.getvalue(), docHighlights , json_output , Alltext_Tobebilled

     hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
     listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
+    # print('listofHeaderstoMarkup',listofHeaderstoMarkup)
     # Precompute all children headers once
     allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
     allchildrenheaders_set = set(allchildrenheaders)  # For faster lookups
         subHeaderFontSize= top_3_font_sizes[1]
         subsubheaderFontSize= top_3_font_sizes[1]
+    # print("📌 Has TOC:", bool(toc_pages), " | Pages to skip:", toc_pages)
     # Preload all pages to avoid repeated loading
     # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
         heading_to_search = heading_to_searchDict['text']
         heading_to_searchPageNum = heading_to_searchDict['page']
+        # print('headertosearch', heading_to_search)
         # Initialize variables
         headertoContinue1 = False
                             Alltext_Tobebilled+=combined_line_norm
                             collecting = True
                             matched_header_font_size = max(span["size"] for span in header_spans)
+                            # print(f"📥 Start collecting after header: {combined_line_norm} (Font size: {matched_header_font_size})")
                             collected_lines.append(line_text)
                             valid_spans = [span for span in spans if span.get("bbox")]
                                 # Convert list to JSON
                                 json_output = json.dumps(data_list_JSON, indent=4)
+                                # print("Final URL:", final_url)
                                 i += 2
                                 continue
                     else:
                                 Alltext_Tobebilled+=combined_line_norm
                                 collecting = True
                                 matched_header_font_size = max(span["size"] for span in header_spans)
+                                # print(f"📥 Start collecting after header: {combined_line_norm} "
+                                #       f"(Font: {matched_header_font_size}, Word match: {word_match_percent:.0f}%)")
                                 collected_lines.append(line_text)
                                 valid_spans = [span for span in spans if span.get("bbox")]
                                     # Convert list to JSON
                                     json_output = json.dumps(data_list_JSON, indent=4)
+                                    # print("Final URL:", final_url)
                                     i += 2
                                     continue
                     if collecting:
                                 norm_line != heading_norm and
                                 is_probably_real_header):
                                 if line_text not in heading_norm:
+                                #   print(f"🛑 Stop at header with same or larger font: '{line_text}' ({header_font_size} ≥ {matched_header_font_size})")
                                   collecting = False
                                   done = True
                                   headertoContinue1 = False
     pdf_bytes = BytesIO()
     docHighlights.save(pdf_bytes)
+    # print('JSONN',json_output)
     return pdf_bytes.getvalue(), docHighlights , json_output , Alltext_Tobebilled