InitialMarkups

Runtime error

App Files Files Community

Marthee commited on Nov 24, 2025

Commit

afa2728

verified ·

1 Parent(s): 1e83b54

Update InitialMarkups.py

Browse files

Files changed (1) hide show

InitialMarkups.py +29 -7

InitialMarkups.py CHANGED Viewed

@@ -145,18 +145,18 @@ def normalize_text(text):
 def get_spaced_text_from_spans(spans):
     return normalize_text(" ".join(span["text"].strip() for span in spans))
 def is_header(span, most_common_font_size, most_common_color, most_common_font):
     fontname = span.get("font", "").lower()
     # is_italic = "italic" in fontname or "oblique" in fontname
     is_bold = "bold" in fontname or span.get("bold", False)
     return (
         (
-            span["size"] > most_common_font_size or
-            span["font"].lower() != most_common_font.lower() or
-            (is_bold and span["size"] > most_common_font_size )
         )
     )
 def add_span_to_nearest_group(span_y, grouped_dict, pageNum=None, threshold=0.5):
     for (p, y) in grouped_dict:
         if pageNum is not None and p != pageNum:
@@ -653,6 +653,20 @@ def highlight_boxes(doc, highlights, stringtowrite, fixed_width=500):  # Set you
                 )
                 annot1.update()
 def get_leaf_headers_with_paths(listtoloop, path=None, output=None):
     if path is None:
         path = []
@@ -729,8 +743,10 @@ def extract_section_under_header(multiplePDF_Paths):
         )
         hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
         listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
         # Precompute all children headers once
         allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
         allchildrenheaders_set = set(allchildrenheaders)  # For faster lookups
@@ -752,9 +768,12 @@ def extract_section_under_header(multiplePDF_Paths):
         # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
         for heading_to_searchDict, paths in listofHeaderstoMarkup:
             heading_to_search = heading_to_searchDict['text']
             heading_to_searchPageNum = heading_to_searchDict['page']
             # Initialize variables
             headertoContinue1 = False
             headertoContinue2 = False
@@ -2457,4 +2476,7 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths):
     combined_json_str = json.dumps(jsonCombined, indent=1)
     print(combined_json_str)
     return pdf_bytes.getvalue(), docHighlights , combined_json_str, Alltexttobebilled , filenames

 def get_spaced_text_from_spans(spans):
     return normalize_text(" ".join(span["text"].strip() for span in spans))
 def is_header(span, most_common_font_size, most_common_color, most_common_font):
     fontname = span.get("font", "").lower()
     # is_italic = "italic" in fontname or "oblique" in fontname
     is_bold = "bold" in fontname or span.get("bold", False)
     return (
         (
+           ( span["size"] > most_common_font_size or
+            span["font"].lower() != most_common_font.lower()) and
+            is_bold
         )
     )
 def add_span_to_nearest_group(span_y, grouped_dict, pageNum=None, threshold=0.5):
     for (p, y) in grouped_dict:
         if pageNum is not None and p != pageNum:
                 )
                 annot1.update()
+# def get_leaf_headers_with_paths(listtoloop, path=None, output=None):
+#     if path is None:
+#         path = []
+#     if output is None:
+#         output = []
+#     for header in listtoloop:
+#         current_path = path + [header['text']]
+#         if not header['children']:
+#             if header['level'] != 0 and header['level'] != 1:
+#                 output.append((header, current_path))
+#         else:
+#             get_leaf_headers_with_paths(header['children'], current_path, output)
+#     return output
 def get_leaf_headers_with_paths(listtoloop, path=None, output=None):
     if path is None:
         path = []
         )
         hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
+        print(hierarchy)
         listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
+        print(len(listofHeaderstoMarkup))
         # Precompute all children headers once
         allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
         allchildrenheaders_set = set(allchildrenheaders)  # For faster lookups
         # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
         for heading_to_searchDict, paths in listofHeaderstoMarkup:
             heading_to_search = heading_to_searchDict['text']
             heading_to_searchPageNum = heading_to_searchDict['page']
+            if len(heading_to_searchDict['children'])==0:
+                continue
+            print(paths,heading_to_search)
             # Initialize variables
             headertoContinue1 = False
             headertoContinue2 = False
     combined_json_str = json.dumps(jsonCombined, indent=1)
     print(combined_json_str)
     return pdf_bytes.getvalue(), docHighlights , combined_json_str, Alltexttobebilled , filenames
+extract_section_under_header('https://www.dropbox.com/scl/fi/vrqetlyh7a18a7a327nng/4460-NBS-Weybridge-Point-2025-08-21.pdf?rlkey=ocrll9lnbbnbrqc2l4lkrwb89&st=4zm04cyk&dl=0')