InitialMarkups

Runtime error

App Files Files Community

Marthee commited on Nov 24, 2025

Commit

7031d55

verified ·

1 Parent(s): 6acfc68

Update InitialMarkups.py

Browse files

Files changed (1) hide show

InitialMarkups.py +78 -52

InitialMarkups.py CHANGED Viewed

@@ -145,18 +145,18 @@ def normalize_text(text):
 def get_spaced_text_from_spans(spans):
     return normalize_text(" ".join(span["text"].strip() for span in spans))
 def is_header(span, most_common_font_size, most_common_color, most_common_font):
     fontname = span.get("font", "").lower()
     # is_italic = "italic" in fontname or "oblique" in fontname
     is_bold = "bold" in fontname or span.get("bold", False)
     return (
         (
-           ( span["size"] > most_common_font_size or
-            span["font"].lower() != most_common_font.lower()) and
-            is_bold
         )
     )
 def add_span_to_nearest_group(span_y, grouped_dict, pageNum=None, threshold=0.5):
     for (p, y) in grouped_dict:
         if pageNum is not None and p != pageNum:
@@ -293,6 +293,7 @@ def extract_headers(doc, toc_pages, most_common_font_size, most_common_color, mo
     return headers, top_3_font_sizes, smallest_font_size, spans
 def is_numbered(text):
     return bool(re.match(r'^\d', text.strip()))
@@ -310,7 +311,35 @@ def clean_toc_entry(toc_text):
     # Remove everything after last sequence of dots/whitespace followed by digits
     return re.sub(r'[\.\s]+\d+.*$', '', toc_text).strip('. ')
-def build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin=70, bottom_margin=85):
     # Extract headers with margin handling
     headers_list, top_3_font_sizes, smallest_font_size, spans = extract_headers(
         doc,
@@ -319,7 +348,7 @@ def build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_co
         most_common_color=most_common_color,
         most_common_font=most_common_font,
         top_margin=top_margin,
-        bottom_margin=bottom_margin
     )
     # Step 1: Collect and filter potential headers
@@ -329,14 +358,15 @@ def build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_co
     # First extract TOC entries to get exact level 0 header texts
     toc_entries = {}
     for pno in toc_pages:
-        page = doc.load_page(pno)
         toc_text = page.get_text()
         for line in toc_text.split('\n'):
             clean_line = line.strip()
             if clean_line:
                 norm_line = normalize(clean_line)
                 toc_entries[norm_line] = clean_line  # Store original text
     for h in headers_list:
         text, size, pageNum, y = h[:4]
         page = doc.load_page(pageNum)
@@ -393,8 +423,9 @@ def build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_co
         i += 1
     # Step 2: Identify level 0 headers (largest and in TOC)
     # max_size = max(h['size'] for h in headers) if headers else 0
     max_size,subheaderSize,nbsheadersize=top_3_font_sizes
     toc_text_match=[]
     # Improved TOC matching with exact and substring matching
     toc_matches = []
@@ -423,6 +454,7 @@ def build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_co
               toc_matches.append(h)
               toc_text_match.append(h['text'])
         elif matching_toc_texts and h['size'] < max_size * 0.9 and h['size'] > nbsheadersize : # h['size'] < max_size * 0.9 and h['size'] > max_size*0.75:
              headers.remove(h)
              continue
@@ -440,7 +472,8 @@ def build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_co
             # Update the header text with cleaned version
             h['text'] = cleaned_text
             unique_level0.append(h)
     # Step 3: Process headers under each level 0 to identify level 1 format
     # First, group headers by their level 0 parent
@@ -576,7 +609,8 @@ def build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_co
     enforce_nesting(root)
     root = [h for h in root if not (h['level'] == 0 and not h['children'])]
-    return root
 def adjust_levels_if_level0_not_in_toc(doc, toc_pages, root):
     def normalize(text):
@@ -613,6 +647,16 @@ def print_tree_with_numbers(headers, indent=0):
               f"(Level {header['level']}, p:{header['page']+1}, {size_info})")
         print_tree_with_numbers(header["children"], indent + 1)
 def highlight_boxes(doc, highlights, stringtowrite, fixed_width=500):  # Set your desired width here
     for page_num, bbox in highlights.items():
@@ -653,20 +697,6 @@ def highlight_boxes(doc, highlights, stringtowrite, fixed_width=500):  # Set you
                 )
                 annot1.update()
-# def get_leaf_headers_with_paths(listtoloop, path=None, output=None):
-#     if path is None:
-#         path = []
-#     if output is None:
-#         output = []
-#     for header in listtoloop:
-#         current_path = path + [header['text']]
-#         if not header['children']:
-#             if header['level'] != 0 and header['level'] != 1:
-#                 output.append((header, current_path))
-#         else:
-#             get_leaf_headers_with_paths(header['children'], current_path, output)
-#     return output
 def get_leaf_headers_with_paths(listtoloop, path=None, output=None):
     if path is None:
         path = []
@@ -680,7 +710,6 @@ def get_leaf_headers_with_paths(listtoloop, path=None, output=None):
         else:
             get_leaf_headers_with_paths(header['children'], current_path, output)
     return output
 # Add this helper function at the top of your code
 def words_match_ratio(text1, text2):
     words1 = set(text1.split())
@@ -743,10 +772,8 @@ def extract_section_under_header(multiplePDF_Paths):
         )
         hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
-        print(hierarchy)
         listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
-        print(len(listofHeaderstoMarkup))
         # Precompute all children headers once
         allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
         allchildrenheaders_set = set(allchildrenheaders)  # For faster lookups
@@ -768,12 +795,9 @@ def extract_section_under_header(multiplePDF_Paths):
         # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
         for heading_to_searchDict, paths in listofHeaderstoMarkup:
             heading_to_search = heading_to_searchDict['text']
             heading_to_searchPageNum = heading_to_searchDict['page']
-            if len(heading_to_searchDict['children'])==0:
-                continue
-            print(paths,heading_to_search)
             # Initialize variables
             headertoContinue1 = False
             headertoContinue2 = False
@@ -833,13 +857,12 @@ def extract_section_under_header(multiplePDF_Paths):
                             combined_line_norm = line_text_norm
                         # Check if we should continue processing
-                        # if combined_line_norm and combined_line_norm in paths[0]:
-                        #     headertoContinue1 = combined_line_norm
-                        # if combined_line_norm and combined_line_norm in paths[-2]:
-                        #     headertoContinue2 = combined_line_norm
-                        print('paths[-2].lower()',paths[-2].lower())
                         if  'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
                             stringtowrite='Not to be billed'
                         else:
@@ -1267,13 +1290,13 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
                     else:
                         combined_line_norm = line_text_norm
-                    # # Check if we should continue processing
-                    # if combined_line_norm and combined_line_norm in paths[0]:
-                    #     headertoContinue1 = combined_line_norm
-                    # if combined_line_norm and combined_line_norm in paths[-2]:
-                    #     headertoContinue2 = combined_line_norm
                     if  'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
                     # if any(word in paths[-2].lower() for word in keywordstoSkip):
                       stringtowrite='Not to be billed'
@@ -1701,12 +1724,12 @@ def extract_section_under_header_tobebilled2(pdf_path):
                         combined_line_norm = line_text_norm
                     # Check if we should continue processing
-                    # if combined_line_norm and combined_line_norm in paths[0]:
-                    #     headertoContinue1 = combined_line_norm
-                    # if combined_line_norm and combined_line_norm in paths[-2]:
-                    #     headertoContinue2 = combined_line_norm
                     # if  'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
                     last_path = paths[-2].lower()
                     # if any(word in paths[-2].lower() for word in keywordstoSkip):
@@ -2154,12 +2177,12 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths):
                             combined_line_norm = line_text_norm
                         # Check if we should continue processing
-                        # if combined_line_norm and combined_line_norm in paths[0]:
-                        #     headertoContinue1 = combined_line_norm
-                        # if combined_line_norm and combined_line_norm in paths[-2]:
-                        #     headertoContinue2 = combined_line_norm
                         # if  'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
                         last_path = paths[-2].lower()
                         # if any(word in paths[-2].lower() for word in keywordstoSkip):
@@ -2476,4 +2499,7 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths):
     combined_json_str = json.dumps(jsonCombined, indent=1)
     print(combined_json_str)
     return pdf_bytes.getvalue(), docHighlights , combined_json_str, Alltexttobebilled , filenames

 def get_spaced_text_from_spans(spans):
     return normalize_text(" ".join(span["text"].strip() for span in spans))
 def is_header(span, most_common_font_size, most_common_color, most_common_font):
     fontname = span.get("font", "").lower()
     # is_italic = "italic" in fontname or "oblique" in fontname
     is_bold = "bold" in fontname or span.get("bold", False)
     return (
         (
+            span["size"] > most_common_font_size or
+            span["font"].lower() != most_common_font.lower() or
+            (is_bold and span["size"] > most_common_font_size )
         )
     )
 def add_span_to_nearest_group(span_y, grouped_dict, pageNum=None, threshold=0.5):
     for (p, y) in grouped_dict:
         if pageNum is not None and p != pageNum:
     return headers, top_3_font_sizes, smallest_font_size, spans
 def is_numbered(text):
     return bool(re.match(r'^\d', text.strip()))
     # Remove everything after last sequence of dots/whitespace followed by digits
     return re.sub(r'[\.\s]+\d+.*$', '', toc_text).strip('. ')
+def enforce_level_hierarchy(headers):
+    """
+    Ensure level 2 headers only exist under level 1 headers
+    and clean up any orphaned headers
+    """
+    def process_node_list(node_list, parent_level=-1):
+        i = 0
+        while i < len(node_list):
+            node = node_list[i]
+            # Remove level 2 headers that don't have a level 1 parent
+            if node['level'] == 2 and parent_level != 1:
+                node_list.pop(i)
+                continue
+            # Recursively process children
+            process_node_list(node['children'], node['level'])
+            i += 1
+    process_node_list(headers)
+    return headers
+def build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin=70, bottom_margin=70):
     # Extract headers with margin handling
     headers_list, top_3_font_sizes, smallest_font_size, spans = extract_headers(
         doc,
         most_common_color=most_common_color,
         most_common_font=most_common_font,
         top_margin=top_margin,
+        bottom_margin=50
     )
     # Step 1: Collect and filter potential headers
     # First extract TOC entries to get exact level 0 header texts
     toc_entries = {}
     for pno in toc_pages:
+        print(pno)
+        page = doc[pno]
         toc_text = page.get_text()
         for line in toc_text.split('\n'):
             clean_line = line.strip()
             if clean_line:
                 norm_line = normalize(clean_line)
                 toc_entries[norm_line] = clean_line  # Store original text
+    print(toc_pages)
     for h in headers_list:
         text, size, pageNum, y = h[:4]
         page = doc.load_page(pageNum)
         i += 1
     # Step 2: Identify level 0 headers (largest and in TOC)
     # max_size = max(h['size'] for h in headers) if headers else 0
+    print(top_3_font_sizes)
     max_size,subheaderSize,nbsheadersize=top_3_font_sizes
+    print(max_size)
     toc_text_match=[]
     # Improved TOC matching with exact and substring matching
     toc_matches = []
               toc_matches.append(h)
               toc_text_match.append(h['text'])
         elif matching_toc_texts and h['size'] < max_size * 0.9 and h['size'] > nbsheadersize : # h['size'] < max_size * 0.9 and h['size'] > max_size*0.75:
+             print(h['text'],matching_toc_texts)
              headers.remove(h)
              continue
             # Update the header text with cleaned version
             h['text'] = cleaned_text
             unique_level0.append(h)
+            print(f"Added unique header: {cleaned_text} (normalized: {norm_cleaned_text})")
     # Step 3: Process headers under each level 0 to identify level 1 format
     # First, group headers by their level 0 parent
     enforce_nesting(root)
     root = [h for h in root if not (h['level'] == 0 and not h['children'])]
+    header_tree = enforce_level_hierarchy(root)
+    return header_tree
 def adjust_levels_if_level0_not_in_toc(doc, toc_pages, root):
     def normalize(text):
               f"(Level {header['level']}, p:{header['page']+1}, {size_info})")
         print_tree_with_numbers(header["children"], indent + 1)
+def process_document_headers(doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin=70, bottom_margin=50):
+    print(f"Processing with margins - top:{top_margin}pt, bottom:{bottom_margin}pt")
+    header_tree = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin, bottom_margin)
+    adjust_levels_if_level0_not_in_toc(doc, toc_pages, header_tree)
+    print("Assigning numbers...")
+    assign_numbers_to_headers(header_tree)
+    print("Document structure (excluding margins):")
+    print_tree_with_numbers(header_tree)
+    return header_tree
 def highlight_boxes(doc, highlights, stringtowrite, fixed_width=500):  # Set your desired width here
     for page_num, bbox in highlights.items():
                 )
                 annot1.update()
 def get_leaf_headers_with_paths(listtoloop, path=None, output=None):
     if path is None:
         path = []
         else:
             get_leaf_headers_with_paths(header['children'], current_path, output)
     return output
 # Add this helper function at the top of your code
 def words_match_ratio(text1, text2):
     words1 = set(text1.split())
         )
         hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
         listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
         # Precompute all children headers once
         allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
         allchildrenheaders_set = set(allchildrenheaders)  # For faster lookups
         # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
         for heading_to_searchDict, paths in listofHeaderstoMarkup:
             heading_to_search = heading_to_searchDict['text']
             heading_to_searchPageNum = heading_to_searchDict['page']
             # Initialize variables
             headertoContinue1 = False
             headertoContinue2 = False
                             combined_line_norm = line_text_norm
                         # Check if we should continue processing
+                        if combined_line_norm and combined_line_norm in paths[0]:
+                            headertoContinue1 = combined_line_norm
+                        if combined_line_norm and combined_line_norm in paths[-2]:
+                            headertoContinue2 = combined_line_norm
                         if  'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
                             stringtowrite='Not to be billed'
                         else:
                     else:
                         combined_line_norm = line_text_norm
+                    # Check if we should continue processing
+                    if combined_line_norm and combined_line_norm in paths[0]:
+                        headertoContinue1 = combined_line_norm
+                    if combined_line_norm and combined_line_norm in paths[-2]:
+                        headertoContinue2 = combined_line_norm
                     if  'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
                     # if any(word in paths[-2].lower() for word in keywordstoSkip):
                       stringtowrite='Not to be billed'
                         combined_line_norm = line_text_norm
                     # Check if we should continue processing
+                    if combined_line_norm and combined_line_norm in paths[0]:
+                        headertoContinue1 = combined_line_norm
+                    if combined_line_norm and combined_line_norm in paths[-2]:
+                        headertoContinue2 = combined_line_norm
                     # if  'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
                     last_path = paths[-2].lower()
                     # if any(word in paths[-2].lower() for word in keywordstoSkip):
                             combined_line_norm = line_text_norm
                         # Check if we should continue processing
+                        if combined_line_norm and combined_line_norm in paths[0]:
+                            headertoContinue1 = combined_line_norm
+                        if combined_line_norm and combined_line_norm in paths[-2]:
+                            headertoContinue2 = combined_line_norm
                         # if  'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
                         last_path = paths[-2].lower()
                         # if any(word in paths[-2].lower() for word in keywordstoSkip):
     combined_json_str = json.dumps(jsonCombined, indent=1)
     print(combined_json_str)
     return pdf_bytes.getvalue(), docHighlights , combined_json_str, Alltexttobebilled , filenames