InitialMarkups2

Sleeping

App Files Files Community

Marthee commited on Jul 8, 2025

Commit

fcf7255

verified ·

1 Parent(s): 2aa8c4b

Update InitialMarkups.py

Browse files

Files changed (1) hide show

InitialMarkups.py +17 -11

InitialMarkups.py CHANGED Viewed

@@ -1044,7 +1044,7 @@ def extract_section_under_header(pdf_path):
 def extract_section_under_header_tobebilledOnly(pdf_path):
-    Alltext_Tobebilled=''
     top_margin = 70
     bottom_margin = 50
     headertoContinue1 = False
@@ -1098,6 +1098,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
     hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
     listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
     # Precompute all children headers once
     allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
     allchildrenheaders_set = set(allchildrenheaders)  # For faster lookups
@@ -1113,6 +1114,8 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
         subHeaderFontSize= top_3_font_sizes[1]
         subsubheaderFontSize= top_3_font_sizes[1]
     # Preload all pages to avoid repeated loading
     # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
@@ -1120,7 +1123,6 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
         heading_to_search = heading_to_searchDict['text']
         heading_to_searchPageNum = heading_to_searchDict['page']
         # Initialize variables
         headertoContinue1 = False
         headertoContinue2 = False
@@ -1181,8 +1183,10 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
                     # Check if we should continue processing
                     if combined_line_norm and combined_line_norm in paths[0]:
                         headertoContinue1 = combined_line_norm
                     if combined_line_norm and combined_line_norm in paths[-2]:
                         headertoContinue2 = combined_line_norm
                     if  'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
                       stringtowrite='Not to be billed'
@@ -1220,13 +1224,12 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
                                 and span['size'] < mainHeaderFontSize)
                         ]
                         if header_spans and stringtowrite.startswith('To'):
-                            Alltext_Tobebilled+=combined_line_norm
                             collecting = True
                             matched_header_font_size = max(span["size"] for span in header_spans)
                             collected_lines.append(line_text)
                             valid_spans = [span for span in spans if span.get("bbox")]
                             if valid_spans:
                                 x0s = [span["bbox"][0] for span in valid_spans]
                                 x1s = [span["bbox"][2] for span in valid_spans]
@@ -1267,7 +1270,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
                                 encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
                                 # Correctly construct the final URL with page and zoom
-                                final_url = f"{tobebilledonlyLink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
                                 # Get current date and time
                                 now = datetime.now()
@@ -1314,11 +1317,10 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
                                     and span['size'] < mainHeaderFontSize)
                             ]
-                            if header_spans and (meets_word_threshold or same_start_word(heading_to_search, combined_line_norm) ) and stringtowrite.startswith('To'):
-                                Alltext_Tobebilled+=combined_line_norm
                                 collecting = True
                                 matched_header_font_size = max(span["size"] for span in header_spans)
                                 collected_lines.append(line_text)
                                 valid_spans = [span for span in spans if span.get("bbox")]
@@ -1362,7 +1364,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
                                     encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
                                     # Correctly construct the final URL with page and zoom
-                                    final_url = f"{tobebilledonlyLink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
                                     # Get current date and time
                                     now = datetime.now()
@@ -1389,6 +1391,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
                                     # Convert list to JSON
                                     json_output = json.dumps(data_list_JSON, indent=4)
                                     i += 2
                                     continue
                     if collecting:
@@ -1465,7 +1468,10 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
     pdf_bytes = BytesIO()
     docHighlights.save(pdf_bytes)
-    return pdf_bytes.getvalue(), docHighlights , json_output , Alltext_Tobebilled

 def extract_section_under_header_tobebilledOnly(pdf_path):
+    Alltexttobebilled=''
     top_margin = 70
     bottom_margin = 50
     headertoContinue1 = False
     hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
     listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
     # Precompute all children headers once
     allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
     allchildrenheaders_set = set(allchildrenheaders)  # For faster lookups
         subHeaderFontSize= top_3_font_sizes[1]
         subsubheaderFontSize= top_3_font_sizes[1]
     # Preload all pages to avoid repeated loading
     # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
         heading_to_search = heading_to_searchDict['text']
         heading_to_searchPageNum = heading_to_searchDict['page']
         # Initialize variables
         headertoContinue1 = False
         headertoContinue2 = False
                     # Check if we should continue processing
                     if combined_line_norm and combined_line_norm in paths[0]:
                         headertoContinue1 = combined_line_norm
                     if combined_line_norm and combined_line_norm in paths[-2]:
                         headertoContinue2 = combined_line_norm
                     if  'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
                       stringtowrite='Not to be billed'
                                 and span['size'] < mainHeaderFontSize)
                         ]
                         if header_spans and stringtowrite.startswith('To'):
                             collecting = True
                             matched_header_font_size = max(span["size"] for span in header_spans)
+                            Alltexttobebilled+= ' '+ combined_line_norm
                             collected_lines.append(line_text)
                             valid_spans = [span for span in spans if span.get("bbox")]
                             if valid_spans:
                                 x0s = [span["bbox"][0] for span in valid_spans]
                                 x1s = [span["bbox"][2] for span in valid_spans]
                                 encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
                                 # Correctly construct the final URL with page and zoom
+                                final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
                                 # Get current date and time
                                 now = datetime.now()
                                     and span['size'] < mainHeaderFontSize)
                             ]
+                            if header_spans and (meets_word_threshold or same_start_word(heading_to_search, combined_line_norm) ) and stringtowrite.startswith('To'):
                                 collecting = True
                                 matched_header_font_size = max(span["size"] for span in header_spans)
+                                Alltexttobebilled+= ' '+ combined_line_norm
                                 collected_lines.append(line_text)
                                 valid_spans = [span for span in spans if span.get("bbox")]
                                     encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
                                     # Correctly construct the final URL with page and zoom
+                                    final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
                                     # Get current date and time
                                     now = datetime.now()
                                     # Convert list to JSON
                                     json_output = json.dumps(data_list_JSON, indent=4)
                                     i += 2
                                     continue
                     if collecting:
     pdf_bytes = BytesIO()
     docHighlights.save(pdf_bytes)
+    return pdf_bytes.getvalue(), docHighlights , json_output