InitialMarkups

Runtime error

App Files Files Community

Marthee commited on Jul 21, 2025

Commit

dca685a

verified ·

1 Parent(s): 3d03a5a

Update InitialMarkups.py

Browse files

Files changed (1) hide show

InitialMarkups.py +16 -8

InitialMarkups.py CHANGED Viewed

@@ -1570,13 +1570,14 @@ def extract_section_under_header_tobebilled2(pdf_path):
         break_collecting = False
         heading_norm = normalize_text(heading_to_search)
         paths_norm = [normalize_text(p) for p in paths[0]] if paths and paths[0] else []
         for page_num in range(heading_to_searchPageNum,len(doc)):
             print(heading_to_search)
             if paths[0].strip().lower() != currentgroupname.strip().lower():
-                Alltexttobebilled+=' \n'+ paths[0] +'\n'
                 currentgroupname=paths[0]
                 print(paths[0])
             if page_num in toc_pages:
               continue
             if break_collecting:
@@ -1628,9 +1629,11 @@ def extract_section_under_header_tobebilled2(pdf_path):
                       stringtowrite='Not to be billed'
                     else:
                       stringtowrite='To be billed'
-                    if stringtowrite!='To be billed':
-                        Alltexttobebilled+= combined_line_norm #################################################
                     # Optimized header matching
                     existsfull = (
                         ( combined_line_norm in allchildrenheaders_set or
@@ -1664,7 +1667,8 @@ def extract_section_under_header_tobebilled2(pdf_path):
                         ]
                         if header_spans:
                             collecting = True
-                            Alltexttobebilled+= ' '+ combined_line_norm
                             matched_header_font_size = max(span["size"] for span in header_spans)
                             collected_lines.append(line_text)
@@ -1759,7 +1763,10 @@ def extract_section_under_header_tobebilled2(pdf_path):
                             if header_spans and (meets_word_threshold or same_start_word(heading_to_search, combined_line_norm) ):
                                 collecting = True
-                                Alltexttobebilled+= ' '+ combined_line_norm
                                 matched_header_font_size = max(span["size"] for span in header_spans)
                                 collected_lines.append(line_text)
@@ -1910,6 +1917,7 @@ def extract_section_under_header_tobebilled2(pdf_path):
     pdf_bytes = BytesIO()
     docHighlights.save(pdf_bytes)
-    return pdf_bytes.getvalue(), docHighlights , json_output , Alltexttobebilled

         break_collecting = False
         heading_norm = normalize_text(heading_to_search)
         paths_norm = [normalize_text(p) for p in paths[0]] if paths and paths[0] else []
         for page_num in range(heading_to_searchPageNum,len(doc)):
             print(heading_to_search)
             if paths[0].strip().lower() != currentgroupname.strip().lower():
+                Alltexttobebilled+= paths[0] +'\n'
                 currentgroupname=paths[0]
                 print(paths[0])
             if page_num in toc_pages:
               continue
             if break_collecting:
                       stringtowrite='Not to be billed'
                     else:
                       stringtowrite='To be billed'
+                    if stringtowrite=='To be billed':
+                        # Alltexttobebilled+= combined_line_norm  #################################################
+                        if matched_header_line_norm in combined_line_norm:
+                            Alltexttobebilled+='\n'
+                        Alltexttobebilled+= ' '+combined_line_norm
                     # Optimized header matching
                     existsfull = (
                         ( combined_line_norm in allchildrenheaders_set or
                         ]
                         if header_spans:
                             collecting = True
+                            # if stringtowrite=='To be billed':
+                            #     Alltexttobebilled+='\n'
                             matched_header_font_size = max(span["size"] for span in header_spans)
                             collected_lines.append(line_text)
                             if header_spans and (meets_word_threshold or same_start_word(heading_to_search, combined_line_norm) ):
                                 collecting = True
+                                if stringtowrite=='To be billed':
+                                    Alltexttobebilled+='\n'
+                                # if stringtowrite=='To be billed':
+                                #     Alltexttobebilled+= ' '+ combined_line_norm
                                 matched_header_font_size = max(span["size"] for span in header_spans)
                                 collected_lines.append(line_text)
     pdf_bytes = BytesIO()
     docHighlights.save(pdf_bytes)
+    return pdf_bytes.getvalue(), docHighlights , json_output, Alltexttobebilled