InitialMarkups2

Sleeping

App Files Files Community

Marthee commited on Jul 15, 2025

Commit

06527d8

verified ·

1 Parent(s): 166e454

Update InitialMarkups.py

Browse files

Files changed (1) hide show

InitialMarkups.py +120 -0

InitialMarkups.py CHANGED Viewed

@@ -1044,6 +1044,126 @@ def extract_section_under_header(pdf_path):
 def extract_section_under_header_tobebilledOnly(pdf_path):
     Alltexttobebilled=''
     alltextWithoutNotbilled=''

+def extract_section_under_header_withoutNot(pdf_path):
+    Alltexttobebilled=''
+    alltextWithoutNotbilled=''
+    top_margin = 70
+    bottom_margin = 50
+    headertoContinue1 = False
+    headertoContinue2=False
+    parsed_url = urlparse(pdf_path)
+    filename = os.path.basename(parsed_url.path)
+    filename = unquote(filename)  # decode URL-encoded characters
+    # Optimized URL handling
+    if pdf_path and ('http' in pdf_path or 'dropbox' in pdf_path):
+        pdf_path = pdf_path.replace('dl=0', 'dl=1')
+    # Cache frequently used values
+    response = requests.get(pdf_path)
+    pdf_content = BytesIO(response.content)
+    if not pdf_content:
+        raise ValueError("No valid PDF content found.")
+    doc = fitz.open(stream=pdf_content, filetype="pdf")
+    docHighlights = fitz.open(stream=pdf_content, filetype="pdf")
+    most_common_font_size, most_common_color, most_common_font = get_regular_font_size_and_color(doc)
+    # Precompute regex patterns
+    dot_pattern = re.compile(r'\.{3,}')
+    url_pattern = re.compile(r'https?://\S+|www\.\S+')
+    def get_toc_page_numbers(doc, max_pages_to_check=15):
+        toc_pages = []
+        for page_num in range(min(len(doc), max_pages_to_check)):
+            page = doc.load_page(page_num)
+            blocks = page.get_text("dict")["blocks"]
+            dot_line_count = 0
+            for block in blocks:
+                for line in block.get("lines", []):
+                    line_text = get_spaced_text_from_spans(line["spans"]).strip()
+                    if dot_pattern.search(line_text):
+                        dot_line_count += 1
+            if dot_line_count >= 3:
+                toc_pages.append(page_num)
+        return list(range(0, toc_pages[-1] +1)) if toc_pages else toc_pages
+    toc_pages = get_toc_page_numbers(doc)
+    hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
+    listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
+    for heading_to_searchDict, paths in listofHeaderstoMarkup:
+        heading_to_search = heading_to_searchDict['text']
+        heading_to_searchPageNum = heading_to_searchDict['page']
+        break_collecting = False
+        for page_num in range(heading_to_searchPageNum,len(doc)):
+            if page_num in toc_pages:
+              continue
+            if break_collecting:
+                break
+            page=doc[page_num]
+            page_height = page.rect.height
+            blocks = page.get_text("dict")["blocks"]
+            for block in blocks:
+                if break_collecting:
+                    break
+                lines = block.get("lines", [])
+                i = 0
+                while i < len(lines):
+                    if break_collecting:
+                        break
+                    spans = lines[i].get("spans", [])
+                    if not spans:
+                        i += 1
+                        continue
+                    y0 = spans[0]["bbox"][1]
+                    y1 = spans[0]["bbox"][3]
+                    if y0 < top_margin or y1 > (page_height - bottom_margin):
+                        i += 1
+                        continue
+                    line_text = get_spaced_text_from_spans(spans).lower()
+                    line_text_norm = normalize_text(line_text)
+                    # Combine with next line if available
+                    if i + 1 < len(lines):
+                        next_spans = lines[i + 1].get("spans", [])
+                        next_line_text = get_spaced_text_from_spans(next_spans).lower()
+                        combined_line_norm = normalize_text(line_text + " " + next_line_text)
+                    else:
+                        combined_line_norm = line_text_norm
+                    # Check if we should continue processing
+                    if combined_line_norm and combined_line_norm in paths[0]:
+                        headertoContinue1 = combined_line_norm
+                    if combined_line_norm and combined_line_norm in paths[-2]:
+                        headertoContinue2 = combined_line_norm
+                    if  'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
+                      stringtowrite='Not to be billed'
+                    else:
+                      stringtowrite='To be billed'
+                    if stringtowrite!='To be billed':
+                        alltextWithoutNotbilled+= combined_line_norm #################################################
+    return alltextWithoutNotbilled
+##############################################################3
 def extract_section_under_header_tobebilledOnly(pdf_path):
     Alltexttobebilled=''
     alltextWithoutNotbilled=''