Marthee commited on
Commit
e14330f
·
verified ·
1 Parent(s): fcf7255

Update InitialMarkups.py

Browse files
Files changed (1) hide show
  1. InitialMarkups.py +1 -387
InitialMarkups.py CHANGED
@@ -1468,7 +1468,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1468
 
1469
  pdf_bytes = BytesIO()
1470
  docHighlights.save(pdf_bytes)
1471
- return pdf_bytes.getvalue(), docHighlights , json_output
1472
 
1473
 
1474
 
@@ -1852,389 +1852,3 @@ def extract_section_under_headerRawan(pdf_path,headingjson,pagenum=0,incominghea
1852
 
1853
 
1854
 
1855
-
1856
- ########################################################################################################################################################
1857
- ########################################################################################################################################################
1858
-
1859
- def extract_section_under_headerRawan(pdf_path,headingjson,pagenum=0,incomingheader=0):
1860
- top_margin = 70
1861
- bottom_margin = 50
1862
- # Optimized URL handling
1863
- if pdf_path and ('http' in pdf_path or 'dropbox' in pdf_path):
1864
- pdf_path = pdf_path.replace('dl=0', 'dl=1')
1865
-
1866
- # Cache frequently used values
1867
- response = requests.get(pdf_path)
1868
- pdf_content = BytesIO(response.content)
1869
- if not pdf_content:
1870
- raise ValueError("No valid PDF content found.")
1871
-
1872
- doc = fitz.open(stream=pdf_content, filetype="pdf")
1873
- docHighlights = fitz.open(stream=pdf_content, filetype="pdf")
1874
- most_common_font_size, most_common_color, most_common_font = get_regular_font_size_and_color(doc)
1875
-
1876
- # Precompute regex patterns
1877
- dot_pattern = re.compile(r'\.{3,}')
1878
- url_pattern = re.compile(r'https?://\S+|www\.\S+')
1879
-
1880
- def get_toc_page_numbers(doc, max_pages_to_check=15):
1881
- toc_pages = []
1882
- for page_num in range(min(len(doc), max_pages_to_check)):
1883
- page = doc.load_page(page_num)
1884
- blocks = page.get_text("dict")["blocks"]
1885
-
1886
- dot_line_count = 0
1887
- for block in blocks:
1888
- for line in block.get("lines", []):
1889
- line_text = get_spaced_text_from_spans(line["spans"]).strip()
1890
- if dot_pattern.search(line_text):
1891
- dot_line_count += 1
1892
-
1893
- if dot_line_count >= 3:
1894
- toc_pages.append(page_num)
1895
-
1896
- return list(range(0, toc_pages[-1] +1)) if toc_pages else toc_pages
1897
-
1898
- toc_pages = get_toc_page_numbers(doc)
1899
-
1900
- headers, top_3_font_sizes, smallest_font_size, headersSpans = extract_headers(
1901
- doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin, bottom_margin
1902
- )
1903
-
1904
- listofheadingsfromrawan=[]
1905
- if type(headingjson) == str:
1906
- listofheadingsfromrawan.append(headingjson)
1907
- headingjson=[headingjson]
1908
- else:
1909
- for item in headingjson:
1910
- listofheadingsfromrawan.append(normalize_text(item['Subject']))
1911
- # print('hereeeeeeeeeeeeeee0',listofheadingsfromrawan)
1912
- # Precompute all children headers once
1913
- allchildrenheaders = listofheadingsfromrawan
1914
- # print('hereeeeeeeeeeeeeee00',allchildrenheaders)
1915
- allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
1916
-
1917
- df = pd.DataFrame(columns=["NBSLink","Subject","Page","Author","Creation Date","Layer",'Code', 'head above 1', "head above 2"])
1918
- data_list_JSON = []
1919
-
1920
- if len(top_3_font_sizes)==3:
1921
- mainHeaderFontSize, subHeaderFontSize, subsubheaderFontSize = top_3_font_sizes
1922
- elif len(top_3_font_sizes)==2:
1923
- mainHeaderFontSize= top_3_font_sizes[0]
1924
- subHeaderFontSize= top_3_font_sizes[1]
1925
- subsubheaderFontSize= top_3_font_sizes[1]
1926
-
1927
- # print("📌 Has TOC:", bool(toc_pages), " | Pages to skip:", toc_pages)
1928
-
1929
- # Preload all pages to avoid repeated loading
1930
- # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
1931
- newjsonList=[]
1932
- for heading_to_searchDict in headingjson:
1933
- if type(heading_to_searchDict) == str:
1934
- heading_to_search = heading_to_searchDict
1935
- heading_to_searchPageNum = pagenum
1936
- else:
1937
- heading_to_search = heading_to_searchDict['Subject']
1938
- heading_to_searchPageNum = int(heading_to_searchDict['Page'])-1
1939
- incomingheader = heading_to_searchDict['head above 1']
1940
-
1941
- # print('hereeeeeeeeeeeeeee0',heading_to_searchPageNum)
1942
- done = False
1943
- collecting = False
1944
- collected_lines = []
1945
- page_highlights = {}
1946
- current_bbox = {}
1947
- last_y1s = {}
1948
- mainHeader = ''
1949
- subHeader = ''
1950
- matched_header_line_norm = heading_to_search
1951
- break_collecting = False
1952
- heading_norm = normalize_text(heading_to_search)
1953
-
1954
- for page_num in range(heading_to_searchPageNum,len(doc)):
1955
-
1956
- if page_num in toc_pages:
1957
- continue
1958
- if break_collecting:
1959
- break
1960
- page=doc[page_num]
1961
- page_height = page.rect.height
1962
- blocks = page.get_text("dict")["blocks"]
1963
-
1964
- for block in blocks:
1965
- if break_collecting:
1966
- break
1967
-
1968
- lines = block.get("lines", [])
1969
- i = 0
1970
- while i < len(lines):
1971
- if break_collecting:
1972
- break
1973
-
1974
- spans = lines[i].get("spans", [])
1975
- if not spans:
1976
- i += 1
1977
- continue
1978
-
1979
- y0 = spans[0]["bbox"][1]
1980
- y1 = spans[0]["bbox"][3]
1981
- if y0 < top_margin or y1 > (page_height - bottom_margin):
1982
- i += 1
1983
- continue
1984
-
1985
- line_text = get_spaced_text_from_spans(spans).lower()
1986
- line_text_norm = normalize_text(line_text)
1987
-
1988
- # Combine with next line if available
1989
- if i + 1 < len(lines):
1990
- next_spans = lines[i + 1].get("spans", [])
1991
- next_line_text = get_spaced_text_from_spans(next_spans).lower()
1992
- combined_line_norm = normalize_text(line_text + " " + next_line_text)
1993
- else:
1994
- combined_line_norm = line_text_norm
1995
- # Optimized header matching
1996
- existsfull = (
1997
- ( combined_line_norm in allchildrenheaders_set or
1998
- combined_line_norm in allchildrenheaders ) and heading_to_search in combined_line_norm
1999
- )
2000
-
2001
- # New word-based matching
2002
- current_line_words = set(combined_line_norm.split())
2003
- heading_words = set(heading_norm.split())
2004
- all_words_match = current_line_words.issubset(heading_words) and len(current_line_words) > 0
2005
-
2006
- substring_match = (
2007
- heading_norm in combined_line_norm or
2008
- combined_line_norm in heading_norm or
2009
- all_words_match # Include the new word-based matching
2010
- )
2011
-
2012
- if (substring_match and existsfull and not collecting and
2013
- len(combined_line_norm) > 0 ):#and (headertoContinue1 or headertoContinue2) ):
2014
-
2015
- # Check header conditions more efficiently
2016
- header_spans = [
2017
- span for span in spans
2018
- if (is_header(span, most_common_font_size, most_common_color, most_common_font)
2019
- # and span['size'] >= subsubheaderFontSize
2020
- and span['size'] < mainHeaderFontSize)
2021
- ]
2022
- if header_spans:
2023
- collecting = True
2024
- matched_header_font_size = max(span["size"] for span in header_spans)
2025
- # print(f"📥 Start collecting after header: {combined_line_norm} (Font size: {matched_header_font_size})")
2026
-
2027
- collected_lines.append(line_text)
2028
- valid_spans = [span for span in spans if span.get("bbox")]
2029
-
2030
- if valid_spans:
2031
- x0s = [span["bbox"][0] for span in valid_spans]
2032
- x1s = [span["bbox"][2] for span in valid_spans]
2033
- y0s = [span["bbox"][1] for span in valid_spans]
2034
- y1s = [span["bbox"][3] for span in valid_spans]
2035
-
2036
- header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
2037
-
2038
- if page_num in current_bbox:
2039
- cb = current_bbox[page_num]
2040
- current_bbox[page_num] = [
2041
- min(cb[0], header_bbox[0]),
2042
- min(cb[1], header_bbox[1]),
2043
- max(cb[2], header_bbox[2]),
2044
- max(cb[3], header_bbox[3])
2045
- ]
2046
- else:
2047
- current_bbox[page_num] = header_bbox
2048
- last_y1s[page_num] = header_bbox[3]
2049
- x0, y0, x1, y1 = header_bbox
2050
-
2051
- zoom = 200
2052
- left = int(x0)
2053
- top = int(y0)
2054
- zoom_str = f"{zoom},{left},{top}"
2055
- pageNumberFound = page_num + 1
2056
-
2057
- # Build the query parameters
2058
- params = {
2059
- 'pdfLink': pdf_path, # Your PDF link
2060
- 'keyword': heading_to_search, # Your keyword (could be a string or list)
2061
- }
2062
-
2063
- # URL encode each parameter
2064
- encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
2065
-
2066
- # Construct the final encoded link
2067
- encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
2068
-
2069
- # Correctly construct the final URL with page and zoom
2070
- final_url = f"{newlink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
2071
-
2072
- # Get current date and time
2073
- now = datetime.now()
2074
-
2075
- # Format the output
2076
- formatted_time = now.strftime("%d/%m/%Y %I:%M:%S %p")
2077
- # Optionally, add the URL to a DataFrame
2078
- new_url= final_url
2079
- if type(heading_to_searchDict) != str:
2080
- heading_to_searchDict['NBSLink']=new_url
2081
- newjsonList.append(heading_to_searchDict)
2082
- # print("Final URL:", final_url)
2083
- i += 2
2084
- continue
2085
- else:
2086
- if (substring_match and not collecting and
2087
- len(combined_line_norm) > 0): # and (headertoContinue1 or headertoContinue2) ):
2088
-
2089
- # Calculate word match percentage
2090
- word_match_percent = words_match_ratio(heading_norm, combined_line_norm) * 100
2091
-
2092
- # Check if at least 70% of header words exist in this line
2093
- meets_word_threshold = word_match_percent >= 100
2094
-
2095
- # Check header conditions (including word threshold)
2096
- header_spans = [
2097
- span for span in spans
2098
- if (is_header(span, most_common_font_size, most_common_color, most_common_font)
2099
- # and span['size'] >= subsubheaderFontSize
2100
- and span['size'] < mainHeaderFontSize)
2101
- ]
2102
-
2103
- if header_spans and (meets_word_threshold or same_start_word(heading_to_search, combined_line_norm) ):
2104
- collecting = True
2105
- matched_header_font_size = max(span["size"] for span in header_spans)
2106
-
2107
- collected_lines.append(line_text)
2108
- valid_spans = [span for span in spans if span.get("bbox")]
2109
-
2110
- if valid_spans:
2111
- x0s = [span["bbox"][0] for span in valid_spans]
2112
- x1s = [span["bbox"][2] for span in valid_spans]
2113
- y0s = [span["bbox"][1] for span in valid_spans]
2114
- y1s = [span["bbox"][3] for span in valid_spans]
2115
-
2116
- header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
2117
-
2118
- if page_num in current_bbox:
2119
- cb = current_bbox[page_num]
2120
- current_bbox[page_num] = [
2121
- min(cb[0], header_bbox[0]),
2122
- min(cb[1], header_bbox[1]),
2123
- max(cb[2], header_bbox[2]),
2124
- max(cb[3], header_bbox[3])
2125
- ]
2126
- else:
2127
- current_bbox[page_num] = header_bbox
2128
-
2129
- last_y1s[page_num] = header_bbox[3]
2130
- x0, y0, x1, y1 = header_bbox
2131
- zoom = 200
2132
- left = int(x0)
2133
- top = int(y0)
2134
- zoom_str = f"{zoom},{left},{top}"
2135
- pageNumberFound = page_num + 1
2136
-
2137
- # Build the query parameters
2138
- params = {
2139
- 'pdfLink': pdf_path, # Your PDF link
2140
- 'keyword': heading_to_search, # Your keyword (could be a string or list)
2141
- }
2142
-
2143
- # URL encode each parameter
2144
- encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
2145
-
2146
- # Construct the final encoded link
2147
- encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
2148
-
2149
- # Correctly construct the final URL with page and zoom
2150
- final_url = f"{newlink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
2151
- new_url= final_url
2152
- if type(heading_to_searchDict) != str:
2153
- heading_to_searchDict['NBSLink']=new_url
2154
- newjsonList.append(heading_to_searchDict)
2155
- i += 2
2156
- continue
2157
- if collecting:
2158
- norm_line = normalize_text(line_text)
2159
-
2160
- # Optimized URL check
2161
- if url_pattern.match(norm_line):
2162
- line_is_header = False
2163
- else:
2164
- line_is_header = any(is_header(span, most_common_font_size, most_common_color, most_common_font) for span in spans)
2165
-
2166
- if line_is_header:
2167
- header_font_size = max(span["size"] for span in spans)
2168
- is_probably_real_header = (
2169
- header_font_size >= matched_header_font_size and
2170
- is_header(spans[0], most_common_font_size, most_common_color, most_common_font) and
2171
- len(line_text.strip()) > 2
2172
- )
2173
-
2174
- if (norm_line != matched_header_line_norm and
2175
- norm_line != heading_norm and
2176
- is_probably_real_header):
2177
- if line_text not in heading_norm:
2178
- # print(f"🛑 Stop at header with same or larger font: '{line_text}' ({header_font_size} ≥ {matched_header_font_size})")
2179
- collecting = False
2180
- done = True
2181
- headertoContinue1 = False
2182
- headertoContinue2=False
2183
- for page_num, bbox in current_bbox.items():
2184
- bbox[3] = last_y1s.get(page_num, bbox[3])
2185
- page_highlights[page_num] = bbox
2186
-
2187
- if 'installation' in incomingheader or 'execution' in incomingheader or 'miscellaneous items' in incomingheader :
2188
- stringtowrite='Not to be billed'
2189
- else:
2190
- stringtowrite='To be billed'
2191
- highlight_boxes(docHighlights, page_highlights,stringtowrite)
2192
-
2193
- break_collecting = True
2194
- break
2195
-
2196
- if break_collecting:
2197
- break
2198
-
2199
- collected_lines.append(line_text)
2200
- valid_spans = [span for span in spans if span.get("bbox")]
2201
- if valid_spans:
2202
- x0s = [span["bbox"][0] for span in valid_spans]
2203
- x1s = [span["bbox"][2] for span in valid_spans]
2204
- y0s = [span["bbox"][1] for span in valid_spans]
2205
- y1s = [span["bbox"][3] for span in valid_spans]
2206
-
2207
- line_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
2208
-
2209
- if page_num in current_bbox:
2210
- cb = current_bbox[page_num]
2211
- current_bbox[page_num] = [
2212
- min(cb[0], line_bbox[0]),
2213
- min(cb[1], line_bbox[1]),
2214
- max(cb[2], line_bbox[2]),
2215
- max(cb[3], line_bbox[3])
2216
- ]
2217
- else:
2218
- current_bbox[page_num] = line_bbox
2219
-
2220
- last_y1s[page_num] = line_bbox[3]
2221
- i += 1
2222
-
2223
- if not done:
2224
- for page_num, bbox in current_bbox.items():
2225
- bbox[3] = last_y1s.get(page_num, bbox[3])
2226
- page_highlights[page_num] = bbox
2227
- if 'installation' in incomingheader or 'execution' in incomingheader or 'miscellaneous items' in incomingheader :
2228
- stringtowrite='Not to be billed'
2229
- else:
2230
- stringtowrite='To be billed'
2231
- highlight_boxes(docHighlights, page_highlights,stringtowrite)
2232
-
2233
- # docHighlights.save("highlighted_output.pdf", garbage=4, deflate=True)
2234
-
2235
- pdf_bytes = BytesIO()
2236
- docHighlights.save(pdf_bytes)
2237
- return pdf_bytes.getvalue(), docHighlights , newjsonList
2238
-
2239
-
2240
-
 
1468
 
1469
  pdf_bytes = BytesIO()
1470
  docHighlights.save(pdf_bytes)
1471
+ return pdf_bytes.getvalue(), docHighlights , json_output , Alltexttobebilled
1472
 
1473
 
1474
 
 
1852
 
1853
 
1854