Marthee commited on
Commit
2aa8c4b
·
verified ·
1 Parent(s): acaca02

Update InitialMarkups.py

Browse files
Files changed (1) hide show
  1. InitialMarkups.py +0 -869
InitialMarkups.py CHANGED
@@ -1844,875 +1844,6 @@ def extract_section_under_headerRawan(pdf_path,headingjson,pagenum=0,incominghea
1844
  docHighlights.save(pdf_bytes)
1845
  return pdf_bytes.getvalue(), docHighlights , newjsonList
1846
 
1847
-
1848
-
1849
-
1850
- # top_margin = 70
1851
- # bottom_margin = 50
1852
- # headertoContinue1 = False
1853
- # headertoContinue2=False
1854
-
1855
- # parsed_url = urlparse(pdf_path)
1856
- # filename = os.path.basename(parsed_url.path)
1857
- # filename = unquote(filename) # decode URL-encoded characters
1858
-
1859
- # # Optimized URL handling
1860
- # if pdf_path and ('http' in pdf_path or 'dropbox' in pdf_path):
1861
- # pdf_path = pdf_path.replace('dl=0', 'dl=1')
1862
-
1863
- # # Cache frequently used values
1864
- # response = requests.get(pdf_path)
1865
- # pdf_content = BytesIO(response.content)
1866
- # if not pdf_content:
1867
- # raise ValueError("No valid PDF content found.")
1868
-
1869
- # doc = fitz.open(stream=pdf_content, filetype="pdf")
1870
- # docHighlights = fitz.open(stream=pdf_content, filetype="pdf")
1871
- # most_common_font_size, most_common_color, most_common_font = get_regular_font_size_and_color(doc)
1872
-
1873
- # # Precompute regex patterns
1874
- # dot_pattern = re.compile(r'\.{3,}')
1875
- # url_pattern = re.compile(r'https?://\S+|www\.\S+')
1876
-
1877
- # def get_toc_page_numbers(doc, max_pages_to_check=15):
1878
- # toc_pages = []
1879
- # for page_num in range(min(len(doc), max_pages_to_check)):
1880
- # page = doc.load_page(page_num)
1881
- # blocks = page.get_text("dict")["blocks"]
1882
-
1883
- # dot_line_count = 0
1884
- # for block in blocks:
1885
- # for line in block.get("lines", []):
1886
- # line_text = get_spaced_text_from_spans(line["spans"]).strip()
1887
- # if dot_pattern.search(line_text):
1888
- # dot_line_count += 1
1889
-
1890
- # if dot_line_count >= 3:
1891
- # toc_pages.append(page_num)
1892
-
1893
- # return list(range(0, toc_pages[-1] +1)) if toc_pages else toc_pages
1894
-
1895
- # toc_pages = get_toc_page_numbers(doc)
1896
-
1897
- # headers, top_3_font_sizes, smallest_font_size, headersSpans = extract_headers(
1898
- # doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin, bottom_margin
1899
- # )
1900
-
1901
- # hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
1902
- # listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
1903
- # print('listofHeaderstoMarkup',listofHeaderstoMarkup)
1904
- # # Precompute all children headers once
1905
- # allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
1906
- # allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
1907
-
1908
- # df = pd.DataFrame(columns=["NBSLink","Subject","Page","Author","Creation Date","Layer",'Code', 'head above 1', "head above 2"])
1909
- # dictionaryNBS={}
1910
- # data_list_JSON = []
1911
-
1912
- # if len(top_3_font_sizes)==3:
1913
- # mainHeaderFontSize, subHeaderFontSize, subsubheaderFontSize = top_3_font_sizes
1914
- # elif len(top_3_font_sizes)==2:
1915
- # mainHeaderFontSize= top_3_font_sizes[0]
1916
- # subHeaderFontSize= top_3_font_sizes[1]
1917
- # subsubheaderFontSize= top_3_font_sizes[1]
1918
-
1919
- # print("📌 Has TOC:", bool(toc_pages), " | Pages to skip:", toc_pages)
1920
-
1921
- # # Preload all pages to avoid repeated loading
1922
- # # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
1923
-
1924
- # for heading_to_searchDict, paths in listofHeaderstoMarkup:
1925
- # heading_to_search = heading_to_searchDict['text']
1926
- # heading_to_searchPageNum = heading_to_searchDict['page']
1927
-
1928
- # print('headertosearch', heading_to_search)
1929
-
1930
- # # Initialize variables
1931
- # headertoContinue1 = False
1932
- # headertoContinue2 = False
1933
- # matched_header_line = None
1934
- # done = False
1935
- # collecting = False
1936
- # collected_lines = []
1937
- # page_highlights = {}
1938
- # current_bbox = {}
1939
- # last_y1s = {}
1940
- # mainHeader = ''
1941
- # subHeader = ''
1942
- # matched_header_line_norm = heading_to_search
1943
- # break_collecting = False
1944
- # heading_norm = normalize_text(heading_to_search)
1945
- # paths_norm = [normalize_text(p) for p in paths[0]] if paths and paths[0] else []
1946
-
1947
- # for page_num in range(heading_to_searchPageNum,len(doc)):
1948
- # if page_num in toc_pages:
1949
- # continue
1950
- # if break_collecting:
1951
- # break
1952
- # page=doc[page_num]
1953
- # page_height = page.rect.height
1954
- # blocks = page.get_text("dict")["blocks"]
1955
-
1956
- # for block in blocks:
1957
- # if break_collecting:
1958
- # break
1959
-
1960
- # lines = block.get("lines", [])
1961
- # i = 0
1962
- # while i < len(lines):
1963
- # if break_collecting:
1964
- # break
1965
-
1966
- # spans = lines[i].get("spans", [])
1967
- # if not spans:
1968
- # i += 1
1969
- # continue
1970
-
1971
- # y0 = spans[0]["bbox"][1]
1972
- # y1 = spans[0]["bbox"][3]
1973
- # if y0 < top_margin or y1 > (page_height - bottom_margin):
1974
- # i += 1
1975
- # continue
1976
-
1977
- # line_text = get_spaced_text_from_spans(spans).lower()
1978
- # line_text_norm = normalize_text(line_text)
1979
-
1980
- # # Combine with next line if available
1981
- # if i + 1 < len(lines):
1982
- # next_spans = lines[i + 1].get("spans", [])
1983
- # next_line_text = get_spaced_text_from_spans(next_spans).lower()
1984
- # combined_line_norm = normalize_text(line_text + " " + next_line_text)
1985
- # else:
1986
- # combined_line_norm = line_text_norm
1987
-
1988
- # # Check if we should continue processing
1989
- # if combined_line_norm and combined_line_norm in paths[0]:
1990
- # print(combined_line_norm)
1991
- # headertoContinue1 = combined_line_norm
1992
- # if combined_line_norm and combined_line_norm in paths[-2]:
1993
- # print(combined_line_norm)
1994
- # headertoContinue2 = combined_line_norm
1995
- # if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
1996
- # stringtowrite='Not to be billed'
1997
- # else:
1998
- # stringtowrite='To be billed'
1999
- # # Optimized header matching
2000
- # existsfull = (
2001
- # ( combined_line_norm in allchildrenheaders_set or
2002
- # combined_line_norm in allchildrenheaders ) and heading_to_search in combined_line_norm
2003
- # )
2004
-
2005
- # # New word-based matching
2006
- # current_line_words = set(combined_line_norm.split())
2007
- # heading_words = set(heading_norm.split())
2008
- # all_words_match = current_line_words.issubset(heading_words) and len(current_line_words) > 0
2009
-
2010
- # substring_match = (
2011
- # heading_norm in combined_line_norm or
2012
- # combined_line_norm in heading_norm or
2013
- # all_words_match # Include the new word-based matching
2014
- # )
2015
- # # substring_match = (
2016
- # # heading_norm in combined_line_norm or
2017
- # # combined_line_norm in heading_norm
2018
- # # )
2019
-
2020
- # if (substring_match and existsfull and not collecting and
2021
- # len(combined_line_norm) > 0 ):#and (headertoContinue1 or headertoContinue2) ):
2022
-
2023
- # # Check header conditions more efficiently
2024
- # header_spans = [
2025
- # span for span in spans
2026
- # if (is_header(span, most_common_font_size, most_common_color, most_common_font)
2027
- # # and span['size'] >= subsubheaderFontSize
2028
- # and span['size'] < mainHeaderFontSize)
2029
- # ]
2030
- # if header_spans:
2031
- # collecting = True
2032
- # matched_header_font_size = max(span["size"] for span in header_spans)
2033
- # print(f"📥 Start collecting after header: {combined_line_norm} (Font size: {matched_header_font_size})")
2034
-
2035
- # collected_lines.append(line_text)
2036
- # valid_spans = [span for span in spans if span.get("bbox")]
2037
-
2038
- # if valid_spans:
2039
- # x0s = [span["bbox"][0] for span in valid_spans]
2040
- # x1s = [span["bbox"][2] for span in valid_spans]
2041
- # y0s = [span["bbox"][1] for span in valid_spans]
2042
- # y1s = [span["bbox"][3] for span in valid_spans]
2043
-
2044
- # header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
2045
-
2046
- # if page_num in current_bbox:
2047
- # cb = current_bbox[page_num]
2048
- # current_bbox[page_num] = [
2049
- # min(cb[0], header_bbox[0]),
2050
- # min(cb[1], header_bbox[1]),
2051
- # max(cb[2], header_bbox[2]),
2052
- # max(cb[3], header_bbox[3])
2053
- # ]
2054
- # else:
2055
- # current_bbox[page_num] = header_bbox
2056
- # last_y1s[page_num] = header_bbox[3]
2057
- # x0, y0, x1, y1 = header_bbox
2058
-
2059
- # zoom = 200
2060
- # left = int(x0)
2061
- # top = int(y0)
2062
- # zoom_str = f"{zoom},{left},{top}"
2063
- # pageNumberFound = page_num + 1
2064
-
2065
- # # Build the query parameters
2066
- # params = {
2067
- # 'pdfLink': pdf_path, # Your PDF link
2068
- # 'keyword': heading_to_search, # Your keyword (could be a string or list)
2069
- # }
2070
-
2071
- # # URL encode each parameter
2072
- # encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
2073
-
2074
- # # Construct the final encoded link
2075
- # encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
2076
-
2077
- # # Correctly construct the final URL with page and zoom
2078
- # final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
2079
-
2080
- # # Get current date and time
2081
- # now = datetime.now()
2082
-
2083
- # # Format the output
2084
- # formatted_time = now.strftime("%d/%m/%Y %I:%M:%S %p")
2085
- # # Optionally, add the URL to a DataFrame
2086
-
2087
-
2088
- # data_entry = {
2089
- # "NBSLink": final_url,
2090
- # "Subject": heading_to_search,
2091
- # "Page": str(pageNumberFound),
2092
- # "Author": "ADR",
2093
- # "Creation Date": formatted_time,
2094
- # "Layer": "Initial",
2095
- # "Code": stringtowrite,
2096
- # "head above 1": paths[-2],
2097
- # "head above 2": paths[0],
2098
- # "MC Connnection": 'Go to ' + paths[0].strip().split()[0] +'/'+ heading_to_search.strip().split()[0] + ' in '+ filename
2099
- # }
2100
- # data_list_JSON.append(data_entry)
2101
-
2102
- # # Convert list to JSON
2103
- # json_output = json.dumps(data_list_JSON, indent=4)
2104
-
2105
- # print("Final URL:", final_url)
2106
- # i += 2
2107
- # continue
2108
- # else:
2109
- # if (substring_match and not collecting and
2110
- # len(combined_line_norm) > 0): # and (headertoContinue1 or headertoContinue2) ):
2111
-
2112
- # # Calculate word match percentage
2113
- # word_match_percent = words_match_ratio(heading_norm, combined_line_norm) * 100
2114
-
2115
- # # Check if at least 70% of header words exist in this line
2116
- # meets_word_threshold = word_match_percent >= 100
2117
-
2118
- # # Check header conditions (including word threshold)
2119
- # header_spans = [
2120
- # span for span in spans
2121
- # if (is_header(span, most_common_font_size, most_common_color, most_common_font)
2122
- # # and span['size'] >= subsubheaderFontSize
2123
- # and span['size'] < mainHeaderFontSize)
2124
- # ]
2125
-
2126
- # if header_spans and (meets_word_threshold or same_start_word(heading_to_search, combined_line_norm) ):
2127
- # collecting = True
2128
- # matched_header_font_size = max(span["size"] for span in header_spans)
2129
- # print(f"📥 Start collecting after header: {combined_line_norm} "
2130
- # f"(Font: {matched_header_font_size}, Word match: {word_match_percent:.0f}%)")
2131
-
2132
- # collected_lines.append(line_text)
2133
- # valid_spans = [span for span in spans if span.get("bbox")]
2134
-
2135
- # if valid_spans:
2136
- # x0s = [span["bbox"][0] for span in valid_spans]
2137
- # x1s = [span["bbox"][2] for span in valid_spans]
2138
- # y0s = [span["bbox"][1] for span in valid_spans]
2139
- # y1s = [span["bbox"][3] for span in valid_spans]
2140
-
2141
- # header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
2142
-
2143
- # if page_num in current_bbox:
2144
- # cb = current_bbox[page_num]
2145
- # current_bbox[page_num] = [
2146
- # min(cb[0], header_bbox[0]),
2147
- # min(cb[1], header_bbox[1]),
2148
- # max(cb[2], header_bbox[2]),
2149
- # max(cb[3], header_bbox[3])
2150
- # ]
2151
- # else:
2152
- # current_bbox[page_num] = header_bbox
2153
-
2154
- # last_y1s[page_num] = header_bbox[3]
2155
- # x0, y0, x1, y1 = header_bbox
2156
- # zoom = 200
2157
- # left = int(x0)
2158
- # top = int(y0)
2159
- # zoom_str = f"{zoom},{left},{top}"
2160
- # pageNumberFound = page_num + 1
2161
-
2162
- # # Build the query parameters
2163
- # params = {
2164
- # 'pdfLink': pdf_path, # Your PDF link
2165
- # 'keyword': heading_to_search, # Your keyword (could be a string or list)
2166
- # }
2167
-
2168
- # # URL encode each parameter
2169
- # encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
2170
-
2171
- # # Construct the final encoded link
2172
- # encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
2173
-
2174
- # # Correctly construct the final URL with page and zoom
2175
- # final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
2176
-
2177
- # # Get current date and time
2178
- # now = datetime.now()
2179
-
2180
- # # Format the output
2181
- # formatted_time = now.strftime("%d/%m/%Y %I:%M:%S %p")
2182
- # # Optionally, add the URL to a DataFrame
2183
-
2184
-
2185
- # data_entry = {
2186
- # "NBSLink": final_url,
2187
- # "Subject": heading_to_search,
2188
- # "Page": str(pageNumberFound),
2189
- # "Author": "ADR",
2190
- # "Creation Date": formatted_time,
2191
- # "Layer": "Initial",
2192
- # "Code": stringtowrite,
2193
- # "head above 1": paths[-2],
2194
- # "head above 2": paths[0],
2195
- # "MC Connnection": 'Go to ' + paths[0].strip().split()[0] +'/'+ heading_to_search.strip().split()[0] + ' in '+ filename
2196
- # }
2197
- # data_list_JSON.append(data_entry)
2198
-
2199
- # # Convert list to JSON
2200
- # json_output = json.dumps(data_list_JSON, indent=4)
2201
-
2202
- # print("Final URL:", final_url)
2203
- # i += 2
2204
- # continue
2205
- # if collecting:
2206
- # norm_line = normalize_text(line_text)
2207
-
2208
- # # Optimized URL check
2209
- # if url_pattern.match(norm_line):
2210
- # line_is_header = False
2211
- # else:
2212
- # line_is_header = any(is_header(span, most_common_font_size, most_common_color, most_common_font) for span in spans)
2213
-
2214
- # if line_is_header:
2215
- # header_font_size = max(span["size"] for span in spans)
2216
- # is_probably_real_header = (
2217
- # header_font_size >= matched_header_font_size and
2218
- # is_header(spans[0], most_common_font_size, most_common_color, most_common_font) and
2219
- # len(line_text.strip()) > 2
2220
- # )
2221
-
2222
- # if (norm_line != matched_header_line_norm and
2223
- # norm_line != heading_norm and
2224
- # is_probably_real_header):
2225
- # if line_text not in heading_norm:
2226
- # print(f"🛑 Stop at header with same or larger font: '{line_text}' ({header_font_size} ≥ {matched_header_font_size})")
2227
- # collecting = False
2228
- # done = True
2229
- # headertoContinue1 = False
2230
- # headertoContinue2=False
2231
- # for page_num, bbox in current_bbox.items():
2232
- # bbox[3] = last_y1s.get(page_num, bbox[3])
2233
- # page_highlights[page_num] = bbox
2234
- # highlight_boxes(docHighlights, page_highlights,stringtowrite)
2235
-
2236
- # break_collecting = True
2237
- # break
2238
-
2239
- # if break_collecting:
2240
- # break
2241
-
2242
- # collected_lines.append(line_text)
2243
- # valid_spans = [span for span in spans if span.get("bbox")]
2244
- # if valid_spans:
2245
- # x0s = [span["bbox"][0] for span in valid_spans]
2246
- # x1s = [span["bbox"][2] for span in valid_spans]
2247
- # y0s = [span["bbox"][1] for span in valid_spans]
2248
- # y1s = [span["bbox"][3] for span in valid_spans]
2249
-
2250
- # line_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
2251
-
2252
- # if page_num in current_bbox:
2253
- # cb = current_bbox[page_num]
2254
- # current_bbox[page_num] = [
2255
- # min(cb[0], line_bbox[0]),
2256
- # min(cb[1], line_bbox[1]),
2257
- # max(cb[2], line_bbox[2]),
2258
- # max(cb[3], line_bbox[3])
2259
- # ]
2260
- # else:
2261
- # current_bbox[page_num] = line_bbox
2262
-
2263
- # last_y1s[page_num] = line_bbox[3]
2264
- # i += 1
2265
-
2266
- # if not done:
2267
- # for page_num, bbox in current_bbox.items():
2268
- # bbox[3] = last_y1s.get(page_num, bbox[3])
2269
- # page_highlights[page_num] = bbox
2270
- # if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
2271
- # stringtowrite='Not to be billed'
2272
- # else:
2273
- # stringtowrite='To be billed'
2274
- # highlight_boxes(docHighlights, page_highlights,stringtowrite)
2275
-
2276
- # # docHighlights.save("highlighted_output.pdf", garbage=4, deflate=True)
2277
-
2278
- # pdf_bytes = BytesIO()
2279
- # docHighlights.save(pdf_bytes)
2280
- # print('JSONN',json_output)
2281
- # return pdf_bytes.getvalue(), docHighlights , json_output
2282
-
2283
-
2284
-
2285
-
2286
- ########################################################################################################################################################
2287
- ########################################################################################################################################################
2288
-
2289
-
2290
- def extract_section_under_header_tobebilledOnly(pdf_path):
2291
- Alltext_tobebilled=''
2292
- top_margin = 70
2293
- bottom_margin = 50
2294
- headertoContinue1 = False
2295
- headertoContinue2=False
2296
-
2297
- parsed_url = urlparse(pdf_path)
2298
- filename = os.path.basename(parsed_url.path)
2299
- filename = unquote(filename) # decode URL-encoded characters
2300
-
2301
- # Optimized URL handling
2302
- if pdf_path and ('http' in pdf_path or 'dropbox' in pdf_path):
2303
- pdf_path = pdf_path.replace('dl=0', 'dl=1')
2304
-
2305
- # Cache frequently used values
2306
- response = requests.get(pdf_path)
2307
- pdf_content = BytesIO(response.content)
2308
- if not pdf_content:
2309
- raise ValueError("No valid PDF content found.")
2310
-
2311
- doc = fitz.open(stream=pdf_content, filetype="pdf")
2312
- docHighlights = fitz.open(stream=pdf_content, filetype="pdf")
2313
- most_common_font_size, most_common_color, most_common_font = get_regular_font_size_and_color(doc)
2314
-
2315
- # Precompute regex patterns
2316
- dot_pattern = re.compile(r'\.{3,}')
2317
- url_pattern = re.compile(r'https?://\S+|www\.\S+')
2318
-
2319
- def get_toc_page_numbers(doc, max_pages_to_check=15):
2320
- toc_pages = []
2321
- for page_num in range(min(len(doc), max_pages_to_check)):
2322
- page = doc.load_page(page_num)
2323
- blocks = page.get_text("dict")["blocks"]
2324
-
2325
- dot_line_count = 0
2326
- for block in blocks:
2327
- for line in block.get("lines", []):
2328
- line_text = get_spaced_text_from_spans(line["spans"]).strip()
2329
- if dot_pattern.search(line_text):
2330
- dot_line_count += 1
2331
-
2332
- if dot_line_count >= 3:
2333
- toc_pages.append(page_num)
2334
-
2335
- return list(range(0, toc_pages[-1] +1)) if toc_pages else toc_pages
2336
-
2337
- toc_pages = get_toc_page_numbers(doc)
2338
-
2339
- headers, top_3_font_sizes, smallest_font_size, headersSpans = extract_headers(
2340
- doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin, bottom_margin
2341
- )
2342
-
2343
- hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
2344
- listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
2345
- # print('listofHeaderstoMarkup',listofHeaderstoMarkup)
2346
- # Precompute all children headers once
2347
- allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
2348
- allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
2349
-
2350
- df = pd.DataFrame(columns=["NBSLink","Subject","Page","Author","Creation Date","Layer",'Code', 'head above 1', "head above 2"])
2351
- dictionaryNBS={}
2352
- data_list_JSON = []
2353
-
2354
- if len(top_3_font_sizes)==3:
2355
- mainHeaderFontSize, subHeaderFontSize, subsubheaderFontSize = top_3_font_sizes
2356
- elif len(top_3_font_sizes)==2:
2357
- mainHeaderFontSize= top_3_font_sizes[0]
2358
- subHeaderFontSize= top_3_font_sizes[1]
2359
- subsubheaderFontSize= top_3_font_sizes[1]
2360
-
2361
-
2362
- # Preload all pages to avoid repeated loading
2363
- # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
2364
-
2365
- for heading_to_searchDict, paths in listofHeaderstoMarkup:
2366
- heading_to_search = heading_to_searchDict['text']
2367
- heading_to_searchPageNum = heading_to_searchDict['page']
2368
-
2369
-
2370
- # Initialize variables
2371
- headertoContinue1 = False
2372
- headertoContinue2 = False
2373
- matched_header_line = None
2374
- done = False
2375
- collecting = False
2376
- collected_lines = []
2377
- page_highlights = {}
2378
- current_bbox = {}
2379
- last_y1s = {}
2380
- mainHeader = ''
2381
- subHeader = ''
2382
- matched_header_line_norm = heading_to_search
2383
- break_collecting = False
2384
- heading_norm = normalize_text(heading_to_search)
2385
- paths_norm = [normalize_text(p) for p in paths[0]] if paths and paths[0] else []
2386
-
2387
- for page_num in range(heading_to_searchPageNum,len(doc)):
2388
- if page_num in toc_pages:
2389
- continue
2390
- if break_collecting:
2391
- break
2392
- page=doc[page_num]
2393
- page_height = page.rect.height
2394
- blocks = page.get_text("dict")["blocks"]
2395
-
2396
- for block in blocks:
2397
- if break_collecting:
2398
- break
2399
-
2400
- lines = block.get("lines", [])
2401
- i = 0
2402
- while i < len(lines):
2403
- if break_collecting:
2404
- break
2405
-
2406
- spans = lines[i].get("spans", [])
2407
- if not spans:
2408
- i += 1
2409
- continue
2410
-
2411
- y0 = spans[0]["bbox"][1]
2412
- y1 = spans[0]["bbox"][3]
2413
- if y0 < top_margin or y1 > (page_height - bottom_margin):
2414
- i += 1
2415
- continue
2416
-
2417
- line_text = get_spaced_text_from_spans(spans).lower()
2418
- line_text_norm = normalize_text(line_text)
2419
-
2420
- # Combine with next line if available
2421
- if i + 1 < len(lines):
2422
- next_spans = lines[i + 1].get("spans", [])
2423
- next_line_text = get_spaced_text_from_spans(next_spans).lower()
2424
- combined_line_norm = normalize_text(line_text + " " + next_line_text)
2425
- else:
2426
- combined_line_norm = line_text_norm
2427
-
2428
- # Check if we should continue processing
2429
- if combined_line_norm and combined_line_norm in paths[0]:
2430
- headertoContinue1 = combined_line_norm
2431
- if combined_line_norm and combined_line_norm in paths[-2]:
2432
- headertoContinue2 = combined_line_norm
2433
- if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
2434
- stringtowrite='Not to be billed'
2435
- else:
2436
- stringtowrite='To be billed'
2437
- # Optimized header matching
2438
- existsfull = (
2439
- ( combined_line_norm in allchildrenheaders_set or
2440
- combined_line_norm in allchildrenheaders ) and heading_to_search in combined_line_norm
2441
- )
2442
-
2443
- # New word-based matching
2444
- current_line_words = set(combined_line_norm.split())
2445
- heading_words = set(heading_norm.split())
2446
- all_words_match = current_line_words.issubset(heading_words) and len(current_line_words) > 0
2447
-
2448
- substring_match = (
2449
- heading_norm in combined_line_norm or
2450
- combined_line_norm in heading_norm or
2451
- all_words_match # Include the new word-based matching
2452
- )
2453
- # substring_match = (
2454
- # heading_norm in combined_line_norm or
2455
- # combined_line_norm in heading_norm
2456
- # )
2457
-
2458
- if (substring_match and existsfull and not collecting and
2459
- len(combined_line_norm) > 0 ):#and (headertoContinue1 or headertoContinue2) ):
2460
-
2461
- # Check header conditions more efficiently
2462
- header_spans = [
2463
- span for span in spans
2464
- if (is_header(span, most_common_font_size, most_common_color, most_common_font)
2465
- # and span['size'] >= subsubheaderFontSize
2466
- and span['size'] < mainHeaderFontSize)
2467
- ]
2468
- if header_spans and stringtowrite.startswith('To'):
2469
- Alltext_tobebilled+=combined_line_norm
2470
- collecting = True
2471
- matched_header_font_size = max(span["size"] for span in header_spans)
2472
-
2473
- collected_lines.append(line_text)
2474
- valid_spans = [span for span in spans if span.get("bbox")]
2475
-
2476
- if valid_spans:
2477
- x0s = [span["bbox"][0] for span in valid_spans]
2478
- x1s = [span["bbox"][2] for span in valid_spans]
2479
- y0s = [span["bbox"][1] for span in valid_spans]
2480
- y1s = [span["bbox"][3] for span in valid_spans]
2481
-
2482
- header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
2483
-
2484
- if page_num in current_bbox:
2485
- cb = current_bbox[page_num]
2486
- current_bbox[page_num] = [
2487
- min(cb[0], header_bbox[0]),
2488
- min(cb[1], header_bbox[1]),
2489
- max(cb[2], header_bbox[2]),
2490
- max(cb[3], header_bbox[3])
2491
- ]
2492
- else:
2493
- current_bbox[page_num] = header_bbox
2494
- last_y1s[page_num] = header_bbox[3]
2495
- x0, y0, x1, y1 = header_bbox
2496
-
2497
- zoom = 200
2498
- left = int(x0)
2499
- top = int(y0)
2500
- zoom_str = f"{zoom},{left},{top}"
2501
- pageNumberFound = page_num + 1
2502
-
2503
- # Build the query parameters
2504
- params = {
2505
- 'pdfLink': pdf_path, # Your PDF link
2506
- 'keyword': heading_to_search, # Your keyword (could be a string or list)
2507
- }
2508
-
2509
- # URL encode each parameter
2510
- encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
2511
-
2512
- # Construct the final encoded link
2513
- encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
2514
-
2515
- # Correctly construct the final URL with page and zoom
2516
- final_url = f"{tobebilledonlyLink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
2517
-
2518
- # Get current date and time
2519
- now = datetime.now()
2520
-
2521
- # Format the output
2522
- formatted_time = now.strftime("%d/%m/%Y %I:%M:%S %p")
2523
- # Optionally, add the URL to a DataFrame
2524
-
2525
-
2526
- data_entry = {
2527
- "NBSLink": final_url,
2528
- "Subject": heading_to_search,
2529
- "Page": str(pageNumberFound),
2530
- "Author": "ADR",
2531
- "Creation Date": formatted_time,
2532
- "Layer": "Initial",
2533
- "Code": stringtowrite,
2534
- "head above 1": paths[-2],
2535
- "head above 2": paths[0],
2536
- "MC Connnection": 'Go to ' + paths[0].strip().split()[0] +'/'+ heading_to_search.strip().split()[0] + ' in '+ filename
2537
- }
2538
- data_list_JSON.append(data_entry)
2539
-
2540
- # Convert list to JSON
2541
- json_output = json.dumps(data_list_JSON, indent=4)
2542
-
2543
- i += 2
2544
- continue
2545
- else:
2546
- if (substring_match and not collecting and
2547
- len(combined_line_norm) > 0): # and (headertoContinue1 or headertoContinue2) ):
2548
-
2549
- # Calculate word match percentage
2550
- word_match_percent = words_match_ratio(heading_norm, combined_line_norm) * 100
2551
-
2552
- # Check if at least 70% of header words exist in this line
2553
- meets_word_threshold = word_match_percent >= 100
2554
-
2555
- # Check header conditions (including word threshold)
2556
- header_spans = [
2557
- span for span in spans
2558
- if (is_header(span, most_common_font_size, most_common_color, most_common_font)
2559
- # and span['size'] >= subsubheaderFontSize
2560
- and span['size'] < mainHeaderFontSize)
2561
- ]
2562
-
2563
- if header_spans and (meets_word_threshold or same_start_word(heading_to_search, combined_line_norm) ) and stringtowrite.startswith('To'):
2564
- Alltext_tobebilled+=combined_line_norm
2565
- collecting = True
2566
- matched_header_font_size = max(span["size"] for span in header_spans)
2567
-
2568
- collected_lines.append(line_text)
2569
- valid_spans = [span for span in spans if span.get("bbox")]
2570
-
2571
- if valid_spans:
2572
- x0s = [span["bbox"][0] for span in valid_spans]
2573
- x1s = [span["bbox"][2] for span in valid_spans]
2574
- y0s = [span["bbox"][1] for span in valid_spans]
2575
- y1s = [span["bbox"][3] for span in valid_spans]
2576
-
2577
- header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
2578
-
2579
- if page_num in current_bbox:
2580
- cb = current_bbox[page_num]
2581
- current_bbox[page_num] = [
2582
- min(cb[0], header_bbox[0]),
2583
- min(cb[1], header_bbox[1]),
2584
- max(cb[2], header_bbox[2]),
2585
- max(cb[3], header_bbox[3])
2586
- ]
2587
- else:
2588
- current_bbox[page_num] = header_bbox
2589
-
2590
- last_y1s[page_num] = header_bbox[3]
2591
- x0, y0, x1, y1 = header_bbox
2592
- zoom = 200
2593
- left = int(x0)
2594
- top = int(y0)
2595
- zoom_str = f"{zoom},{left},{top}"
2596
- pageNumberFound = page_num + 1
2597
-
2598
- # Build the query parameters
2599
- params = {
2600
- 'pdfLink': pdf_path, # Your PDF link
2601
- 'keyword': heading_to_search, # Your keyword (could be a string or list)
2602
- }
2603
-
2604
- # URL encode each parameter
2605
- encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
2606
-
2607
- # Construct the final encoded link
2608
- encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
2609
-
2610
- # Correctly construct the final URL with page and zoom
2611
- final_url = f"{tobebilledonlyLink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
2612
-
2613
- # Get current date and time
2614
- now = datetime.now()
2615
-
2616
- # Format the output
2617
- formatted_time = now.strftime("%d/%m/%Y %I:%M:%S %p")
2618
- # Optionally, add the URL to a DataFrame
2619
-
2620
-
2621
- data_entry = {
2622
- "NBSLink": final_url,
2623
- "Subject": heading_to_search,
2624
- "Page": str(pageNumberFound),
2625
- "Author": "ADR",
2626
- "Creation Date": formatted_time,
2627
- "Layer": "Initial",
2628
- "Code": stringtowrite,
2629
- "head above 1": paths[-2],
2630
- "head above 2": paths[0],
2631
- "MC Connnection": 'Go to ' + paths[0].strip().split()[0] +'/'+ heading_to_search.strip().split()[0] + ' in '+ filename
2632
- }
2633
- data_list_JSON.append(data_entry)
2634
-
2635
- # Convert list to JSON
2636
- json_output = json.dumps(data_list_JSON, indent=4)
2637
-
2638
- i += 2
2639
- continue
2640
- if collecting:
2641
- norm_line = normalize_text(line_text)
2642
-
2643
- # Optimized URL check
2644
- if url_pattern.match(norm_line):
2645
- line_is_header = False
2646
- else:
2647
- line_is_header = any(is_header(span, most_common_font_size, most_common_color, most_common_font) for span in spans)
2648
-
2649
- if line_is_header:
2650
- header_font_size = max(span["size"] for span in spans)
2651
- is_probably_real_header = (
2652
- header_font_size >= matched_header_font_size and
2653
- is_header(spans[0], most_common_font_size, most_common_color, most_common_font) and
2654
- len(line_text.strip()) > 2
2655
- )
2656
-
2657
- if (norm_line != matched_header_line_norm and
2658
- norm_line != heading_norm and
2659
- is_probably_real_header):
2660
- if line_text not in heading_norm:
2661
- collecting = False
2662
- done = True
2663
- headertoContinue1 = False
2664
- headertoContinue2=False
2665
- for page_num, bbox in current_bbox.items():
2666
- bbox[3] = last_y1s.get(page_num, bbox[3])
2667
- page_highlights[page_num] = bbox
2668
- highlight_boxes(docHighlights, page_highlights,stringtowrite)
2669
-
2670
- break_collecting = True
2671
- break
2672
-
2673
- if break_collecting:
2674
- break
2675
-
2676
- collected_lines.append(line_text)
2677
- valid_spans = [span for span in spans if span.get("bbox")]
2678
- if valid_spans:
2679
- x0s = [span["bbox"][0] for span in valid_spans]
2680
- x1s = [span["bbox"][2] for span in valid_spans]
2681
- y0s = [span["bbox"][1] for span in valid_spans]
2682
- y1s = [span["bbox"][3] for span in valid_spans]
2683
-
2684
- line_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
2685
-
2686
- if page_num in current_bbox:
2687
- cb = current_bbox[page_num]
2688
- current_bbox[page_num] = [
2689
- min(cb[0], line_bbox[0]),
2690
- min(cb[1], line_bbox[1]),
2691
- max(cb[2], line_bbox[2]),
2692
- max(cb[3], line_bbox[3])
2693
- ]
2694
- else:
2695
- current_bbox[page_num] = line_bbox
2696
-
2697
- last_y1s[page_num] = line_bbox[3]
2698
- i += 1
2699
-
2700
- if not done:
2701
- for page_num, bbox in current_bbox.items():
2702
- bbox[3] = last_y1s.get(page_num, bbox[3])
2703
- page_highlights[page_num] = bbox
2704
- if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
2705
- stringtowrite='Not to be billed'
2706
- else:
2707
- stringtowrite='To be billed'
2708
- highlight_boxes(docHighlights, page_highlights,stringtowrite)
2709
-
2710
- # docHighlights.save("highlighted_output.pdf", garbage=4, deflate=True)
2711
-
2712
- pdf_bytes = BytesIO()
2713
- docHighlights.save(pdf_bytes)
2714
-
2715
- return pdf_bytes.getvalue(), docHighlights , json_output , Alltext_tobebilled
2716
 
2717
 
2718
 
 
1844
  docHighlights.save(pdf_bytes)
1845
  return pdf_bytes.getvalue(), docHighlights , newjsonList
1846
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1847
 
1848
 
1849