Marthee commited on
Commit
acaca02
·
verified ·
1 Parent(s): 14a2e4b

Update InitialMarkups.py

Browse files
Files changed (1) hide show
  1. InitialMarkups.py +453 -501
InitialMarkups.py CHANGED
@@ -89,7 +89,6 @@ def add_span_to_nearest_group(span_y, grouped_dict, pageNum=None, threshold=0.5)
89
  return (pageNum, span_y)
90
 
91
  def extract_headers(doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin, bottom_margin):
92
- print("Font baseline:", most_common_font_size, most_common_color, most_common_font)
93
 
94
  grouped_headers = defaultdict(list)
95
  spans = []
@@ -318,7 +317,7 @@ def build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_co
318
  # Step 2: Identify level 0 headers (largest and in TOC)
319
  # max_size = max(h['size'] for h in headers) if headers else 0
320
  max_size,subheaderSize,nbsheadersize=top_3_font_sizes
321
- print(max_size)
322
  toc_text_match=[]
323
  # Improved TOC matching with exact and substring matching
324
  toc_matches = []
@@ -347,7 +346,6 @@ def build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_co
347
  toc_matches.append(h)
348
  toc_text_match.append(h['text'])
349
  elif matching_toc_texts and h['size'] < max_size * 0.9 and h['size'] > nbsheadersize : # h['size'] < max_size * 0.9 and h['size'] > max_size*0.75:
350
- print(h['text'],matching_toc_texts)
351
  headers.remove(h)
352
  continue
353
 
@@ -365,8 +363,7 @@ def build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_co
365
  # Update the header text with cleaned version
366
  h['text'] = cleaned_text
367
  unique_level0.append(h)
368
- print(f"Added unique header: {cleaned_text} (normalized: {norm_cleaned_text})")
369
-
370
  # Step 3: Process headers under each level 0 to identify level 1 format
371
 
372
  # First, group headers by their level 0 parent
@@ -667,7 +664,7 @@ def extract_section_under_header(pdf_path):
667
 
668
  hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
669
  listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
670
- print('listofHeaderstoMarkup',listofHeaderstoMarkup)
671
  # Precompute all children headers once
672
  allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
673
  allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
@@ -683,7 +680,7 @@ def extract_section_under_header(pdf_path):
683
  subHeaderFontSize= top_3_font_sizes[1]
684
  subsubheaderFontSize= top_3_font_sizes[1]
685
 
686
- print("📌 Has TOC:", bool(toc_pages), " | Pages to skip:", toc_pages)
687
 
688
  # Preload all pages to avoid repeated loading
689
  # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
@@ -692,8 +689,6 @@ def extract_section_under_header(pdf_path):
692
  heading_to_search = heading_to_searchDict['text']
693
  heading_to_searchPageNum = heading_to_searchDict['page']
694
 
695
- print('headertosearch', heading_to_search)
696
-
697
  # Initialize variables
698
  headertoContinue1 = False
699
  headertoContinue2 = False
@@ -754,10 +749,10 @@ def extract_section_under_header(pdf_path):
754
 
755
  # Check if we should continue processing
756
  if combined_line_norm and combined_line_norm in paths[0]:
757
- print(combined_line_norm)
758
  headertoContinue1 = combined_line_norm
759
  if combined_line_norm and combined_line_norm in paths[-2]:
760
- print(combined_line_norm)
761
  headertoContinue2 = combined_line_norm
762
  if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
763
  stringtowrite='Not to be billed'
@@ -797,7 +792,6 @@ def extract_section_under_header(pdf_path):
797
  if header_spans:
798
  collecting = True
799
  matched_header_font_size = max(span["size"] for span in header_spans)
800
- print(f"📥 Start collecting after header: {combined_line_norm} (Font size: {matched_header_font_size})")
801
 
802
  collected_lines.append(line_text)
803
  valid_spans = [span for span in spans if span.get("bbox")]
@@ -869,7 +863,6 @@ def extract_section_under_header(pdf_path):
869
  # Convert list to JSON
870
  json_output = json.dumps(data_list_JSON, indent=4)
871
 
872
- print("Final URL:", final_url)
873
  i += 2
874
  continue
875
  else:
@@ -893,9 +886,7 @@ def extract_section_under_header(pdf_path):
893
  if header_spans and (meets_word_threshold or same_start_word(heading_to_search, combined_line_norm) ):
894
  collecting = True
895
  matched_header_font_size = max(span["size"] for span in header_spans)
896
- print(f"📥 Start collecting after header: {combined_line_norm} "
897
- f"(Font: {matched_header_font_size}, Word match: {word_match_percent:.0f}%)")
898
-
899
  collected_lines.append(line_text)
900
  valid_spans = [span for span in spans if span.get("bbox")]
901
 
@@ -966,7 +957,7 @@ def extract_section_under_header(pdf_path):
966
  # Convert list to JSON
967
  json_output = json.dumps(data_list_JSON, indent=4)
968
 
969
- print("Final URL:", final_url)
970
  i += 2
971
  continue
972
  if collecting:
@@ -990,7 +981,6 @@ def extract_section_under_header(pdf_path):
990
  norm_line != heading_norm and
991
  is_probably_real_header):
992
  if line_text not in heading_norm:
993
- print(f"🛑 Stop at header with same or larger font: '{line_text}' ({header_font_size} ≥ {matched_header_font_size})")
994
  collecting = False
995
  done = True
996
  headertoContinue1 = False
@@ -1044,7 +1034,6 @@ def extract_section_under_header(pdf_path):
1044
 
1045
  pdf_bytes = BytesIO()
1046
  docHighlights.save(pdf_bytes)
1047
- print('JSONN',json_output)
1048
  return pdf_bytes.getvalue(), docHighlights , json_output
1049
 
1050
 
@@ -1109,7 +1098,6 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1109
 
1110
  hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
1111
  listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
1112
- # print('listofHeaderstoMarkup',listofHeaderstoMarkup)
1113
  # Precompute all children headers once
1114
  allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
1115
  allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
@@ -1125,8 +1113,6 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1125
  subHeaderFontSize= top_3_font_sizes[1]
1126
  subsubheaderFontSize= top_3_font_sizes[1]
1127
 
1128
- # print("📌 Has TOC:", bool(toc_pages), " | Pages to skip:", toc_pages)
1129
-
1130
  # Preload all pages to avoid repeated loading
1131
  # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
1132
 
@@ -1134,8 +1120,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1134
  heading_to_search = heading_to_searchDict['text']
1135
  heading_to_searchPageNum = heading_to_searchDict['page']
1136
 
1137
- # print('headertosearch', heading_to_search)
1138
-
1139
  # Initialize variables
1140
  headertoContinue1 = False
1141
  headertoContinue2 = False
@@ -1196,10 +1181,8 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1196
 
1197
  # Check if we should continue processing
1198
  if combined_line_norm and combined_line_norm in paths[0]:
1199
- print(combined_line_norm)
1200
  headertoContinue1 = combined_line_norm
1201
  if combined_line_norm and combined_line_norm in paths[-2]:
1202
- print(combined_line_norm)
1203
  headertoContinue2 = combined_line_norm
1204
  if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
1205
  stringtowrite='Not to be billed'
@@ -1240,8 +1223,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1240
  Alltext_Tobebilled+=combined_line_norm
1241
  collecting = True
1242
  matched_header_font_size = max(span["size"] for span in header_spans)
1243
- # print(f"📥 Start collecting after header: {combined_line_norm} (Font size: {matched_header_font_size})")
1244
-
1245
  collected_lines.append(line_text)
1246
  valid_spans = [span for span in spans if span.get("bbox")]
1247
 
@@ -1312,7 +1294,6 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1312
  # Convert list to JSON
1313
  json_output = json.dumps(data_list_JSON, indent=4)
1314
 
1315
- # print("Final URL:", final_url)
1316
  i += 2
1317
  continue
1318
  else:
@@ -1337,9 +1318,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1337
  Alltext_Tobebilled+=combined_line_norm
1338
  collecting = True
1339
  matched_header_font_size = max(span["size"] for span in header_spans)
1340
- # print(f"📥 Start collecting after header: {combined_line_norm} "
1341
- # f"(Font: {matched_header_font_size}, Word match: {word_match_percent:.0f}%)")
1342
-
1343
  collected_lines.append(line_text)
1344
  valid_spans = [span for span in spans if span.get("bbox")]
1345
 
@@ -1410,7 +1389,6 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1410
  # Convert list to JSON
1411
  json_output = json.dumps(data_list_JSON, indent=4)
1412
 
1413
- # print("Final URL:", final_url)
1414
  i += 2
1415
  continue
1416
  if collecting:
@@ -1434,7 +1412,6 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1434
  norm_line != heading_norm and
1435
  is_probably_real_header):
1436
  if line_text not in heading_norm:
1437
- # print(f"🛑 Stop at header with same or larger font: '{line_text}' ({header_font_size} ≥ {matched_header_font_size})")
1438
  collecting = False
1439
  done = True
1440
  headertoContinue1 = False
@@ -1488,7 +1465,6 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1488
 
1489
  pdf_bytes = BytesIO()
1490
  docHighlights.save(pdf_bytes)
1491
- # print('JSONN',json_output)
1492
  return pdf_bytes.getvalue(), docHighlights , json_output , Alltext_Tobebilled
1493
 
1494
 
@@ -1548,10 +1524,8 @@ def extract_section_under_headerRawan(pdf_path,headingjson,pagenum=0,incominghea
1548
  else:
1549
  for item in headingjson:
1550
  listofheadingsfromrawan.append(normalize_text(item['Subject']))
1551
- print('hereeeeeeeeeeeeeee0',listofheadingsfromrawan)
1552
  # Precompute all children headers once
1553
  allchildrenheaders = listofheadingsfromrawan
1554
- print('hereeeeeeeeeeeeeee00',allchildrenheaders)
1555
  allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
1556
 
1557
  df = pd.DataFrame(columns=["NBSLink","Subject","Page","Author","Creation Date","Layer",'Code', 'head above 1', "head above 2"])
@@ -1564,7 +1538,6 @@ def extract_section_under_headerRawan(pdf_path,headingjson,pagenum=0,incominghea
1564
  subHeaderFontSize= top_3_font_sizes[1]
1565
  subsubheaderFontSize= top_3_font_sizes[1]
1566
 
1567
- print("📌 Has TOC:", bool(toc_pages), " | Pages to skip:", toc_pages)
1568
 
1569
  # Preload all pages to avoid repeated loading
1570
  # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
@@ -1578,7 +1551,6 @@ def extract_section_under_headerRawan(pdf_path,headingjson,pagenum=0,incominghea
1578
  heading_to_searchPageNum = int(heading_to_searchDict['Page'])-1
1579
  incomingheader = heading_to_searchDict['head above 1']
1580
 
1581
- print('hereeeeeeeeeeeeeee0',heading_to_searchPageNum)
1582
  done = False
1583
  collecting = False
1584
  collected_lines = []
@@ -1592,7 +1564,6 @@ def extract_section_under_headerRawan(pdf_path,headingjson,pagenum=0,incominghea
1592
  heading_norm = normalize_text(heading_to_search)
1593
 
1594
  for page_num in range(heading_to_searchPageNum,len(doc)):
1595
- print('hereeeeeeeeeeeeeee1')
1596
  if page_num in toc_pages:
1597
  continue
1598
  if break_collecting:
@@ -1662,7 +1633,6 @@ def extract_section_under_headerRawan(pdf_path,headingjson,pagenum=0,incominghea
1662
  if header_spans:
1663
  collecting = True
1664
  matched_header_font_size = max(span["size"] for span in header_spans)
1665
- print(f"📥 Start collecting after header: {combined_line_norm} (Font size: {matched_header_font_size})")
1666
 
1667
  collected_lines.append(line_text)
1668
  valid_spans = [span for span in spans if span.get("bbox")]
@@ -1719,7 +1689,6 @@ def extract_section_under_headerRawan(pdf_path,headingjson,pagenum=0,incominghea
1719
  if type(heading_to_searchDict) != str:
1720
  heading_to_searchDict['NBSLink']=new_url
1721
  newjsonList.append(heading_to_searchDict)
1722
- print("Final URL:", final_url)
1723
  i += 2
1724
  continue
1725
  else:
@@ -1743,9 +1712,7 @@ def extract_section_under_headerRawan(pdf_path,headingjson,pagenum=0,incominghea
1743
  if header_spans and (meets_word_threshold or same_start_word(heading_to_search, combined_line_norm) ):
1744
  collecting = True
1745
  matched_header_font_size = max(span["size"] for span in header_spans)
1746
- print(f"📥 Start collecting after header: {combined_line_norm} "
1747
- f"(Font: {matched_header_font_size}, Word match: {word_match_percent:.0f}%)")
1748
-
1749
  collected_lines.append(line_text)
1750
  valid_spans = [span for span in spans if span.get("bbox")]
1751
 
@@ -1794,7 +1761,6 @@ def extract_section_under_headerRawan(pdf_path,headingjson,pagenum=0,incominghea
1794
  if type(heading_to_searchDict) != str:
1795
  heading_to_searchDict['NBSLink']=new_url
1796
  newjsonList.append(heading_to_searchDict)
1797
- print("Final URL:", final_url)
1798
  i += 2
1799
  continue
1800
  if collecting:
@@ -1818,7 +1784,6 @@ def extract_section_under_headerRawan(pdf_path,headingjson,pagenum=0,incominghea
1818
  norm_line != heading_norm and
1819
  is_probably_real_header):
1820
  if line_text not in heading_norm:
1821
- print(f"🛑 Stop at header with same or larger font: '{line_text}' ({header_font_size} ≥ {matched_header_font_size})")
1822
  collecting = False
1823
  done = True
1824
  headertoContinue1 = False
@@ -1882,438 +1847,438 @@ def extract_section_under_headerRawan(pdf_path,headingjson,pagenum=0,incominghea
1882
 
1883
 
1884
 
1885
- top_margin = 70
1886
- bottom_margin = 50
1887
- headertoContinue1 = False
1888
- headertoContinue2=False
1889
 
1890
- parsed_url = urlparse(pdf_path)
1891
- filename = os.path.basename(parsed_url.path)
1892
- filename = unquote(filename) # decode URL-encoded characters
1893
-
1894
- # Optimized URL handling
1895
- if pdf_path and ('http' in pdf_path or 'dropbox' in pdf_path):
1896
- pdf_path = pdf_path.replace('dl=0', 'dl=1')
1897
-
1898
- # Cache frequently used values
1899
- response = requests.get(pdf_path)
1900
- pdf_content = BytesIO(response.content)
1901
- if not pdf_content:
1902
- raise ValueError("No valid PDF content found.")
1903
-
1904
- doc = fitz.open(stream=pdf_content, filetype="pdf")
1905
- docHighlights = fitz.open(stream=pdf_content, filetype="pdf")
1906
- most_common_font_size, most_common_color, most_common_font = get_regular_font_size_and_color(doc)
1907
-
1908
- # Precompute regex patterns
1909
- dot_pattern = re.compile(r'\.{3,}')
1910
- url_pattern = re.compile(r'https?://\S+|www\.\S+')
1911
-
1912
- def get_toc_page_numbers(doc, max_pages_to_check=15):
1913
- toc_pages = []
1914
- for page_num in range(min(len(doc), max_pages_to_check)):
1915
- page = doc.load_page(page_num)
1916
- blocks = page.get_text("dict")["blocks"]
1917
-
1918
- dot_line_count = 0
1919
- for block in blocks:
1920
- for line in block.get("lines", []):
1921
- line_text = get_spaced_text_from_spans(line["spans"]).strip()
1922
- if dot_pattern.search(line_text):
1923
- dot_line_count += 1
1924
-
1925
- if dot_line_count >= 3:
1926
- toc_pages.append(page_num)
1927
-
1928
- return list(range(0, toc_pages[-1] +1)) if toc_pages else toc_pages
1929
-
1930
- toc_pages = get_toc_page_numbers(doc)
1931
-
1932
- headers, top_3_font_sizes, smallest_font_size, headersSpans = extract_headers(
1933
- doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin, bottom_margin
1934
- )
1935
-
1936
- hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
1937
- listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
1938
- print('listofHeaderstoMarkup',listofHeaderstoMarkup)
1939
- # Precompute all children headers once
1940
- allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
1941
- allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
1942
-
1943
- df = pd.DataFrame(columns=["NBSLink","Subject","Page","Author","Creation Date","Layer",'Code', 'head above 1', "head above 2"])
1944
- dictionaryNBS={}
1945
- data_list_JSON = []
1946
-
1947
- if len(top_3_font_sizes)==3:
1948
- mainHeaderFontSize, subHeaderFontSize, subsubheaderFontSize = top_3_font_sizes
1949
- elif len(top_3_font_sizes)==2:
1950
- mainHeaderFontSize= top_3_font_sizes[0]
1951
- subHeaderFontSize= top_3_font_sizes[1]
1952
- subsubheaderFontSize= top_3_font_sizes[1]
1953
-
1954
- print("📌 Has TOC:", bool(toc_pages), " | Pages to skip:", toc_pages)
1955
-
1956
- # Preload all pages to avoid repeated loading
1957
- # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
1958
-
1959
- for heading_to_searchDict, paths in listofHeaderstoMarkup:
1960
- heading_to_search = heading_to_searchDict['text']
1961
- heading_to_searchPageNum = heading_to_searchDict['page']
1962
-
1963
- print('headertosearch', heading_to_search)
1964
-
1965
- # Initialize variables
1966
- headertoContinue1 = False
1967
- headertoContinue2 = False
1968
- matched_header_line = None
1969
- done = False
1970
- collecting = False
1971
- collected_lines = []
1972
- page_highlights = {}
1973
- current_bbox = {}
1974
- last_y1s = {}
1975
- mainHeader = ''
1976
- subHeader = ''
1977
- matched_header_line_norm = heading_to_search
1978
- break_collecting = False
1979
- heading_norm = normalize_text(heading_to_search)
1980
- paths_norm = [normalize_text(p) for p in paths[0]] if paths and paths[0] else []
1981
-
1982
- for page_num in range(heading_to_searchPageNum,len(doc)):
1983
- if page_num in toc_pages:
1984
- continue
1985
- if break_collecting:
1986
- break
1987
- page=doc[page_num]
1988
- page_height = page.rect.height
1989
- blocks = page.get_text("dict")["blocks"]
1990
-
1991
- for block in blocks:
1992
- if break_collecting:
1993
- break
1994
-
1995
- lines = block.get("lines", [])
1996
- i = 0
1997
- while i < len(lines):
1998
- if break_collecting:
1999
- break
2000
-
2001
- spans = lines[i].get("spans", [])
2002
- if not spans:
2003
- i += 1
2004
- continue
2005
-
2006
- y0 = spans[0]["bbox"][1]
2007
- y1 = spans[0]["bbox"][3]
2008
- if y0 < top_margin or y1 > (page_height - bottom_margin):
2009
- i += 1
2010
- continue
2011
-
2012
- line_text = get_spaced_text_from_spans(spans).lower()
2013
- line_text_norm = normalize_text(line_text)
2014
-
2015
- # Combine with next line if available
2016
- if i + 1 < len(lines):
2017
- next_spans = lines[i + 1].get("spans", [])
2018
- next_line_text = get_spaced_text_from_spans(next_spans).lower()
2019
- combined_line_norm = normalize_text(line_text + " " + next_line_text)
2020
- else:
2021
- combined_line_norm = line_text_norm
2022
-
2023
- # Check if we should continue processing
2024
- if combined_line_norm and combined_line_norm in paths[0]:
2025
- print(combined_line_norm)
2026
- headertoContinue1 = combined_line_norm
2027
- if combined_line_norm and combined_line_norm in paths[-2]:
2028
- print(combined_line_norm)
2029
- headertoContinue2 = combined_line_norm
2030
- if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
2031
- stringtowrite='Not to be billed'
2032
- else:
2033
- stringtowrite='To be billed'
2034
- # Optimized header matching
2035
- existsfull = (
2036
- ( combined_line_norm in allchildrenheaders_set or
2037
- combined_line_norm in allchildrenheaders ) and heading_to_search in combined_line_norm
2038
- )
2039
-
2040
- # New word-based matching
2041
- current_line_words = set(combined_line_norm.split())
2042
- heading_words = set(heading_norm.split())
2043
- all_words_match = current_line_words.issubset(heading_words) and len(current_line_words) > 0
2044
-
2045
- substring_match = (
2046
- heading_norm in combined_line_norm or
2047
- combined_line_norm in heading_norm or
2048
- all_words_match # Include the new word-based matching
2049
- )
2050
- # substring_match = (
2051
- # heading_norm in combined_line_norm or
2052
- # combined_line_norm in heading_norm
2053
- # )
2054
-
2055
- if (substring_match and existsfull and not collecting and
2056
- len(combined_line_norm) > 0 ):#and (headertoContinue1 or headertoContinue2) ):
2057
-
2058
- # Check header conditions more efficiently
2059
- header_spans = [
2060
- span for span in spans
2061
- if (is_header(span, most_common_font_size, most_common_color, most_common_font)
2062
- # and span['size'] >= subsubheaderFontSize
2063
- and span['size'] < mainHeaderFontSize)
2064
- ]
2065
- if header_spans:
2066
- collecting = True
2067
- matched_header_font_size = max(span["size"] for span in header_spans)
2068
- print(f"📥 Start collecting after header: {combined_line_norm} (Font size: {matched_header_font_size})")
2069
-
2070
- collected_lines.append(line_text)
2071
- valid_spans = [span for span in spans if span.get("bbox")]
2072
-
2073
- if valid_spans:
2074
- x0s = [span["bbox"][0] for span in valid_spans]
2075
- x1s = [span["bbox"][2] for span in valid_spans]
2076
- y0s = [span["bbox"][1] for span in valid_spans]
2077
- y1s = [span["bbox"][3] for span in valid_spans]
2078
-
2079
- header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
2080
-
2081
- if page_num in current_bbox:
2082
- cb = current_bbox[page_num]
2083
- current_bbox[page_num] = [
2084
- min(cb[0], header_bbox[0]),
2085
- min(cb[1], header_bbox[1]),
2086
- max(cb[2], header_bbox[2]),
2087
- max(cb[3], header_bbox[3])
2088
- ]
2089
- else:
2090
- current_bbox[page_num] = header_bbox
2091
- last_y1s[page_num] = header_bbox[3]
2092
- x0, y0, x1, y1 = header_bbox
2093
-
2094
- zoom = 200
2095
- left = int(x0)
2096
- top = int(y0)
2097
- zoom_str = f"{zoom},{left},{top}"
2098
- pageNumberFound = page_num + 1
2099
-
2100
- # Build the query parameters
2101
- params = {
2102
- 'pdfLink': pdf_path, # Your PDF link
2103
- 'keyword': heading_to_search, # Your keyword (could be a string or list)
2104
- }
2105
-
2106
- # URL encode each parameter
2107
- encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
2108
-
2109
- # Construct the final encoded link
2110
- encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
2111
-
2112
- # Correctly construct the final URL with page and zoom
2113
- final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
2114
-
2115
- # Get current date and time
2116
- now = datetime.now()
2117
-
2118
- # Format the output
2119
- formatted_time = now.strftime("%d/%m/%Y %I:%M:%S %p")
2120
- # Optionally, add the URL to a DataFrame
2121
-
2122
-
2123
- data_entry = {
2124
- "NBSLink": final_url,
2125
- "Subject": heading_to_search,
2126
- "Page": str(pageNumberFound),
2127
- "Author": "ADR",
2128
- "Creation Date": formatted_time,
2129
- "Layer": "Initial",
2130
- "Code": stringtowrite,
2131
- "head above 1": paths[-2],
2132
- "head above 2": paths[0],
2133
- "MC Connnection": 'Go to ' + paths[0].strip().split()[0] +'/'+ heading_to_search.strip().split()[0] + ' in '+ filename
2134
- }
2135
- data_list_JSON.append(data_entry)
2136
-
2137
- # Convert list to JSON
2138
- json_output = json.dumps(data_list_JSON, indent=4)
2139
-
2140
- print("Final URL:", final_url)
2141
- i += 2
2142
- continue
2143
- else:
2144
- if (substring_match and not collecting and
2145
- len(combined_line_norm) > 0): # and (headertoContinue1 or headertoContinue2) ):
2146
-
2147
- # Calculate word match percentage
2148
- word_match_percent = words_match_ratio(heading_norm, combined_line_norm) * 100
2149
-
2150
- # Check if at least 70% of header words exist in this line
2151
- meets_word_threshold = word_match_percent >= 100
2152
-
2153
- # Check header conditions (including word threshold)
2154
- header_spans = [
2155
- span for span in spans
2156
- if (is_header(span, most_common_font_size, most_common_color, most_common_font)
2157
- # and span['size'] >= subsubheaderFontSize
2158
- and span['size'] < mainHeaderFontSize)
2159
- ]
2160
-
2161
- if header_spans and (meets_word_threshold or same_start_word(heading_to_search, combined_line_norm) ):
2162
- collecting = True
2163
- matched_header_font_size = max(span["size"] for span in header_spans)
2164
- print(f"📥 Start collecting after header: {combined_line_norm} "
2165
- f"(Font: {matched_header_font_size}, Word match: {word_match_percent:.0f}%)")
2166
-
2167
- collected_lines.append(line_text)
2168
- valid_spans = [span for span in spans if span.get("bbox")]
2169
-
2170
- if valid_spans:
2171
- x0s = [span["bbox"][0] for span in valid_spans]
2172
- x1s = [span["bbox"][2] for span in valid_spans]
2173
- y0s = [span["bbox"][1] for span in valid_spans]
2174
- y1s = [span["bbox"][3] for span in valid_spans]
2175
-
2176
- header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
2177
-
2178
- if page_num in current_bbox:
2179
- cb = current_bbox[page_num]
2180
- current_bbox[page_num] = [
2181
- min(cb[0], header_bbox[0]),
2182
- min(cb[1], header_bbox[1]),
2183
- max(cb[2], header_bbox[2]),
2184
- max(cb[3], header_bbox[3])
2185
- ]
2186
- else:
2187
- current_bbox[page_num] = header_bbox
2188
-
2189
- last_y1s[page_num] = header_bbox[3]
2190
- x0, y0, x1, y1 = header_bbox
2191
- zoom = 200
2192
- left = int(x0)
2193
- top = int(y0)
2194
- zoom_str = f"{zoom},{left},{top}"
2195
- pageNumberFound = page_num + 1
2196
-
2197
- # Build the query parameters
2198
- params = {
2199
- 'pdfLink': pdf_path, # Your PDF link
2200
- 'keyword': heading_to_search, # Your keyword (could be a string or list)
2201
- }
2202
-
2203
- # URL encode each parameter
2204
- encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
2205
-
2206
- # Construct the final encoded link
2207
- encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
2208
-
2209
- # Correctly construct the final URL with page and zoom
2210
- final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
2211
-
2212
- # Get current date and time
2213
- now = datetime.now()
2214
-
2215
- # Format the output
2216
- formatted_time = now.strftime("%d/%m/%Y %I:%M:%S %p")
2217
- # Optionally, add the URL to a DataFrame
2218
-
2219
-
2220
- data_entry = {
2221
- "NBSLink": final_url,
2222
- "Subject": heading_to_search,
2223
- "Page": str(pageNumberFound),
2224
- "Author": "ADR",
2225
- "Creation Date": formatted_time,
2226
- "Layer": "Initial",
2227
- "Code": stringtowrite,
2228
- "head above 1": paths[-2],
2229
- "head above 2": paths[0],
2230
- "MC Connnection": 'Go to ' + paths[0].strip().split()[0] +'/'+ heading_to_search.strip().split()[0] + ' in '+ filename
2231
- }
2232
- data_list_JSON.append(data_entry)
2233
-
2234
- # Convert list to JSON
2235
- json_output = json.dumps(data_list_JSON, indent=4)
2236
-
2237
- print("Final URL:", final_url)
2238
- i += 2
2239
- continue
2240
- if collecting:
2241
- norm_line = normalize_text(line_text)
2242
-
2243
- # Optimized URL check
2244
- if url_pattern.match(norm_line):
2245
- line_is_header = False
2246
- else:
2247
- line_is_header = any(is_header(span, most_common_font_size, most_common_color, most_common_font) for span in spans)
2248
-
2249
- if line_is_header:
2250
- header_font_size = max(span["size"] for span in spans)
2251
- is_probably_real_header = (
2252
- header_font_size >= matched_header_font_size and
2253
- is_header(spans[0], most_common_font_size, most_common_color, most_common_font) and
2254
- len(line_text.strip()) > 2
2255
- )
2256
-
2257
- if (norm_line != matched_header_line_norm and
2258
- norm_line != heading_norm and
2259
- is_probably_real_header):
2260
- if line_text not in heading_norm:
2261
- print(f"🛑 Stop at header with same or larger font: '{line_text}' ({header_font_size} ≥ {matched_header_font_size})")
2262
- collecting = False
2263
- done = True
2264
- headertoContinue1 = False
2265
- headertoContinue2=False
2266
- for page_num, bbox in current_bbox.items():
2267
- bbox[3] = last_y1s.get(page_num, bbox[3])
2268
- page_highlights[page_num] = bbox
2269
- highlight_boxes(docHighlights, page_highlights,stringtowrite)
2270
-
2271
- break_collecting = True
2272
- break
2273
-
2274
- if break_collecting:
2275
- break
2276
-
2277
- collected_lines.append(line_text)
2278
- valid_spans = [span for span in spans if span.get("bbox")]
2279
- if valid_spans:
2280
- x0s = [span["bbox"][0] for span in valid_spans]
2281
- x1s = [span["bbox"][2] for span in valid_spans]
2282
- y0s = [span["bbox"][1] for span in valid_spans]
2283
- y1s = [span["bbox"][3] for span in valid_spans]
2284
-
2285
- line_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
2286
-
2287
- if page_num in current_bbox:
2288
- cb = current_bbox[page_num]
2289
- current_bbox[page_num] = [
2290
- min(cb[0], line_bbox[0]),
2291
- min(cb[1], line_bbox[1]),
2292
- max(cb[2], line_bbox[2]),
2293
- max(cb[3], line_bbox[3])
2294
- ]
2295
- else:
2296
- current_bbox[page_num] = line_bbox
2297
 
2298
- last_y1s[page_num] = line_bbox[3]
2299
- i += 1
 
2300
 
2301
- if not done:
2302
- for page_num, bbox in current_bbox.items():
2303
- bbox[3] = last_y1s.get(page_num, bbox[3])
2304
- page_highlights[page_num] = bbox
2305
- if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
2306
- stringtowrite='Not to be billed'
2307
- else:
2308
- stringtowrite='To be billed'
2309
- highlight_boxes(docHighlights, page_highlights,stringtowrite)
2310
 
2311
- # docHighlights.save("highlighted_output.pdf", garbage=4, deflate=True)
2312
 
2313
- pdf_bytes = BytesIO()
2314
- docHighlights.save(pdf_bytes)
2315
- print('JSONN',json_output)
2316
- return pdf_bytes.getvalue(), docHighlights , json_output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2317
 
2318
 
2319
 
@@ -2393,8 +2358,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
2393
  subHeaderFontSize= top_3_font_sizes[1]
2394
  subsubheaderFontSize= top_3_font_sizes[1]
2395
 
2396
- # print("📌 Has TOC:", bool(toc_pages), " | Pages to skip:", toc_pages)
2397
-
2398
  # Preload all pages to avoid repeated loading
2399
  # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
2400
 
@@ -2402,8 +2366,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
2402
  heading_to_search = heading_to_searchDict['text']
2403
  heading_to_searchPageNum = heading_to_searchDict['page']
2404
 
2405
- # print('headertosearch', heading_to_search)
2406
-
2407
  # Initialize variables
2408
  headertoContinue1 = False
2409
  headertoContinue2 = False
@@ -2464,10 +2427,8 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
2464
 
2465
  # Check if we should continue processing
2466
  if combined_line_norm and combined_line_norm in paths[0]:
2467
- print(combined_line_norm)
2468
  headertoContinue1 = combined_line_norm
2469
  if combined_line_norm and combined_line_norm in paths[-2]:
2470
- print(combined_line_norm)
2471
  headertoContinue2 = combined_line_norm
2472
  if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
2473
  stringtowrite='Not to be billed'
@@ -2508,8 +2469,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
2508
  Alltext_tobebilled+=combined_line_norm
2509
  collecting = True
2510
  matched_header_font_size = max(span["size"] for span in header_spans)
2511
- # print(f"📥 Start collecting after header: {combined_line_norm} (Font size: {matched_header_font_size})")
2512
-
2513
  collected_lines.append(line_text)
2514
  valid_spans = [span for span in spans if span.get("bbox")]
2515
 
@@ -2580,7 +2540,6 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
2580
  # Convert list to JSON
2581
  json_output = json.dumps(data_list_JSON, indent=4)
2582
 
2583
- # print("Final URL:", final_url)
2584
  i += 2
2585
  continue
2586
  else:
@@ -2605,9 +2564,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
2605
  Alltext_tobebilled+=combined_line_norm
2606
  collecting = True
2607
  matched_header_font_size = max(span["size"] for span in header_spans)
2608
- # print(f"📥 Start collecting after header: {combined_line_norm} "
2609
- # f"(Font: {matched_header_font_size}, Word match: {word_match_percent:.0f}%)")
2610
-
2611
  collected_lines.append(line_text)
2612
  valid_spans = [span for span in spans if span.get("bbox")]
2613
 
@@ -2678,7 +2635,6 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
2678
  # Convert list to JSON
2679
  json_output = json.dumps(data_list_JSON, indent=4)
2680
 
2681
- print("Final URL:", final_url)
2682
  i += 2
2683
  continue
2684
  if collecting:
@@ -2702,7 +2658,6 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
2702
  norm_line != heading_norm and
2703
  is_probably_real_header):
2704
  if line_text not in heading_norm:
2705
- # print(f"🛑 Stop at header with same or larger font: '{line_text}' ({header_font_size} ≥ {matched_header_font_size})")
2706
  collecting = False
2707
  done = True
2708
  headertoContinue1 = False
@@ -2756,7 +2711,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
2756
 
2757
  pdf_bytes = BytesIO()
2758
  docHighlights.save(pdf_bytes)
2759
- # print('JSONN',json_output)
2760
  return pdf_bytes.getvalue(), docHighlights , json_output , Alltext_tobebilled
2761
 
2762
 
@@ -2816,10 +2771,10 @@ def extract_section_under_headerRawan(pdf_path,headingjson,pagenum=0,incominghea
2816
  else:
2817
  for item in headingjson:
2818
  listofheadingsfromrawan.append(normalize_text(item['Subject']))
2819
- print('hereeeeeeeeeeeeeee0',listofheadingsfromrawan)
2820
  # Precompute all children headers once
2821
  allchildrenheaders = listofheadingsfromrawan
2822
- print('hereeeeeeeeeeeeeee00',allchildrenheaders)
2823
  allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
2824
 
2825
  df = pd.DataFrame(columns=["NBSLink","Subject","Page","Author","Creation Date","Layer",'Code', 'head above 1', "head above 2"])
@@ -2832,7 +2787,7 @@ def extract_section_under_headerRawan(pdf_path,headingjson,pagenum=0,incominghea
2832
  subHeaderFontSize= top_3_font_sizes[1]
2833
  subsubheaderFontSize= top_3_font_sizes[1]
2834
 
2835
- print("📌 Has TOC:", bool(toc_pages), " | Pages to skip:", toc_pages)
2836
 
2837
  # Preload all pages to avoid repeated loading
2838
  # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
@@ -2846,7 +2801,7 @@ def extract_section_under_headerRawan(pdf_path,headingjson,pagenum=0,incominghea
2846
  heading_to_searchPageNum = int(heading_to_searchDict['Page'])-1
2847
  incomingheader = heading_to_searchDict['head above 1']
2848
 
2849
- print('hereeeeeeeeeeeeeee0',heading_to_searchPageNum)
2850
  done = False
2851
  collecting = False
2852
  collected_lines = []
@@ -2860,7 +2815,7 @@ def extract_section_under_headerRawan(pdf_path,headingjson,pagenum=0,incominghea
2860
  heading_norm = normalize_text(heading_to_search)
2861
 
2862
  for page_num in range(heading_to_searchPageNum,len(doc)):
2863
- print('hereeeeeeeeeeeeeee1')
2864
  if page_num in toc_pages:
2865
  continue
2866
  if break_collecting:
@@ -2930,7 +2885,7 @@ def extract_section_under_headerRawan(pdf_path,headingjson,pagenum=0,incominghea
2930
  if header_spans:
2931
  collecting = True
2932
  matched_header_font_size = max(span["size"] for span in header_spans)
2933
- print(f"📥 Start collecting after header: {combined_line_norm} (Font size: {matched_header_font_size})")
2934
 
2935
  collected_lines.append(line_text)
2936
  valid_spans = [span for span in spans if span.get("bbox")]
@@ -2987,7 +2942,7 @@ def extract_section_under_headerRawan(pdf_path,headingjson,pagenum=0,incominghea
2987
  if type(heading_to_searchDict) != str:
2988
  heading_to_searchDict['NBSLink']=new_url
2989
  newjsonList.append(heading_to_searchDict)
2990
- print("Final URL:", final_url)
2991
  i += 2
2992
  continue
2993
  else:
@@ -3011,9 +2966,7 @@ def extract_section_under_headerRawan(pdf_path,headingjson,pagenum=0,incominghea
3011
  if header_spans and (meets_word_threshold or same_start_word(heading_to_search, combined_line_norm) ):
3012
  collecting = True
3013
  matched_header_font_size = max(span["size"] for span in header_spans)
3014
- print(f"📥 Start collecting after header: {combined_line_norm} "
3015
- f"(Font: {matched_header_font_size}, Word match: {word_match_percent:.0f}%)")
3016
-
3017
  collected_lines.append(line_text)
3018
  valid_spans = [span for span in spans if span.get("bbox")]
3019
 
@@ -3062,7 +3015,6 @@ def extract_section_under_headerRawan(pdf_path,headingjson,pagenum=0,incominghea
3062
  if type(heading_to_searchDict) != str:
3063
  heading_to_searchDict['NBSLink']=new_url
3064
  newjsonList.append(heading_to_searchDict)
3065
- print("Final URL:", final_url)
3066
  i += 2
3067
  continue
3068
  if collecting:
@@ -3086,7 +3038,7 @@ def extract_section_under_headerRawan(pdf_path,headingjson,pagenum=0,incominghea
3086
  norm_line != heading_norm and
3087
  is_probably_real_header):
3088
  if line_text not in heading_norm:
3089
- print(f"🛑 Stop at header with same or larger font: '{line_text}' ({header_font_size} ≥ {matched_header_font_size})")
3090
  collecting = False
3091
  done = True
3092
  headertoContinue1 = False
 
89
  return (pageNum, span_y)
90
 
91
  def extract_headers(doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin, bottom_margin):
 
92
 
93
  grouped_headers = defaultdict(list)
94
  spans = []
 
317
  # Step 2: Identify level 0 headers (largest and in TOC)
318
  # max_size = max(h['size'] for h in headers) if headers else 0
319
  max_size,subheaderSize,nbsheadersize=top_3_font_sizes
320
+
321
  toc_text_match=[]
322
  # Improved TOC matching with exact and substring matching
323
  toc_matches = []
 
346
  toc_matches.append(h)
347
  toc_text_match.append(h['text'])
348
  elif matching_toc_texts and h['size'] < max_size * 0.9 and h['size'] > nbsheadersize : # h['size'] < max_size * 0.9 and h['size'] > max_size*0.75:
 
349
  headers.remove(h)
350
  continue
351
 
 
363
  # Update the header text with cleaned version
364
  h['text'] = cleaned_text
365
  unique_level0.append(h)
366
+
 
367
  # Step 3: Process headers under each level 0 to identify level 1 format
368
 
369
  # First, group headers by their level 0 parent
 
664
 
665
  hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
666
  listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
667
+
668
  # Precompute all children headers once
669
  allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
670
  allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
 
680
  subHeaderFontSize= top_3_font_sizes[1]
681
  subsubheaderFontSize= top_3_font_sizes[1]
682
 
683
+
684
 
685
  # Preload all pages to avoid repeated loading
686
  # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
 
689
  heading_to_search = heading_to_searchDict['text']
690
  heading_to_searchPageNum = heading_to_searchDict['page']
691
 
 
 
692
  # Initialize variables
693
  headertoContinue1 = False
694
  headertoContinue2 = False
 
749
 
750
  # Check if we should continue processing
751
  if combined_line_norm and combined_line_norm in paths[0]:
752
+
753
  headertoContinue1 = combined_line_norm
754
  if combined_line_norm and combined_line_norm in paths[-2]:
755
+
756
  headertoContinue2 = combined_line_norm
757
  if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
758
  stringtowrite='Not to be billed'
 
792
  if header_spans:
793
  collecting = True
794
  matched_header_font_size = max(span["size"] for span in header_spans)
 
795
 
796
  collected_lines.append(line_text)
797
  valid_spans = [span for span in spans if span.get("bbox")]
 
863
  # Convert list to JSON
864
  json_output = json.dumps(data_list_JSON, indent=4)
865
 
 
866
  i += 2
867
  continue
868
  else:
 
886
  if header_spans and (meets_word_threshold or same_start_word(heading_to_search, combined_line_norm) ):
887
  collecting = True
888
  matched_header_font_size = max(span["size"] for span in header_spans)
889
+
 
 
890
  collected_lines.append(line_text)
891
  valid_spans = [span for span in spans if span.get("bbox")]
892
 
 
957
  # Convert list to JSON
958
  json_output = json.dumps(data_list_JSON, indent=4)
959
 
960
+
961
  i += 2
962
  continue
963
  if collecting:
 
981
  norm_line != heading_norm and
982
  is_probably_real_header):
983
  if line_text not in heading_norm:
 
984
  collecting = False
985
  done = True
986
  headertoContinue1 = False
 
1034
 
1035
  pdf_bytes = BytesIO()
1036
  docHighlights.save(pdf_bytes)
 
1037
  return pdf_bytes.getvalue(), docHighlights , json_output
1038
 
1039
 
 
1098
 
1099
  hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
1100
  listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
 
1101
  # Precompute all children headers once
1102
  allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
1103
  allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
 
1113
  subHeaderFontSize= top_3_font_sizes[1]
1114
  subsubheaderFontSize= top_3_font_sizes[1]
1115
 
 
 
1116
  # Preload all pages to avoid repeated loading
1117
  # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
1118
 
 
1120
  heading_to_search = heading_to_searchDict['text']
1121
  heading_to_searchPageNum = heading_to_searchDict['page']
1122
 
1123
+
 
1124
  # Initialize variables
1125
  headertoContinue1 = False
1126
  headertoContinue2 = False
 
1181
 
1182
  # Check if we should continue processing
1183
  if combined_line_norm and combined_line_norm in paths[0]:
 
1184
  headertoContinue1 = combined_line_norm
1185
  if combined_line_norm and combined_line_norm in paths[-2]:
 
1186
  headertoContinue2 = combined_line_norm
1187
  if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
1188
  stringtowrite='Not to be billed'
 
1223
  Alltext_Tobebilled+=combined_line_norm
1224
  collecting = True
1225
  matched_header_font_size = max(span["size"] for span in header_spans)
1226
+
 
1227
  collected_lines.append(line_text)
1228
  valid_spans = [span for span in spans if span.get("bbox")]
1229
 
 
1294
  # Convert list to JSON
1295
  json_output = json.dumps(data_list_JSON, indent=4)
1296
 
 
1297
  i += 2
1298
  continue
1299
  else:
 
1318
  Alltext_Tobebilled+=combined_line_norm
1319
  collecting = True
1320
  matched_header_font_size = max(span["size"] for span in header_spans)
1321
+
 
 
1322
  collected_lines.append(line_text)
1323
  valid_spans = [span for span in spans if span.get("bbox")]
1324
 
 
1389
  # Convert list to JSON
1390
  json_output = json.dumps(data_list_JSON, indent=4)
1391
 
 
1392
  i += 2
1393
  continue
1394
  if collecting:
 
1412
  norm_line != heading_norm and
1413
  is_probably_real_header):
1414
  if line_text not in heading_norm:
 
1415
  collecting = False
1416
  done = True
1417
  headertoContinue1 = False
 
1465
 
1466
  pdf_bytes = BytesIO()
1467
  docHighlights.save(pdf_bytes)
 
1468
  return pdf_bytes.getvalue(), docHighlights , json_output , Alltext_Tobebilled
1469
 
1470
 
 
1524
  else:
1525
  for item in headingjson:
1526
  listofheadingsfromrawan.append(normalize_text(item['Subject']))
 
1527
  # Precompute all children headers once
1528
  allchildrenheaders = listofheadingsfromrawan
 
1529
  allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
1530
 
1531
  df = pd.DataFrame(columns=["NBSLink","Subject","Page","Author","Creation Date","Layer",'Code', 'head above 1', "head above 2"])
 
1538
  subHeaderFontSize= top_3_font_sizes[1]
1539
  subsubheaderFontSize= top_3_font_sizes[1]
1540
 
 
1541
 
1542
  # Preload all pages to avoid repeated loading
1543
  # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
 
1551
  heading_to_searchPageNum = int(heading_to_searchDict['Page'])-1
1552
  incomingheader = heading_to_searchDict['head above 1']
1553
 
 
1554
  done = False
1555
  collecting = False
1556
  collected_lines = []
 
1564
  heading_norm = normalize_text(heading_to_search)
1565
 
1566
  for page_num in range(heading_to_searchPageNum,len(doc)):
 
1567
  if page_num in toc_pages:
1568
  continue
1569
  if break_collecting:
 
1633
  if header_spans:
1634
  collecting = True
1635
  matched_header_font_size = max(span["size"] for span in header_spans)
 
1636
 
1637
  collected_lines.append(line_text)
1638
  valid_spans = [span for span in spans if span.get("bbox")]
 
1689
  if type(heading_to_searchDict) != str:
1690
  heading_to_searchDict['NBSLink']=new_url
1691
  newjsonList.append(heading_to_searchDict)
 
1692
  i += 2
1693
  continue
1694
  else:
 
1712
  if header_spans and (meets_word_threshold or same_start_word(heading_to_search, combined_line_norm) ):
1713
  collecting = True
1714
  matched_header_font_size = max(span["size"] for span in header_spans)
1715
+
 
 
1716
  collected_lines.append(line_text)
1717
  valid_spans = [span for span in spans if span.get("bbox")]
1718
 
 
1761
  if type(heading_to_searchDict) != str:
1762
  heading_to_searchDict['NBSLink']=new_url
1763
  newjsonList.append(heading_to_searchDict)
 
1764
  i += 2
1765
  continue
1766
  if collecting:
 
1784
  norm_line != heading_norm and
1785
  is_probably_real_header):
1786
  if line_text not in heading_norm:
 
1787
  collecting = False
1788
  done = True
1789
  headertoContinue1 = False
 
1847
 
1848
 
1849
 
1850
+ # top_margin = 70
1851
+ # bottom_margin = 50
1852
+ # headertoContinue1 = False
1853
+ # headertoContinue2=False
1854
 
1855
+ # parsed_url = urlparse(pdf_path)
1856
+ # filename = os.path.basename(parsed_url.path)
1857
+ # filename = unquote(filename) # decode URL-encoded characters
1858
+
1859
+ # # Optimized URL handling
1860
+ # if pdf_path and ('http' in pdf_path or 'dropbox' in pdf_path):
1861
+ # pdf_path = pdf_path.replace('dl=0', 'dl=1')
1862
+
1863
+ # # Cache frequently used values
1864
+ # response = requests.get(pdf_path)
1865
+ # pdf_content = BytesIO(response.content)
1866
+ # if not pdf_content:
1867
+ # raise ValueError("No valid PDF content found.")
1868
+
1869
+ # doc = fitz.open(stream=pdf_content, filetype="pdf")
1870
+ # docHighlights = fitz.open(stream=pdf_content, filetype="pdf")
1871
+ # most_common_font_size, most_common_color, most_common_font = get_regular_font_size_and_color(doc)
1872
+
1873
+ # # Precompute regex patterns
1874
+ # dot_pattern = re.compile(r'\.{3,}')
1875
+ # url_pattern = re.compile(r'https?://\S+|www\.\S+')
1876
+
1877
+ # def get_toc_page_numbers(doc, max_pages_to_check=15):
1878
+ # toc_pages = []
1879
+ # for page_num in range(min(len(doc), max_pages_to_check)):
1880
+ # page = doc.load_page(page_num)
1881
+ # blocks = page.get_text("dict")["blocks"]
1882
+
1883
+ # dot_line_count = 0
1884
+ # for block in blocks:
1885
+ # for line in block.get("lines", []):
1886
+ # line_text = get_spaced_text_from_spans(line["spans"]).strip()
1887
+ # if dot_pattern.search(line_text):
1888
+ # dot_line_count += 1
1889
+
1890
+ # if dot_line_count >= 3:
1891
+ # toc_pages.append(page_num)
1892
+
1893
+ # return list(range(0, toc_pages[-1] +1)) if toc_pages else toc_pages
1894
+
1895
+ # toc_pages = get_toc_page_numbers(doc)
1896
+
1897
+ # headers, top_3_font_sizes, smallest_font_size, headersSpans = extract_headers(
1898
+ # doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin, bottom_margin
1899
+ # )
1900
+
1901
+ # hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
1902
+ # listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
1903
+ # print('listofHeaderstoMarkup',listofHeaderstoMarkup)
1904
+ # # Precompute all children headers once
1905
+ # allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
1906
+ # allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1907
 
1908
+ # df = pd.DataFrame(columns=["NBSLink","Subject","Page","Author","Creation Date","Layer",'Code', 'head above 1', "head above 2"])
1909
+ # dictionaryNBS={}
1910
+ # data_list_JSON = []
1911
 
1912
+ # if len(top_3_font_sizes)==3:
1913
+ # mainHeaderFontSize, subHeaderFontSize, subsubheaderFontSize = top_3_font_sizes
1914
+ # elif len(top_3_font_sizes)==2:
1915
+ # mainHeaderFontSize= top_3_font_sizes[0]
1916
+ # subHeaderFontSize= top_3_font_sizes[1]
1917
+ # subsubheaderFontSize= top_3_font_sizes[1]
 
 
 
1918
 
1919
+ # print("📌 Has TOC:", bool(toc_pages), " | Pages to skip:", toc_pages)
1920
 
1921
+ # # Preload all pages to avoid repeated loading
1922
+ # # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
1923
+
1924
+ # for heading_to_searchDict, paths in listofHeaderstoMarkup:
1925
+ # heading_to_search = heading_to_searchDict['text']
1926
+ # heading_to_searchPageNum = heading_to_searchDict['page']
1927
+
1928
+ # print('headertosearch', heading_to_search)
1929
+
1930
+ # # Initialize variables
1931
+ # headertoContinue1 = False
1932
+ # headertoContinue2 = False
1933
+ # matched_header_line = None
1934
+ # done = False
1935
+ # collecting = False
1936
+ # collected_lines = []
1937
+ # page_highlights = {}
1938
+ # current_bbox = {}
1939
+ # last_y1s = {}
1940
+ # mainHeader = ''
1941
+ # subHeader = ''
1942
+ # matched_header_line_norm = heading_to_search
1943
+ # break_collecting = False
1944
+ # heading_norm = normalize_text(heading_to_search)
1945
+ # paths_norm = [normalize_text(p) for p in paths[0]] if paths and paths[0] else []
1946
+
1947
+ # for page_num in range(heading_to_searchPageNum,len(doc)):
1948
+ # if page_num in toc_pages:
1949
+ # continue
1950
+ # if break_collecting:
1951
+ # break
1952
+ # page=doc[page_num]
1953
+ # page_height = page.rect.height
1954
+ # blocks = page.get_text("dict")["blocks"]
1955
+
1956
+ # for block in blocks:
1957
+ # if break_collecting:
1958
+ # break
1959
+
1960
+ # lines = block.get("lines", [])
1961
+ # i = 0
1962
+ # while i < len(lines):
1963
+ # if break_collecting:
1964
+ # break
1965
+
1966
+ # spans = lines[i].get("spans", [])
1967
+ # if not spans:
1968
+ # i += 1
1969
+ # continue
1970
+
1971
+ # y0 = spans[0]["bbox"][1]
1972
+ # y1 = spans[0]["bbox"][3]
1973
+ # if y0 < top_margin or y1 > (page_height - bottom_margin):
1974
+ # i += 1
1975
+ # continue
1976
+
1977
+ # line_text = get_spaced_text_from_spans(spans).lower()
1978
+ # line_text_norm = normalize_text(line_text)
1979
+
1980
+ # # Combine with next line if available
1981
+ # if i + 1 < len(lines):
1982
+ # next_spans = lines[i + 1].get("spans", [])
1983
+ # next_line_text = get_spaced_text_from_spans(next_spans).lower()
1984
+ # combined_line_norm = normalize_text(line_text + " " + next_line_text)
1985
+ # else:
1986
+ # combined_line_norm = line_text_norm
1987
+
1988
+ # # Check if we should continue processing
1989
+ # if combined_line_norm and combined_line_norm in paths[0]:
1990
+ # print(combined_line_norm)
1991
+ # headertoContinue1 = combined_line_norm
1992
+ # if combined_line_norm and combined_line_norm in paths[-2]:
1993
+ # print(combined_line_norm)
1994
+ # headertoContinue2 = combined_line_norm
1995
+ # if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
1996
+ # stringtowrite='Not to be billed'
1997
+ # else:
1998
+ # stringtowrite='To be billed'
1999
+ # # Optimized header matching
2000
+ # existsfull = (
2001
+ # ( combined_line_norm in allchildrenheaders_set or
2002
+ # combined_line_norm in allchildrenheaders ) and heading_to_search in combined_line_norm
2003
+ # )
2004
+
2005
+ # # New word-based matching
2006
+ # current_line_words = set(combined_line_norm.split())
2007
+ # heading_words = set(heading_norm.split())
2008
+ # all_words_match = current_line_words.issubset(heading_words) and len(current_line_words) > 0
2009
+
2010
+ # substring_match = (
2011
+ # heading_norm in combined_line_norm or
2012
+ # combined_line_norm in heading_norm or
2013
+ # all_words_match # Include the new word-based matching
2014
+ # )
2015
+ # # substring_match = (
2016
+ # # heading_norm in combined_line_norm or
2017
+ # # combined_line_norm in heading_norm
2018
+ # # )
2019
+
2020
+ # if (substring_match and existsfull and not collecting and
2021
+ # len(combined_line_norm) > 0 ):#and (headertoContinue1 or headertoContinue2) ):
2022
+
2023
+ # # Check header conditions more efficiently
2024
+ # header_spans = [
2025
+ # span for span in spans
2026
+ # if (is_header(span, most_common_font_size, most_common_color, most_common_font)
2027
+ # # and span['size'] >= subsubheaderFontSize
2028
+ # and span['size'] < mainHeaderFontSize)
2029
+ # ]
2030
+ # if header_spans:
2031
+ # collecting = True
2032
+ # matched_header_font_size = max(span["size"] for span in header_spans)
2033
+ # print(f"📥 Start collecting after header: {combined_line_norm} (Font size: {matched_header_font_size})")
2034
+
2035
+ # collected_lines.append(line_text)
2036
+ # valid_spans = [span for span in spans if span.get("bbox")]
2037
+
2038
+ # if valid_spans:
2039
+ # x0s = [span["bbox"][0] for span in valid_spans]
2040
+ # x1s = [span["bbox"][2] for span in valid_spans]
2041
+ # y0s = [span["bbox"][1] for span in valid_spans]
2042
+ # y1s = [span["bbox"][3] for span in valid_spans]
2043
+
2044
+ # header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
2045
+
2046
+ # if page_num in current_bbox:
2047
+ # cb = current_bbox[page_num]
2048
+ # current_bbox[page_num] = [
2049
+ # min(cb[0], header_bbox[0]),
2050
+ # min(cb[1], header_bbox[1]),
2051
+ # max(cb[2], header_bbox[2]),
2052
+ # max(cb[3], header_bbox[3])
2053
+ # ]
2054
+ # else:
2055
+ # current_bbox[page_num] = header_bbox
2056
+ # last_y1s[page_num] = header_bbox[3]
2057
+ # x0, y0, x1, y1 = header_bbox
2058
+
2059
+ # zoom = 200
2060
+ # left = int(x0)
2061
+ # top = int(y0)
2062
+ # zoom_str = f"{zoom},{left},{top}"
2063
+ # pageNumberFound = page_num + 1
2064
+
2065
+ # # Build the query parameters
2066
+ # params = {
2067
+ # 'pdfLink': pdf_path, # Your PDF link
2068
+ # 'keyword': heading_to_search, # Your keyword (could be a string or list)
2069
+ # }
2070
+
2071
+ # # URL encode each parameter
2072
+ # encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
2073
+
2074
+ # # Construct the final encoded link
2075
+ # encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
2076
+
2077
+ # # Correctly construct the final URL with page and zoom
2078
+ # final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
2079
+
2080
+ # # Get current date and time
2081
+ # now = datetime.now()
2082
+
2083
+ # # Format the output
2084
+ # formatted_time = now.strftime("%d/%m/%Y %I:%M:%S %p")
2085
+ # # Optionally, add the URL to a DataFrame
2086
+
2087
+
2088
+ # data_entry = {
2089
+ # "NBSLink": final_url,
2090
+ # "Subject": heading_to_search,
2091
+ # "Page": str(pageNumberFound),
2092
+ # "Author": "ADR",
2093
+ # "Creation Date": formatted_time,
2094
+ # "Layer": "Initial",
2095
+ # "Code": stringtowrite,
2096
+ # "head above 1": paths[-2],
2097
+ # "head above 2": paths[0],
2098
+ # "MC Connnection": 'Go to ' + paths[0].strip().split()[0] +'/'+ heading_to_search.strip().split()[0] + ' in '+ filename
2099
+ # }
2100
+ # data_list_JSON.append(data_entry)
2101
+
2102
+ # # Convert list to JSON
2103
+ # json_output = json.dumps(data_list_JSON, indent=4)
2104
+
2105
+ # print("Final URL:", final_url)
2106
+ # i += 2
2107
+ # continue
2108
+ # else:
2109
+ # if (substring_match and not collecting and
2110
+ # len(combined_line_norm) > 0): # and (headertoContinue1 or headertoContinue2) ):
2111
+
2112
+ # # Calculate word match percentage
2113
+ # word_match_percent = words_match_ratio(heading_norm, combined_line_norm) * 100
2114
+
2115
+ # # Check if at least 70% of header words exist in this line
2116
+ # meets_word_threshold = word_match_percent >= 100
2117
+
2118
+ # # Check header conditions (including word threshold)
2119
+ # header_spans = [
2120
+ # span for span in spans
2121
+ # if (is_header(span, most_common_font_size, most_common_color, most_common_font)
2122
+ # # and span['size'] >= subsubheaderFontSize
2123
+ # and span['size'] < mainHeaderFontSize)
2124
+ # ]
2125
+
2126
+ # if header_spans and (meets_word_threshold or same_start_word(heading_to_search, combined_line_norm) ):
2127
+ # collecting = True
2128
+ # matched_header_font_size = max(span["size"] for span in header_spans)
2129
+ # print(f"📥 Start collecting after header: {combined_line_norm} "
2130
+ # f"(Font: {matched_header_font_size}, Word match: {word_match_percent:.0f}%)")
2131
+
2132
+ # collected_lines.append(line_text)
2133
+ # valid_spans = [span for span in spans if span.get("bbox")]
2134
+
2135
+ # if valid_spans:
2136
+ # x0s = [span["bbox"][0] for span in valid_spans]
2137
+ # x1s = [span["bbox"][2] for span in valid_spans]
2138
+ # y0s = [span["bbox"][1] for span in valid_spans]
2139
+ # y1s = [span["bbox"][3] for span in valid_spans]
2140
+
2141
+ # header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
2142
+
2143
+ # if page_num in current_bbox:
2144
+ # cb = current_bbox[page_num]
2145
+ # current_bbox[page_num] = [
2146
+ # min(cb[0], header_bbox[0]),
2147
+ # min(cb[1], header_bbox[1]),
2148
+ # max(cb[2], header_bbox[2]),
2149
+ # max(cb[3], header_bbox[3])
2150
+ # ]
2151
+ # else:
2152
+ # current_bbox[page_num] = header_bbox
2153
+
2154
+ # last_y1s[page_num] = header_bbox[3]
2155
+ # x0, y0, x1, y1 = header_bbox
2156
+ # zoom = 200
2157
+ # left = int(x0)
2158
+ # top = int(y0)
2159
+ # zoom_str = f"{zoom},{left},{top}"
2160
+ # pageNumberFound = page_num + 1
2161
+
2162
+ # # Build the query parameters
2163
+ # params = {
2164
+ # 'pdfLink': pdf_path, # Your PDF link
2165
+ # 'keyword': heading_to_search, # Your keyword (could be a string or list)
2166
+ # }
2167
+
2168
+ # # URL encode each parameter
2169
+ # encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
2170
+
2171
+ # # Construct the final encoded link
2172
+ # encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
2173
+
2174
+ # # Correctly construct the final URL with page and zoom
2175
+ # final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
2176
+
2177
+ # # Get current date and time
2178
+ # now = datetime.now()
2179
+
2180
+ # # Format the output
2181
+ # formatted_time = now.strftime("%d/%m/%Y %I:%M:%S %p")
2182
+ # # Optionally, add the URL to a DataFrame
2183
+
2184
+
2185
+ # data_entry = {
2186
+ # "NBSLink": final_url,
2187
+ # "Subject": heading_to_search,
2188
+ # "Page": str(pageNumberFound),
2189
+ # "Author": "ADR",
2190
+ # "Creation Date": formatted_time,
2191
+ # "Layer": "Initial",
2192
+ # "Code": stringtowrite,
2193
+ # "head above 1": paths[-2],
2194
+ # "head above 2": paths[0],
2195
+ # "MC Connnection": 'Go to ' + paths[0].strip().split()[0] +'/'+ heading_to_search.strip().split()[0] + ' in '+ filename
2196
+ # }
2197
+ # data_list_JSON.append(data_entry)
2198
+
2199
+ # # Convert list to JSON
2200
+ # json_output = json.dumps(data_list_JSON, indent=4)
2201
+
2202
+ # print("Final URL:", final_url)
2203
+ # i += 2
2204
+ # continue
2205
+ # if collecting:
2206
+ # norm_line = normalize_text(line_text)
2207
+
2208
+ # # Optimized URL check
2209
+ # if url_pattern.match(norm_line):
2210
+ # line_is_header = False
2211
+ # else:
2212
+ # line_is_header = any(is_header(span, most_common_font_size, most_common_color, most_common_font) for span in spans)
2213
+
2214
+ # if line_is_header:
2215
+ # header_font_size = max(span["size"] for span in spans)
2216
+ # is_probably_real_header = (
2217
+ # header_font_size >= matched_header_font_size and
2218
+ # is_header(spans[0], most_common_font_size, most_common_color, most_common_font) and
2219
+ # len(line_text.strip()) > 2
2220
+ # )
2221
+
2222
+ # if (norm_line != matched_header_line_norm and
2223
+ # norm_line != heading_norm and
2224
+ # is_probably_real_header):
2225
+ # if line_text not in heading_norm:
2226
+ # print(f"🛑 Stop at header with same or larger font: '{line_text}' ({header_font_size} ≥ {matched_header_font_size})")
2227
+ # collecting = False
2228
+ # done = True
2229
+ # headertoContinue1 = False
2230
+ # headertoContinue2=False
2231
+ # for page_num, bbox in current_bbox.items():
2232
+ # bbox[3] = last_y1s.get(page_num, bbox[3])
2233
+ # page_highlights[page_num] = bbox
2234
+ # highlight_boxes(docHighlights, page_highlights,stringtowrite)
2235
+
2236
+ # break_collecting = True
2237
+ # break
2238
+
2239
+ # if break_collecting:
2240
+ # break
2241
+
2242
+ # collected_lines.append(line_text)
2243
+ # valid_spans = [span for span in spans if span.get("bbox")]
2244
+ # if valid_spans:
2245
+ # x0s = [span["bbox"][0] for span in valid_spans]
2246
+ # x1s = [span["bbox"][2] for span in valid_spans]
2247
+ # y0s = [span["bbox"][1] for span in valid_spans]
2248
+ # y1s = [span["bbox"][3] for span in valid_spans]
2249
+
2250
+ # line_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
2251
+
2252
+ # if page_num in current_bbox:
2253
+ # cb = current_bbox[page_num]
2254
+ # current_bbox[page_num] = [
2255
+ # min(cb[0], line_bbox[0]),
2256
+ # min(cb[1], line_bbox[1]),
2257
+ # max(cb[2], line_bbox[2]),
2258
+ # max(cb[3], line_bbox[3])
2259
+ # ]
2260
+ # else:
2261
+ # current_bbox[page_num] = line_bbox
2262
+
2263
+ # last_y1s[page_num] = line_bbox[3]
2264
+ # i += 1
2265
+
2266
+ # if not done:
2267
+ # for page_num, bbox in current_bbox.items():
2268
+ # bbox[3] = last_y1s.get(page_num, bbox[3])
2269
+ # page_highlights[page_num] = bbox
2270
+ # if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
2271
+ # stringtowrite='Not to be billed'
2272
+ # else:
2273
+ # stringtowrite='To be billed'
2274
+ # highlight_boxes(docHighlights, page_highlights,stringtowrite)
2275
+
2276
+ # # docHighlights.save("highlighted_output.pdf", garbage=4, deflate=True)
2277
+
2278
+ # pdf_bytes = BytesIO()
2279
+ # docHighlights.save(pdf_bytes)
2280
+ # print('JSONN',json_output)
2281
+ # return pdf_bytes.getvalue(), docHighlights , json_output
2282
 
2283
 
2284
 
 
2358
  subHeaderFontSize= top_3_font_sizes[1]
2359
  subsubheaderFontSize= top_3_font_sizes[1]
2360
 
2361
+
 
2362
  # Preload all pages to avoid repeated loading
2363
  # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
2364
 
 
2366
  heading_to_search = heading_to_searchDict['text']
2367
  heading_to_searchPageNum = heading_to_searchDict['page']
2368
 
2369
+
 
2370
  # Initialize variables
2371
  headertoContinue1 = False
2372
  headertoContinue2 = False
 
2427
 
2428
  # Check if we should continue processing
2429
  if combined_line_norm and combined_line_norm in paths[0]:
 
2430
  headertoContinue1 = combined_line_norm
2431
  if combined_line_norm and combined_line_norm in paths[-2]:
 
2432
  headertoContinue2 = combined_line_norm
2433
  if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
2434
  stringtowrite='Not to be billed'
 
2469
  Alltext_tobebilled+=combined_line_norm
2470
  collecting = True
2471
  matched_header_font_size = max(span["size"] for span in header_spans)
2472
+
 
2473
  collected_lines.append(line_text)
2474
  valid_spans = [span for span in spans if span.get("bbox")]
2475
 
 
2540
  # Convert list to JSON
2541
  json_output = json.dumps(data_list_JSON, indent=4)
2542
 
 
2543
  i += 2
2544
  continue
2545
  else:
 
2564
  Alltext_tobebilled+=combined_line_norm
2565
  collecting = True
2566
  matched_header_font_size = max(span["size"] for span in header_spans)
2567
+
 
 
2568
  collected_lines.append(line_text)
2569
  valid_spans = [span for span in spans if span.get("bbox")]
2570
 
 
2635
  # Convert list to JSON
2636
  json_output = json.dumps(data_list_JSON, indent=4)
2637
 
 
2638
  i += 2
2639
  continue
2640
  if collecting:
 
2658
  norm_line != heading_norm and
2659
  is_probably_real_header):
2660
  if line_text not in heading_norm:
 
2661
  collecting = False
2662
  done = True
2663
  headertoContinue1 = False
 
2711
 
2712
  pdf_bytes = BytesIO()
2713
  docHighlights.save(pdf_bytes)
2714
+
2715
  return pdf_bytes.getvalue(), docHighlights , json_output , Alltext_tobebilled
2716
 
2717
 
 
2771
  else:
2772
  for item in headingjson:
2773
  listofheadingsfromrawan.append(normalize_text(item['Subject']))
2774
+ # print('hereeeeeeeeeeeeeee0',listofheadingsfromrawan)
2775
  # Precompute all children headers once
2776
  allchildrenheaders = listofheadingsfromrawan
2777
+ # print('hereeeeeeeeeeeeeee00',allchildrenheaders)
2778
  allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
2779
 
2780
  df = pd.DataFrame(columns=["NBSLink","Subject","Page","Author","Creation Date","Layer",'Code', 'head above 1', "head above 2"])
 
2787
  subHeaderFontSize= top_3_font_sizes[1]
2788
  subsubheaderFontSize= top_3_font_sizes[1]
2789
 
2790
+ # print("📌 Has TOC:", bool(toc_pages), " | Pages to skip:", toc_pages)
2791
 
2792
  # Preload all pages to avoid repeated loading
2793
  # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
 
2801
  heading_to_searchPageNum = int(heading_to_searchDict['Page'])-1
2802
  incomingheader = heading_to_searchDict['head above 1']
2803
 
2804
+ # print('hereeeeeeeeeeeeeee0',heading_to_searchPageNum)
2805
  done = False
2806
  collecting = False
2807
  collected_lines = []
 
2815
  heading_norm = normalize_text(heading_to_search)
2816
 
2817
  for page_num in range(heading_to_searchPageNum,len(doc)):
2818
+
2819
  if page_num in toc_pages:
2820
  continue
2821
  if break_collecting:
 
2885
  if header_spans:
2886
  collecting = True
2887
  matched_header_font_size = max(span["size"] for span in header_spans)
2888
+ # print(f"📥 Start collecting after header: {combined_line_norm} (Font size: {matched_header_font_size})")
2889
 
2890
  collected_lines.append(line_text)
2891
  valid_spans = [span for span in spans if span.get("bbox")]
 
2942
  if type(heading_to_searchDict) != str:
2943
  heading_to_searchDict['NBSLink']=new_url
2944
  newjsonList.append(heading_to_searchDict)
2945
+ # print("Final URL:", final_url)
2946
  i += 2
2947
  continue
2948
  else:
 
2966
  if header_spans and (meets_word_threshold or same_start_word(heading_to_search, combined_line_norm) ):
2967
  collecting = True
2968
  matched_header_font_size = max(span["size"] for span in header_spans)
2969
+
 
 
2970
  collected_lines.append(line_text)
2971
  valid_spans = [span for span in spans if span.get("bbox")]
2972
 
 
3015
  if type(heading_to_searchDict) != str:
3016
  heading_to_searchDict['NBSLink']=new_url
3017
  newjsonList.append(heading_to_searchDict)
 
3018
  i += 2
3019
  continue
3020
  if collecting:
 
3038
  norm_line != heading_norm and
3039
  is_probably_real_header):
3040
  if line_text not in heading_norm:
3041
+ # print(f"🛑 Stop at header with same or larger font: '{line_text}' ({header_font_size} ≥ {matched_header_font_size})")
3042
  collecting = False
3043
  done = True
3044
  headertoContinue1 = False