Marthee commited on
Commit
5997297
·
verified ·
1 Parent(s): 58e3f42

Update InitialMarkups.py

Browse files
Files changed (1) hide show
  1. InitialMarkups.py +156 -87
InitialMarkups.py CHANGED
@@ -42,7 +42,7 @@ import urllib.parse
42
 
43
  def changepdflinks(json_data, pdf_path):
44
  print('ll , ' ,json_data,pdf_path)
45
- base_viewer_link = "https://findconsole-initialmarkups.hf.space/view-pdf?"
46
 
47
  updated_json = []
48
  for entry in json_data:
@@ -54,7 +54,7 @@ def changepdflinks(json_data, pdf_path):
54
  encoded_pdf_link = urllib.parse.quote(pdf_path, safe='')
55
 
56
  # Construct the final link
57
- final_url = f"{base_viewer_link}pdfLink={encoded_pdf_link}#page={str(page_str)}&zoom={zoom_str}"
58
 
59
  # Replace the old NBSLink value with the full URL
60
  entry["NBSLink"] = final_url
@@ -891,7 +891,7 @@ def extract_section_under_header(multiplePDF_Paths):
891
  "Code": stringtowrite,
892
  "head above 1": paths[-2],
893
  "head above 2": paths[0],
894
- "MC Connnection": 'Go to ' + paths[0].strip().split()[0] +'/'+ heading_to_search.strip().split()[0] + ' in '+ filename
895
  }
896
  data_list_JSON.append(data_entry)
897
 
@@ -985,7 +985,7 @@ def extract_section_under_header(multiplePDF_Paths):
985
  "Code": stringtowrite,
986
  "head above 1": paths[-2],
987
  "head above 2": paths[0],
988
- "MC Connnection": 'Go to ' + paths[0].strip().split()[0] +'/'+ heading_to_search.strip().split()[0] + ' in '+ filename
989
  }
990
  data_list_JSON.append(data_entry)
991
 
@@ -2005,7 +2005,9 @@ def extract_section_under_header_tobebilled2(pdf_path):
2005
 
2006
 
2007
 
 
2008
  def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths):
 
2009
  # keywordstoSkip=["installation", "execution", "miscellaneous items", "workmanship", "testing", "labeling"]
2010
  filenames=[]
2011
  keywords = {'installation', 'execution', 'miscellaneous items', 'workmanship', 'testing', 'labeling'}
@@ -2028,52 +2030,86 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths):
2028
  # Optimized URL handling
2029
  if pdf_path and ('http' in pdf_path or 'dropbox' in pdf_path):
2030
  pdf_path = pdf_path.replace('dl=0', 'dl=1')
2031
-
2032
  # Cache frequently used values
2033
  response = requests.get(pdf_path)
2034
  pdf_content = BytesIO(response.content)
2035
  if not pdf_content:
2036
  raise ValueError("No valid PDF content found.")
2037
-
2038
  doc = fitz.open(stream=pdf_content, filetype="pdf")
2039
  docHighlights = fitz.open(stream=pdf_content, filetype="pdf")
2040
  most_common_font_size, most_common_color, most_common_font = get_regular_font_size_and_color(doc)
2041
-
2042
  # Precompute regex patterns
2043
  dot_pattern = re.compile(r'\.{3,}')
2044
  url_pattern = re.compile(r'https?://\S+|www\.\S+')
2045
-
 
2046
  def get_toc_page_numbers(doc, max_pages_to_check=15):
2047
  toc_pages = []
 
 
 
 
 
 
 
 
 
2048
  for page_num in range(min(len(doc), max_pages_to_check)):
2049
  page = doc.load_page(page_num)
2050
  blocks = page.get_text("dict")["blocks"]
2051
-
2052
  dot_line_count = 0
 
 
2053
  for block in blocks:
2054
  for line in block.get("lines", []):
2055
- line_text = get_spaced_text_from_spans(line["spans"]).strip()
 
 
 
2056
  if dot_pattern.search(line_text):
2057
  dot_line_count += 1
2058
-
2059
- if dot_line_count >= 1:
 
 
 
 
 
 
 
 
 
2060
  toc_pages.append(page_num)
2061
-
2062
- return list(range(0, toc_pages[-1] +1)) if toc_pages else toc_pages
2063
-
 
 
 
 
 
 
 
 
2064
  toc_pages = get_toc_page_numbers(doc)
2065
-
2066
  headers, top_3_font_sizes, smallest_font_size, headersSpans = extract_headers(
2067
  doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin, bottom_margin
2068
  )
2069
-
2070
  hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
2071
  listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
2072
-
 
 
2073
  # Precompute all children headers once
2074
  allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
2075
  allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
2076
-
2077
  # df = pd.DataFrame(columns=["NBSLink","Subject","Page","Author","Creation Date","Layer",'Code', 'head above 1', "head above 2","BodyText"])
2078
  dictionaryNBS={}
2079
  data_list_JSON = []
@@ -2085,16 +2121,16 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths):
2085
  mainHeaderFontSize= top_3_font_sizes[0]
2086
  subHeaderFontSize= top_3_font_sizes[1]
2087
  subsubheaderFontSize= top_3_font_sizes[1]
2088
-
2089
-
2090
-
2091
  # Preload all pages to avoid repeated loading
2092
  # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
2093
-
2094
  for heading_to_searchDict, paths in listofHeaderstoMarkup:
2095
  heading_to_search = heading_to_searchDict['text']
2096
  heading_to_searchPageNum = heading_to_searchDict['page']
2097
-
2098
  # Initialize variables
2099
  headertoContinue1 = False
2100
  headertoContinue2 = False
@@ -2112,45 +2148,48 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths):
2112
  heading_norm = normalize_text(heading_to_search)
2113
  paths_norm = [normalize_text(p) for p in paths[0]] if paths and paths[0] else []
2114
  for page_num in range(heading_to_searchPageNum,len(doc)):
 
2115
  # print(heading_to_search)
2116
  if paths[0].strip().lower() != currentgroupname.strip().lower():
2117
  Alltexttobebilled+= paths[0] +'\n'
2118
  currentgroupname=paths[0]
2119
  # print(paths[0])
2120
-
2121
-
2122
  if page_num in toc_pages:
2123
  continue
 
2124
  if break_collecting:
2125
  break
2126
  page=doc[page_num]
2127
  page_height = page.rect.height
2128
  blocks = page.get_text("dict")["blocks"]
2129
-
2130
  for block in blocks:
2131
  if break_collecting:
2132
  break
2133
-
2134
  lines = block.get("lines", [])
2135
  i = 0
2136
  while i < len(lines):
2137
  if break_collecting:
2138
  break
2139
-
2140
  spans = lines[i].get("spans", [])
2141
  if not spans:
2142
  i += 1
2143
  continue
2144
-
2145
  y0 = spans[0]["bbox"][1]
2146
  y1 = spans[0]["bbox"][3]
2147
  if y0 < top_margin or y1 > (page_height - bottom_margin):
2148
  i += 1
2149
  continue
2150
-
2151
  line_text = get_spaced_text_from_spans(spans).lower()
2152
  line_text_norm = normalize_text(line_text)
2153
-
 
2154
  # Combine with next line if available
2155
  if i + 1 < len(lines):
2156
  next_spans = lines[i + 1].get("spans", [])
@@ -2158,16 +2197,14 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths):
2158
  combined_line_norm = normalize_text(line_text + " " + next_line_text)
2159
  else:
2160
  combined_line_norm = line_text_norm
2161
-
2162
  # Check if we should continue processing
2163
  if combined_line_norm and combined_line_norm in paths[0]:
2164
-
2165
  headertoContinue1 = combined_line_norm
2166
  if combined_line_norm and combined_line_norm in paths[-2]:
2167
-
2168
  headertoContinue2 = combined_line_norm
2169
  # if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
2170
  last_path = paths[-2].lower()
 
2171
  # if any(word in paths[-2].lower() for word in keywordstoSkip):
2172
  # if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() or 'workmanship' in paths[-2].lower() or 'testing' in paths[-2].lower() or 'labeling' in paths[-2].lower():
2173
  if any(keyword in last_path for keyword in keywords):
@@ -2178,18 +2215,18 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths):
2178
  # Alltexttobebilled+= combined_line_norm #################################################
2179
  if matched_header_line_norm in combined_line_norm:
2180
  Alltexttobebilled+='\n'
2181
- Alltexttobebilled+= ' '+combined_line_norm
2182
  # Optimized header matching
2183
  existsfull = (
2184
  ( combined_line_norm in allchildrenheaders_set or
2185
  combined_line_norm in allchildrenheaders ) and heading_to_search in combined_line_norm
2186
  )
2187
-
2188
  # New word-based matching
2189
  current_line_words = set(combined_line_norm.split())
2190
  heading_words = set(heading_norm.split())
2191
  all_words_match = current_line_words.issubset(heading_words) and len(current_line_words) > 0
2192
-
2193
  substring_match = (
2194
  heading_norm in combined_line_norm or
2195
  combined_line_norm in heading_norm or
@@ -2199,10 +2236,10 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths):
2199
  # heading_norm in combined_line_norm or
2200
  # combined_line_norm in heading_norm
2201
  # )
2202
-
2203
  if (substring_match and existsfull and not collecting and
2204
  len(combined_line_norm) > 0 ):#and (headertoContinue1 or headertoContinue2) ):
2205
-
2206
  # Check header conditions more efficiently
2207
  header_spans = [
2208
  span for span in spans
@@ -2215,18 +2252,18 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths):
2215
  # if stringtowrite=='To be billed':
2216
  # Alltexttobebilled+='\n'
2217
  matched_header_font_size = max(span["size"] for span in header_spans)
2218
-
2219
  # collected_lines.append(line_text)
2220
  valid_spans = [span for span in spans if span.get("bbox")]
2221
-
2222
  if valid_spans:
2223
  x0s = [span["bbox"][0] for span in valid_spans]
2224
  x1s = [span["bbox"][2] for span in valid_spans]
2225
  y0s = [span["bbox"][1] for span in valid_spans]
2226
  y1s = [span["bbox"][3] for span in valid_spans]
2227
-
2228
  header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
2229
-
2230
  if page_num in current_bbox:
2231
  cb = current_bbox[page_num]
2232
  current_bbox[page_num] = [
@@ -2239,36 +2276,36 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths):
2239
  current_bbox[page_num] = header_bbox
2240
  last_y1s[page_num] = header_bbox[3]
2241
  x0, y0, x1, y1 = header_bbox
2242
-
2243
  zoom = 200
2244
  left = int(x0)
2245
  top = int(y0)
2246
  zoom_str = f"{zoom},{left},{top}"
2247
  pageNumberFound = page_num + 1
2248
-
2249
  # Build the query parameters
2250
  params = {
2251
  'pdfLink': pdf_path, # Your PDF link
2252
  'keyword': heading_to_search, # Your keyword (could be a string or list)
2253
  }
2254
-
2255
  # URL encode each parameter
2256
  encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
2257
-
2258
  # Construct the final encoded link
2259
  encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
2260
-
2261
  # Correctly construct the final URL with page and zoom
2262
  final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
2263
-
2264
  # Get current date and time
2265
  now = datetime.now()
2266
-
2267
  # Format the output
2268
  formatted_time = now.strftime("%d/%m/%Y %I:%M:%S %p")
2269
  # Optionally, add the URL to a DataFrame
2270
-
2271
-
2272
  data_entry = {
2273
  "PDF Name":filename,
2274
  "NBSLink": zoom_str,
@@ -2284,23 +2321,23 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths):
2284
  "MC Connnection": 'Go to ' + paths[0].strip().split()[0] +'/'+ heading_to_search.strip().split()[0] + ' in '+ filename
2285
  }
2286
  data_list_JSON.append(data_entry)
2287
-
2288
  # Convert list to JSON
2289
  # json_output = [data_list_JSON]
2290
  # json_output = json.dumps(data_list_JSON, indent=4)
2291
-
2292
  i += 2
2293
  continue
2294
  else:
2295
  if (substring_match and not collecting and
2296
  len(combined_line_norm) > 0): # and (headertoContinue1 or headertoContinue2) ):
2297
-
2298
  # Calculate word match percentage
2299
  word_match_percent = words_match_ratio(heading_norm, combined_line_norm) * 100
2300
-
2301
  # Check if at least 70% of header words exist in this line
2302
  meets_word_threshold = word_match_percent >= 100
2303
-
2304
  # Check header conditions (including word threshold)
2305
  header_spans = [
2306
  span for span in spans
@@ -2308,7 +2345,7 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths):
2308
  # and span['size'] >= subsubheaderFontSize
2309
  and span['size'] < mainHeaderFontSize)
2310
  ]
2311
-
2312
  if header_spans and (meets_word_threshold or same_start_word(heading_to_search, combined_line_norm) ) and stringtowrite.startswith('To'):
2313
  collecting = True
2314
  if stringtowrite=='To be billed':
@@ -2316,18 +2353,18 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths):
2316
  # if stringtowrite=='To be billed':
2317
  # Alltexttobebilled+= ' '+ combined_line_norm
2318
  matched_header_font_size = max(span["size"] for span in header_spans)
2319
-
2320
- collected_lines.append(line_text)
2321
  valid_spans = [span for span in spans if span.get("bbox")]
2322
-
2323
  if valid_spans:
2324
  x0s = [span["bbox"][0] for span in valid_spans]
2325
  x1s = [span["bbox"][2] for span in valid_spans]
2326
  y0s = [span["bbox"][1] for span in valid_spans]
2327
  y1s = [span["bbox"][3] for span in valid_spans]
2328
-
2329
  header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
2330
-
2331
  if page_num in current_bbox:
2332
  cb = current_bbox[page_num]
2333
  current_bbox[page_num] = [
@@ -2338,7 +2375,7 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths):
2338
  ]
2339
  else:
2340
  current_bbox[page_num] = header_bbox
2341
-
2342
  last_y1s[page_num] = header_bbox[3]
2343
  x0, y0, x1, y1 = header_bbox
2344
  zoom = 200
@@ -2346,30 +2383,30 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths):
2346
  top = int(y0)
2347
  zoom_str = f"{zoom},{left},{top}"
2348
  pageNumberFound = page_num + 1
2349
-
2350
  # Build the query parameters
2351
  params = {
2352
  'pdfLink': pdf_path, # Your PDF link
2353
  'keyword': heading_to_search, # Your keyword (could be a string or list)
2354
  }
2355
-
2356
  # URL encode each parameter
2357
  encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
2358
-
2359
  # Construct the final encoded link
2360
  encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
2361
-
2362
  # Correctly construct the final URL with page and zoom
2363
  final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
2364
-
2365
  # Get current date and time
2366
  now = datetime.now()
2367
-
2368
  # Format the output
2369
  formatted_time = now.strftime("%d/%m/%Y %I:%M:%S %p")
2370
  # Optionally, add the URL to a DataFrame
2371
-
2372
-
2373
  data_entry = {
2374
  "PDF Name":filename,
2375
  "NBSLink": zoom_str,
@@ -2385,23 +2422,34 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths):
2385
  "MC Connnection": 'Go to ' + paths[0].strip().split()[0] +'/'+ heading_to_search.strip().split()[0] + ' in '+ filename
2386
  }
2387
  data_list_JSON.append(data_entry)
2388
-
2389
  # Convert list to JSON
2390
  # json_output = [data_list_JSON]
2391
  # json_output = json.dumps(data_list_JSON, indent=4)
2392
-
2393
-
2394
  i += 2
2395
  continue
2396
  if collecting:
2397
  norm_line = normalize_text(line_text)
2398
-
 
 
 
 
 
 
 
 
 
 
 
2399
  # Optimized URL check
2400
  if url_pattern.match(norm_line):
2401
  line_is_header = False
2402
  else:
2403
  line_is_header = any(is_header(span, most_common_font_size, most_common_color, most_common_font) for span in spans)
2404
-
2405
  if line_is_header:
2406
  header_font_size = max(span["size"] for span in spans)
2407
  is_probably_real_header = (
@@ -2409,7 +2457,7 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths):
2409
  is_header(spans[0], most_common_font_size, most_common_color, most_common_font) and
2410
  len(line_text.strip()) > 2
2411
  )
2412
-
2413
  if (norm_line != matched_header_line_norm and
2414
  norm_line != heading_norm and
2415
  is_probably_real_header):
@@ -2422,23 +2470,41 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths):
2422
  bbox[3] = last_y1s.get(page_num, bbox[3])
2423
  page_highlights[page_num] = bbox
2424
  highlight_boxes(docHighlights, page_highlights,stringtowrite)
2425
-
2426
  break_collecting = True
2427
  break
2428
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2429
  if break_collecting:
2430
  break
2431
-
2432
- collected_lines.append(line_text)
 
 
 
 
2433
  valid_spans = [span for span in spans if span.get("bbox")]
2434
  if valid_spans:
2435
  x0s = [span["bbox"][0] for span in valid_spans]
2436
  x1s = [span["bbox"][2] for span in valid_spans]
2437
  y0s = [span["bbox"][1] for span in valid_spans]
2438
  y1s = [span["bbox"][3] for span in valid_spans]
2439
-
2440
  line_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
2441
-
2442
  if page_num in current_bbox:
2443
  cb = current_bbox[page_num]
2444
  current_bbox[page_num] = [
@@ -2449,10 +2515,13 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths):
2449
  ]
2450
  else:
2451
  current_bbox[page_num] = line_bbox
2452
-
2453
  last_y1s[page_num] = line_bbox[3]
2454
  i += 1
2455
-
 
 
 
2456
  if not done:
2457
  for page_num, bbox in current_bbox.items():
2458
  bbox[3] = last_y1s.get(page_num, bbox[3])
 
42
 
43
  def changepdflinks(json_data, pdf_path):
44
  print('ll , ' ,json_data,pdf_path)
45
+ # base_viewer_link = "https://findconsole-initialmarkups.hf.space/view-pdf?"
46
 
47
  updated_json = []
48
  for entry in json_data:
 
54
  encoded_pdf_link = urllib.parse.quote(pdf_path, safe='')
55
 
56
  # Construct the final link
57
+ final_url = f"{baselink}pdfLink={encoded_pdf_link}#page={str(page_str)}&zoom={zoom_str}"
58
 
59
  # Replace the old NBSLink value with the full URL
60
  entry["NBSLink"] = final_url
 
891
  "Code": stringtowrite,
892
  "head above 1": paths[-2],
893
  "head above 2": paths[0],
894
+ # "MC Connnection": 'Go to ' + paths[0].strip().split()[0] +'/'+ heading_to_search.strip().split()[0] + ' in '+ filename
895
  }
896
  data_list_JSON.append(data_entry)
897
 
 
985
  "Code": stringtowrite,
986
  "head above 1": paths[-2],
987
  "head above 2": paths[0],
988
+ # "MC Connnection": 'Go to ' + paths[0].strip().split()[0] +'/'+ heading_to_search.strip().split()[0] + ' in '+ filename
989
  }
990
  data_list_JSON.append(data_entry)
991
 
 
2005
 
2006
 
2007
 
2008
+ import datefinder
2009
  def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths):
2010
+ baselink = "https://findconsole-initialmarkups.hf.space/view-pdf?"
2011
  # keywordstoSkip=["installation", "execution", "miscellaneous items", "workmanship", "testing", "labeling"]
2012
  filenames=[]
2013
  keywords = {'installation', 'execution', 'miscellaneous items', 'workmanship', 'testing', 'labeling'}
 
2030
  # Optimized URL handling
2031
  if pdf_path and ('http' in pdf_path or 'dropbox' in pdf_path):
2032
  pdf_path = pdf_path.replace('dl=0', 'dl=1')
2033
+
2034
  # Cache frequently used values
2035
  response = requests.get(pdf_path)
2036
  pdf_content = BytesIO(response.content)
2037
  if not pdf_content:
2038
  raise ValueError("No valid PDF content found.")
2039
+
2040
  doc = fitz.open(stream=pdf_content, filetype="pdf")
2041
  docHighlights = fitz.open(stream=pdf_content, filetype="pdf")
2042
  most_common_font_size, most_common_color, most_common_font = get_regular_font_size_and_color(doc)
2043
+
2044
  # Precompute regex patterns
2045
  dot_pattern = re.compile(r'\.{3,}')
2046
  url_pattern = re.compile(r'https?://\S+|www\.\S+')
2047
+
2048
+
2049
  def get_toc_page_numbers(doc, max_pages_to_check=15):
2050
  toc_pages = []
2051
+
2052
+ # 1. Existing Dot Pattern (looking for ".....")
2053
+ dot_pattern = re.compile(r"\.{2,}")
2054
+
2055
+ # 2. NEW: Title Pattern (looking for specific headers)
2056
+ # ^ and $ ensure the line is JUST that word (ignoring "The contents of the bag...")
2057
+ # re.IGNORECASE makes it match "CONTENTS", "Contents", "Index", etc.
2058
+ title_pattern = re.compile(r"^\s*(table of contents|contents|index)\s*$", re.IGNORECASE)
2059
+
2060
  for page_num in range(min(len(doc), max_pages_to_check)):
2061
  page = doc.load_page(page_num)
2062
  blocks = page.get_text("dict")["blocks"]
2063
+ skip_page_due_to_footer = False
2064
  dot_line_count = 0
2065
+ has_toc_title = False
2066
+
2067
  for block in blocks:
2068
  for line in block.get("lines", []):
2069
+ # Extract text from spans (mimicking get_spaced_text_from_spans)
2070
+ line_text = " ".join([span["text"] for span in line["spans"]]).strip()
2071
+
2072
+ # CHECK A: Does the line have dots?
2073
  if dot_pattern.search(line_text):
2074
  dot_line_count += 1
2075
+
2076
+ # CHECK B: Is this line a Title?
2077
+ # We check this early in the loop. If a page has a title "Contents",
2078
+ # we mark it immediately.
2079
+ if title_pattern.match(line_text):
2080
+ has_toc_title = True
2081
+
2082
+ # CONDITION:
2083
+ # It is a TOC page if it has a Title OR if it has dot leaders.
2084
+ # We use 'dot_line_count >= 1' to be sensitive to single-item lists.
2085
+ if has_toc_title or dot_line_count >= 1:
2086
  toc_pages.append(page_num)
2087
+
2088
+ # RETURN:
2089
+ # If we found TOC pages (e.g., [2, 3]), we return [0, 1, 2, 3]
2090
+ # This covers the cover page, inside cover, and the TOC itself.
2091
+ if toc_pages:
2092
+ last_toc_page = toc_pages[-1]
2093
+ return list(range(0, last_toc_page + 1))
2094
+
2095
+ return [] # Return empty list if nothing found
2096
+
2097
+ # Usage
2098
  toc_pages = get_toc_page_numbers(doc)
2099
+
2100
  headers, top_3_font_sizes, smallest_font_size, headersSpans = extract_headers(
2101
  doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin, bottom_margin
2102
  )
2103
+
2104
  hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
2105
  listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
2106
+ # print(listofHeaderstoMarkup)
2107
+ for header, path in listofHeaderstoMarkup:
2108
+ print(path)
2109
  # Precompute all children headers once
2110
  allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
2111
  allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
2112
+
2113
  # df = pd.DataFrame(columns=["NBSLink","Subject","Page","Author","Creation Date","Layer",'Code', 'head above 1', "head above 2","BodyText"])
2114
  dictionaryNBS={}
2115
  data_list_JSON = []
 
2121
  mainHeaderFontSize= top_3_font_sizes[0]
2122
  subHeaderFontSize= top_3_font_sizes[1]
2123
  subsubheaderFontSize= top_3_font_sizes[1]
2124
+
2125
+
2126
+
2127
  # Preload all pages to avoid repeated loading
2128
  # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
2129
+ skip_page_due_to_footer = False
2130
  for heading_to_searchDict, paths in listofHeaderstoMarkup:
2131
  heading_to_search = heading_to_searchDict['text']
2132
  heading_to_searchPageNum = heading_to_searchDict['page']
2133
+
2134
  # Initialize variables
2135
  headertoContinue1 = False
2136
  headertoContinue2 = False
 
2148
  heading_norm = normalize_text(heading_to_search)
2149
  paths_norm = [normalize_text(p) for p in paths[0]] if paths and paths[0] else []
2150
  for page_num in range(heading_to_searchPageNum,len(doc)):
2151
+ skip_page_due_to_footer = False
2152
  # print(heading_to_search)
2153
  if paths[0].strip().lower() != currentgroupname.strip().lower():
2154
  Alltexttobebilled+= paths[0] +'\n'
2155
  currentgroupname=paths[0]
2156
  # print(paths[0])
2157
+
2158
+
2159
  if page_num in toc_pages:
2160
  continue
2161
+
2162
  if break_collecting:
2163
  break
2164
  page=doc[page_num]
2165
  page_height = page.rect.height
2166
  blocks = page.get_text("dict")["blocks"]
2167
+
2168
  for block in blocks:
2169
  if break_collecting:
2170
  break
2171
+
2172
  lines = block.get("lines", [])
2173
  i = 0
2174
  while i < len(lines):
2175
  if break_collecting:
2176
  break
2177
+
2178
  spans = lines[i].get("spans", [])
2179
  if not spans:
2180
  i += 1
2181
  continue
2182
+
2183
  y0 = spans[0]["bbox"][1]
2184
  y1 = spans[0]["bbox"][3]
2185
  if y0 < top_margin or y1 > (page_height - bottom_margin):
2186
  i += 1
2187
  continue
2188
+
2189
  line_text = get_spaced_text_from_spans(spans).lower()
2190
  line_text_norm = normalize_text(line_text)
2191
+
2192
+
2193
  # Combine with next line if available
2194
  if i + 1 < len(lines):
2195
  next_spans = lines[i + 1].get("spans", [])
 
2197
  combined_line_norm = normalize_text(line_text + " " + next_line_text)
2198
  else:
2199
  combined_line_norm = line_text_norm
 
2200
  # Check if we should continue processing
2201
  if combined_line_norm and combined_line_norm in paths[0]:
 
2202
  headertoContinue1 = combined_line_norm
2203
  if combined_line_norm and combined_line_norm in paths[-2]:
 
2204
  headertoContinue2 = combined_line_norm
2205
  # if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
2206
  last_path = paths[-2].lower()
2207
+
2208
  # if any(word in paths[-2].lower() for word in keywordstoSkip):
2209
  # if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() or 'workmanship' in paths[-2].lower() or 'testing' in paths[-2].lower() or 'labeling' in paths[-2].lower():
2210
  if any(keyword in last_path for keyword in keywords):
 
2215
  # Alltexttobebilled+= combined_line_norm #################################################
2216
  if matched_header_line_norm in combined_line_norm:
2217
  Alltexttobebilled+='\n'
2218
+ Alltexttobebilled+= ' '+combined_line_norm
2219
  # Optimized header matching
2220
  existsfull = (
2221
  ( combined_line_norm in allchildrenheaders_set or
2222
  combined_line_norm in allchildrenheaders ) and heading_to_search in combined_line_norm
2223
  )
2224
+
2225
  # New word-based matching
2226
  current_line_words = set(combined_line_norm.split())
2227
  heading_words = set(heading_norm.split())
2228
  all_words_match = current_line_words.issubset(heading_words) and len(current_line_words) > 0
2229
+
2230
  substring_match = (
2231
  heading_norm in combined_line_norm or
2232
  combined_line_norm in heading_norm or
 
2236
  # heading_norm in combined_line_norm or
2237
  # combined_line_norm in heading_norm
2238
  # )
2239
+
2240
  if (substring_match and existsfull and not collecting and
2241
  len(combined_line_norm) > 0 ):#and (headertoContinue1 or headertoContinue2) ):
2242
+
2243
  # Check header conditions more efficiently
2244
  header_spans = [
2245
  span for span in spans
 
2252
  # if stringtowrite=='To be billed':
2253
  # Alltexttobebilled+='\n'
2254
  matched_header_font_size = max(span["size"] for span in header_spans)
2255
+
2256
  # collected_lines.append(line_text)
2257
  valid_spans = [span for span in spans if span.get("bbox")]
2258
+
2259
  if valid_spans:
2260
  x0s = [span["bbox"][0] for span in valid_spans]
2261
  x1s = [span["bbox"][2] for span in valid_spans]
2262
  y0s = [span["bbox"][1] for span in valid_spans]
2263
  y1s = [span["bbox"][3] for span in valid_spans]
2264
+
2265
  header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
2266
+
2267
  if page_num in current_bbox:
2268
  cb = current_bbox[page_num]
2269
  current_bbox[page_num] = [
 
2276
  current_bbox[page_num] = header_bbox
2277
  last_y1s[page_num] = header_bbox[3]
2278
  x0, y0, x1, y1 = header_bbox
2279
+
2280
  zoom = 200
2281
  left = int(x0)
2282
  top = int(y0)
2283
  zoom_str = f"{zoom},{left},{top}"
2284
  pageNumberFound = page_num + 1
2285
+
2286
  # Build the query parameters
2287
  params = {
2288
  'pdfLink': pdf_path, # Your PDF link
2289
  'keyword': heading_to_search, # Your keyword (could be a string or list)
2290
  }
2291
+
2292
  # URL encode each parameter
2293
  encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
2294
+
2295
  # Construct the final encoded link
2296
  encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
2297
+
2298
  # Correctly construct the final URL with page and zoom
2299
  final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
2300
+
2301
  # Get current date and time
2302
  now = datetime.now()
2303
+
2304
  # Format the output
2305
  formatted_time = now.strftime("%d/%m/%Y %I:%M:%S %p")
2306
  # Optionally, add the URL to a DataFrame
2307
+
2308
+
2309
  data_entry = {
2310
  "PDF Name":filename,
2311
  "NBSLink": zoom_str,
 
2321
  "MC Connnection": 'Go to ' + paths[0].strip().split()[0] +'/'+ heading_to_search.strip().split()[0] + ' in '+ filename
2322
  }
2323
  data_list_JSON.append(data_entry)
2324
+
2325
  # Convert list to JSON
2326
  # json_output = [data_list_JSON]
2327
  # json_output = json.dumps(data_list_JSON, indent=4)
2328
+
2329
  i += 2
2330
  continue
2331
  else:
2332
  if (substring_match and not collecting and
2333
  len(combined_line_norm) > 0): # and (headertoContinue1 or headertoContinue2) ):
2334
+
2335
  # Calculate word match percentage
2336
  word_match_percent = words_match_ratio(heading_norm, combined_line_norm) * 100
2337
+
2338
  # Check if at least 70% of header words exist in this line
2339
  meets_word_threshold = word_match_percent >= 100
2340
+
2341
  # Check header conditions (including word threshold)
2342
  header_spans = [
2343
  span for span in spans
 
2345
  # and span['size'] >= subsubheaderFontSize
2346
  and span['size'] < mainHeaderFontSize)
2347
  ]
2348
+
2349
  if header_spans and (meets_word_threshold or same_start_word(heading_to_search, combined_line_norm) ) and stringtowrite.startswith('To'):
2350
  collecting = True
2351
  if stringtowrite=='To be billed':
 
2353
  # if stringtowrite=='To be billed':
2354
  # Alltexttobebilled+= ' '+ combined_line_norm
2355
  matched_header_font_size = max(span["size"] for span in header_spans)
2356
+ # if normalize_text(line_text)!=heading_norm:
2357
+ # collected_lines.append(line_text)
2358
  valid_spans = [span for span in spans if span.get("bbox")]
2359
+
2360
  if valid_spans:
2361
  x0s = [span["bbox"][0] for span in valid_spans]
2362
  x1s = [span["bbox"][2] for span in valid_spans]
2363
  y0s = [span["bbox"][1] for span in valid_spans]
2364
  y1s = [span["bbox"][3] for span in valid_spans]
2365
+
2366
  header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
2367
+
2368
  if page_num in current_bbox:
2369
  cb = current_bbox[page_num]
2370
  current_bbox[page_num] = [
 
2375
  ]
2376
  else:
2377
  current_bbox[page_num] = header_bbox
2378
+
2379
  last_y1s[page_num] = header_bbox[3]
2380
  x0, y0, x1, y1 = header_bbox
2381
  zoom = 200
 
2383
  top = int(y0)
2384
  zoom_str = f"{zoom},{left},{top}"
2385
  pageNumberFound = page_num + 1
2386
+
2387
  # Build the query parameters
2388
  params = {
2389
  'pdfLink': pdf_path, # Your PDF link
2390
  'keyword': heading_to_search, # Your keyword (could be a string or list)
2391
  }
2392
+
2393
  # URL encode each parameter
2394
  encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
2395
+
2396
  # Construct the final encoded link
2397
  encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
2398
+
2399
  # Correctly construct the final URL with page and zoom
2400
  final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
2401
+
2402
  # Get current date and time
2403
  now = datetime.now()
2404
+
2405
  # Format the output
2406
  formatted_time = now.strftime("%d/%m/%Y %I:%M:%S %p")
2407
  # Optionally, add the URL to a DataFrame
2408
+
2409
+
2410
  data_entry = {
2411
  "PDF Name":filename,
2412
  "NBSLink": zoom_str,
 
2422
  "MC Connnection": 'Go to ' + paths[0].strip().split()[0] +'/'+ heading_to_search.strip().split()[0] + ' in '+ filename
2423
  }
2424
  data_list_JSON.append(data_entry)
2425
+
2426
  # Convert list to JSON
2427
  # json_output = [data_list_JSON]
2428
  # json_output = json.dumps(data_list_JSON, indent=4)
2429
+
2430
+
2431
  i += 2
2432
  continue
2433
  if collecting:
2434
  norm_line = normalize_text(line_text)
2435
+ # ...
2436
+ matches = list(datefinder.find_dates(line_text_norm))
2437
+
2438
+ if matches: # If the list is not empty, a date was found
2439
+ # --- MODIFY THIS BLOCK ---
2440
+ print(f"Skipping rest of page {page_num} due to date: {line_text_norm}") # Optional
2441
+ skip_page_due_to_footer = True
2442
+ i += 1
2443
+ break
2444
+ # --- END OF CHANGE ---
2445
+ # ...
2446
+ print(norm_line)
2447
  # Optimized URL check
2448
  if url_pattern.match(norm_line):
2449
  line_is_header = False
2450
  else:
2451
  line_is_header = any(is_header(span, most_common_font_size, most_common_color, most_common_font) for span in spans)
2452
+
2453
  if line_is_header:
2454
  header_font_size = max(span["size"] for span in spans)
2455
  is_probably_real_header = (
 
2457
  is_header(spans[0], most_common_font_size, most_common_color, most_common_font) and
2458
  len(line_text.strip()) > 2
2459
  )
2460
+
2461
  if (norm_line != matched_header_line_norm and
2462
  norm_line != heading_norm and
2463
  is_probably_real_header):
 
2470
  bbox[3] = last_y1s.get(page_num, bbox[3])
2471
  page_highlights[page_num] = bbox
2472
  highlight_boxes(docHighlights, page_highlights,stringtowrite)
2473
+
2474
  break_collecting = True
2475
  break
2476
+ # ... (this is the end of your 'while' loop, after line 508)
2477
+
2478
+ # --- ADD THIS BLOCK (after 'while' loop) ---
2479
+ # This breaks out of the 'for block...' loop
2480
+ # if skip_page_due_to_footer or break_collecting:
2481
+ # break
2482
+
2483
+ # --- End of 'for block' loop ---
2484
+
2485
+ # --- ADD THIS BLOCK (after 'for block...' loop) ---
2486
+ # # This skips to the next page
2487
+ # if skip_page_due_to_footer:
2488
+ # continue
2489
+
2490
+ # This (existing) check stops collecting for this header
2491
  if break_collecting:
2492
  break
2493
+ # ...
2494
+
2495
+ # if line_text.lower() != heading_norm.lower():
2496
+ # print('checkk',line_text,heading_norm)
2497
+ # collected_lines.append(line_text)
2498
+ # collected_lines.append(line_text)
2499
  valid_spans = [span for span in spans if span.get("bbox")]
2500
  if valid_spans:
2501
  x0s = [span["bbox"][0] for span in valid_spans]
2502
  x1s = [span["bbox"][2] for span in valid_spans]
2503
  y0s = [span["bbox"][1] for span in valid_spans]
2504
  y1s = [span["bbox"][3] for span in valid_spans]
2505
+
2506
  line_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
2507
+
2508
  if page_num in current_bbox:
2509
  cb = current_bbox[page_num]
2510
  current_bbox[page_num] = [
 
2515
  ]
2516
  else:
2517
  current_bbox[page_num] = line_bbox
2518
+
2519
  last_y1s[page_num] = line_bbox[3]
2520
  i += 1
2521
+ if skip_page_due_to_footer:
2522
+ break # This breaks the 'for block...' loop
2523
+ if skip_page_due_to_footer:
2524
+ continue # This skips to the next 'for page_num...'
2525
  if not done:
2526
  for page_num, bbox in current_bbox.items():
2527
  bbox[3] = last_y1s.get(page_num, bbox[3])