Marthee commited on
Commit
052564a
·
verified ·
1 Parent(s): cc0ae3f

Update InitialMarkups.py

Browse files
Files changed (1) hide show
  1. InitialMarkups.py +88 -216
InitialMarkups.py CHANGED
@@ -13,7 +13,7 @@ tobebilledonlyLink='https://adr.trevorsadd.co.uk/api/view-pdf-tobebilled?'
13
 
14
 
15
 
16
- import datefinder
17
  from urllib.parse import urlparse, unquote
18
  import os
19
  from io import BytesIO
@@ -36,9 +36,8 @@ import tsadropboxretrieval
36
 
37
  import urllib.parse
38
 
39
-
40
-
41
- import urllib.parse
42
 
43
  def changepdflinks(json_data, pdf_path):
44
  print('ll , ' ,json_data,pdf_path)
@@ -74,7 +73,7 @@ def get_toc_page_numbers(doc, max_pages_to_check=15):
74
  # 2. NEW: Title Pattern (looking for specific headers)
75
  # ^ and $ ensure the line is JUST that word (ignoring "The contents of the bag...")
76
  # re.IGNORECASE makes it match "CONTENTS", "Contents", "Index", etc.
77
- title_pattern = re.compile(r"^\s*(table of contents|contents|index|content)\s*$", re.IGNORECASE)
78
 
79
  for page_num in range(min(len(doc), max_pages_to_check)):
80
  page = doc.load_page(page_num)
@@ -108,13 +107,13 @@ def get_toc_page_numbers(doc, max_pages_to_check=15):
108
  # If we found TOC pages (e.g., [2, 3]), we return [0, 1, 2, 3]
109
  # This covers the cover page, inside cover, and the TOC itself.
110
  if toc_pages:
111
- print('toccc',toc_pages)
112
  last_toc_page = toc_pages[0]
113
  return list(range(0, last_toc_page + 1))
114
 
115
  return [] # Return empty list if nothing found
116
 
117
 
 
118
  def get_regular_font_size_and_color(doc):
119
  font_sizes = []
120
  colors = []
@@ -311,7 +310,7 @@ def clean_toc_entry(toc_text):
311
  # Remove everything after last sequence of dots/whitespace followed by digits
312
  return re.sub(r'[\.\s]+\d+.*$', '', toc_text).strip('. ')
313
 
314
- def build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin=70, bottom_margin=70):
315
  # Extract headers with margin handling
316
  headers_list, top_3_font_sizes, smallest_font_size, spans = extract_headers(
317
  doc,
@@ -690,8 +689,6 @@ def same_start_word(s1, s2):
690
  def extract_section_under_header(multiplePDF_Paths):
691
  filenames=[]
692
  keywords = {'installation', 'execution', 'miscellaneous items', 'workmanship', 'testing', 'labeling'}
693
- top_margin = 70
694
- bottom_margin = 50
695
  arrayofPDFS=multiplePDF_Paths.split(',')
696
  print(multiplePDF_Paths)
697
  print(arrayofPDFS,len(arrayofPDFS))
@@ -724,8 +721,7 @@ def extract_section_under_header(multiplePDF_Paths):
724
  dot_pattern = re.compile(r'\.{3,}')
725
  url_pattern = re.compile(r'https?://\S+|www\.\S+')
726
 
727
-
728
-
729
  toc_pages = get_toc_page_numbers(doc)
730
 
731
  headers, top_3_font_sizes, smallest_font_size, headersSpans = extract_headers(
@@ -1127,10 +1123,6 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1127
  alltextWithoutNotbilled=''
1128
  # keywordstoSkip=["installation", "execution", "miscellaneous items", "workmanship", "testing", "labeling"]
1129
 
1130
-
1131
-
1132
- top_margin = 70
1133
- bottom_margin = 50
1134
  headertoContinue1 = False
1135
  headertoContinue2=False
1136
 
@@ -1161,24 +1153,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1161
  dot_pattern = re.compile(r'\.{3,}')
1162
  url_pattern = re.compile(r'https?://\S+|www\.\S+')
1163
 
1164
- def get_toc_page_numbers(doc, max_pages_to_check=15):
1165
- toc_pages = []
1166
- for page_num in range(min(len(doc), max_pages_to_check)):
1167
- page = doc.load_page(page_num)
1168
- blocks = page.get_text("dict")["blocks"]
1169
-
1170
- dot_line_count = 0
1171
- for block in blocks:
1172
- for line in block.get("lines", []):
1173
- line_text = get_spaced_text_from_spans(line["spans"]).strip()
1174
- if dot_pattern.search(line_text):
1175
- dot_line_count += 1
1176
-
1177
- if dot_line_count >= 1:
1178
- toc_pages.append(page_num)
1179
-
1180
- return list(range(0, toc_pages[-1] +1)) if toc_pages else toc_pages
1181
-
1182
  toc_pages = get_toc_page_numbers(doc)
1183
 
1184
  headers, top_3_font_sizes, smallest_font_size, headersSpans = extract_headers(
@@ -1581,8 +1556,7 @@ def extract_section_under_header_tobebilled2(pdf_path):
1581
  # keywordstoSkip=["installation", "execution", "miscellaneous items", "workmanship", "testing", "labeling"]
1582
 
1583
  keywords = {'installation', 'execution', 'miscellaneous items', 'workmanship', 'testing', 'labeling'}
1584
- top_margin = 70
1585
- bottom_margin = 50
1586
  headertoContinue1 = False
1587
  headertoContinue2=False
1588
  Alltexttobebilled=''
@@ -1608,24 +1582,7 @@ def extract_section_under_header_tobebilled2(pdf_path):
1608
  dot_pattern = re.compile(r'\.{3,}')
1609
  url_pattern = re.compile(r'https?://\S+|www\.\S+')
1610
 
1611
- def get_toc_page_numbers(doc, max_pages_to_check=15):
1612
- toc_pages = []
1613
- for page_num in range(min(len(doc), max_pages_to_check)):
1614
- page = doc.load_page(page_num)
1615
- blocks = page.get_text("dict")["blocks"]
1616
-
1617
- dot_line_count = 0
1618
- for block in blocks:
1619
- for line in block.get("lines", []):
1620
- line_text = get_spaced_text_from_spans(line["spans"]).strip()
1621
- if dot_pattern.search(line_text):
1622
- dot_line_count += 1
1623
-
1624
- if dot_line_count >= 1:
1625
- toc_pages.append(page_num)
1626
-
1627
- return list(range(0, toc_pages[-1] +1)) if toc_pages else toc_pages
1628
-
1629
  toc_pages = get_toc_page_numbers(doc)
1630
 
1631
  headers, top_3_font_sizes, smallest_font_size, headersSpans = extract_headers(
@@ -2039,13 +1996,12 @@ def extract_section_under_header_tobebilled2(pdf_path):
2039
 
2040
 
2041
 
 
2042
  def extract_section_under_header_tobebilledMultiplePDFSmarthe(multiplePDF_Paths):
2043
- baselink = "https://findconsole-initialmarkups.hf.space/view-pdf?"
2044
  # keywordstoSkip=["installation", "execution", "miscellaneous items", "workmanship", "testing", "labeling"]
2045
  filenames=[]
2046
  keywords = {'installation', 'execution', 'miscellaneous items', 'workmanship', 'testing', 'labeling'}
2047
- top_margin = 70
2048
- bottom_margin = 50
2049
  arrayofPDFS=multiplePDF_Paths.split(',')
2050
  print(multiplePDF_Paths)
2051
  print(arrayofPDFS)
@@ -2063,86 +2019,35 @@ def extract_section_under_header_tobebilledMultiplePDFSmarthe(multiplePDF_Paths)
2063
  # Optimized URL handling
2064
  if pdf_path and ('http' in pdf_path or 'dropbox' in pdf_path):
2065
  pdf_path = pdf_path.replace('dl=0', 'dl=1')
2066
-
2067
  # Cache frequently used values
2068
  response = requests.get(pdf_path)
2069
  pdf_content = BytesIO(response.content)
2070
  if not pdf_content:
2071
  raise ValueError("No valid PDF content found.")
2072
-
2073
  doc = fitz.open(stream=pdf_content, filetype="pdf")
2074
  docHighlights = fitz.open(stream=pdf_content, filetype="pdf")
2075
  most_common_font_size, most_common_color, most_common_font = get_regular_font_size_and_color(doc)
2076
-
2077
  # Precompute regex patterns
2078
  dot_pattern = re.compile(r'\.{3,}')
2079
  url_pattern = re.compile(r'https?://\S+|www\.\S+')
 
2080
 
2081
-
2082
- def get_toc_page_numbers(doc, max_pages_to_check=15):
2083
- toc_pages = []
2084
-
2085
- # 1. Existing Dot Pattern (looking for ".....")
2086
- dot_pattern = re.compile(r"\.{2,}")
2087
-
2088
- # 2. NEW: Title Pattern (looking for specific headers)
2089
- # ^ and $ ensure the line is JUST that word (ignoring "The contents of the bag...")
2090
- # re.IGNORECASE makes it match "CONTENTS", "Contents", "Index", etc.
2091
- title_pattern = re.compile(r"^\s*(table of contents|contents|index)\s*$", re.IGNORECASE)
2092
-
2093
- for page_num in range(min(len(doc), max_pages_to_check)):
2094
- page = doc.load_page(page_num)
2095
- blocks = page.get_text("dict")["blocks"]
2096
- skip_page_due_to_footer = False
2097
- dot_line_count = 0
2098
- has_toc_title = False
2099
-
2100
- for block in blocks:
2101
- for line in block.get("lines", []):
2102
- # Extract text from spans (mimicking get_spaced_text_from_spans)
2103
- line_text = " ".join([span["text"] for span in line["spans"]]).strip()
2104
-
2105
- # CHECK A: Does the line have dots?
2106
- if dot_pattern.search(line_text):
2107
- dot_line_count += 1
2108
-
2109
- # CHECK B: Is this line a Title?
2110
- # We check this early in the loop. If a page has a title "Contents",
2111
- # we mark it immediately.
2112
- if title_pattern.match(line_text):
2113
- has_toc_title = True
2114
-
2115
- # CONDITION:
2116
- # It is a TOC page if it has a Title OR if it has dot leaders.
2117
- # We use 'dot_line_count >= 1' to be sensitive to single-item lists.
2118
- if has_toc_title or dot_line_count >= 1:
2119
- toc_pages.append(page_num)
2120
-
2121
- # RETURN:
2122
- # If we found TOC pages (e.g., [2, 3]), we return [0, 1, 2, 3]
2123
- # This covers the cover page, inside cover, and the TOC itself.
2124
- if toc_pages:
2125
- last_toc_page = toc_pages[-1]
2126
- return list(range(0, last_toc_page + 1))
2127
-
2128
- return [] # Return empty list if nothing found
2129
-
2130
- # Usage
2131
  toc_pages = get_toc_page_numbers(doc)
2132
-
2133
  headers, top_3_font_sizes, smallest_font_size, headersSpans = extract_headers(
2134
  doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin, bottom_margin
2135
  )
2136
-
2137
  hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
2138
  listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
2139
- # print(listofHeaderstoMarkup)
2140
- for header, path in listofHeaderstoMarkup:
2141
- print(path)
2142
  # Precompute all children headers once
2143
  allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
2144
  allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
2145
-
2146
  # df = pd.DataFrame(columns=["NBSLink","Subject","Page","Author","Creation Date","Layer",'Code', 'head above 1', "head above 2","BodyText"])
2147
  dictionaryNBS={}
2148
  data_list_JSON = []
@@ -2154,16 +2059,16 @@ def extract_section_under_header_tobebilledMultiplePDFSmarthe(multiplePDF_Paths)
2154
  mainHeaderFontSize= top_3_font_sizes[0]
2155
  subHeaderFontSize= top_3_font_sizes[1]
2156
  subsubheaderFontSize= top_3_font_sizes[1]
2157
-
2158
-
2159
-
2160
  # Preload all pages to avoid repeated loading
2161
  # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
2162
- skip_page_due_to_footer = False
2163
  for heading_to_searchDict, paths in listofHeaderstoMarkup:
2164
  heading_to_search = heading_to_searchDict['text']
2165
  heading_to_searchPageNum = heading_to_searchDict['page']
2166
-
2167
  # Initialize variables
2168
  headertoContinue1 = False
2169
  headertoContinue2 = False
@@ -2181,48 +2086,45 @@ def extract_section_under_header_tobebilledMultiplePDFSmarthe(multiplePDF_Paths)
2181
  heading_norm = normalize_text(heading_to_search)
2182
  paths_norm = [normalize_text(p) for p in paths[0]] if paths and paths[0] else []
2183
  for page_num in range(heading_to_searchPageNum,len(doc)):
2184
- skip_page_due_to_footer = False
2185
  # print(heading_to_search)
2186
  if paths[0].strip().lower() != currentgroupname.strip().lower():
2187
  Alltexttobebilled+= paths[0] +'\n'
2188
  currentgroupname=paths[0]
2189
  # print(paths[0])
2190
-
2191
-
2192
  if page_num in toc_pages:
2193
  continue
2194
-
2195
  if break_collecting:
2196
  break
2197
  page=doc[page_num]
2198
  page_height = page.rect.height
2199
  blocks = page.get_text("dict")["blocks"]
2200
-
2201
  for block in blocks:
2202
  if break_collecting:
2203
  break
2204
-
2205
  lines = block.get("lines", [])
2206
  i = 0
2207
  while i < len(lines):
2208
  if break_collecting:
2209
  break
2210
-
2211
  spans = lines[i].get("spans", [])
2212
  if not spans:
2213
  i += 1
2214
  continue
2215
-
2216
  y0 = spans[0]["bbox"][1]
2217
  y1 = spans[0]["bbox"][3]
2218
  if y0 < top_margin or y1 > (page_height - bottom_margin):
2219
  i += 1
2220
  continue
2221
-
2222
  line_text = get_spaced_text_from_spans(spans).lower()
2223
  line_text_norm = normalize_text(line_text)
2224
-
2225
-
2226
  # Combine with next line if available
2227
  if i + 1 < len(lines):
2228
  next_spans = lines[i + 1].get("spans", [])
@@ -2230,14 +2132,16 @@ def extract_section_under_header_tobebilledMultiplePDFSmarthe(multiplePDF_Paths)
2230
  combined_line_norm = normalize_text(line_text + " " + next_line_text)
2231
  else:
2232
  combined_line_norm = line_text_norm
 
2233
  # Check if we should continue processing
2234
  if combined_line_norm and combined_line_norm in paths[0]:
 
2235
  headertoContinue1 = combined_line_norm
2236
  if combined_line_norm and combined_line_norm in paths[-2]:
 
2237
  headertoContinue2 = combined_line_norm
2238
  # if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
2239
  last_path = paths[-2].lower()
2240
-
2241
  # if any(word in paths[-2].lower() for word in keywordstoSkip):
2242
  # if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() or 'workmanship' in paths[-2].lower() or 'testing' in paths[-2].lower() or 'labeling' in paths[-2].lower():
2243
  if any(keyword in last_path for keyword in keywords):
@@ -2248,18 +2152,18 @@ def extract_section_under_header_tobebilledMultiplePDFSmarthe(multiplePDF_Paths)
2248
  # Alltexttobebilled+= combined_line_norm #################################################
2249
  if matched_header_line_norm in combined_line_norm:
2250
  Alltexttobebilled+='\n'
2251
- Alltexttobebilled+= ' '+combined_line_norm
2252
  # Optimized header matching
2253
  existsfull = (
2254
  ( combined_line_norm in allchildrenheaders_set or
2255
  combined_line_norm in allchildrenheaders ) and heading_to_search in combined_line_norm
2256
  )
2257
-
2258
  # New word-based matching
2259
  current_line_words = set(combined_line_norm.split())
2260
  heading_words = set(heading_norm.split())
2261
  all_words_match = current_line_words.issubset(heading_words) and len(current_line_words) > 0
2262
-
2263
  substring_match = (
2264
  heading_norm in combined_line_norm or
2265
  combined_line_norm in heading_norm or
@@ -2269,10 +2173,10 @@ def extract_section_under_header_tobebilledMultiplePDFSmarthe(multiplePDF_Paths)
2269
  # heading_norm in combined_line_norm or
2270
  # combined_line_norm in heading_norm
2271
  # )
2272
-
2273
  if (substring_match and existsfull and not collecting and
2274
  len(combined_line_norm) > 0 ):#and (headertoContinue1 or headertoContinue2) ):
2275
-
2276
  # Check header conditions more efficiently
2277
  header_spans = [
2278
  span for span in spans
@@ -2280,23 +2184,23 @@ def extract_section_under_header_tobebilledMultiplePDFSmarthe(multiplePDF_Paths)
2280
  # and span['size'] >= subsubheaderFontSize
2281
  and span['size'] < mainHeaderFontSize)
2282
  ]
2283
- if header_spans and stringtowrite.startswith('To'):
2284
  collecting = True
2285
  # if stringtowrite=='To be billed':
2286
  # Alltexttobebilled+='\n'
2287
  matched_header_font_size = max(span["size"] for span in header_spans)
2288
-
2289
- collected_lines.append(line_text)
2290
  valid_spans = [span for span in spans if span.get("bbox")]
2291
-
2292
  if valid_spans:
2293
  x0s = [span["bbox"][0] for span in valid_spans]
2294
  x1s = [span["bbox"][2] for span in valid_spans]
2295
  y0s = [span["bbox"][1] for span in valid_spans]
2296
  y1s = [span["bbox"][3] for span in valid_spans]
2297
-
2298
  header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
2299
-
2300
  if page_num in current_bbox:
2301
  cb = current_bbox[page_num]
2302
  current_bbox[page_num] = [
@@ -2309,36 +2213,36 @@ def extract_section_under_header_tobebilledMultiplePDFSmarthe(multiplePDF_Paths)
2309
  current_bbox[page_num] = header_bbox
2310
  last_y1s[page_num] = header_bbox[3]
2311
  x0, y0, x1, y1 = header_bbox
2312
-
2313
  zoom = 200
2314
  left = int(x0)
2315
  top = int(y0)
2316
  zoom_str = f"{zoom},{left},{top}"
2317
  pageNumberFound = page_num + 1
2318
-
2319
  # Build the query parameters
2320
  params = {
2321
  'pdfLink': pdf_path, # Your PDF link
2322
  'keyword': heading_to_search, # Your keyword (could be a string or list)
2323
  }
2324
-
2325
  # URL encode each parameter
2326
  encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
2327
-
2328
  # Construct the final encoded link
2329
  encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
2330
-
2331
  # Correctly construct the final URL with page and zoom
2332
  final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
2333
-
2334
  # Get current date and time
2335
  now = datetime.now()
2336
-
2337
  # Format the output
2338
  formatted_time = now.strftime("%d/%m/%Y %I:%M:%S %p")
2339
  # Optionally, add the URL to a DataFrame
2340
-
2341
-
2342
  data_entry = {
2343
  "PDF Name":filename,
2344
  "NBSLink": zoom_str,
@@ -2354,23 +2258,23 @@ def extract_section_under_header_tobebilledMultiplePDFSmarthe(multiplePDF_Paths)
2354
  "MC Connnection": 'Go to ' + paths[0].strip().split()[0] +'/'+ heading_to_search.strip().split()[0] + ' in '+ filename
2355
  }
2356
  data_list_JSON.append(data_entry)
2357
-
2358
  # Convert list to JSON
2359
  # json_output = [data_list_JSON]
2360
  # json_output = json.dumps(data_list_JSON, indent=4)
2361
-
2362
  i += 2
2363
  continue
2364
  else:
2365
  if (substring_match and not collecting and
2366
  len(combined_line_norm) > 0): # and (headertoContinue1 or headertoContinue2) ):
2367
-
2368
  # Calculate word match percentage
2369
  word_match_percent = words_match_ratio(heading_norm, combined_line_norm) * 100
2370
-
2371
  # Check if at least 70% of header words exist in this line
2372
  meets_word_threshold = word_match_percent >= 100
2373
-
2374
  # Check header conditions (including word threshold)
2375
  header_spans = [
2376
  span for span in spans
@@ -2378,7 +2282,7 @@ def extract_section_under_header_tobebilledMultiplePDFSmarthe(multiplePDF_Paths)
2378
  # and span['size'] >= subsubheaderFontSize
2379
  and span['size'] < mainHeaderFontSize)
2380
  ]
2381
-
2382
  if header_spans and (meets_word_threshold or same_start_word(heading_to_search, combined_line_norm) ) and stringtowrite.startswith('To'):
2383
  collecting = True
2384
  if stringtowrite=='To be billed':
@@ -2386,18 +2290,18 @@ def extract_section_under_header_tobebilledMultiplePDFSmarthe(multiplePDF_Paths)
2386
  # if stringtowrite=='To be billed':
2387
  # Alltexttobebilled+= ' '+ combined_line_norm
2388
  matched_header_font_size = max(span["size"] for span in header_spans)
2389
- # if normalize_text(line_text)!=heading_norm:
2390
  collected_lines.append(line_text)
2391
  valid_spans = [span for span in spans if span.get("bbox")]
2392
-
2393
  if valid_spans:
2394
  x0s = [span["bbox"][0] for span in valid_spans]
2395
  x1s = [span["bbox"][2] for span in valid_spans]
2396
  y0s = [span["bbox"][1] for span in valid_spans]
2397
  y1s = [span["bbox"][3] for span in valid_spans]
2398
-
2399
  header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
2400
-
2401
  if page_num in current_bbox:
2402
  cb = current_bbox[page_num]
2403
  current_bbox[page_num] = [
@@ -2408,7 +2312,7 @@ def extract_section_under_header_tobebilledMultiplePDFSmarthe(multiplePDF_Paths)
2408
  ]
2409
  else:
2410
  current_bbox[page_num] = header_bbox
2411
-
2412
  last_y1s[page_num] = header_bbox[3]
2413
  x0, y0, x1, y1 = header_bbox
2414
  zoom = 200
@@ -2416,30 +2320,30 @@ def extract_section_under_header_tobebilledMultiplePDFSmarthe(multiplePDF_Paths)
2416
  top = int(y0)
2417
  zoom_str = f"{zoom},{left},{top}"
2418
  pageNumberFound = page_num + 1
2419
-
2420
  # Build the query parameters
2421
  params = {
2422
  'pdfLink': pdf_path, # Your PDF link
2423
  'keyword': heading_to_search, # Your keyword (could be a string or list)
2424
  }
2425
-
2426
  # URL encode each parameter
2427
  encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
2428
-
2429
  # Construct the final encoded link
2430
  encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
2431
-
2432
  # Correctly construct the final URL with page and zoom
2433
  final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
2434
-
2435
  # Get current date and time
2436
  now = datetime.now()
2437
-
2438
  # Format the output
2439
  formatted_time = now.strftime("%d/%m/%Y %I:%M:%S %p")
2440
  # Optionally, add the URL to a DataFrame
2441
-
2442
-
2443
  data_entry = {
2444
  "PDF Name":filename,
2445
  "NBSLink": zoom_str,
@@ -2455,34 +2359,23 @@ def extract_section_under_header_tobebilledMultiplePDFSmarthe(multiplePDF_Paths)
2455
  "MC Connnection": 'Go to ' + paths[0].strip().split()[0] +'/'+ heading_to_search.strip().split()[0] + ' in '+ filename
2456
  }
2457
  data_list_JSON.append(data_entry)
2458
-
2459
  # Convert list to JSON
2460
  # json_output = [data_list_JSON]
2461
  # json_output = json.dumps(data_list_JSON, indent=4)
2462
-
2463
-
2464
  i += 2
2465
  continue
2466
  if collecting:
2467
  norm_line = normalize_text(line_text)
2468
- # ...
2469
- matches = list(datefinder.find_dates(line_text_norm))
2470
-
2471
- if matches: # If the list is not empty, a date was found
2472
- # --- MODIFY THIS BLOCK ---
2473
- print(f"Skipping rest of page {page_num} due to date: {line_text_norm}") # Optional
2474
- skip_page_due_to_footer = True
2475
- i += 1
2476
- break
2477
- # --- END OF CHANGE ---
2478
- # ...
2479
- print(norm_line)
2480
  # Optimized URL check
2481
  if url_pattern.match(norm_line):
2482
  line_is_header = False
2483
  else:
2484
  line_is_header = any(is_header(span, most_common_font_size, most_common_color, most_common_font) for span in spans)
2485
-
2486
  if line_is_header:
2487
  header_font_size = max(span["size"] for span in spans)
2488
  is_probably_real_header = (
@@ -2490,7 +2383,7 @@ def extract_section_under_header_tobebilledMultiplePDFSmarthe(multiplePDF_Paths)
2490
  is_header(spans[0], most_common_font_size, most_common_color, most_common_font) and
2491
  len(line_text.strip()) > 2
2492
  )
2493
-
2494
  if (norm_line != matched_header_line_norm and
2495
  norm_line != heading_norm and
2496
  is_probably_real_header):
@@ -2503,31 +2396,13 @@ def extract_section_under_header_tobebilledMultiplePDFSmarthe(multiplePDF_Paths)
2503
  bbox[3] = last_y1s.get(page_num, bbox[3])
2504
  page_highlights[page_num] = bbox
2505
  highlight_boxes(docHighlights, page_highlights,stringtowrite)
2506
-
2507
  break_collecting = True
2508
  break
2509
- # ... (this is the end of your 'while' loop, after line 508)
2510
-
2511
- # --- ADD THIS BLOCK (after 'while' loop) ---
2512
- # This breaks out of the 'for block...' loop
2513
- # if skip_page_due_to_footer or break_collecting:
2514
- # break
2515
-
2516
- # --- End of 'for block' loop ---
2517
-
2518
- # --- ADD THIS BLOCK (after 'for block...' loop) ---
2519
- # # This skips to the next page
2520
- # if skip_page_due_to_footer:
2521
- # continue
2522
-
2523
- # This (existing) check stops collecting for this header
2524
  if break_collecting:
2525
  break
2526
- # ...
2527
-
2528
- # if line_text.lower() != heading_norm.lower():
2529
- # print('checkk',line_text,heading_norm)
2530
- # collected_lines.append(line_text)
2531
  collected_lines.append(line_text)
2532
  valid_spans = [span for span in spans if span.get("bbox")]
2533
  if valid_spans:
@@ -2535,9 +2410,9 @@ def extract_section_under_header_tobebilledMultiplePDFSmarthe(multiplePDF_Paths)
2535
  x1s = [span["bbox"][2] for span in valid_spans]
2536
  y0s = [span["bbox"][1] for span in valid_spans]
2537
  y1s = [span["bbox"][3] for span in valid_spans]
2538
-
2539
  line_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
2540
-
2541
  if page_num in current_bbox:
2542
  cb = current_bbox[page_num]
2543
  current_bbox[page_num] = [
@@ -2548,13 +2423,10 @@ def extract_section_under_header_tobebilledMultiplePDFSmarthe(multiplePDF_Paths)
2548
  ]
2549
  else:
2550
  current_bbox[page_num] = line_bbox
2551
-
2552
  last_y1s[page_num] = line_bbox[3]
2553
  i += 1
2554
- if skip_page_due_to_footer:
2555
- break # This breaks the 'for block...' loop
2556
- if skip_page_due_to_footer:
2557
- continue # This skips to the next 'for page_num...'
2558
  if not done:
2559
  for page_num, bbox in current_bbox.items():
2560
  bbox[3] = last_y1s.get(page_num, bbox[3])
 
13
 
14
 
15
 
16
+
17
  from urllib.parse import urlparse, unquote
18
  import os
19
  from io import BytesIO
 
36
 
37
  import urllib.parse
38
 
39
+ top_margin = 70
40
+ bottom_margin = 85
 
41
 
42
  def changepdflinks(json_data, pdf_path):
43
  print('ll , ' ,json_data,pdf_path)
 
73
  # 2. NEW: Title Pattern (looking for specific headers)
74
  # ^ and $ ensure the line is JUST that word (ignoring "The contents of the bag...")
75
  # re.IGNORECASE makes it match "CONTENTS", "Contents", "Index", etc.
76
+ title_pattern = re.compile(r"^\s*(table of contents|contents|index)\s*$", re.IGNORECASE)
77
 
78
  for page_num in range(min(len(doc), max_pages_to_check)):
79
  page = doc.load_page(page_num)
 
107
  # If we found TOC pages (e.g., [2, 3]), we return [0, 1, 2, 3]
108
  # This covers the cover page, inside cover, and the TOC itself.
109
  if toc_pages:
 
110
  last_toc_page = toc_pages[0]
111
  return list(range(0, last_toc_page + 1))
112
 
113
  return [] # Return empty list if nothing found
114
 
115
 
116
+
117
  def get_regular_font_size_and_color(doc):
118
  font_sizes = []
119
  colors = []
 
310
  # Remove everything after last sequence of dots/whitespace followed by digits
311
  return re.sub(r'[\.\s]+\d+.*$', '', toc_text).strip('. ')
312
 
313
+ def build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin=70, bottom_margin=85):
314
  # Extract headers with margin handling
315
  headers_list, top_3_font_sizes, smallest_font_size, spans = extract_headers(
316
  doc,
 
689
  def extract_section_under_header(multiplePDF_Paths):
690
  filenames=[]
691
  keywords = {'installation', 'execution', 'miscellaneous items', 'workmanship', 'testing', 'labeling'}
 
 
692
  arrayofPDFS=multiplePDF_Paths.split(',')
693
  print(multiplePDF_Paths)
694
  print(arrayofPDFS,len(arrayofPDFS))
 
721
  dot_pattern = re.compile(r'\.{3,}')
722
  url_pattern = re.compile(r'https?://\S+|www\.\S+')
723
 
724
+
 
725
  toc_pages = get_toc_page_numbers(doc)
726
 
727
  headers, top_3_font_sizes, smallest_font_size, headersSpans = extract_headers(
 
1123
  alltextWithoutNotbilled=''
1124
  # keywordstoSkip=["installation", "execution", "miscellaneous items", "workmanship", "testing", "labeling"]
1125
 
 
 
 
 
1126
  headertoContinue1 = False
1127
  headertoContinue2=False
1128
 
 
1153
  dot_pattern = re.compile(r'\.{3,}')
1154
  url_pattern = re.compile(r'https?://\S+|www\.\S+')
1155
 
1156
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1157
  toc_pages = get_toc_page_numbers(doc)
1158
 
1159
  headers, top_3_font_sizes, smallest_font_size, headersSpans = extract_headers(
 
1556
  # keywordstoSkip=["installation", "execution", "miscellaneous items", "workmanship", "testing", "labeling"]
1557
 
1558
  keywords = {'installation', 'execution', 'miscellaneous items', 'workmanship', 'testing', 'labeling'}
1559
+
 
1560
  headertoContinue1 = False
1561
  headertoContinue2=False
1562
  Alltexttobebilled=''
 
1582
  dot_pattern = re.compile(r'\.{3,}')
1583
  url_pattern = re.compile(r'https?://\S+|www\.\S+')
1584
 
1585
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1586
  toc_pages = get_toc_page_numbers(doc)
1587
 
1588
  headers, top_3_font_sizes, smallest_font_size, headersSpans = extract_headers(
 
1996
 
1997
 
1998
 
1999
+
2000
  def extract_section_under_header_tobebilledMultiplePDFSmarthe(multiplePDF_Paths):
 
2001
  # keywordstoSkip=["installation", "execution", "miscellaneous items", "workmanship", "testing", "labeling"]
2002
  filenames=[]
2003
  keywords = {'installation', 'execution', 'miscellaneous items', 'workmanship', 'testing', 'labeling'}
2004
+
 
2005
  arrayofPDFS=multiplePDF_Paths.split(',')
2006
  print(multiplePDF_Paths)
2007
  print(arrayofPDFS)
 
2019
  # Optimized URL handling
2020
  if pdf_path and ('http' in pdf_path or 'dropbox' in pdf_path):
2021
  pdf_path = pdf_path.replace('dl=0', 'dl=1')
2022
+
2023
  # Cache frequently used values
2024
  response = requests.get(pdf_path)
2025
  pdf_content = BytesIO(response.content)
2026
  if not pdf_content:
2027
  raise ValueError("No valid PDF content found.")
2028
+
2029
  doc = fitz.open(stream=pdf_content, filetype="pdf")
2030
  docHighlights = fitz.open(stream=pdf_content, filetype="pdf")
2031
  most_common_font_size, most_common_color, most_common_font = get_regular_font_size_and_color(doc)
2032
+
2033
  # Precompute regex patterns
2034
  dot_pattern = re.compile(r'\.{3,}')
2035
  url_pattern = re.compile(r'https?://\S+|www\.\S+')
2036
+
2037
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2038
  toc_pages = get_toc_page_numbers(doc)
2039
+
2040
  headers, top_3_font_sizes, smallest_font_size, headersSpans = extract_headers(
2041
  doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin, bottom_margin
2042
  )
2043
+
2044
  hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
2045
  listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
2046
+
 
 
2047
  # Precompute all children headers once
2048
  allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
2049
  allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
2050
+
2051
  # df = pd.DataFrame(columns=["NBSLink","Subject","Page","Author","Creation Date","Layer",'Code', 'head above 1', "head above 2","BodyText"])
2052
  dictionaryNBS={}
2053
  data_list_JSON = []
 
2059
  mainHeaderFontSize= top_3_font_sizes[0]
2060
  subHeaderFontSize= top_3_font_sizes[1]
2061
  subsubheaderFontSize= top_3_font_sizes[1]
2062
+
2063
+
2064
+
2065
  # Preload all pages to avoid repeated loading
2066
  # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
2067
+
2068
  for heading_to_searchDict, paths in listofHeaderstoMarkup:
2069
  heading_to_search = heading_to_searchDict['text']
2070
  heading_to_searchPageNum = heading_to_searchDict['page']
2071
+
2072
  # Initialize variables
2073
  headertoContinue1 = False
2074
  headertoContinue2 = False
 
2086
  heading_norm = normalize_text(heading_to_search)
2087
  paths_norm = [normalize_text(p) for p in paths[0]] if paths and paths[0] else []
2088
  for page_num in range(heading_to_searchPageNum,len(doc)):
 
2089
  # print(heading_to_search)
2090
  if paths[0].strip().lower() != currentgroupname.strip().lower():
2091
  Alltexttobebilled+= paths[0] +'\n'
2092
  currentgroupname=paths[0]
2093
  # print(paths[0])
2094
+
2095
+
2096
  if page_num in toc_pages:
2097
  continue
 
2098
  if break_collecting:
2099
  break
2100
  page=doc[page_num]
2101
  page_height = page.rect.height
2102
  blocks = page.get_text("dict")["blocks"]
2103
+
2104
  for block in blocks:
2105
  if break_collecting:
2106
  break
2107
+
2108
  lines = block.get("lines", [])
2109
  i = 0
2110
  while i < len(lines):
2111
  if break_collecting:
2112
  break
2113
+
2114
  spans = lines[i].get("spans", [])
2115
  if not spans:
2116
  i += 1
2117
  continue
2118
+
2119
  y0 = spans[0]["bbox"][1]
2120
  y1 = spans[0]["bbox"][3]
2121
  if y0 < top_margin or y1 > (page_height - bottom_margin):
2122
  i += 1
2123
  continue
2124
+
2125
  line_text = get_spaced_text_from_spans(spans).lower()
2126
  line_text_norm = normalize_text(line_text)
2127
+
 
2128
  # Combine with next line if available
2129
  if i + 1 < len(lines):
2130
  next_spans = lines[i + 1].get("spans", [])
 
2132
  combined_line_norm = normalize_text(line_text + " " + next_line_text)
2133
  else:
2134
  combined_line_norm = line_text_norm
2135
+
2136
  # Check if we should continue processing
2137
  if combined_line_norm and combined_line_norm in paths[0]:
2138
+
2139
  headertoContinue1 = combined_line_norm
2140
  if combined_line_norm and combined_line_norm in paths[-2]:
2141
+
2142
  headertoContinue2 = combined_line_norm
2143
  # if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
2144
  last_path = paths[-2].lower()
 
2145
  # if any(word in paths[-2].lower() for word in keywordstoSkip):
2146
  # if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() or 'workmanship' in paths[-2].lower() or 'testing' in paths[-2].lower() or 'labeling' in paths[-2].lower():
2147
  if any(keyword in last_path for keyword in keywords):
 
2152
  # Alltexttobebilled+= combined_line_norm #################################################
2153
  if matched_header_line_norm in combined_line_norm:
2154
  Alltexttobebilled+='\n'
2155
+ Alltexttobebilled+= ' '+combined_line_norm
2156
  # Optimized header matching
2157
  existsfull = (
2158
  ( combined_line_norm in allchildrenheaders_set or
2159
  combined_line_norm in allchildrenheaders ) and heading_to_search in combined_line_norm
2160
  )
2161
+
2162
  # New word-based matching
2163
  current_line_words = set(combined_line_norm.split())
2164
  heading_words = set(heading_norm.split())
2165
  all_words_match = current_line_words.issubset(heading_words) and len(current_line_words) > 0
2166
+
2167
  substring_match = (
2168
  heading_norm in combined_line_norm or
2169
  combined_line_norm in heading_norm or
 
2173
  # heading_norm in combined_line_norm or
2174
  # combined_line_norm in heading_norm
2175
  # )
2176
+
2177
  if (substring_match and existsfull and not collecting and
2178
  len(combined_line_norm) > 0 ):#and (headertoContinue1 or headertoContinue2) ):
2179
+
2180
  # Check header conditions more efficiently
2181
  header_spans = [
2182
  span for span in spans
 
2184
  # and span['size'] >= subsubheaderFontSize
2185
  and span['size'] < mainHeaderFontSize)
2186
  ]
2187
+ if header_spans and stringtowrite.startswith('To') and is_numbered(heading_to_search):
2188
  collecting = True
2189
  # if stringtowrite=='To be billed':
2190
  # Alltexttobebilled+='\n'
2191
  matched_header_font_size = max(span["size"] for span in header_spans)
2192
+
2193
+ # collected_lines.append(line_text)
2194
  valid_spans = [span for span in spans if span.get("bbox")]
2195
+
2196
  if valid_spans:
2197
  x0s = [span["bbox"][0] for span in valid_spans]
2198
  x1s = [span["bbox"][2] for span in valid_spans]
2199
  y0s = [span["bbox"][1] for span in valid_spans]
2200
  y1s = [span["bbox"][3] for span in valid_spans]
2201
+
2202
  header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
2203
+
2204
  if page_num in current_bbox:
2205
  cb = current_bbox[page_num]
2206
  current_bbox[page_num] = [
 
2213
  current_bbox[page_num] = header_bbox
2214
  last_y1s[page_num] = header_bbox[3]
2215
  x0, y0, x1, y1 = header_bbox
2216
+
2217
  zoom = 200
2218
  left = int(x0)
2219
  top = int(y0)
2220
  zoom_str = f"{zoom},{left},{top}"
2221
  pageNumberFound = page_num + 1
2222
+
2223
  # Build the query parameters
2224
  params = {
2225
  'pdfLink': pdf_path, # Your PDF link
2226
  'keyword': heading_to_search, # Your keyword (could be a string or list)
2227
  }
2228
+
2229
  # URL encode each parameter
2230
  encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
2231
+
2232
  # Construct the final encoded link
2233
  encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
2234
+
2235
  # Correctly construct the final URL with page and zoom
2236
  final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
2237
+
2238
  # Get current date and time
2239
  now = datetime.now()
2240
+
2241
  # Format the output
2242
  formatted_time = now.strftime("%d/%m/%Y %I:%M:%S %p")
2243
  # Optionally, add the URL to a DataFrame
2244
+
2245
+
2246
  data_entry = {
2247
  "PDF Name":filename,
2248
  "NBSLink": zoom_str,
 
2258
  "MC Connnection": 'Go to ' + paths[0].strip().split()[0] +'/'+ heading_to_search.strip().split()[0] + ' in '+ filename
2259
  }
2260
  data_list_JSON.append(data_entry)
2261
+
2262
  # Convert list to JSON
2263
  # json_output = [data_list_JSON]
2264
  # json_output = json.dumps(data_list_JSON, indent=4)
2265
+
2266
  i += 2
2267
  continue
2268
  else:
2269
  if (substring_match and not collecting and
2270
  len(combined_line_norm) > 0): # and (headertoContinue1 or headertoContinue2) ):
2271
+
2272
  # Calculate word match percentage
2273
  word_match_percent = words_match_ratio(heading_norm, combined_line_norm) * 100
2274
+
2275
  # Check if at least 70% of header words exist in this line
2276
  meets_word_threshold = word_match_percent >= 100
2277
+
2278
  # Check header conditions (including word threshold)
2279
  header_spans = [
2280
  span for span in spans
 
2282
  # and span['size'] >= subsubheaderFontSize
2283
  and span['size'] < mainHeaderFontSize)
2284
  ]
2285
+
2286
  if header_spans and (meets_word_threshold or same_start_word(heading_to_search, combined_line_norm) ) and stringtowrite.startswith('To'):
2287
  collecting = True
2288
  if stringtowrite=='To be billed':
 
2290
  # if stringtowrite=='To be billed':
2291
  # Alltexttobebilled+= ' '+ combined_line_norm
2292
  matched_header_font_size = max(span["size"] for span in header_spans)
2293
+
2294
  collected_lines.append(line_text)
2295
  valid_spans = [span for span in spans if span.get("bbox")]
2296
+
2297
  if valid_spans:
2298
  x0s = [span["bbox"][0] for span in valid_spans]
2299
  x1s = [span["bbox"][2] for span in valid_spans]
2300
  y0s = [span["bbox"][1] for span in valid_spans]
2301
  y1s = [span["bbox"][3] for span in valid_spans]
2302
+
2303
  header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
2304
+
2305
  if page_num in current_bbox:
2306
  cb = current_bbox[page_num]
2307
  current_bbox[page_num] = [
 
2312
  ]
2313
  else:
2314
  current_bbox[page_num] = header_bbox
2315
+
2316
  last_y1s[page_num] = header_bbox[3]
2317
  x0, y0, x1, y1 = header_bbox
2318
  zoom = 200
 
2320
  top = int(y0)
2321
  zoom_str = f"{zoom},{left},{top}"
2322
  pageNumberFound = page_num + 1
2323
+
2324
  # Build the query parameters
2325
  params = {
2326
  'pdfLink': pdf_path, # Your PDF link
2327
  'keyword': heading_to_search, # Your keyword (could be a string or list)
2328
  }
2329
+
2330
  # URL encode each parameter
2331
  encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
2332
+
2333
  # Construct the final encoded link
2334
  encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
2335
+
2336
  # Correctly construct the final URL with page and zoom
2337
  final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
2338
+
2339
  # Get current date and time
2340
  now = datetime.now()
2341
+
2342
  # Format the output
2343
  formatted_time = now.strftime("%d/%m/%Y %I:%M:%S %p")
2344
  # Optionally, add the URL to a DataFrame
2345
+
2346
+
2347
  data_entry = {
2348
  "PDF Name":filename,
2349
  "NBSLink": zoom_str,
 
2359
  "MC Connnection": 'Go to ' + paths[0].strip().split()[0] +'/'+ heading_to_search.strip().split()[0] + ' in '+ filename
2360
  }
2361
  data_list_JSON.append(data_entry)
2362
+
2363
  # Convert list to JSON
2364
  # json_output = [data_list_JSON]
2365
  # json_output = json.dumps(data_list_JSON, indent=4)
2366
+
2367
+
2368
  i += 2
2369
  continue
2370
  if collecting:
2371
  norm_line = normalize_text(line_text)
2372
+
 
 
 
 
 
 
 
 
 
 
 
2373
  # Optimized URL check
2374
  if url_pattern.match(norm_line):
2375
  line_is_header = False
2376
  else:
2377
  line_is_header = any(is_header(span, most_common_font_size, most_common_color, most_common_font) for span in spans)
2378
+
2379
  if line_is_header:
2380
  header_font_size = max(span["size"] for span in spans)
2381
  is_probably_real_header = (
 
2383
  is_header(spans[0], most_common_font_size, most_common_color, most_common_font) and
2384
  len(line_text.strip()) > 2
2385
  )
2386
+
2387
  if (norm_line != matched_header_line_norm and
2388
  norm_line != heading_norm and
2389
  is_probably_real_header):
 
2396
  bbox[3] = last_y1s.get(page_num, bbox[3])
2397
  page_highlights[page_num] = bbox
2398
  highlight_boxes(docHighlights, page_highlights,stringtowrite)
2399
+
2400
  break_collecting = True
2401
  break
2402
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2403
  if break_collecting:
2404
  break
2405
+
 
 
 
 
2406
  collected_lines.append(line_text)
2407
  valid_spans = [span for span in spans if span.get("bbox")]
2408
  if valid_spans:
 
2410
  x1s = [span["bbox"][2] for span in valid_spans]
2411
  y0s = [span["bbox"][1] for span in valid_spans]
2412
  y1s = [span["bbox"][3] for span in valid_spans]
2413
+
2414
  line_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
2415
+
2416
  if page_num in current_bbox:
2417
  cb = current_bbox[page_num]
2418
  current_bbox[page_num] = [
 
2423
  ]
2424
  else:
2425
  current_bbox[page_num] = line_bbox
2426
+
2427
  last_y1s[page_num] = line_bbox[3]
2428
  i += 1
2429
+
 
 
 
2430
  if not done:
2431
  for page_num, bbox in current_bbox.items():
2432
  bbox[3] = last_y1s.get(page_num, bbox[3])