Marthee commited on
Commit
fbfef95
·
verified ·
1 Parent(s): d59358e

Update InitialMarkups.py

Browse files
Files changed (1) hide show
  1. InitialMarkups.py +1271 -1
InitialMarkups.py CHANGED
@@ -1055,6 +1055,7 @@ def extract_section_under_header(pdf_path):
1055
 
1056
 
1057
  def extract_section_under_header_tobebilledOnly(pdf_path):
 
1058
  top_margin = 70
1059
  bottom_margin = 50
1060
  headertoContinue1 = False
@@ -1236,7 +1237,1275 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1236
  and span['size'] < mainHeaderFontSize)
1237
  ]
1238
  if header_spans and stringtowrite.startswith('To'):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1239
 
 
 
 
 
 
 
 
 
 
 
 
 
1240
  collecting = True
1241
  matched_header_font_size = max(span["size"] for span in header_spans)
1242
  print(f"📥 Start collecting after header: {combined_line_norm} (Font size: {matched_header_font_size})")
@@ -1333,6 +2602,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1333
  ]
1334
 
1335
  if header_spans and (meets_word_threshold or same_start_word(heading_to_search, combined_line_norm) ) and stringtowrite.startswith('To'):
 
1336
  collecting = True
1337
  matched_header_font_size = max(span["size"] for span in header_spans)
1338
  print(f"📥 Start collecting after header: {combined_line_norm} "
@@ -1487,7 +2757,7 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1487
  pdf_bytes = BytesIO()
1488
  docHighlights.save(pdf_bytes)
1489
  print('JSONN',json_output)
1490
- return pdf_bytes.getvalue(), docHighlights , json_output
1491
 
1492
 
1493
 
 
1055
 
1056
 
1057
  def extract_section_under_header_tobebilledOnly(pdf_path):
1058
+ Alltext_Tobebilled=''
1059
  top_margin = 70
1060
  bottom_margin = 50
1061
  headertoContinue1 = False
 
1237
  and span['size'] < mainHeaderFontSize)
1238
  ]
1239
  if header_spans and stringtowrite.startswith('To'):
1240
+ Alltext_Tobebilled+=combined_line_norm
1241
+ collecting = True
1242
+ matched_header_font_size = max(span["size"] for span in header_spans)
1243
+ print(f"📥 Start collecting after header: {combined_line_norm} (Font size: {matched_header_font_size})")
1244
+
1245
+ collected_lines.append(line_text)
1246
+ valid_spans = [span for span in spans if span.get("bbox")]
1247
+
1248
+ if valid_spans:
1249
+ x0s = [span["bbox"][0] for span in valid_spans]
1250
+ x1s = [span["bbox"][2] for span in valid_spans]
1251
+ y0s = [span["bbox"][1] for span in valid_spans]
1252
+ y1s = [span["bbox"][3] for span in valid_spans]
1253
+
1254
+ header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
1255
+
1256
+ if page_num in current_bbox:
1257
+ cb = current_bbox[page_num]
1258
+ current_bbox[page_num] = [
1259
+ min(cb[0], header_bbox[0]),
1260
+ min(cb[1], header_bbox[1]),
1261
+ max(cb[2], header_bbox[2]),
1262
+ max(cb[3], header_bbox[3])
1263
+ ]
1264
+ else:
1265
+ current_bbox[page_num] = header_bbox
1266
+ last_y1s[page_num] = header_bbox[3]
1267
+ x0, y0, x1, y1 = header_bbox
1268
+
1269
+ zoom = 200
1270
+ left = int(x0)
1271
+ top = int(y0)
1272
+ zoom_str = f"{zoom},{left},{top}"
1273
+ pageNumberFound = page_num + 1
1274
+
1275
+ # Build the query parameters
1276
+ params = {
1277
+ 'pdfLink': pdf_path, # Your PDF link
1278
+ 'keyword': heading_to_search, # Your keyword (could be a string or list)
1279
+ }
1280
+
1281
+ # URL encode each parameter
1282
+ encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
1283
+
1284
+ # Construct the final encoded link
1285
+ encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
1286
+
1287
+ # Correctly construct the final URL with page and zoom
1288
+ final_url = f"{tobebilledonlyLink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
1289
+
1290
+ # Get current date and time
1291
+ now = datetime.now()
1292
+
1293
+ # Format the output
1294
+ formatted_time = now.strftime("%d/%m/%Y %I:%M:%S %p")
1295
+ # Optionally, add the URL to a DataFrame
1296
+
1297
+
1298
+ data_entry = {
1299
+ "NBSLink": final_url,
1300
+ "Subject": heading_to_search,
1301
+ "Page": str(pageNumberFound),
1302
+ "Author": "ADR",
1303
+ "Creation Date": formatted_time,
1304
+ "Layer": "Initial",
1305
+ "Code": stringtowrite,
1306
+ "head above 1": paths[-2],
1307
+ "head above 2": paths[0],
1308
+ "MC Connnection": 'Go to ' + paths[0].strip().split()[0] +'/'+ heading_to_search.strip().split()[0] + ' in '+ filename
1309
+ }
1310
+ data_list_JSON.append(data_entry)
1311
+
1312
+ # Convert list to JSON
1313
+ json_output = json.dumps(data_list_JSON, indent=4)
1314
+
1315
+ print("Final URL:", final_url)
1316
+ i += 2
1317
+ continue
1318
+ else:
1319
+ if (substring_match and not collecting and
1320
+ len(combined_line_norm) > 0): # and (headertoContinue1 or headertoContinue2) ):
1321
+
1322
+ # Calculate word match percentage
1323
+ word_match_percent = words_match_ratio(heading_norm, combined_line_norm) * 100
1324
+
1325
+ # Check if at least 70% of header words exist in this line
1326
+ meets_word_threshold = word_match_percent >= 100
1327
+
1328
+ # Check header conditions (including word threshold)
1329
+ header_spans = [
1330
+ span for span in spans
1331
+ if (is_header(span, most_common_font_size, most_common_color, most_common_font)
1332
+ # and span['size'] >= subsubheaderFontSize
1333
+ and span['size'] < mainHeaderFontSize)
1334
+ ]
1335
+
1336
+ if header_spans and (meets_word_threshold or same_start_word(heading_to_search, combined_line_norm) ) and stringtowrite.startswith('To'):
1337
+ Alltext_Tobebilled+=combined_line_norm
1338
+ collecting = True
1339
+ matched_header_font_size = max(span["size"] for span in header_spans)
1340
+ print(f"📥 Start collecting after header: {combined_line_norm} "
1341
+ f"(Font: {matched_header_font_size}, Word match: {word_match_percent:.0f}%)")
1342
+
1343
+ collected_lines.append(line_text)
1344
+ valid_spans = [span for span in spans if span.get("bbox")]
1345
+
1346
+ if valid_spans:
1347
+ x0s = [span["bbox"][0] for span in valid_spans]
1348
+ x1s = [span["bbox"][2] for span in valid_spans]
1349
+ y0s = [span["bbox"][1] for span in valid_spans]
1350
+ y1s = [span["bbox"][3] for span in valid_spans]
1351
+
1352
+ header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
1353
+
1354
+ if page_num in current_bbox:
1355
+ cb = current_bbox[page_num]
1356
+ current_bbox[page_num] = [
1357
+ min(cb[0], header_bbox[0]),
1358
+ min(cb[1], header_bbox[1]),
1359
+ max(cb[2], header_bbox[2]),
1360
+ max(cb[3], header_bbox[3])
1361
+ ]
1362
+ else:
1363
+ current_bbox[page_num] = header_bbox
1364
+
1365
+ last_y1s[page_num] = header_bbox[3]
1366
+ x0, y0, x1, y1 = header_bbox
1367
+ zoom = 200
1368
+ left = int(x0)
1369
+ top = int(y0)
1370
+ zoom_str = f"{zoom},{left},{top}"
1371
+ pageNumberFound = page_num + 1
1372
+
1373
+ # Build the query parameters
1374
+ params = {
1375
+ 'pdfLink': pdf_path, # Your PDF link
1376
+ 'keyword': heading_to_search, # Your keyword (could be a string or list)
1377
+ }
1378
+
1379
+ # URL encode each parameter
1380
+ encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
1381
+
1382
+ # Construct the final encoded link
1383
+ encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
1384
+
1385
+ # Correctly construct the final URL with page and zoom
1386
+ final_url = f"{tobebilledonlyLink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
1387
+
1388
+ # Get current date and time
1389
+ now = datetime.now()
1390
+
1391
+ # Format the output
1392
+ formatted_time = now.strftime("%d/%m/%Y %I:%M:%S %p")
1393
+ # Optionally, add the URL to a DataFrame
1394
+
1395
+
1396
+ data_entry = {
1397
+ "NBSLink": final_url,
1398
+ "Subject": heading_to_search,
1399
+ "Page": str(pageNumberFound),
1400
+ "Author": "ADR",
1401
+ "Creation Date": formatted_time,
1402
+ "Layer": "Initial",
1403
+ "Code": stringtowrite,
1404
+ "head above 1": paths[-2],
1405
+ "head above 2": paths[0],
1406
+ "MC Connnection": 'Go to ' + paths[0].strip().split()[0] +'/'+ heading_to_search.strip().split()[0] + ' in '+ filename
1407
+ }
1408
+ data_list_JSON.append(data_entry)
1409
+
1410
+ # Convert list to JSON
1411
+ json_output = json.dumps(data_list_JSON, indent=4)
1412
+
1413
+ print("Final URL:", final_url)
1414
+ i += 2
1415
+ continue
1416
+ if collecting:
1417
+ norm_line = normalize_text(line_text)
1418
+
1419
+ # Optimized URL check
1420
+ if url_pattern.match(norm_line):
1421
+ line_is_header = False
1422
+ else:
1423
+ line_is_header = any(is_header(span, most_common_font_size, most_common_color, most_common_font) for span in spans)
1424
+
1425
+ if line_is_header:
1426
+ header_font_size = max(span["size"] for span in spans)
1427
+ is_probably_real_header = (
1428
+ header_font_size >= matched_header_font_size and
1429
+ is_header(spans[0], most_common_font_size, most_common_color, most_common_font) and
1430
+ len(line_text.strip()) > 2
1431
+ )
1432
+
1433
+ if (norm_line != matched_header_line_norm and
1434
+ norm_line != heading_norm and
1435
+ is_probably_real_header):
1436
+ if line_text not in heading_norm:
1437
+ print(f"🛑 Stop at header with same or larger font: '{line_text}' ({header_font_size} ≥ {matched_header_font_size})")
1438
+ collecting = False
1439
+ done = True
1440
+ headertoContinue1 = False
1441
+ headertoContinue2=False
1442
+ for page_num, bbox in current_bbox.items():
1443
+ bbox[3] = last_y1s.get(page_num, bbox[3])
1444
+ page_highlights[page_num] = bbox
1445
+ highlight_boxes(docHighlights, page_highlights,stringtowrite)
1446
+
1447
+ break_collecting = True
1448
+ break
1449
+
1450
+ if break_collecting:
1451
+ break
1452
+
1453
+ collected_lines.append(line_text)
1454
+ valid_spans = [span for span in spans if span.get("bbox")]
1455
+ if valid_spans:
1456
+ x0s = [span["bbox"][0] for span in valid_spans]
1457
+ x1s = [span["bbox"][2] for span in valid_spans]
1458
+ y0s = [span["bbox"][1] for span in valid_spans]
1459
+ y1s = [span["bbox"][3] for span in valid_spans]
1460
+
1461
+ line_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
1462
+
1463
+ if page_num in current_bbox:
1464
+ cb = current_bbox[page_num]
1465
+ current_bbox[page_num] = [
1466
+ min(cb[0], line_bbox[0]),
1467
+ min(cb[1], line_bbox[1]),
1468
+ max(cb[2], line_bbox[2]),
1469
+ max(cb[3], line_bbox[3])
1470
+ ]
1471
+ else:
1472
+ current_bbox[page_num] = line_bbox
1473
+
1474
+ last_y1s[page_num] = line_bbox[3]
1475
+ i += 1
1476
+
1477
+ if not done:
1478
+ for page_num, bbox in current_bbox.items():
1479
+ bbox[3] = last_y1s.get(page_num, bbox[3])
1480
+ page_highlights[page_num] = bbox
1481
+ if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
1482
+ stringtowrite='Not to be billed'
1483
+ else:
1484
+ stringtowrite='To be billed'
1485
+ highlight_boxes(docHighlights, page_highlights,stringtowrite)
1486
+
1487
+ # docHighlights.save("highlighted_output.pdf", garbage=4, deflate=True)
1488
+
1489
+ pdf_bytes = BytesIO()
1490
+ docHighlights.save(pdf_bytes)
1491
+ print('JSONN',json_output)
1492
+ return pdf_bytes.getvalue(), docHighlights , json_output , Alltext_Tobebilled
1493
+
1494
+
1495
+
1496
+ ########################################################################################################################################################
1497
+ ########################################################################################################################################################
1498
+
1499
+ def extract_section_under_headerRawan(pdf_path,headingjson,pagenum=0,incomingheader=0):
1500
+ top_margin = 70
1501
+ bottom_margin = 50
1502
+ # Optimized URL handling
1503
+ if pdf_path and ('http' in pdf_path or 'dropbox' in pdf_path):
1504
+ pdf_path = pdf_path.replace('dl=0', 'dl=1')
1505
+
1506
+ # Cache frequently used values
1507
+ response = requests.get(pdf_path)
1508
+ pdf_content = BytesIO(response.content)
1509
+ if not pdf_content:
1510
+ raise ValueError("No valid PDF content found.")
1511
+
1512
+ doc = fitz.open(stream=pdf_content, filetype="pdf")
1513
+ docHighlights = fitz.open(stream=pdf_content, filetype="pdf")
1514
+ most_common_font_size, most_common_color, most_common_font = get_regular_font_size_and_color(doc)
1515
+
1516
+ # Precompute regex patterns
1517
+ dot_pattern = re.compile(r'\.{3,}')
1518
+ url_pattern = re.compile(r'https?://\S+|www\.\S+')
1519
+
1520
+ def get_toc_page_numbers(doc, max_pages_to_check=15):
1521
+ toc_pages = []
1522
+ for page_num in range(min(len(doc), max_pages_to_check)):
1523
+ page = doc.load_page(page_num)
1524
+ blocks = page.get_text("dict")["blocks"]
1525
+
1526
+ dot_line_count = 0
1527
+ for block in blocks:
1528
+ for line in block.get("lines", []):
1529
+ line_text = get_spaced_text_from_spans(line["spans"]).strip()
1530
+ if dot_pattern.search(line_text):
1531
+ dot_line_count += 1
1532
+
1533
+ if dot_line_count >= 3:
1534
+ toc_pages.append(page_num)
1535
+
1536
+ return list(range(0, toc_pages[-1] +1)) if toc_pages else toc_pages
1537
+
1538
+ toc_pages = get_toc_page_numbers(doc)
1539
+
1540
+ headers, top_3_font_sizes, smallest_font_size, headersSpans = extract_headers(
1541
+ doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin, bottom_margin
1542
+ )
1543
+
1544
+ listofheadingsfromrawan=[]
1545
+ if type(headingjson) == str:
1546
+ listofheadingsfromrawan.append(headingjson)
1547
+ headingjson=[headingjson]
1548
+ else:
1549
+ for item in headingjson:
1550
+ listofheadingsfromrawan.append(normalize_text(item['Subject']))
1551
+ print('hereeeeeeeeeeeeeee0',listofheadingsfromrawan)
1552
+ # Precompute all children headers once
1553
+ allchildrenheaders = listofheadingsfromrawan
1554
+ print('hereeeeeeeeeeeeeee00',allchildrenheaders)
1555
+ allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
1556
+
1557
+ df = pd.DataFrame(columns=["NBSLink","Subject","Page","Author","Creation Date","Layer",'Code', 'head above 1', "head above 2"])
1558
+ data_list_JSON = []
1559
+
1560
+ if len(top_3_font_sizes)==3:
1561
+ mainHeaderFontSize, subHeaderFontSize, subsubheaderFontSize = top_3_font_sizes
1562
+ elif len(top_3_font_sizes)==2:
1563
+ mainHeaderFontSize= top_3_font_sizes[0]
1564
+ subHeaderFontSize= top_3_font_sizes[1]
1565
+ subsubheaderFontSize= top_3_font_sizes[1]
1566
+
1567
+ print("📌 Has TOC:", bool(toc_pages), " | Pages to skip:", toc_pages)
1568
+
1569
+ # Preload all pages to avoid repeated loading
1570
+ # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
1571
+ newjsonList=[]
1572
+ for heading_to_searchDict in headingjson:
1573
+ if type(heading_to_searchDict) == str:
1574
+ heading_to_search = heading_to_searchDict
1575
+ heading_to_searchPageNum = pagenum
1576
+ else:
1577
+ heading_to_search = heading_to_searchDict['Subject']
1578
+ heading_to_searchPageNum = int(heading_to_searchDict['Page'])-1
1579
+ incomingheader = heading_to_searchDict['head above 1']
1580
+
1581
+ print('hereeeeeeeeeeeeeee0',heading_to_searchPageNum)
1582
+ done = False
1583
+ collecting = False
1584
+ collected_lines = []
1585
+ page_highlights = {}
1586
+ current_bbox = {}
1587
+ last_y1s = {}
1588
+ mainHeader = ''
1589
+ subHeader = ''
1590
+ matched_header_line_norm = heading_to_search
1591
+ break_collecting = False
1592
+ heading_norm = normalize_text(heading_to_search)
1593
+
1594
+ for page_num in range(heading_to_searchPageNum,len(doc)):
1595
+ print('hereeeeeeeeeeeeeee1')
1596
+ if page_num in toc_pages:
1597
+ continue
1598
+ if break_collecting:
1599
+ break
1600
+ page=doc[page_num]
1601
+ page_height = page.rect.height
1602
+ blocks = page.get_text("dict")["blocks"]
1603
+
1604
+ for block in blocks:
1605
+ if break_collecting:
1606
+ break
1607
+
1608
+ lines = block.get("lines", [])
1609
+ i = 0
1610
+ while i < len(lines):
1611
+ if break_collecting:
1612
+ break
1613
+
1614
+ spans = lines[i].get("spans", [])
1615
+ if not spans:
1616
+ i += 1
1617
+ continue
1618
+
1619
+ y0 = spans[0]["bbox"][1]
1620
+ y1 = spans[0]["bbox"][3]
1621
+ if y0 < top_margin or y1 > (page_height - bottom_margin):
1622
+ i += 1
1623
+ continue
1624
+
1625
+ line_text = get_spaced_text_from_spans(spans).lower()
1626
+ line_text_norm = normalize_text(line_text)
1627
+
1628
+ # Combine with next line if available
1629
+ if i + 1 < len(lines):
1630
+ next_spans = lines[i + 1].get("spans", [])
1631
+ next_line_text = get_spaced_text_from_spans(next_spans).lower()
1632
+ combined_line_norm = normalize_text(line_text + " " + next_line_text)
1633
+ else:
1634
+ combined_line_norm = line_text_norm
1635
+ # Optimized header matching
1636
+ existsfull = (
1637
+ ( combined_line_norm in allchildrenheaders_set or
1638
+ combined_line_norm in allchildrenheaders ) and heading_to_search in combined_line_norm
1639
+ )
1640
+
1641
+ # New word-based matching
1642
+ current_line_words = set(combined_line_norm.split())
1643
+ heading_words = set(heading_norm.split())
1644
+ all_words_match = current_line_words.issubset(heading_words) and len(current_line_words) > 0
1645
+
1646
+ substring_match = (
1647
+ heading_norm in combined_line_norm or
1648
+ combined_line_norm in heading_norm or
1649
+ all_words_match # Include the new word-based matching
1650
+ )
1651
+
1652
+ if (substring_match and existsfull and not collecting and
1653
+ len(combined_line_norm) > 0 ):#and (headertoContinue1 or headertoContinue2) ):
1654
+
1655
+ # Check header conditions more efficiently
1656
+ header_spans = [
1657
+ span for span in spans
1658
+ if (is_header(span, most_common_font_size, most_common_color, most_common_font)
1659
+ # and span['size'] >= subsubheaderFontSize
1660
+ and span['size'] < mainHeaderFontSize)
1661
+ ]
1662
+ if header_spans:
1663
+ collecting = True
1664
+ matched_header_font_size = max(span["size"] for span in header_spans)
1665
+ print(f"📥 Start collecting after header: {combined_line_norm} (Font size: {matched_header_font_size})")
1666
+
1667
+ collected_lines.append(line_text)
1668
+ valid_spans = [span for span in spans if span.get("bbox")]
1669
+
1670
+ if valid_spans:
1671
+ x0s = [span["bbox"][0] for span in valid_spans]
1672
+ x1s = [span["bbox"][2] for span in valid_spans]
1673
+ y0s = [span["bbox"][1] for span in valid_spans]
1674
+ y1s = [span["bbox"][3] for span in valid_spans]
1675
+
1676
+ header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
1677
+
1678
+ if page_num in current_bbox:
1679
+ cb = current_bbox[page_num]
1680
+ current_bbox[page_num] = [
1681
+ min(cb[0], header_bbox[0]),
1682
+ min(cb[1], header_bbox[1]),
1683
+ max(cb[2], header_bbox[2]),
1684
+ max(cb[3], header_bbox[3])
1685
+ ]
1686
+ else:
1687
+ current_bbox[page_num] = header_bbox
1688
+ last_y1s[page_num] = header_bbox[3]
1689
+ x0, y0, x1, y1 = header_bbox
1690
+
1691
+ zoom = 200
1692
+ left = int(x0)
1693
+ top = int(y0)
1694
+ zoom_str = f"{zoom},{left},{top}"
1695
+ pageNumberFound = page_num + 1
1696
+
1697
+ # Build the query parameters
1698
+ params = {
1699
+ 'pdfLink': pdf_path, # Your PDF link
1700
+ 'keyword': heading_to_search, # Your keyword (could be a string or list)
1701
+ }
1702
+
1703
+ # URL encode each parameter
1704
+ encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
1705
+
1706
+ # Construct the final encoded link
1707
+ encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
1708
+
1709
+ # Correctly construct the final URL with page and zoom
1710
+ final_url = f"{newlink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
1711
+
1712
+ # Get current date and time
1713
+ now = datetime.now()
1714
+
1715
+ # Format the output
1716
+ formatted_time = now.strftime("%d/%m/%Y %I:%M:%S %p")
1717
+ # Optionally, add the URL to a DataFrame
1718
+ new_url= final_url
1719
+ if type(heading_to_searchDict) != str:
1720
+ heading_to_searchDict['NBSLink']=new_url
1721
+ newjsonList.append(heading_to_searchDict)
1722
+ print("Final URL:", final_url)
1723
+ i += 2
1724
+ continue
1725
+ else:
1726
+ if (substring_match and not collecting and
1727
+ len(combined_line_norm) > 0): # and (headertoContinue1 or headertoContinue2) ):
1728
+
1729
+ # Calculate word match percentage
1730
+ word_match_percent = words_match_ratio(heading_norm, combined_line_norm) * 100
1731
+
1732
+ # Check if at least 70% of header words exist in this line
1733
+ meets_word_threshold = word_match_percent >= 100
1734
+
1735
+ # Check header conditions (including word threshold)
1736
+ header_spans = [
1737
+ span for span in spans
1738
+ if (is_header(span, most_common_font_size, most_common_color, most_common_font)
1739
+ # and span['size'] >= subsubheaderFontSize
1740
+ and span['size'] < mainHeaderFontSize)
1741
+ ]
1742
+
1743
+ if header_spans and (meets_word_threshold or same_start_word(heading_to_search, combined_line_norm) ):
1744
+ collecting = True
1745
+ matched_header_font_size = max(span["size"] for span in header_spans)
1746
+ print(f"📥 Start collecting after header: {combined_line_norm} "
1747
+ f"(Font: {matched_header_font_size}, Word match: {word_match_percent:.0f}%)")
1748
+
1749
+ collected_lines.append(line_text)
1750
+ valid_spans = [span for span in spans if span.get("bbox")]
1751
+
1752
+ if valid_spans:
1753
+ x0s = [span["bbox"][0] for span in valid_spans]
1754
+ x1s = [span["bbox"][2] for span in valid_spans]
1755
+ y0s = [span["bbox"][1] for span in valid_spans]
1756
+ y1s = [span["bbox"][3] for span in valid_spans]
1757
+
1758
+ header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
1759
+
1760
+ if page_num in current_bbox:
1761
+ cb = current_bbox[page_num]
1762
+ current_bbox[page_num] = [
1763
+ min(cb[0], header_bbox[0]),
1764
+ min(cb[1], header_bbox[1]),
1765
+ max(cb[2], header_bbox[2]),
1766
+ max(cb[3], header_bbox[3])
1767
+ ]
1768
+ else:
1769
+ current_bbox[page_num] = header_bbox
1770
+
1771
+ last_y1s[page_num] = header_bbox[3]
1772
+ x0, y0, x1, y1 = header_bbox
1773
+ zoom = 200
1774
+ left = int(x0)
1775
+ top = int(y0)
1776
+ zoom_str = f"{zoom},{left},{top}"
1777
+ pageNumberFound = page_num + 1
1778
+
1779
+ # Build the query parameters
1780
+ params = {
1781
+ 'pdfLink': pdf_path, # Your PDF link
1782
+ 'keyword': heading_to_search, # Your keyword (could be a string or list)
1783
+ }
1784
+
1785
+ # URL encode each parameter
1786
+ encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
1787
+
1788
+ # Construct the final encoded link
1789
+ encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
1790
+
1791
+ # Correctly construct the final URL with page and zoom
1792
+ final_url = f"{newlink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
1793
+ new_url= final_url
1794
+ if type(heading_to_searchDict) != str:
1795
+ heading_to_searchDict['NBSLink']=new_url
1796
+ newjsonList.append(heading_to_searchDict)
1797
+ print("Final URL:", final_url)
1798
+ i += 2
1799
+ continue
1800
+ if collecting:
1801
+ norm_line = normalize_text(line_text)
1802
+
1803
+ # Optimized URL check
1804
+ if url_pattern.match(norm_line):
1805
+ line_is_header = False
1806
+ else:
1807
+ line_is_header = any(is_header(span, most_common_font_size, most_common_color, most_common_font) for span in spans)
1808
+
1809
+ if line_is_header:
1810
+ header_font_size = max(span["size"] for span in spans)
1811
+ is_probably_real_header = (
1812
+ header_font_size >= matched_header_font_size and
1813
+ is_header(spans[0], most_common_font_size, most_common_color, most_common_font) and
1814
+ len(line_text.strip()) > 2
1815
+ )
1816
+
1817
+ if (norm_line != matched_header_line_norm and
1818
+ norm_line != heading_norm and
1819
+ is_probably_real_header):
1820
+ if line_text not in heading_norm:
1821
+ print(f"🛑 Stop at header with same or larger font: '{line_text}' ({header_font_size} ≥ {matched_header_font_size})")
1822
+ collecting = False
1823
+ done = True
1824
+ headertoContinue1 = False
1825
+ headertoContinue2=False
1826
+ for page_num, bbox in current_bbox.items():
1827
+ bbox[3] = last_y1s.get(page_num, bbox[3])
1828
+ page_highlights[page_num] = bbox
1829
+
1830
+ if 'installation' in incomingheader or 'execution' in incomingheader or 'miscellaneous items' in incomingheader :
1831
+ stringtowrite='Not to be billed'
1832
+ else:
1833
+ stringtowrite='To be billed'
1834
+ highlight_boxes(docHighlights, page_highlights,stringtowrite)
1835
+
1836
+ break_collecting = True
1837
+ break
1838
+
1839
+ if break_collecting:
1840
+ break
1841
+
1842
+ collected_lines.append(line_text)
1843
+ valid_spans = [span for span in spans if span.get("bbox")]
1844
+ if valid_spans:
1845
+ x0s = [span["bbox"][0] for span in valid_spans]
1846
+ x1s = [span["bbox"][2] for span in valid_spans]
1847
+ y0s = [span["bbox"][1] for span in valid_spans]
1848
+ y1s = [span["bbox"][3] for span in valid_spans]
1849
+
1850
+ line_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
1851
+
1852
+ if page_num in current_bbox:
1853
+ cb = current_bbox[page_num]
1854
+ current_bbox[page_num] = [
1855
+ min(cb[0], line_bbox[0]),
1856
+ min(cb[1], line_bbox[1]),
1857
+ max(cb[2], line_bbox[2]),
1858
+ max(cb[3], line_bbox[3])
1859
+ ]
1860
+ else:
1861
+ current_bbox[page_num] = line_bbox
1862
+
1863
+ last_y1s[page_num] = line_bbox[3]
1864
+ i += 1
1865
+
1866
+ if not done:
1867
+ for page_num, bbox in current_bbox.items():
1868
+ bbox[3] = last_y1s.get(page_num, bbox[3])
1869
+ page_highlights[page_num] = bbox
1870
+ if 'installation' in incomingheader or 'execution' in incomingheader or 'miscellaneous items' in incomingheader :
1871
+ stringtowrite='Not to be billed'
1872
+ else:
1873
+ stringtowrite='To be billed'
1874
+ highlight_boxes(docHighlights, page_highlights,stringtowrite)
1875
+
1876
+ # docHighlights.save("highlighted_output.pdf", garbage=4, deflate=True)
1877
+
1878
+ pdf_bytes = BytesIO()
1879
+ docHighlights.save(pdf_bytes)
1880
+ return pdf_bytes.getvalue(), docHighlights , newjsonList
1881
+
1882
+
1883
+
1884
+
1885
+ top_margin = 70
1886
+ bottom_margin = 50
1887
+ headertoContinue1 = False
1888
+ headertoContinue2=False
1889
+
1890
+ parsed_url = urlparse(pdf_path)
1891
+ filename = os.path.basename(parsed_url.path)
1892
+ filename = unquote(filename) # decode URL-encoded characters
1893
+
1894
+ # Optimized URL handling
1895
+ if pdf_path and ('http' in pdf_path or 'dropbox' in pdf_path):
1896
+ pdf_path = pdf_path.replace('dl=0', 'dl=1')
1897
+
1898
+ # Cache frequently used values
1899
+ response = requests.get(pdf_path)
1900
+ pdf_content = BytesIO(response.content)
1901
+ if not pdf_content:
1902
+ raise ValueError("No valid PDF content found.")
1903
+
1904
+ doc = fitz.open(stream=pdf_content, filetype="pdf")
1905
+ docHighlights = fitz.open(stream=pdf_content, filetype="pdf")
1906
+ most_common_font_size, most_common_color, most_common_font = get_regular_font_size_and_color(doc)
1907
+
1908
+ # Precompute regex patterns
1909
+ dot_pattern = re.compile(r'\.{3,}')
1910
+ url_pattern = re.compile(r'https?://\S+|www\.\S+')
1911
+
1912
+ def get_toc_page_numbers(doc, max_pages_to_check=15):
1913
+ toc_pages = []
1914
+ for page_num in range(min(len(doc), max_pages_to_check)):
1915
+ page = doc.load_page(page_num)
1916
+ blocks = page.get_text("dict")["blocks"]
1917
+
1918
+ dot_line_count = 0
1919
+ for block in blocks:
1920
+ for line in block.get("lines", []):
1921
+ line_text = get_spaced_text_from_spans(line["spans"]).strip()
1922
+ if dot_pattern.search(line_text):
1923
+ dot_line_count += 1
1924
+
1925
+ if dot_line_count >= 3:
1926
+ toc_pages.append(page_num)
1927
+
1928
+ return list(range(0, toc_pages[-1] +1)) if toc_pages else toc_pages
1929
+
1930
+ toc_pages = get_toc_page_numbers(doc)
1931
+
1932
+ headers, top_3_font_sizes, smallest_font_size, headersSpans = extract_headers(
1933
+ doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin, bottom_margin
1934
+ )
1935
+
1936
+ hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
1937
+ listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
1938
+ print('listofHeaderstoMarkup',listofHeaderstoMarkup)
1939
+ # Precompute all children headers once
1940
+ allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
1941
+ allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
1942
+
1943
+ df = pd.DataFrame(columns=["NBSLink","Subject","Page","Author","Creation Date","Layer",'Code', 'head above 1', "head above 2"])
1944
+ dictionaryNBS={}
1945
+ data_list_JSON = []
1946
+
1947
+ if len(top_3_font_sizes)==3:
1948
+ mainHeaderFontSize, subHeaderFontSize, subsubheaderFontSize = top_3_font_sizes
1949
+ elif len(top_3_font_sizes)==2:
1950
+ mainHeaderFontSize= top_3_font_sizes[0]
1951
+ subHeaderFontSize= top_3_font_sizes[1]
1952
+ subsubheaderFontSize= top_3_font_sizes[1]
1953
+
1954
+ print("📌 Has TOC:", bool(toc_pages), " | Pages to skip:", toc_pages)
1955
+
1956
+ # Preload all pages to avoid repeated loading
1957
+ # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
1958
+
1959
+ for heading_to_searchDict, paths in listofHeaderstoMarkup:
1960
+ heading_to_search = heading_to_searchDict['text']
1961
+ heading_to_searchPageNum = heading_to_searchDict['page']
1962
+
1963
+ print('headertosearch', heading_to_search)
1964
+
1965
+ # Initialize variables
1966
+ headertoContinue1 = False
1967
+ headertoContinue2 = False
1968
+ matched_header_line = None
1969
+ done = False
1970
+ collecting = False
1971
+ collected_lines = []
1972
+ page_highlights = {}
1973
+ current_bbox = {}
1974
+ last_y1s = {}
1975
+ mainHeader = ''
1976
+ subHeader = ''
1977
+ matched_header_line_norm = heading_to_search
1978
+ break_collecting = False
1979
+ heading_norm = normalize_text(heading_to_search)
1980
+ paths_norm = [normalize_text(p) for p in paths[0]] if paths and paths[0] else []
1981
+
1982
+ for page_num in range(heading_to_searchPageNum,len(doc)):
1983
+ if page_num in toc_pages:
1984
+ continue
1985
+ if break_collecting:
1986
+ break
1987
+ page=doc[page_num]
1988
+ page_height = page.rect.height
1989
+ blocks = page.get_text("dict")["blocks"]
1990
+
1991
+ for block in blocks:
1992
+ if break_collecting:
1993
+ break
1994
+
1995
+ lines = block.get("lines", [])
1996
+ i = 0
1997
+ while i < len(lines):
1998
+ if break_collecting:
1999
+ break
2000
+
2001
+ spans = lines[i].get("spans", [])
2002
+ if not spans:
2003
+ i += 1
2004
+ continue
2005
+
2006
+ y0 = spans[0]["bbox"][1]
2007
+ y1 = spans[0]["bbox"][3]
2008
+ if y0 < top_margin or y1 > (page_height - bottom_margin):
2009
+ i += 1
2010
+ continue
2011
+
2012
+ line_text = get_spaced_text_from_spans(spans).lower()
2013
+ line_text_norm = normalize_text(line_text)
2014
+
2015
+ # Combine with next line if available
2016
+ if i + 1 < len(lines):
2017
+ next_spans = lines[i + 1].get("spans", [])
2018
+ next_line_text = get_spaced_text_from_spans(next_spans).lower()
2019
+ combined_line_norm = normalize_text(line_text + " " + next_line_text)
2020
+ else:
2021
+ combined_line_norm = line_text_norm
2022
+
2023
+ # Check if we should continue processing
2024
+ if combined_line_norm and combined_line_norm in paths[0]:
2025
+ print(combined_line_norm)
2026
+ headertoContinue1 = combined_line_norm
2027
+ if combined_line_norm and combined_line_norm in paths[-2]:
2028
+ print(combined_line_norm)
2029
+ headertoContinue2 = combined_line_norm
2030
+ if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
2031
+ stringtowrite='Not to be billed'
2032
+ else:
2033
+ stringtowrite='To be billed'
2034
+ # Optimized header matching
2035
+ existsfull = (
2036
+ ( combined_line_norm in allchildrenheaders_set or
2037
+ combined_line_norm in allchildrenheaders ) and heading_to_search in combined_line_norm
2038
+ )
2039
+
2040
+ # New word-based matching
2041
+ current_line_words = set(combined_line_norm.split())
2042
+ heading_words = set(heading_norm.split())
2043
+ all_words_match = current_line_words.issubset(heading_words) and len(current_line_words) > 0
2044
+
2045
+ substring_match = (
2046
+ heading_norm in combined_line_norm or
2047
+ combined_line_norm in heading_norm or
2048
+ all_words_match # Include the new word-based matching
2049
+ )
2050
+ # substring_match = (
2051
+ # heading_norm in combined_line_norm or
2052
+ # combined_line_norm in heading_norm
2053
+ # )
2054
+
2055
+ if (substring_match and existsfull and not collecting and
2056
+ len(combined_line_norm) > 0 ):#and (headertoContinue1 or headertoContinue2) ):
2057
+
2058
+ # Check header conditions more efficiently
2059
+ header_spans = [
2060
+ span for span in spans
2061
+ if (is_header(span, most_common_font_size, most_common_color, most_common_font)
2062
+ # and span['size'] >= subsubheaderFontSize
2063
+ and span['size'] < mainHeaderFontSize)
2064
+ ]
2065
+ if header_spans:
2066
+ collecting = True
2067
+ matched_header_font_size = max(span["size"] for span in header_spans)
2068
+ print(f"📥 Start collecting after header: {combined_line_norm} (Font size: {matched_header_font_size})")
2069
+
2070
+ collected_lines.append(line_text)
2071
+ valid_spans = [span for span in spans if span.get("bbox")]
2072
+
2073
+ if valid_spans:
2074
+ x0s = [span["bbox"][0] for span in valid_spans]
2075
+ x1s = [span["bbox"][2] for span in valid_spans]
2076
+ y0s = [span["bbox"][1] for span in valid_spans]
2077
+ y1s = [span["bbox"][3] for span in valid_spans]
2078
+
2079
+ header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
2080
+
2081
+ if page_num in current_bbox:
2082
+ cb = current_bbox[page_num]
2083
+ current_bbox[page_num] = [
2084
+ min(cb[0], header_bbox[0]),
2085
+ min(cb[1], header_bbox[1]),
2086
+ max(cb[2], header_bbox[2]),
2087
+ max(cb[3], header_bbox[3])
2088
+ ]
2089
+ else:
2090
+ current_bbox[page_num] = header_bbox
2091
+ last_y1s[page_num] = header_bbox[3]
2092
+ x0, y0, x1, y1 = header_bbox
2093
+
2094
+ zoom = 200
2095
+ left = int(x0)
2096
+ top = int(y0)
2097
+ zoom_str = f"{zoom},{left},{top}"
2098
+ pageNumberFound = page_num + 1
2099
+
2100
+ # Build the query parameters
2101
+ params = {
2102
+ 'pdfLink': pdf_path, # Your PDF link
2103
+ 'keyword': heading_to_search, # Your keyword (could be a string or list)
2104
+ }
2105
+
2106
+ # URL encode each parameter
2107
+ encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
2108
+
2109
+ # Construct the final encoded link
2110
+ encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
2111
+
2112
+ # Correctly construct the final URL with page and zoom
2113
+ final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
2114
+
2115
+ # Get current date and time
2116
+ now = datetime.now()
2117
+
2118
+ # Format the output
2119
+ formatted_time = now.strftime("%d/%m/%Y %I:%M:%S %p")
2120
+ # Optionally, add the URL to a DataFrame
2121
+
2122
+
2123
+ data_entry = {
2124
+ "NBSLink": final_url,
2125
+ "Subject": heading_to_search,
2126
+ "Page": str(pageNumberFound),
2127
+ "Author": "ADR",
2128
+ "Creation Date": formatted_time,
2129
+ "Layer": "Initial",
2130
+ "Code": stringtowrite,
2131
+ "head above 1": paths[-2],
2132
+ "head above 2": paths[0],
2133
+ "MC Connnection": 'Go to ' + paths[0].strip().split()[0] +'/'+ heading_to_search.strip().split()[0] + ' in '+ filename
2134
+ }
2135
+ data_list_JSON.append(data_entry)
2136
+
2137
+ # Convert list to JSON
2138
+ json_output = json.dumps(data_list_JSON, indent=4)
2139
+
2140
+ print("Final URL:", final_url)
2141
+ i += 2
2142
+ continue
2143
+ else:
2144
+ if (substring_match and not collecting and
2145
+ len(combined_line_norm) > 0): # and (headertoContinue1 or headertoContinue2) ):
2146
+
2147
+ # Calculate word match percentage
2148
+ word_match_percent = words_match_ratio(heading_norm, combined_line_norm) * 100
2149
+
2150
+ # Check if at least 70% of header words exist in this line
2151
+ meets_word_threshold = word_match_percent >= 100
2152
+
2153
+ # Check header conditions (including word threshold)
2154
+ header_spans = [
2155
+ span for span in spans
2156
+ if (is_header(span, most_common_font_size, most_common_color, most_common_font)
2157
+ # and span['size'] >= subsubheaderFontSize
2158
+ and span['size'] < mainHeaderFontSize)
2159
+ ]
2160
+
2161
+ if header_spans and (meets_word_threshold or same_start_word(heading_to_search, combined_line_norm) ):
2162
+ collecting = True
2163
+ matched_header_font_size = max(span["size"] for span in header_spans)
2164
+ print(f"📥 Start collecting after header: {combined_line_norm} "
2165
+ f"(Font: {matched_header_font_size}, Word match: {word_match_percent:.0f}%)")
2166
+
2167
+ collected_lines.append(line_text)
2168
+ valid_spans = [span for span in spans if span.get("bbox")]
2169
+
2170
+ if valid_spans:
2171
+ x0s = [span["bbox"][0] for span in valid_spans]
2172
+ x1s = [span["bbox"][2] for span in valid_spans]
2173
+ y0s = [span["bbox"][1] for span in valid_spans]
2174
+ y1s = [span["bbox"][3] for span in valid_spans]
2175
+
2176
+ header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
2177
+
2178
+ if page_num in current_bbox:
2179
+ cb = current_bbox[page_num]
2180
+ current_bbox[page_num] = [
2181
+ min(cb[0], header_bbox[0]),
2182
+ min(cb[1], header_bbox[1]),
2183
+ max(cb[2], header_bbox[2]),
2184
+ max(cb[3], header_bbox[3])
2185
+ ]
2186
+ else:
2187
+ current_bbox[page_num] = header_bbox
2188
+
2189
+ last_y1s[page_num] = header_bbox[3]
2190
+ x0, y0, x1, y1 = header_bbox
2191
+ zoom = 200
2192
+ left = int(x0)
2193
+ top = int(y0)
2194
+ zoom_str = f"{zoom},{left},{top}"
2195
+ pageNumberFound = page_num + 1
2196
+
2197
+ # Build the query parameters
2198
+ params = {
2199
+ 'pdfLink': pdf_path, # Your PDF link
2200
+ 'keyword': heading_to_search, # Your keyword (could be a string or list)
2201
+ }
2202
+
2203
+ # URL encode each parameter
2204
+ encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
2205
+
2206
+ # Construct the final encoded link
2207
+ encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
2208
+
2209
+ # Correctly construct the final URL with page and zoom
2210
+ final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
2211
+
2212
+ # Get current date and time
2213
+ now = datetime.now()
2214
+
2215
+ # Format the output
2216
+ formatted_time = now.strftime("%d/%m/%Y %I:%M:%S %p")
2217
+ # Optionally, add the URL to a DataFrame
2218
+
2219
+
2220
+ data_entry = {
2221
+ "NBSLink": final_url,
2222
+ "Subject": heading_to_search,
2223
+ "Page": str(pageNumberFound),
2224
+ "Author": "ADR",
2225
+ "Creation Date": formatted_time,
2226
+ "Layer": "Initial",
2227
+ "Code": stringtowrite,
2228
+ "head above 1": paths[-2],
2229
+ "head above 2": paths[0],
2230
+ "MC Connnection": 'Go to ' + paths[0].strip().split()[0] +'/'+ heading_to_search.strip().split()[0] + ' in '+ filename
2231
+ }
2232
+ data_list_JSON.append(data_entry)
2233
+
2234
+ # Convert list to JSON
2235
+ json_output = json.dumps(data_list_JSON, indent=4)
2236
+
2237
+ print("Final URL:", final_url)
2238
+ i += 2
2239
+ continue
2240
+ if collecting:
2241
+ norm_line = normalize_text(line_text)
2242
+
2243
+ # Optimized URL check
2244
+ if url_pattern.match(norm_line):
2245
+ line_is_header = False
2246
+ else:
2247
+ line_is_header = any(is_header(span, most_common_font_size, most_common_color, most_common_font) for span in spans)
2248
+
2249
+ if line_is_header:
2250
+ header_font_size = max(span["size"] for span in spans)
2251
+ is_probably_real_header = (
2252
+ header_font_size >= matched_header_font_size and
2253
+ is_header(spans[0], most_common_font_size, most_common_color, most_common_font) and
2254
+ len(line_text.strip()) > 2
2255
+ )
2256
+
2257
+ if (norm_line != matched_header_line_norm and
2258
+ norm_line != heading_norm and
2259
+ is_probably_real_header):
2260
+ if line_text not in heading_norm:
2261
+ print(f"🛑 Stop at header with same or larger font: '{line_text}' ({header_font_size} ≥ {matched_header_font_size})")
2262
+ collecting = False
2263
+ done = True
2264
+ headertoContinue1 = False
2265
+ headertoContinue2=False
2266
+ for page_num, bbox in current_bbox.items():
2267
+ bbox[3] = last_y1s.get(page_num, bbox[3])
2268
+ page_highlights[page_num] = bbox
2269
+ highlight_boxes(docHighlights, page_highlights,stringtowrite)
2270
+
2271
+ break_collecting = True
2272
+ break
2273
+
2274
+ if break_collecting:
2275
+ break
2276
+
2277
+ collected_lines.append(line_text)
2278
+ valid_spans = [span for span in spans if span.get("bbox")]
2279
+ if valid_spans:
2280
+ x0s = [span["bbox"][0] for span in valid_spans]
2281
+ x1s = [span["bbox"][2] for span in valid_spans]
2282
+ y0s = [span["bbox"][1] for span in valid_spans]
2283
+ y1s = [span["bbox"][3] for span in valid_spans]
2284
+
2285
+ line_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
2286
+
2287
+ if page_num in current_bbox:
2288
+ cb = current_bbox[page_num]
2289
+ current_bbox[page_num] = [
2290
+ min(cb[0], line_bbox[0]),
2291
+ min(cb[1], line_bbox[1]),
2292
+ max(cb[2], line_bbox[2]),
2293
+ max(cb[3], line_bbox[3])
2294
+ ]
2295
+ else:
2296
+ current_bbox[page_num] = line_bbox
2297
+
2298
+ last_y1s[page_num] = line_bbox[3]
2299
+ i += 1
2300
+
2301
+ if not done:
2302
+ for page_num, bbox in current_bbox.items():
2303
+ bbox[3] = last_y1s.get(page_num, bbox[3])
2304
+ page_highlights[page_num] = bbox
2305
+ if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
2306
+ stringtowrite='Not to be billed'
2307
+ else:
2308
+ stringtowrite='To be billed'
2309
+ highlight_boxes(docHighlights, page_highlights,stringtowrite)
2310
+
2311
+ # docHighlights.save("highlighted_output.pdf", garbage=4, deflate=True)
2312
+
2313
+ pdf_bytes = BytesIO()
2314
+ docHighlights.save(pdf_bytes)
2315
+ print('JSONN',json_output)
2316
+ return pdf_bytes.getvalue(), docHighlights , json_output
2317
+
2318
+
2319
+
2320
+
2321
+ ########################################################################################################################################################
2322
+ ########################################################################################################################################################
2323
+
2324
+
2325
+ def extract_section_under_header_tobebilledOnly(pdf_path):
2326
+ Alltext_tobebilled=''
2327
+ top_margin = 70
2328
+ bottom_margin = 50
2329
+ headertoContinue1 = False
2330
+ headertoContinue2=False
2331
+
2332
+ parsed_url = urlparse(pdf_path)
2333
+ filename = os.path.basename(parsed_url.path)
2334
+ filename = unquote(filename) # decode URL-encoded characters
2335
+
2336
+ # Optimized URL handling
2337
+ if pdf_path and ('http' in pdf_path or 'dropbox' in pdf_path):
2338
+ pdf_path = pdf_path.replace('dl=0', 'dl=1')
2339
+
2340
+ # Cache frequently used values
2341
+ response = requests.get(pdf_path)
2342
+ pdf_content = BytesIO(response.content)
2343
+ if not pdf_content:
2344
+ raise ValueError("No valid PDF content found.")
2345
+
2346
+ doc = fitz.open(stream=pdf_content, filetype="pdf")
2347
+ docHighlights = fitz.open(stream=pdf_content, filetype="pdf")
2348
+ most_common_font_size, most_common_color, most_common_font = get_regular_font_size_and_color(doc)
2349
+
2350
+ # Precompute regex patterns
2351
+ dot_pattern = re.compile(r'\.{3,}')
2352
+ url_pattern = re.compile(r'https?://\S+|www\.\S+')
2353
+
2354
+ def get_toc_page_numbers(doc, max_pages_to_check=15):
2355
+ toc_pages = []
2356
+ for page_num in range(min(len(doc), max_pages_to_check)):
2357
+ page = doc.load_page(page_num)
2358
+ blocks = page.get_text("dict")["blocks"]
2359
+
2360
+ dot_line_count = 0
2361
+ for block in blocks:
2362
+ for line in block.get("lines", []):
2363
+ line_text = get_spaced_text_from_spans(line["spans"]).strip()
2364
+ if dot_pattern.search(line_text):
2365
+ dot_line_count += 1
2366
+
2367
+ if dot_line_count >= 3:
2368
+ toc_pages.append(page_num)
2369
+
2370
+ return list(range(0, toc_pages[-1] +1)) if toc_pages else toc_pages
2371
+
2372
+ toc_pages = get_toc_page_numbers(doc)
2373
+
2374
+ headers, top_3_font_sizes, smallest_font_size, headersSpans = extract_headers(
2375
+ doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin, bottom_margin
2376
+ )
2377
+
2378
+ hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
2379
+ listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
2380
+ print('listofHeaderstoMarkup',listofHeaderstoMarkup)
2381
+ # Precompute all children headers once
2382
+ allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
2383
+ allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
2384
+
2385
+ df = pd.DataFrame(columns=["NBSLink","Subject","Page","Author","Creation Date","Layer",'Code', 'head above 1', "head above 2"])
2386
+ dictionaryNBS={}
2387
+ data_list_JSON = []
2388
+
2389
+ if len(top_3_font_sizes)==3:
2390
+ mainHeaderFontSize, subHeaderFontSize, subsubheaderFontSize = top_3_font_sizes
2391
+ elif len(top_3_font_sizes)==2:
2392
+ mainHeaderFontSize= top_3_font_sizes[0]
2393
+ subHeaderFontSize= top_3_font_sizes[1]
2394
+ subsubheaderFontSize= top_3_font_sizes[1]
2395
+
2396
+ print("📌 Has TOC:", bool(toc_pages), " | Pages to skip:", toc_pages)
2397
+
2398
+ # Preload all pages to avoid repeated loading
2399
+ # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
2400
+
2401
+ for heading_to_searchDict, paths in listofHeaderstoMarkup:
2402
+ heading_to_search = heading_to_searchDict['text']
2403
+ heading_to_searchPageNum = heading_to_searchDict['page']
2404
+
2405
+ print('headertosearch', heading_to_search)
2406
+
2407
+ # Initialize variables
2408
+ headertoContinue1 = False
2409
+ headertoContinue2 = False
2410
+ matched_header_line = None
2411
+ done = False
2412
+ collecting = False
2413
+ collected_lines = []
2414
+ page_highlights = {}
2415
+ current_bbox = {}
2416
+ last_y1s = {}
2417
+ mainHeader = ''
2418
+ subHeader = ''
2419
+ matched_header_line_norm = heading_to_search
2420
+ break_collecting = False
2421
+ heading_norm = normalize_text(heading_to_search)
2422
+ paths_norm = [normalize_text(p) for p in paths[0]] if paths and paths[0] else []
2423
+
2424
+ for page_num in range(heading_to_searchPageNum,len(doc)):
2425
+ if page_num in toc_pages:
2426
+ continue
2427
+ if break_collecting:
2428
+ break
2429
+ page=doc[page_num]
2430
+ page_height = page.rect.height
2431
+ blocks = page.get_text("dict")["blocks"]
2432
+
2433
+ for block in blocks:
2434
+ if break_collecting:
2435
+ break
2436
+
2437
+ lines = block.get("lines", [])
2438
+ i = 0
2439
+ while i < len(lines):
2440
+ if break_collecting:
2441
+ break
2442
+
2443
+ spans = lines[i].get("spans", [])
2444
+ if not spans:
2445
+ i += 1
2446
+ continue
2447
+
2448
+ y0 = spans[0]["bbox"][1]
2449
+ y1 = spans[0]["bbox"][3]
2450
+ if y0 < top_margin or y1 > (page_height - bottom_margin):
2451
+ i += 1
2452
+ continue
2453
+
2454
+ line_text = get_spaced_text_from_spans(spans).lower()
2455
+ line_text_norm = normalize_text(line_text)
2456
+
2457
+ # Combine with next line if available
2458
+ if i + 1 < len(lines):
2459
+ next_spans = lines[i + 1].get("spans", [])
2460
+ next_line_text = get_spaced_text_from_spans(next_spans).lower()
2461
+ combined_line_norm = normalize_text(line_text + " " + next_line_text)
2462
+ else:
2463
+ combined_line_norm = line_text_norm
2464
+
2465
+ # Check if we should continue processing
2466
+ if combined_line_norm and combined_line_norm in paths[0]:
2467
+ print(combined_line_norm)
2468
+ headertoContinue1 = combined_line_norm
2469
+ if combined_line_norm and combined_line_norm in paths[-2]:
2470
+ print(combined_line_norm)
2471
+ headertoContinue2 = combined_line_norm
2472
+ if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
2473
+ stringtowrite='Not to be billed'
2474
+ else:
2475
+ stringtowrite='To be billed'
2476
+ # Optimized header matching
2477
+ existsfull = (
2478
+ ( combined_line_norm in allchildrenheaders_set or
2479
+ combined_line_norm in allchildrenheaders ) and heading_to_search in combined_line_norm
2480
+ )
2481
+
2482
+ # New word-based matching
2483
+ current_line_words = set(combined_line_norm.split())
2484
+ heading_words = set(heading_norm.split())
2485
+ all_words_match = current_line_words.issubset(heading_words) and len(current_line_words) > 0
2486
+
2487
+ substring_match = (
2488
+ heading_norm in combined_line_norm or
2489
+ combined_line_norm in heading_norm or
2490
+ all_words_match # Include the new word-based matching
2491
+ )
2492
+ # substring_match = (
2493
+ # heading_norm in combined_line_norm or
2494
+ # combined_line_norm in heading_norm
2495
+ # )
2496
 
2497
+ if (substring_match and existsfull and not collecting and
2498
+ len(combined_line_norm) > 0 ):#and (headertoContinue1 or headertoContinue2) ):
2499
+
2500
+ # Check header conditions more efficiently
2501
+ header_spans = [
2502
+ span for span in spans
2503
+ if (is_header(span, most_common_font_size, most_common_color, most_common_font)
2504
+ # and span['size'] >= subsubheaderFontSize
2505
+ and span['size'] < mainHeaderFontSize)
2506
+ ]
2507
+ if header_spans and stringtowrite.startswith('To'):
2508
+ Alltext_tobebilled+=combined_line_norm
2509
  collecting = True
2510
  matched_header_font_size = max(span["size"] for span in header_spans)
2511
  print(f"📥 Start collecting after header: {combined_line_norm} (Font size: {matched_header_font_size})")
 
2602
  ]
2603
 
2604
  if header_spans and (meets_word_threshold or same_start_word(heading_to_search, combined_line_norm) ) and stringtowrite.startswith('To'):
2605
+ Alltext_tobebilled+=combined_line_norm
2606
  collecting = True
2607
  matched_header_font_size = max(span["size"] for span in header_spans)
2608
  print(f"📥 Start collecting after header: {combined_line_norm} "
 
2757
  pdf_bytes = BytesIO()
2758
  docHighlights.save(pdf_bytes)
2759
  print('JSONN',json_output)
2760
+ return pdf_bytes.getvalue(), docHighlights , json_output , Alltext_tobebilled
2761
 
2762
 
2763