rawanessam commited on
Commit
cd2c25f
·
verified ·
1 Parent(s): 6ae927d

Update app.py

Browse files

Add 'x' to the hierarchy nodes

Files changed (1) hide show
  1. app.py +78 -75
app.py CHANGED
@@ -98,7 +98,7 @@ def filter_headers_outside_toc(headers, toc_pages):
98
  def headers_with_location(doc, llm_headers):
99
  """
100
  Converts LLM headers into:
101
- [text, font_size, page, y, suggested_level, confidence]
102
  Always include all headers, even if location not found.
103
  """
104
  headersJson = []
@@ -131,7 +131,7 @@ def headers_with_location(doc, llm_headers):
131
  loc["page"],
132
  loc["y"],
133
  h["suggested_level"],
134
-
135
  ]
136
  if entry not in headersJson:
137
  headersJson.append(entry)
@@ -148,10 +148,10 @@ def build_hierarchy_from_llm(headers):
148
  for h in headers:
149
  # print("headerrrrrrrrrrrrrrr", h)
150
 
151
- if len(h) < 5:
152
  continue
153
 
154
- text, size, page, y, level = h
155
 
156
  if level is None:
157
  continue
@@ -164,6 +164,7 @@ def build_hierarchy_from_llm(headers):
164
  node = {
165
  "text": text,
166
  "page": page if page is not None else -1,
 
167
  "y": y if y is not None else -1,
168
  "size": size,
169
  "bold": False,
@@ -1336,6 +1337,8 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths,model,
1336
  heading_to_search = heading_to_searchDict['text']
1337
  heading_to_searchPageNum = heading_to_searchDict['page']
1338
  paths=heading_to_searchDict['path']
 
 
1339
 
1340
  # Initialize variables
1341
  headertoContinue1 = False
@@ -1368,31 +1371,31 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths,model,
1368
  page=doc[page_num]
1369
  page_height = page.rect.height
1370
  blocks = page.get_text("dict")["blocks"]
1371
-
1372
  for block in blocks:
1373
  if break_collecting:
1374
  break
1375
-
1376
  lines = block.get("lines", [])
1377
  i = 0
1378
  while i < len(lines):
1379
  if break_collecting:
1380
  break
1381
-
1382
  spans = lines[i].get("spans", [])
1383
  if not spans:
1384
  i += 1
1385
  continue
1386
-
1387
  y0 = spans[0]["bbox"][1]
1388
  y1 = spans[0]["bbox"][3]
1389
  if y0 < top_margin or y1 > (page_height - bottom_margin):
1390
  i += 1
1391
  continue
1392
-
1393
  line_text = get_spaced_text_from_spans(spans).lower()
1394
  line_text_norm = normalize_text(line_text)
1395
-
1396
  # Combine with next line if available
1397
  if i + 1 < len(lines):
1398
  next_spans = lines[i + 1].get("spans", [])
@@ -1427,12 +1430,12 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths,model,
1427
  ( combined_line_norm in allchildrenheaders_set or
1428
  combined_line_norm in allchildrenheaders ) and heading_to_search in combined_line_norm
1429
  )
1430
-
1431
  # New word-based matching
1432
  current_line_words = set(combined_line_norm.split())
1433
  heading_words = set(heading_norm.split())
1434
  all_words_match = current_line_words.issubset(heading_words) and len(current_line_words) > 0
1435
-
1436
  substring_match = (
1437
  heading_norm in combined_line_norm or
1438
  combined_line_norm in heading_norm or
@@ -1442,10 +1445,10 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths,model,
1442
  # heading_norm in combined_line_norm or
1443
  # combined_line_norm in heading_norm
1444
  # )
1445
-
1446
  if (substring_match and existsfull and not collecting and
1447
  len(combined_line_norm) > 0 ):#and (headertoContinue1 or headertoContinue2) ):
1448
-
1449
  # Check header conditions more efficiently
1450
  # header_spans = [
1451
  # span for span in spans
@@ -1458,18 +1461,18 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths,model,
1458
  # if stringtowrite=='To be billed':
1459
  # Alltexttobebilled+='\n'
1460
  # matched_header_font_size = max(span["size"] for span in header_spans)
1461
-
1462
  # collected_lines.append(line_text)
1463
  valid_spans = [span for span in spans if span.get("bbox")]
1464
-
1465
  if valid_spans:
1466
  x0s = [span["bbox"][0] for span in valid_spans]
1467
  x1s = [span["bbox"][2] for span in valid_spans]
1468
  y0s = [span["bbox"][1] for span in valid_spans]
1469
  y1s = [span["bbox"][3] for span in valid_spans]
1470
-
1471
  header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
1472
-
1473
  if page_num in current_bbox:
1474
  cb = current_bbox[page_num]
1475
  current_bbox[page_num] = [
@@ -1482,36 +1485,36 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths,model,
1482
  current_bbox[page_num] = header_bbox
1483
  last_y1s[page_num] = header_bbox[3]
1484
  x0, y0, x1, y1 = header_bbox
1485
-
1486
  zoom = 200
1487
  left = int(x0)
1488
  top = int(y0)
1489
  zoom_str = f"{zoom},{left},{top}"
1490
  pageNumberFound = page_num + 1
1491
-
1492
  # Build the query parameters
1493
  params = {
1494
  'pdfLink': pdf_path, # Your PDF link
1495
  'keyword': heading_to_search, # Your keyword (could be a string or list)
1496
  }
1497
-
1498
  # URL encode each parameter
1499
  encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
1500
-
1501
  # Construct the final encoded link
1502
  encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
1503
-
1504
  # Correctly construct the final URL with page and zoom
1505
  # final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
1506
-
1507
  # Get current date and time
1508
  now = datetime.now()
1509
-
1510
  # Format the output
1511
  formatted_time = now.strftime("%d/%m/%Y %I:%M:%S %p")
1512
  # Optionally, add the URL to a DataFrame
1513
-
1514
-
1515
  data_entry = {
1516
  "PDF Name":filename,
1517
  "NBSLink": zoom_str,
@@ -1540,13 +1543,13 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths,model,
1540
  else:
1541
  if (substring_match and not collecting and
1542
  len(combined_line_norm) > 0): # and (headertoContinue1 or headertoContinue2) ):
1543
-
1544
  # Calculate word match percentage
1545
  word_match_percent = words_match_ratio(heading_norm, combined_line_norm) * 100
1546
-
1547
  # Check if at least 70% of header words exist in this line
1548
  meets_word_threshold = word_match_percent >= 100
1549
-
1550
  # Check header conditions (including word threshold)
1551
  # header_spans = [
1552
  # span for span in spans
@@ -1554,7 +1557,7 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths,model,
1554
  # # and span['size'] >= subsubheaderFontSize
1555
  # and span['size'] < mainHeaderFontSize)
1556
  # ]
1557
-
1558
  if (meets_word_threshold or same_start_word(heading_to_search, combined_line_norm) ) and stringtowrite.startswith('To'):
1559
  collecting = True
1560
  if stringtowrite=='To be billed':
@@ -1565,15 +1568,15 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths,model,
1565
 
1566
  collected_lines.append(line_text)
1567
  valid_spans = [span for span in spans if span.get("bbox")]
1568
-
1569
  if valid_spans:
1570
  x0s = [span["bbox"][0] for span in valid_spans]
1571
  x1s = [span["bbox"][2] for span in valid_spans]
1572
  y0s = [span["bbox"][1] for span in valid_spans]
1573
  y1s = [span["bbox"][3] for span in valid_spans]
1574
-
1575
  header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
1576
-
1577
  if page_num in current_bbox:
1578
  cb = current_bbox[page_num]
1579
  current_bbox[page_num] = [
@@ -1584,7 +1587,7 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths,model,
1584
  ]
1585
  else:
1586
  current_bbox[page_num] = header_bbox
1587
-
1588
  last_y1s[page_num] = header_bbox[3]
1589
  x0, y0, x1, y1 = header_bbox
1590
  zoom = 200
@@ -1592,29 +1595,29 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths,model,
1592
  top = int(y0)
1593
  zoom_str = f"{zoom},{left},{top}"
1594
  pageNumberFound = page_num + 1
1595
-
1596
  # Build the query parameters
1597
  params = {
1598
  'pdfLink': pdf_path, # Your PDF link
1599
  'keyword': heading_to_search, # Your keyword (could be a string or list)
1600
  }
1601
-
1602
  # URL encode each parameter
1603
  encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
1604
-
1605
  # Construct the final encoded link
1606
  encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
1607
-
1608
  # Correctly construct the final URL with page and zoom
1609
  # final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
1610
-
1611
  # Get current date and time
1612
  now = datetime.now()
1613
-
1614
  # Format the output
1615
  formatted_time = now.strftime("%d/%m/%Y %I:%M:%S %p")
1616
  # Optionally, add the URL to a DataFrame
1617
-
1618
  logger.info(f"Logging into table")
1619
  data_entry = {
1620
  "PDF Name":filename,
@@ -1644,7 +1647,7 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths,model,
1644
  continue
1645
  if collecting:
1646
  norm_line = normalize_text(line_text)
1647
-
1648
  # Optimized URL check
1649
  if url_pattern.match(norm_line):
1650
  line_is_header = False
@@ -1666,7 +1669,7 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths,model,
1666
  # is_header(spans[0], most_common_font_size, most_common_color, most_common_font) and
1667
  len(line_text.strip()) > 2
1668
  )
1669
-
1670
  if (norm_line != matched_header_line_norm and
1671
  norm_line != heading_norm and
1672
  is_probably_real_header):
@@ -1679,37 +1682,37 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths,model,
1679
  bbox[3] = last_y1s.get(page_num, bbox[3])
1680
  page_highlights[page_num] = bbox
1681
  highlight_boxes(docHighlights, page_highlights,stringtowrite)
1682
-
1683
  break_collecting = True
1684
  break
1685
-
1686
- if break_collecting:
1687
- break
1688
-
1689
- collected_lines.append(line_text)
1690
- valid_spans = [span for span in spans if span.get("bbox")]
1691
- if valid_spans:
1692
- x0s = [span["bbox"][0] for span in valid_spans]
1693
- x1s = [span["bbox"][2] for span in valid_spans]
1694
- y0s = [span["bbox"][1] for span in valid_spans]
1695
- y1s = [span["bbox"][3] for span in valid_spans]
1696
-
1697
- line_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
1698
-
1699
- if page_num in current_bbox:
1700
- cb = current_bbox[page_num]
1701
- current_bbox[page_num] = [
1702
- min(cb[0], line_bbox[0]),
1703
- min(cb[1], line_bbox[1]),
1704
- max(cb[2], line_bbox[2]),
1705
- max(cb[3], line_bbox[3])
1706
- ]
1707
- else:
1708
- current_bbox[page_num] = line_bbox
1709
-
1710
- last_y1s[page_num] = line_bbox[3]
1711
  i += 1
1712
-
1713
  if not done:
1714
  for page_num, bbox in current_bbox.items():
1715
  bbox[3] = last_y1s.get(page_num, bbox[3])
@@ -1810,8 +1813,8 @@ def testFunction(pdf_path, model,LLM_prompt):
1810
  heading_to_search = heading_to_searchDict['text']
1811
  heading_to_searchPageNum = heading_to_searchDict['page']
1812
  paths=heading_to_searchDict['path']
1813
- xloc=heading_to_searchDict['x']
1814
- yloc=heading_to_searchDict['y']
1815
 
1816
  # Initialize variables
1817
  headertoContinue1 = False
@@ -2359,4 +2362,4 @@ iface = gr.Interface(
2359
  )
2360
 
2361
  # Launch with debug=True to see errors in the console
2362
- iface.launch(debug=True)
 
98
  def headers_with_location(doc, llm_headers):
99
  """
100
  Converts LLM headers into:
101
+ [text, font_size, page, y, suggested_level, x]
102
  Always include all headers, even if location not found.
103
  """
104
  headersJson = []
 
131
  loc["page"],
132
  loc["y"],
133
  h["suggested_level"],
134
+ loc.get("x", 0), # Add x coordinate
135
  ]
136
  if entry not in headersJson:
137
  headersJson.append(entry)
 
148
  for h in headers:
149
  # print("headerrrrrrrrrrrrrrr", h)
150
 
151
+ if len(h) < 6: # Changed from 5 to 6 since we added 'x'
152
  continue
153
 
154
+ text, size, page, y, level, x = h # Unpack 'x' here
155
 
156
  if level is None:
157
  continue
 
164
  node = {
165
  "text": text,
166
  "page": page if page is not None else -1,
167
+ "x": x if x is not None else -1, # Add this
168
  "y": y if y is not None else -1,
169
  "size": size,
170
  "bold": False,
 
1337
  heading_to_search = heading_to_searchDict['text']
1338
  heading_to_searchPageNum = heading_to_searchDict['page']
1339
  paths=heading_to_searchDict['path']
1340
+ xloc=heading_to_searchDict.get('x', 0) # Use get() with default
1341
+ yloc=heading_to_searchDict.get('y', 0) # Use get() with default
1342
 
1343
  # Initialize variables
1344
  headertoContinue1 = False
 
1371
  page=doc[page_num]
1372
  page_height = page.rect.height
1373
  blocks = page.get_text("dict")["blocks"]
1374
+
1375
  for block in blocks:
1376
  if break_collecting:
1377
  break
1378
+
1379
  lines = block.get("lines", [])
1380
  i = 0
1381
  while i < len(lines):
1382
  if break_collecting:
1383
  break
1384
+
1385
  spans = lines[i].get("spans", [])
1386
  if not spans:
1387
  i += 1
1388
  continue
1389
+
1390
  y0 = spans[0]["bbox"][1]
1391
  y1 = spans[0]["bbox"][3]
1392
  if y0 < top_margin or y1 > (page_height - bottom_margin):
1393
  i += 1
1394
  continue
1395
+
1396
  line_text = get_spaced_text_from_spans(spans).lower()
1397
  line_text_norm = normalize_text(line_text)
1398
+
1399
  # Combine with next line if available
1400
  if i + 1 < len(lines):
1401
  next_spans = lines[i + 1].get("spans", [])
 
1430
  ( combined_line_norm in allchildrenheaders_set or
1431
  combined_line_norm in allchildrenheaders ) and heading_to_search in combined_line_norm
1432
  )
1433
+
1434
  # New word-based matching
1435
  current_line_words = set(combined_line_norm.split())
1436
  heading_words = set(heading_norm.split())
1437
  all_words_match = current_line_words.issubset(heading_words) and len(current_line_words) > 0
1438
+
1439
  substring_match = (
1440
  heading_norm in combined_line_norm or
1441
  combined_line_norm in heading_norm or
 
1445
  # heading_norm in combined_line_norm or
1446
  # combined_line_norm in heading_norm
1447
  # )
1448
+
1449
  if (substring_match and existsfull and not collecting and
1450
  len(combined_line_norm) > 0 ):#and (headertoContinue1 or headertoContinue2) ):
1451
+
1452
  # Check header conditions more efficiently
1453
  # header_spans = [
1454
  # span for span in spans
 
1461
  # if stringtowrite=='To be billed':
1462
  # Alltexttobebilled+='\n'
1463
  # matched_header_font_size = max(span["size"] for span in header_spans)
1464
+
1465
  # collected_lines.append(line_text)
1466
  valid_spans = [span for span in spans if span.get("bbox")]
1467
+
1468
  if valid_spans:
1469
  x0s = [span["bbox"][0] for span in valid_spans]
1470
  x1s = [span["bbox"][2] for span in valid_spans]
1471
  y0s = [span["bbox"][1] for span in valid_spans]
1472
  y1s = [span["bbox"][3] for span in valid_spans]
1473
+
1474
  header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
1475
+
1476
  if page_num in current_bbox:
1477
  cb = current_bbox[page_num]
1478
  current_bbox[page_num] = [
 
1485
  current_bbox[page_num] = header_bbox
1486
  last_y1s[page_num] = header_bbox[3]
1487
  x0, y0, x1, y1 = header_bbox
1488
+
1489
  zoom = 200
1490
  left = int(x0)
1491
  top = int(y0)
1492
  zoom_str = f"{zoom},{left},{top}"
1493
  pageNumberFound = page_num + 1
1494
+
1495
  # Build the query parameters
1496
  params = {
1497
  'pdfLink': pdf_path, # Your PDF link
1498
  'keyword': heading_to_search, # Your keyword (could be a string or list)
1499
  }
1500
+
1501
  # URL encode each parameter
1502
  encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
1503
+
1504
  # Construct the final encoded link
1505
  encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
1506
+
1507
  # Correctly construct the final URL with page and zoom
1508
  # final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
1509
+
1510
  # Get current date and time
1511
  now = datetime.now()
1512
+
1513
  # Format the output
1514
  formatted_time = now.strftime("%d/%m/%Y %I:%M:%S %p")
1515
  # Optionally, add the URL to a DataFrame
1516
+
1517
+
1518
  data_entry = {
1519
  "PDF Name":filename,
1520
  "NBSLink": zoom_str,
 
1543
  else:
1544
  if (substring_match and not collecting and
1545
  len(combined_line_norm) > 0): # and (headertoContinue1 or headertoContinue2) ):
1546
+
1547
  # Calculate word match percentage
1548
  word_match_percent = words_match_ratio(heading_norm, combined_line_norm) * 100
1549
+
1550
  # Check if at least 70% of header words exist in this line
1551
  meets_word_threshold = word_match_percent >= 100
1552
+
1553
  # Check header conditions (including word threshold)
1554
  # header_spans = [
1555
  # span for span in spans
 
1557
  # # and span['size'] >= subsubheaderFontSize
1558
  # and span['size'] < mainHeaderFontSize)
1559
  # ]
1560
+
1561
  if (meets_word_threshold or same_start_word(heading_to_search, combined_line_norm) ) and stringtowrite.startswith('To'):
1562
  collecting = True
1563
  if stringtowrite=='To be billed':
 
1568
 
1569
  collected_lines.append(line_text)
1570
  valid_spans = [span for span in spans if span.get("bbox")]
1571
+
1572
  if valid_spans:
1573
  x0s = [span["bbox"][0] for span in valid_spans]
1574
  x1s = [span["bbox"][2] for span in valid_spans]
1575
  y0s = [span["bbox"][1] for span in valid_spans]
1576
  y1s = [span["bbox"][3] for span in valid_spans]
1577
+
1578
  header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
1579
+
1580
  if page_num in current_bbox:
1581
  cb = current_bbox[page_num]
1582
  current_bbox[page_num] = [
 
1587
  ]
1588
  else:
1589
  current_bbox[page_num] = header_bbox
1590
+
1591
  last_y1s[page_num] = header_bbox[3]
1592
  x0, y0, x1, y1 = header_bbox
1593
  zoom = 200
 
1595
  top = int(y0)
1596
  zoom_str = f"{zoom},{left},{top}"
1597
  pageNumberFound = page_num + 1
1598
+
1599
  # Build the query parameters
1600
  params = {
1601
  'pdfLink': pdf_path, # Your PDF link
1602
  'keyword': heading_to_search, # Your keyword (could be a string or list)
1603
  }
1604
+
1605
  # URL encode each parameter
1606
  encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()}
1607
+
1608
  # Construct the final encoded link
1609
  encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
1610
+
1611
  # Correctly construct the final URL with page and zoom
1612
  # final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
1613
+
1614
  # Get current date and time
1615
  now = datetime.now()
1616
+
1617
  # Format the output
1618
  formatted_time = now.strftime("%d/%m/%Y %I:%M:%S %p")
1619
  # Optionally, add the URL to a DataFrame
1620
+
1621
  logger.info(f"Logging into table")
1622
  data_entry = {
1623
  "PDF Name":filename,
 
1647
  continue
1648
  if collecting:
1649
  norm_line = normalize_text(line_text)
1650
+
1651
  # Optimized URL check
1652
  if url_pattern.match(norm_line):
1653
  line_is_header = False
 
1669
  # is_header(spans[0], most_common_font_size, most_common_color, most_common_font) and
1670
  len(line_text.strip()) > 2
1671
  )
1672
+
1673
  if (norm_line != matched_header_line_norm and
1674
  norm_line != heading_norm and
1675
  is_probably_real_header):
 
1682
  bbox[3] = last_y1s.get(page_num, bbox[3])
1683
  page_highlights[page_num] = bbox
1684
  highlight_boxes(docHighlights, page_highlights,stringtowrite)
1685
+
1686
  break_collecting = True
1687
  break
1688
+
1689
+ if break_collecting:
1690
+ break
1691
+
1692
+ collected_lines.append(line_text)
1693
+ valid_spans = [span for span in spans if span.get("bbox")]
1694
+ if valid_spans:
1695
+ x0s = [span["bbox"][0] for span in valid_spans]
1696
+ x1s = [span["bbox"][2] for span in valid_spans]
1697
+ y0s = [span["bbox"][1] for span in valid_spans]
1698
+ y1s = [span["bbox"][3] for span in valid_spans]
1699
+
1700
+ line_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)]
1701
+
1702
+ if page_num in current_bbox:
1703
+ cb = current_bbox[page_num]
1704
+ current_bbox[page_num] = [
1705
+ min(cb[0], line_bbox[0]),
1706
+ min(cb[1], line_bbox[1]),
1707
+ max(cb[2], line_bbox[2]),
1708
+ max(cb[3], line_bbox[3])
1709
+ ]
1710
+ else:
1711
+ current_bbox[page_num] = line_bbox
1712
+
1713
+ last_y1s[page_num] = line_bbox[3]
1714
  i += 1
1715
+
1716
  if not done:
1717
  for page_num, bbox in current_bbox.items():
1718
  bbox[3] = last_y1s.get(page_num, bbox[3])
 
1813
  heading_to_search = heading_to_searchDict['text']
1814
  heading_to_searchPageNum = heading_to_searchDict['page']
1815
  paths=heading_to_searchDict['path']
1816
+ xloc=heading_to_searchDict.get('x', 0) # Use get() with default
1817
+ yloc=heading_to_searchDict.get('y', 0) # Use get() with default
1818
 
1819
  # Initialize variables
1820
  headertoContinue1 = False
 
2362
  )
2363
 
2364
  # Launch with debug=True to see errors in the console
2365
+ iface.launch(debug=True)