Spaces:
Runtime error
Runtime error
Update InitialMarkups.py
Browse files- InitialMarkups.py +16 -8
InitialMarkups.py
CHANGED
|
@@ -1570,13 +1570,14 @@ def extract_section_under_header_tobebilled2(pdf_path):
|
|
| 1570 |
break_collecting = False
|
| 1571 |
heading_norm = normalize_text(heading_to_search)
|
| 1572 |
paths_norm = [normalize_text(p) for p in paths[0]] if paths and paths[0] else []
|
| 1573 |
-
|
| 1574 |
for page_num in range(heading_to_searchPageNum,len(doc)):
|
| 1575 |
print(heading_to_search)
|
| 1576 |
if paths[0].strip().lower() != currentgroupname.strip().lower():
|
| 1577 |
-
Alltexttobebilled+=
|
| 1578 |
currentgroupname=paths[0]
|
| 1579 |
print(paths[0])
|
|
|
|
|
|
|
| 1580 |
if page_num in toc_pages:
|
| 1581 |
continue
|
| 1582 |
if break_collecting:
|
|
@@ -1628,9 +1629,11 @@ def extract_section_under_header_tobebilled2(pdf_path):
|
|
| 1628 |
stringtowrite='Not to be billed'
|
| 1629 |
else:
|
| 1630 |
stringtowrite='To be billed'
|
| 1631 |
-
if stringtowrite
|
| 1632 |
-
Alltexttobebilled+= combined_line_norm
|
| 1633 |
-
|
|
|
|
|
|
|
| 1634 |
# Optimized header matching
|
| 1635 |
existsfull = (
|
| 1636 |
( combined_line_norm in allchildrenheaders_set or
|
|
@@ -1664,7 +1667,8 @@ def extract_section_under_header_tobebilled2(pdf_path):
|
|
| 1664 |
]
|
| 1665 |
if header_spans:
|
| 1666 |
collecting = True
|
| 1667 |
-
|
|
|
|
| 1668 |
matched_header_font_size = max(span["size"] for span in header_spans)
|
| 1669 |
|
| 1670 |
collected_lines.append(line_text)
|
|
@@ -1759,7 +1763,10 @@ def extract_section_under_header_tobebilled2(pdf_path):
|
|
| 1759 |
|
| 1760 |
if header_spans and (meets_word_threshold or same_start_word(heading_to_search, combined_line_norm) ):
|
| 1761 |
collecting = True
|
| 1762 |
-
|
|
|
|
|
|
|
|
|
|
| 1763 |
matched_header_font_size = max(span["size"] for span in header_spans)
|
| 1764 |
|
| 1765 |
collected_lines.append(line_text)
|
|
@@ -1910,6 +1917,7 @@ def extract_section_under_header_tobebilled2(pdf_path):
|
|
| 1910 |
pdf_bytes = BytesIO()
|
| 1911 |
docHighlights.save(pdf_bytes)
|
| 1912 |
|
| 1913 |
-
return pdf_bytes.getvalue(), docHighlights , json_output
|
|
|
|
| 1914 |
|
| 1915 |
|
|
|
|
| 1570 |
break_collecting = False
|
| 1571 |
heading_norm = normalize_text(heading_to_search)
|
| 1572 |
paths_norm = [normalize_text(p) for p in paths[0]] if paths and paths[0] else []
|
|
|
|
| 1573 |
for page_num in range(heading_to_searchPageNum,len(doc)):
|
| 1574 |
print(heading_to_search)
|
| 1575 |
if paths[0].strip().lower() != currentgroupname.strip().lower():
|
| 1576 |
+
Alltexttobebilled+= paths[0] +'\n'
|
| 1577 |
currentgroupname=paths[0]
|
| 1578 |
print(paths[0])
|
| 1579 |
+
|
| 1580 |
+
|
| 1581 |
if page_num in toc_pages:
|
| 1582 |
continue
|
| 1583 |
if break_collecting:
|
|
|
|
| 1629 |
stringtowrite='Not to be billed'
|
| 1630 |
else:
|
| 1631 |
stringtowrite='To be billed'
|
| 1632 |
+
if stringtowrite=='To be billed':
|
| 1633 |
+
# Alltexttobebilled+= combined_line_norm #################################################
|
| 1634 |
+
if matched_header_line_norm in combined_line_norm:
|
| 1635 |
+
Alltexttobebilled+='\n'
|
| 1636 |
+
Alltexttobebilled+= ' '+combined_line_norm
|
| 1637 |
# Optimized header matching
|
| 1638 |
existsfull = (
|
| 1639 |
( combined_line_norm in allchildrenheaders_set or
|
|
|
|
| 1667 |
]
|
| 1668 |
if header_spans:
|
| 1669 |
collecting = True
|
| 1670 |
+
# if stringtowrite=='To be billed':
|
| 1671 |
+
# Alltexttobebilled+='\n'
|
| 1672 |
matched_header_font_size = max(span["size"] for span in header_spans)
|
| 1673 |
|
| 1674 |
collected_lines.append(line_text)
|
|
|
|
| 1763 |
|
| 1764 |
if header_spans and (meets_word_threshold or same_start_word(heading_to_search, combined_line_norm) ):
|
| 1765 |
collecting = True
|
| 1766 |
+
if stringtowrite=='To be billed':
|
| 1767 |
+
Alltexttobebilled+='\n'
|
| 1768 |
+
# if stringtowrite=='To be billed':
|
| 1769 |
+
# Alltexttobebilled+= ' '+ combined_line_norm
|
| 1770 |
matched_header_font_size = max(span["size"] for span in header_spans)
|
| 1771 |
|
| 1772 |
collected_lines.append(line_text)
|
|
|
|
| 1917 |
pdf_bytes = BytesIO()
|
| 1918 |
docHighlights.save(pdf_bytes)
|
| 1919 |
|
| 1920 |
+
return pdf_bytes.getvalue(), docHighlights , json_output, Alltexttobebilled
|
| 1921 |
+
|
| 1922 |
|
| 1923 |
|