Spaces:
Runtime error
Runtime error
Update InitialMarkups.py
Browse files- InitialMarkups.py +29 -7
InitialMarkups.py
CHANGED
|
@@ -145,18 +145,18 @@ def normalize_text(text):
|
|
| 145 |
def get_spaced_text_from_spans(spans):
|
| 146 |
return normalize_text(" ".join(span["text"].strip() for span in spans))
|
| 147 |
|
|
|
|
| 148 |
def is_header(span, most_common_font_size, most_common_color, most_common_font):
|
| 149 |
fontname = span.get("font", "").lower()
|
| 150 |
# is_italic = "italic" in fontname or "oblique" in fontname
|
| 151 |
is_bold = "bold" in fontname or span.get("bold", False)
|
| 152 |
return (
|
| 153 |
(
|
| 154 |
-
|
| 155 |
-
span["font"].lower() != most_common_font.lower()
|
| 156 |
-
|
| 157 |
)
|
| 158 |
)
|
| 159 |
-
|
| 160 |
def add_span_to_nearest_group(span_y, grouped_dict, pageNum=None, threshold=0.5):
|
| 161 |
for (p, y) in grouped_dict:
|
| 162 |
if pageNum is not None and p != pageNum:
|
|
@@ -653,6 +653,20 @@ def highlight_boxes(doc, highlights, stringtowrite, fixed_width=500): # Set you
|
|
| 653 |
)
|
| 654 |
annot1.update()
|
| 655 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 656 |
def get_leaf_headers_with_paths(listtoloop, path=None, output=None):
|
| 657 |
if path is None:
|
| 658 |
path = []
|
|
@@ -729,8 +743,10 @@ def extract_section_under_header(multiplePDF_Paths):
|
|
| 729 |
)
|
| 730 |
|
| 731 |
hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
|
|
|
|
| 732 |
listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
|
| 733 |
-
|
|
|
|
| 734 |
# Precompute all children headers once
|
| 735 |
allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
|
| 736 |
allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
|
|
@@ -752,9 +768,12 @@ def extract_section_under_header(multiplePDF_Paths):
|
|
| 752 |
# pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
|
| 753 |
|
| 754 |
for heading_to_searchDict, paths in listofHeaderstoMarkup:
|
|
|
|
| 755 |
heading_to_search = heading_to_searchDict['text']
|
| 756 |
heading_to_searchPageNum = heading_to_searchDict['page']
|
| 757 |
-
|
|
|
|
|
|
|
| 758 |
# Initialize variables
|
| 759 |
headertoContinue1 = False
|
| 760 |
headertoContinue2 = False
|
|
@@ -2457,4 +2476,7 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths):
|
|
| 2457 |
combined_json_str = json.dumps(jsonCombined, indent=1)
|
| 2458 |
print(combined_json_str)
|
| 2459 |
return pdf_bytes.getvalue(), docHighlights , combined_json_str, Alltexttobebilled , filenames
|
| 2460 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
def get_spaced_text_from_spans(spans):
|
| 146 |
return normalize_text(" ".join(span["text"].strip() for span in spans))
|
| 147 |
|
| 148 |
+
|
| 149 |
def is_header(span, most_common_font_size, most_common_color, most_common_font):
|
| 150 |
fontname = span.get("font", "").lower()
|
| 151 |
# is_italic = "italic" in fontname or "oblique" in fontname
|
| 152 |
is_bold = "bold" in fontname or span.get("bold", False)
|
| 153 |
return (
|
| 154 |
(
|
| 155 |
+
( span["size"] > most_common_font_size or
|
| 156 |
+
span["font"].lower() != most_common_font.lower()) and
|
| 157 |
+
is_bold
|
| 158 |
)
|
| 159 |
)
|
|
|
|
| 160 |
def add_span_to_nearest_group(span_y, grouped_dict, pageNum=None, threshold=0.5):
|
| 161 |
for (p, y) in grouped_dict:
|
| 162 |
if pageNum is not None and p != pageNum:
|
|
|
|
| 653 |
)
|
| 654 |
annot1.update()
|
| 655 |
|
| 656 |
+
# def get_leaf_headers_with_paths(listtoloop, path=None, output=None):
|
| 657 |
+
# if path is None:
|
| 658 |
+
# path = []
|
| 659 |
+
# if output is None:
|
| 660 |
+
# output = []
|
| 661 |
+
# for header in listtoloop:
|
| 662 |
+
# current_path = path + [header['text']]
|
| 663 |
+
# if not header['children']:
|
| 664 |
+
# if header['level'] != 0 and header['level'] != 1:
|
| 665 |
+
# output.append((header, current_path))
|
| 666 |
+
# else:
|
| 667 |
+
# get_leaf_headers_with_paths(header['children'], current_path, output)
|
| 668 |
+
# return output
|
| 669 |
+
|
| 670 |
def get_leaf_headers_with_paths(listtoloop, path=None, output=None):
|
| 671 |
if path is None:
|
| 672 |
path = []
|
|
|
|
| 743 |
)
|
| 744 |
|
| 745 |
hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
|
| 746 |
+
print(hierarchy)
|
| 747 |
listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
|
| 748 |
+
print(len(listofHeaderstoMarkup))
|
| 749 |
+
|
| 750 |
# Precompute all children headers once
|
| 751 |
allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
|
| 752 |
allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
|
|
|
|
| 768 |
# pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
|
| 769 |
|
| 770 |
for heading_to_searchDict, paths in listofHeaderstoMarkup:
|
| 771 |
+
|
| 772 |
heading_to_search = heading_to_searchDict['text']
|
| 773 |
heading_to_searchPageNum = heading_to_searchDict['page']
|
| 774 |
+
if len(heading_to_searchDict['children'])==0:
|
| 775 |
+
continue
|
| 776 |
+
print(paths,heading_to_search)
|
| 777 |
# Initialize variables
|
| 778 |
headertoContinue1 = False
|
| 779 |
headertoContinue2 = False
|
|
|
|
| 2476 |
combined_json_str = json.dumps(jsonCombined, indent=1)
|
| 2477 |
print(combined_json_str)
|
| 2478 |
return pdf_bytes.getvalue(), docHighlights , combined_json_str, Alltexttobebilled , filenames
|
| 2479 |
+
|
| 2480 |
+
|
| 2481 |
+
|
| 2482 |
+
extract_section_under_header('https://www.dropbox.com/scl/fi/vrqetlyh7a18a7a327nng/4460-NBS-Weybridge-Point-2025-08-21.pdf?rlkey=ocrll9lnbbnbrqc2l4lkrwb89&st=4zm04cyk&dl=0')
|