Spaces:
Runtime error
Runtime error
Update InitialMarkups.py
Browse files- InitialMarkups.py +78 -52
InitialMarkups.py
CHANGED
|
@@ -145,18 +145,18 @@ def normalize_text(text):
|
|
| 145 |
def get_spaced_text_from_spans(spans):
|
| 146 |
return normalize_text(" ".join(span["text"].strip() for span in spans))
|
| 147 |
|
| 148 |
-
|
| 149 |
def is_header(span, most_common_font_size, most_common_color, most_common_font):
|
| 150 |
fontname = span.get("font", "").lower()
|
| 151 |
# is_italic = "italic" in fontname or "oblique" in fontname
|
| 152 |
is_bold = "bold" in fontname or span.get("bold", False)
|
| 153 |
return (
|
| 154 |
(
|
| 155 |
-
|
| 156 |
-
span["font"].lower() != most_common_font.lower()
|
| 157 |
-
is_bold
|
| 158 |
)
|
| 159 |
)
|
|
|
|
| 160 |
def add_span_to_nearest_group(span_y, grouped_dict, pageNum=None, threshold=0.5):
|
| 161 |
for (p, y) in grouped_dict:
|
| 162 |
if pageNum is not None and p != pageNum:
|
|
@@ -293,6 +293,7 @@ def extract_headers(doc, toc_pages, most_common_font_size, most_common_color, mo
|
|
| 293 |
|
| 294 |
return headers, top_3_font_sizes, smallest_font_size, spans
|
| 295 |
|
|
|
|
| 296 |
def is_numbered(text):
|
| 297 |
return bool(re.match(r'^\d', text.strip()))
|
| 298 |
|
|
@@ -310,7 +311,35 @@ def clean_toc_entry(toc_text):
|
|
| 310 |
# Remove everything after last sequence of dots/whitespace followed by digits
|
| 311 |
return re.sub(r'[\.\s]+\d+.*$', '', toc_text).strip('. ')
|
| 312 |
|
| 313 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 314 |
# Extract headers with margin handling
|
| 315 |
headers_list, top_3_font_sizes, smallest_font_size, spans = extract_headers(
|
| 316 |
doc,
|
|
@@ -319,7 +348,7 @@ def build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_co
|
|
| 319 |
most_common_color=most_common_color,
|
| 320 |
most_common_font=most_common_font,
|
| 321 |
top_margin=top_margin,
|
| 322 |
-
bottom_margin=
|
| 323 |
)
|
| 324 |
|
| 325 |
# Step 1: Collect and filter potential headers
|
|
@@ -329,14 +358,15 @@ def build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_co
|
|
| 329 |
# First extract TOC entries to get exact level 0 header texts
|
| 330 |
toc_entries = {}
|
| 331 |
for pno in toc_pages:
|
| 332 |
-
|
|
|
|
| 333 |
toc_text = page.get_text()
|
| 334 |
for line in toc_text.split('\n'):
|
| 335 |
clean_line = line.strip()
|
| 336 |
if clean_line:
|
| 337 |
norm_line = normalize(clean_line)
|
| 338 |
toc_entries[norm_line] = clean_line # Store original text
|
| 339 |
-
|
| 340 |
for h in headers_list:
|
| 341 |
text, size, pageNum, y = h[:4]
|
| 342 |
page = doc.load_page(pageNum)
|
|
@@ -393,8 +423,9 @@ def build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_co
|
|
| 393 |
i += 1
|
| 394 |
# Step 2: Identify level 0 headers (largest and in TOC)
|
| 395 |
# max_size = max(h['size'] for h in headers) if headers else 0
|
|
|
|
| 396 |
max_size,subheaderSize,nbsheadersize=top_3_font_sizes
|
| 397 |
-
|
| 398 |
toc_text_match=[]
|
| 399 |
# Improved TOC matching with exact and substring matching
|
| 400 |
toc_matches = []
|
|
@@ -423,6 +454,7 @@ def build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_co
|
|
| 423 |
toc_matches.append(h)
|
| 424 |
toc_text_match.append(h['text'])
|
| 425 |
elif matching_toc_texts and h['size'] < max_size * 0.9 and h['size'] > nbsheadersize : # h['size'] < max_size * 0.9 and h['size'] > max_size*0.75:
|
|
|
|
| 426 |
headers.remove(h)
|
| 427 |
continue
|
| 428 |
|
|
@@ -440,7 +472,8 @@ def build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_co
|
|
| 440 |
# Update the header text with cleaned version
|
| 441 |
h['text'] = cleaned_text
|
| 442 |
unique_level0.append(h)
|
| 443 |
-
|
|
|
|
| 444 |
# Step 3: Process headers under each level 0 to identify level 1 format
|
| 445 |
|
| 446 |
# First, group headers by their level 0 parent
|
|
@@ -576,7 +609,8 @@ def build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_co
|
|
| 576 |
|
| 577 |
enforce_nesting(root)
|
| 578 |
root = [h for h in root if not (h['level'] == 0 and not h['children'])]
|
| 579 |
-
|
|
|
|
| 580 |
|
| 581 |
def adjust_levels_if_level0_not_in_toc(doc, toc_pages, root):
|
| 582 |
def normalize(text):
|
|
@@ -613,6 +647,16 @@ def print_tree_with_numbers(headers, indent=0):
|
|
| 613 |
f"(Level {header['level']}, p:{header['page']+1}, {size_info})")
|
| 614 |
print_tree_with_numbers(header["children"], indent + 1)
|
| 615 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 616 |
|
| 617 |
def highlight_boxes(doc, highlights, stringtowrite, fixed_width=500): # Set your desired width here
|
| 618 |
for page_num, bbox in highlights.items():
|
|
@@ -653,20 +697,6 @@ def highlight_boxes(doc, highlights, stringtowrite, fixed_width=500): # Set you
|
|
| 653 |
)
|
| 654 |
annot1.update()
|
| 655 |
|
| 656 |
-
# def get_leaf_headers_with_paths(listtoloop, path=None, output=None):
|
| 657 |
-
# if path is None:
|
| 658 |
-
# path = []
|
| 659 |
-
# if output is None:
|
| 660 |
-
# output = []
|
| 661 |
-
# for header in listtoloop:
|
| 662 |
-
# current_path = path + [header['text']]
|
| 663 |
-
# if not header['children']:
|
| 664 |
-
# if header['level'] != 0 and header['level'] != 1:
|
| 665 |
-
# output.append((header, current_path))
|
| 666 |
-
# else:
|
| 667 |
-
# get_leaf_headers_with_paths(header['children'], current_path, output)
|
| 668 |
-
# return output
|
| 669 |
-
|
| 670 |
def get_leaf_headers_with_paths(listtoloop, path=None, output=None):
|
| 671 |
if path is None:
|
| 672 |
path = []
|
|
@@ -680,7 +710,6 @@ def get_leaf_headers_with_paths(listtoloop, path=None, output=None):
|
|
| 680 |
else:
|
| 681 |
get_leaf_headers_with_paths(header['children'], current_path, output)
|
| 682 |
return output
|
| 683 |
-
|
| 684 |
# Add this helper function at the top of your code
|
| 685 |
def words_match_ratio(text1, text2):
|
| 686 |
words1 = set(text1.split())
|
|
@@ -743,10 +772,8 @@ def extract_section_under_header(multiplePDF_Paths):
|
|
| 743 |
)
|
| 744 |
|
| 745 |
hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
|
| 746 |
-
print(hierarchy)
|
| 747 |
listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
|
| 748 |
-
|
| 749 |
-
|
| 750 |
# Precompute all children headers once
|
| 751 |
allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
|
| 752 |
allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
|
|
@@ -768,12 +795,9 @@ def extract_section_under_header(multiplePDF_Paths):
|
|
| 768 |
# pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
|
| 769 |
|
| 770 |
for heading_to_searchDict, paths in listofHeaderstoMarkup:
|
| 771 |
-
|
| 772 |
heading_to_search = heading_to_searchDict['text']
|
| 773 |
heading_to_searchPageNum = heading_to_searchDict['page']
|
| 774 |
-
|
| 775 |
-
continue
|
| 776 |
-
print(paths,heading_to_search)
|
| 777 |
# Initialize variables
|
| 778 |
headertoContinue1 = False
|
| 779 |
headertoContinue2 = False
|
|
@@ -833,13 +857,12 @@ def extract_section_under_header(multiplePDF_Paths):
|
|
| 833 |
combined_line_norm = line_text_norm
|
| 834 |
|
| 835 |
# Check if we should continue processing
|
| 836 |
-
|
| 837 |
|
| 838 |
-
|
| 839 |
-
|
| 840 |
|
| 841 |
-
|
| 842 |
-
print('paths[-2].lower()',paths[-2].lower())
|
| 843 |
if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
|
| 844 |
stringtowrite='Not to be billed'
|
| 845 |
else:
|
|
@@ -1267,13 +1290,13 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
|
|
| 1267 |
else:
|
| 1268 |
combined_line_norm = line_text_norm
|
| 1269 |
|
| 1270 |
-
#
|
| 1271 |
-
|
| 1272 |
|
| 1273 |
-
|
| 1274 |
-
|
| 1275 |
|
| 1276 |
-
|
| 1277 |
if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
|
| 1278 |
# if any(word in paths[-2].lower() for word in keywordstoSkip):
|
| 1279 |
stringtowrite='Not to be billed'
|
|
@@ -1701,12 +1724,12 @@ def extract_section_under_header_tobebilled2(pdf_path):
|
|
| 1701 |
combined_line_norm = line_text_norm
|
| 1702 |
|
| 1703 |
# Check if we should continue processing
|
| 1704 |
-
|
| 1705 |
|
| 1706 |
-
|
| 1707 |
-
|
| 1708 |
|
| 1709 |
-
|
| 1710 |
# if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
|
| 1711 |
last_path = paths[-2].lower()
|
| 1712 |
# if any(word in paths[-2].lower() for word in keywordstoSkip):
|
|
@@ -2154,12 +2177,12 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths):
|
|
| 2154 |
combined_line_norm = line_text_norm
|
| 2155 |
|
| 2156 |
# Check if we should continue processing
|
| 2157 |
-
|
| 2158 |
|
| 2159 |
-
|
| 2160 |
-
|
| 2161 |
|
| 2162 |
-
|
| 2163 |
# if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
|
| 2164 |
last_path = paths[-2].lower()
|
| 2165 |
# if any(word in paths[-2].lower() for word in keywordstoSkip):
|
|
@@ -2476,4 +2499,7 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths):
|
|
| 2476 |
combined_json_str = json.dumps(jsonCombined, indent=1)
|
| 2477 |
print(combined_json_str)
|
| 2478 |
return pdf_bytes.getvalue(), docHighlights , combined_json_str, Alltexttobebilled , filenames
|
| 2479 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
def get_spaced_text_from_spans(spans):
|
| 146 |
return normalize_text(" ".join(span["text"].strip() for span in spans))
|
| 147 |
|
|
|
|
| 148 |
def is_header(span, most_common_font_size, most_common_color, most_common_font):
|
| 149 |
fontname = span.get("font", "").lower()
|
| 150 |
# is_italic = "italic" in fontname or "oblique" in fontname
|
| 151 |
is_bold = "bold" in fontname or span.get("bold", False)
|
| 152 |
return (
|
| 153 |
(
|
| 154 |
+
span["size"] > most_common_font_size or
|
| 155 |
+
span["font"].lower() != most_common_font.lower() or
|
| 156 |
+
(is_bold and span["size"] > most_common_font_size )
|
| 157 |
)
|
| 158 |
)
|
| 159 |
+
|
| 160 |
def add_span_to_nearest_group(span_y, grouped_dict, pageNum=None, threshold=0.5):
|
| 161 |
for (p, y) in grouped_dict:
|
| 162 |
if pageNum is not None and p != pageNum:
|
|
|
|
| 293 |
|
| 294 |
return headers, top_3_font_sizes, smallest_font_size, spans
|
| 295 |
|
| 296 |
+
|
| 297 |
def is_numbered(text):
|
| 298 |
return bool(re.match(r'^\d', text.strip()))
|
| 299 |
|
|
|
|
| 311 |
# Remove everything after last sequence of dots/whitespace followed by digits
|
| 312 |
return re.sub(r'[\.\s]+\d+.*$', '', toc_text).strip('. ')
|
| 313 |
|
| 314 |
+
|
| 315 |
+
|
| 316 |
+
|
| 317 |
+
|
| 318 |
+
def enforce_level_hierarchy(headers):
|
| 319 |
+
"""
|
| 320 |
+
Ensure level 2 headers only exist under level 1 headers
|
| 321 |
+
and clean up any orphaned headers
|
| 322 |
+
"""
|
| 323 |
+
def process_node_list(node_list, parent_level=-1):
|
| 324 |
+
i = 0
|
| 325 |
+
while i < len(node_list):
|
| 326 |
+
node = node_list[i]
|
| 327 |
+
|
| 328 |
+
# Remove level 2 headers that don't have a level 1 parent
|
| 329 |
+
if node['level'] == 2 and parent_level != 1:
|
| 330 |
+
node_list.pop(i)
|
| 331 |
+
continue
|
| 332 |
+
|
| 333 |
+
# Recursively process children
|
| 334 |
+
process_node_list(node['children'], node['level'])
|
| 335 |
+
i += 1
|
| 336 |
+
|
| 337 |
+
process_node_list(headers)
|
| 338 |
+
return headers
|
| 339 |
+
|
| 340 |
+
|
| 341 |
+
|
| 342 |
+
def build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin=70, bottom_margin=70):
|
| 343 |
# Extract headers with margin handling
|
| 344 |
headers_list, top_3_font_sizes, smallest_font_size, spans = extract_headers(
|
| 345 |
doc,
|
|
|
|
| 348 |
most_common_color=most_common_color,
|
| 349 |
most_common_font=most_common_font,
|
| 350 |
top_margin=top_margin,
|
| 351 |
+
bottom_margin=50
|
| 352 |
)
|
| 353 |
|
| 354 |
# Step 1: Collect and filter potential headers
|
|
|
|
| 358 |
# First extract TOC entries to get exact level 0 header texts
|
| 359 |
toc_entries = {}
|
| 360 |
for pno in toc_pages:
|
| 361 |
+
print(pno)
|
| 362 |
+
page = doc[pno]
|
| 363 |
toc_text = page.get_text()
|
| 364 |
for line in toc_text.split('\n'):
|
| 365 |
clean_line = line.strip()
|
| 366 |
if clean_line:
|
| 367 |
norm_line = normalize(clean_line)
|
| 368 |
toc_entries[norm_line] = clean_line # Store original text
|
| 369 |
+
print(toc_pages)
|
| 370 |
for h in headers_list:
|
| 371 |
text, size, pageNum, y = h[:4]
|
| 372 |
page = doc.load_page(pageNum)
|
|
|
|
| 423 |
i += 1
|
| 424 |
# Step 2: Identify level 0 headers (largest and in TOC)
|
| 425 |
# max_size = max(h['size'] for h in headers) if headers else 0
|
| 426 |
+
print(top_3_font_sizes)
|
| 427 |
max_size,subheaderSize,nbsheadersize=top_3_font_sizes
|
| 428 |
+
print(max_size)
|
| 429 |
toc_text_match=[]
|
| 430 |
# Improved TOC matching with exact and substring matching
|
| 431 |
toc_matches = []
|
|
|
|
| 454 |
toc_matches.append(h)
|
| 455 |
toc_text_match.append(h['text'])
|
| 456 |
elif matching_toc_texts and h['size'] < max_size * 0.9 and h['size'] > nbsheadersize : # h['size'] < max_size * 0.9 and h['size'] > max_size*0.75:
|
| 457 |
+
print(h['text'],matching_toc_texts)
|
| 458 |
headers.remove(h)
|
| 459 |
continue
|
| 460 |
|
|
|
|
| 472 |
# Update the header text with cleaned version
|
| 473 |
h['text'] = cleaned_text
|
| 474 |
unique_level0.append(h)
|
| 475 |
+
print(f"Added unique header: {cleaned_text} (normalized: {norm_cleaned_text})")
|
| 476 |
+
|
| 477 |
# Step 3: Process headers under each level 0 to identify level 1 format
|
| 478 |
|
| 479 |
# First, group headers by their level 0 parent
|
|
|
|
| 609 |
|
| 610 |
enforce_nesting(root)
|
| 611 |
root = [h for h in root if not (h['level'] == 0 and not h['children'])]
|
| 612 |
+
header_tree = enforce_level_hierarchy(root)
|
| 613 |
+
return header_tree
|
| 614 |
|
| 615 |
def adjust_levels_if_level0_not_in_toc(doc, toc_pages, root):
|
| 616 |
def normalize(text):
|
|
|
|
| 647 |
f"(Level {header['level']}, p:{header['page']+1}, {size_info})")
|
| 648 |
print_tree_with_numbers(header["children"], indent + 1)
|
| 649 |
|
| 650 |
+
def process_document_headers(doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin=70, bottom_margin=50):
|
| 651 |
+
print(f"Processing with margins - top:{top_margin}pt, bottom:{bottom_margin}pt")
|
| 652 |
+
header_tree = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin, bottom_margin)
|
| 653 |
+
adjust_levels_if_level0_not_in_toc(doc, toc_pages, header_tree)
|
| 654 |
+
print("Assigning numbers...")
|
| 655 |
+
assign_numbers_to_headers(header_tree)
|
| 656 |
+
print("Document structure (excluding margins):")
|
| 657 |
+
print_tree_with_numbers(header_tree)
|
| 658 |
+
return header_tree
|
| 659 |
+
|
| 660 |
|
| 661 |
def highlight_boxes(doc, highlights, stringtowrite, fixed_width=500): # Set your desired width here
|
| 662 |
for page_num, bbox in highlights.items():
|
|
|
|
| 697 |
)
|
| 698 |
annot1.update()
|
| 699 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 700 |
def get_leaf_headers_with_paths(listtoloop, path=None, output=None):
|
| 701 |
if path is None:
|
| 702 |
path = []
|
|
|
|
| 710 |
else:
|
| 711 |
get_leaf_headers_with_paths(header['children'], current_path, output)
|
| 712 |
return output
|
|
|
|
| 713 |
# Add this helper function at the top of your code
|
| 714 |
def words_match_ratio(text1, text2):
|
| 715 |
words1 = set(text1.split())
|
|
|
|
| 772 |
)
|
| 773 |
|
| 774 |
hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
|
|
|
|
| 775 |
listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
|
| 776 |
+
|
|
|
|
| 777 |
# Precompute all children headers once
|
| 778 |
allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
|
| 779 |
allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
|
|
|
|
| 795 |
# pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
|
| 796 |
|
| 797 |
for heading_to_searchDict, paths in listofHeaderstoMarkup:
|
|
|
|
| 798 |
heading_to_search = heading_to_searchDict['text']
|
| 799 |
heading_to_searchPageNum = heading_to_searchDict['page']
|
| 800 |
+
|
|
|
|
|
|
|
| 801 |
# Initialize variables
|
| 802 |
headertoContinue1 = False
|
| 803 |
headertoContinue2 = False
|
|
|
|
| 857 |
combined_line_norm = line_text_norm
|
| 858 |
|
| 859 |
# Check if we should continue processing
|
| 860 |
+
if combined_line_norm and combined_line_norm in paths[0]:
|
| 861 |
|
| 862 |
+
headertoContinue1 = combined_line_norm
|
| 863 |
+
if combined_line_norm and combined_line_norm in paths[-2]:
|
| 864 |
|
| 865 |
+
headertoContinue2 = combined_line_norm
|
|
|
|
| 866 |
if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
|
| 867 |
stringtowrite='Not to be billed'
|
| 868 |
else:
|
|
|
|
| 1290 |
else:
|
| 1291 |
combined_line_norm = line_text_norm
|
| 1292 |
|
| 1293 |
+
# Check if we should continue processing
|
| 1294 |
+
if combined_line_norm and combined_line_norm in paths[0]:
|
| 1295 |
|
| 1296 |
+
headertoContinue1 = combined_line_norm
|
| 1297 |
+
if combined_line_norm and combined_line_norm in paths[-2]:
|
| 1298 |
|
| 1299 |
+
headertoContinue2 = combined_line_norm
|
| 1300 |
if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
|
| 1301 |
# if any(word in paths[-2].lower() for word in keywordstoSkip):
|
| 1302 |
stringtowrite='Not to be billed'
|
|
|
|
| 1724 |
combined_line_norm = line_text_norm
|
| 1725 |
|
| 1726 |
# Check if we should continue processing
|
| 1727 |
+
if combined_line_norm and combined_line_norm in paths[0]:
|
| 1728 |
|
| 1729 |
+
headertoContinue1 = combined_line_norm
|
| 1730 |
+
if combined_line_norm and combined_line_norm in paths[-2]:
|
| 1731 |
|
| 1732 |
+
headertoContinue2 = combined_line_norm
|
| 1733 |
# if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
|
| 1734 |
last_path = paths[-2].lower()
|
| 1735 |
# if any(word in paths[-2].lower() for word in keywordstoSkip):
|
|
|
|
| 2177 |
combined_line_norm = line_text_norm
|
| 2178 |
|
| 2179 |
# Check if we should continue processing
|
| 2180 |
+
if combined_line_norm and combined_line_norm in paths[0]:
|
| 2181 |
|
| 2182 |
+
headertoContinue1 = combined_line_norm
|
| 2183 |
+
if combined_line_norm and combined_line_norm in paths[-2]:
|
| 2184 |
|
| 2185 |
+
headertoContinue2 = combined_line_norm
|
| 2186 |
# if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
|
| 2187 |
last_path = paths[-2].lower()
|
| 2188 |
# if any(word in paths[-2].lower() for word in keywordstoSkip):
|
|
|
|
| 2499 |
combined_json_str = json.dumps(jsonCombined, indent=1)
|
| 2500 |
print(combined_json_str)
|
| 2501 |
return pdf_bytes.getvalue(), docHighlights , combined_json_str, Alltexttobebilled , filenames
|
| 2502 |
+
|
| 2503 |
+
|
| 2504 |
+
|
| 2505 |
+
|