Marthee commited on
Commit
7031d55
·
verified ·
1 Parent(s): 6acfc68

Update InitialMarkups.py

Browse files
Files changed (1) hide show
  1. InitialMarkups.py +78 -52
InitialMarkups.py CHANGED
@@ -145,18 +145,18 @@ def normalize_text(text):
145
  def get_spaced_text_from_spans(spans):
146
  return normalize_text(" ".join(span["text"].strip() for span in spans))
147
 
148
-
149
  def is_header(span, most_common_font_size, most_common_color, most_common_font):
150
  fontname = span.get("font", "").lower()
151
  # is_italic = "italic" in fontname or "oblique" in fontname
152
  is_bold = "bold" in fontname or span.get("bold", False)
153
  return (
154
  (
155
- ( span["size"] > most_common_font_size or
156
- span["font"].lower() != most_common_font.lower()) and
157
- is_bold
158
  )
159
  )
 
160
  def add_span_to_nearest_group(span_y, grouped_dict, pageNum=None, threshold=0.5):
161
  for (p, y) in grouped_dict:
162
  if pageNum is not None and p != pageNum:
@@ -293,6 +293,7 @@ def extract_headers(doc, toc_pages, most_common_font_size, most_common_color, mo
293
 
294
  return headers, top_3_font_sizes, smallest_font_size, spans
295
 
 
296
  def is_numbered(text):
297
  return bool(re.match(r'^\d', text.strip()))
298
 
@@ -310,7 +311,35 @@ def clean_toc_entry(toc_text):
310
  # Remove everything after last sequence of dots/whitespace followed by digits
311
  return re.sub(r'[\.\s]+\d+.*$', '', toc_text).strip('. ')
312
 
313
- def build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin=70, bottom_margin=85):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
  # Extract headers with margin handling
315
  headers_list, top_3_font_sizes, smallest_font_size, spans = extract_headers(
316
  doc,
@@ -319,7 +348,7 @@ def build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_co
319
  most_common_color=most_common_color,
320
  most_common_font=most_common_font,
321
  top_margin=top_margin,
322
- bottom_margin=bottom_margin
323
  )
324
 
325
  # Step 1: Collect and filter potential headers
@@ -329,14 +358,15 @@ def build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_co
329
  # First extract TOC entries to get exact level 0 header texts
330
  toc_entries = {}
331
  for pno in toc_pages:
332
- page = doc.load_page(pno)
 
333
  toc_text = page.get_text()
334
  for line in toc_text.split('\n'):
335
  clean_line = line.strip()
336
  if clean_line:
337
  norm_line = normalize(clean_line)
338
  toc_entries[norm_line] = clean_line # Store original text
339
-
340
  for h in headers_list:
341
  text, size, pageNum, y = h[:4]
342
  page = doc.load_page(pageNum)
@@ -393,8 +423,9 @@ def build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_co
393
  i += 1
394
  # Step 2: Identify level 0 headers (largest and in TOC)
395
  # max_size = max(h['size'] for h in headers) if headers else 0
 
396
  max_size,subheaderSize,nbsheadersize=top_3_font_sizes
397
-
398
  toc_text_match=[]
399
  # Improved TOC matching with exact and substring matching
400
  toc_matches = []
@@ -423,6 +454,7 @@ def build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_co
423
  toc_matches.append(h)
424
  toc_text_match.append(h['text'])
425
  elif matching_toc_texts and h['size'] < max_size * 0.9 and h['size'] > nbsheadersize : # h['size'] < max_size * 0.9 and h['size'] > max_size*0.75:
 
426
  headers.remove(h)
427
  continue
428
 
@@ -440,7 +472,8 @@ def build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_co
440
  # Update the header text with cleaned version
441
  h['text'] = cleaned_text
442
  unique_level0.append(h)
443
-
 
444
  # Step 3: Process headers under each level 0 to identify level 1 format
445
 
446
  # First, group headers by their level 0 parent
@@ -576,7 +609,8 @@ def build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_co
576
 
577
  enforce_nesting(root)
578
  root = [h for h in root if not (h['level'] == 0 and not h['children'])]
579
- return root
 
580
 
581
  def adjust_levels_if_level0_not_in_toc(doc, toc_pages, root):
582
  def normalize(text):
@@ -613,6 +647,16 @@ def print_tree_with_numbers(headers, indent=0):
613
  f"(Level {header['level']}, p:{header['page']+1}, {size_info})")
614
  print_tree_with_numbers(header["children"], indent + 1)
615
 
 
 
 
 
 
 
 
 
 
 
616
 
617
  def highlight_boxes(doc, highlights, stringtowrite, fixed_width=500): # Set your desired width here
618
  for page_num, bbox in highlights.items():
@@ -653,20 +697,6 @@ def highlight_boxes(doc, highlights, stringtowrite, fixed_width=500): # Set you
653
  )
654
  annot1.update()
655
 
656
- # def get_leaf_headers_with_paths(listtoloop, path=None, output=None):
657
- # if path is None:
658
- # path = []
659
- # if output is None:
660
- # output = []
661
- # for header in listtoloop:
662
- # current_path = path + [header['text']]
663
- # if not header['children']:
664
- # if header['level'] != 0 and header['level'] != 1:
665
- # output.append((header, current_path))
666
- # else:
667
- # get_leaf_headers_with_paths(header['children'], current_path, output)
668
- # return output
669
-
670
  def get_leaf_headers_with_paths(listtoloop, path=None, output=None):
671
  if path is None:
672
  path = []
@@ -680,7 +710,6 @@ def get_leaf_headers_with_paths(listtoloop, path=None, output=None):
680
  else:
681
  get_leaf_headers_with_paths(header['children'], current_path, output)
682
  return output
683
-
684
  # Add this helper function at the top of your code
685
  def words_match_ratio(text1, text2):
686
  words1 = set(text1.split())
@@ -743,10 +772,8 @@ def extract_section_under_header(multiplePDF_Paths):
743
  )
744
 
745
  hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
746
- print(hierarchy)
747
  listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
748
- print(len(listofHeaderstoMarkup))
749
-
750
  # Precompute all children headers once
751
  allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
752
  allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
@@ -768,12 +795,9 @@ def extract_section_under_header(multiplePDF_Paths):
768
  # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
769
 
770
  for heading_to_searchDict, paths in listofHeaderstoMarkup:
771
-
772
  heading_to_search = heading_to_searchDict['text']
773
  heading_to_searchPageNum = heading_to_searchDict['page']
774
- if len(heading_to_searchDict['children'])==0:
775
- continue
776
- print(paths,heading_to_search)
777
  # Initialize variables
778
  headertoContinue1 = False
779
  headertoContinue2 = False
@@ -833,13 +857,12 @@ def extract_section_under_header(multiplePDF_Paths):
833
  combined_line_norm = line_text_norm
834
 
835
  # Check if we should continue processing
836
- # if combined_line_norm and combined_line_norm in paths[0]:
837
 
838
- # headertoContinue1 = combined_line_norm
839
- # if combined_line_norm and combined_line_norm in paths[-2]:
840
 
841
- # headertoContinue2 = combined_line_norm
842
- print('paths[-2].lower()',paths[-2].lower())
843
  if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
844
  stringtowrite='Not to be billed'
845
  else:
@@ -1267,13 +1290,13 @@ def extract_section_under_header_tobebilledOnly(pdf_path):
1267
  else:
1268
  combined_line_norm = line_text_norm
1269
 
1270
- # # Check if we should continue processing
1271
- # if combined_line_norm and combined_line_norm in paths[0]:
1272
 
1273
- # headertoContinue1 = combined_line_norm
1274
- # if combined_line_norm and combined_line_norm in paths[-2]:
1275
 
1276
- # headertoContinue2 = combined_line_norm
1277
  if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
1278
  # if any(word in paths[-2].lower() for word in keywordstoSkip):
1279
  stringtowrite='Not to be billed'
@@ -1701,12 +1724,12 @@ def extract_section_under_header_tobebilled2(pdf_path):
1701
  combined_line_norm = line_text_norm
1702
 
1703
  # Check if we should continue processing
1704
- # if combined_line_norm and combined_line_norm in paths[0]:
1705
 
1706
- # headertoContinue1 = combined_line_norm
1707
- # if combined_line_norm and combined_line_norm in paths[-2]:
1708
 
1709
- # headertoContinue2 = combined_line_norm
1710
  # if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
1711
  last_path = paths[-2].lower()
1712
  # if any(word in paths[-2].lower() for word in keywordstoSkip):
@@ -2154,12 +2177,12 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths):
2154
  combined_line_norm = line_text_norm
2155
 
2156
  # Check if we should continue processing
2157
- # if combined_line_norm and combined_line_norm in paths[0]:
2158
 
2159
- # headertoContinue1 = combined_line_norm
2160
- # if combined_line_norm and combined_line_norm in paths[-2]:
2161
 
2162
- # headertoContinue2 = combined_line_norm
2163
  # if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
2164
  last_path = paths[-2].lower()
2165
  # if any(word in paths[-2].lower() for word in keywordstoSkip):
@@ -2476,4 +2499,7 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths):
2476
  combined_json_str = json.dumps(jsonCombined, indent=1)
2477
  print(combined_json_str)
2478
  return pdf_bytes.getvalue(), docHighlights , combined_json_str, Alltexttobebilled , filenames
2479
-
 
 
 
 
145
  def get_spaced_text_from_spans(spans):
146
  return normalize_text(" ".join(span["text"].strip() for span in spans))
147
 
 
148
  def is_header(span, most_common_font_size, most_common_color, most_common_font):
149
  fontname = span.get("font", "").lower()
150
  # is_italic = "italic" in fontname or "oblique" in fontname
151
  is_bold = "bold" in fontname or span.get("bold", False)
152
  return (
153
  (
154
+ span["size"] > most_common_font_size or
155
+ span["font"].lower() != most_common_font.lower() or
156
+ (is_bold and span["size"] > most_common_font_size )
157
  )
158
  )
159
+
160
  def add_span_to_nearest_group(span_y, grouped_dict, pageNum=None, threshold=0.5):
161
  for (p, y) in grouped_dict:
162
  if pageNum is not None and p != pageNum:
 
293
 
294
  return headers, top_3_font_sizes, smallest_font_size, spans
295
 
296
+
297
  def is_numbered(text):
298
  return bool(re.match(r'^\d', text.strip()))
299
 
 
311
  # Remove everything after last sequence of dots/whitespace followed by digits
312
  return re.sub(r'[\.\s]+\d+.*$', '', toc_text).strip('. ')
313
 
314
+
315
+
316
+
317
+
318
+ def enforce_level_hierarchy(headers):
319
+ """
320
+ Ensure level 2 headers only exist under level 1 headers
321
+ and clean up any orphaned headers
322
+ """
323
+ def process_node_list(node_list, parent_level=-1):
324
+ i = 0
325
+ while i < len(node_list):
326
+ node = node_list[i]
327
+
328
+ # Remove level 2 headers that don't have a level 1 parent
329
+ if node['level'] == 2 and parent_level != 1:
330
+ node_list.pop(i)
331
+ continue
332
+
333
+ # Recursively process children
334
+ process_node_list(node['children'], node['level'])
335
+ i += 1
336
+
337
+ process_node_list(headers)
338
+ return headers
339
+
340
+
341
+
342
+ def build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin=70, bottom_margin=70):
343
  # Extract headers with margin handling
344
  headers_list, top_3_font_sizes, smallest_font_size, spans = extract_headers(
345
  doc,
 
348
  most_common_color=most_common_color,
349
  most_common_font=most_common_font,
350
  top_margin=top_margin,
351
+ bottom_margin=50
352
  )
353
 
354
  # Step 1: Collect and filter potential headers
 
358
  # First extract TOC entries to get exact level 0 header texts
359
  toc_entries = {}
360
  for pno in toc_pages:
361
+ print(pno)
362
+ page = doc[pno]
363
  toc_text = page.get_text()
364
  for line in toc_text.split('\n'):
365
  clean_line = line.strip()
366
  if clean_line:
367
  norm_line = normalize(clean_line)
368
  toc_entries[norm_line] = clean_line # Store original text
369
+ print(toc_pages)
370
  for h in headers_list:
371
  text, size, pageNum, y = h[:4]
372
  page = doc.load_page(pageNum)
 
423
  i += 1
424
  # Step 2: Identify level 0 headers (largest and in TOC)
425
  # max_size = max(h['size'] for h in headers) if headers else 0
426
+ print(top_3_font_sizes)
427
  max_size,subheaderSize,nbsheadersize=top_3_font_sizes
428
+ print(max_size)
429
  toc_text_match=[]
430
  # Improved TOC matching with exact and substring matching
431
  toc_matches = []
 
454
  toc_matches.append(h)
455
  toc_text_match.append(h['text'])
456
  elif matching_toc_texts and h['size'] < max_size * 0.9 and h['size'] > nbsheadersize : # h['size'] < max_size * 0.9 and h['size'] > max_size*0.75:
457
+ print(h['text'],matching_toc_texts)
458
  headers.remove(h)
459
  continue
460
 
 
472
  # Update the header text with cleaned version
473
  h['text'] = cleaned_text
474
  unique_level0.append(h)
475
+ print(f"Added unique header: {cleaned_text} (normalized: {norm_cleaned_text})")
476
+
477
  # Step 3: Process headers under each level 0 to identify level 1 format
478
 
479
  # First, group headers by their level 0 parent
 
609
 
610
  enforce_nesting(root)
611
  root = [h for h in root if not (h['level'] == 0 and not h['children'])]
612
+ header_tree = enforce_level_hierarchy(root)
613
+ return header_tree
614
 
615
  def adjust_levels_if_level0_not_in_toc(doc, toc_pages, root):
616
  def normalize(text):
 
647
  f"(Level {header['level']}, p:{header['page']+1}, {size_info})")
648
  print_tree_with_numbers(header["children"], indent + 1)
649
 
650
+ def process_document_headers(doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin=70, bottom_margin=50):
651
+ print(f"Processing with margins - top:{top_margin}pt, bottom:{bottom_margin}pt")
652
+ header_tree = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin, bottom_margin)
653
+ adjust_levels_if_level0_not_in_toc(doc, toc_pages, header_tree)
654
+ print("Assigning numbers...")
655
+ assign_numbers_to_headers(header_tree)
656
+ print("Document structure (excluding margins):")
657
+ print_tree_with_numbers(header_tree)
658
+ return header_tree
659
+
660
 
661
  def highlight_boxes(doc, highlights, stringtowrite, fixed_width=500): # Set your desired width here
662
  for page_num, bbox in highlights.items():
 
697
  )
698
  annot1.update()
699
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
700
  def get_leaf_headers_with_paths(listtoloop, path=None, output=None):
701
  if path is None:
702
  path = []
 
710
  else:
711
  get_leaf_headers_with_paths(header['children'], current_path, output)
712
  return output
 
713
  # Add this helper function at the top of your code
714
  def words_match_ratio(text1, text2):
715
  words1 = set(text1.split())
 
772
  )
773
 
774
  hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
 
775
  listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
776
+
 
777
  # Precompute all children headers once
778
  allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
779
  allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
 
795
  # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
796
 
797
  for heading_to_searchDict, paths in listofHeaderstoMarkup:
 
798
  heading_to_search = heading_to_searchDict['text']
799
  heading_to_searchPageNum = heading_to_searchDict['page']
800
+
 
 
801
  # Initialize variables
802
  headertoContinue1 = False
803
  headertoContinue2 = False
 
857
  combined_line_norm = line_text_norm
858
 
859
  # Check if we should continue processing
860
+ if combined_line_norm and combined_line_norm in paths[0]:
861
 
862
+ headertoContinue1 = combined_line_norm
863
+ if combined_line_norm and combined_line_norm in paths[-2]:
864
 
865
+ headertoContinue2 = combined_line_norm
 
866
  if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
867
  stringtowrite='Not to be billed'
868
  else:
 
1290
  else:
1291
  combined_line_norm = line_text_norm
1292
 
1293
+ # Check if we should continue processing
1294
+ if combined_line_norm and combined_line_norm in paths[0]:
1295
 
1296
+ headertoContinue1 = combined_line_norm
1297
+ if combined_line_norm and combined_line_norm in paths[-2]:
1298
 
1299
+ headertoContinue2 = combined_line_norm
1300
  if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
1301
  # if any(word in paths[-2].lower() for word in keywordstoSkip):
1302
  stringtowrite='Not to be billed'
 
1724
  combined_line_norm = line_text_norm
1725
 
1726
  # Check if we should continue processing
1727
+ if combined_line_norm and combined_line_norm in paths[0]:
1728
 
1729
+ headertoContinue1 = combined_line_norm
1730
+ if combined_line_norm and combined_line_norm in paths[-2]:
1731
 
1732
+ headertoContinue2 = combined_line_norm
1733
  # if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
1734
  last_path = paths[-2].lower()
1735
  # if any(word in paths[-2].lower() for word in keywordstoSkip):
 
2177
  combined_line_norm = line_text_norm
2178
 
2179
  # Check if we should continue processing
2180
+ if combined_line_norm and combined_line_norm in paths[0]:
2181
 
2182
+ headertoContinue1 = combined_line_norm
2183
+ if combined_line_norm and combined_line_norm in paths[-2]:
2184
 
2185
+ headertoContinue2 = combined_line_norm
2186
  # if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() :
2187
  last_path = paths[-2].lower()
2188
  # if any(word in paths[-2].lower() for word in keywordstoSkip):
 
2499
  combined_json_str = json.dumps(jsonCombined, indent=1)
2500
  print(combined_json_str)
2501
  return pdf_bytes.getvalue(), docHighlights , combined_json_str, Alltexttobebilled , filenames
2502
+
2503
+
2504
+
2505
+