Marthee commited on
Commit
afa2728
·
verified ·
1 Parent(s): 1e83b54

Update InitialMarkups.py

Browse files
Files changed (1) hide show
  1. InitialMarkups.py +29 -7
InitialMarkups.py CHANGED
@@ -145,18 +145,18 @@ def normalize_text(text):
145
  def get_spaced_text_from_spans(spans):
146
  return normalize_text(" ".join(span["text"].strip() for span in spans))
147
 
 
148
  def is_header(span, most_common_font_size, most_common_color, most_common_font):
149
  fontname = span.get("font", "").lower()
150
  # is_italic = "italic" in fontname or "oblique" in fontname
151
  is_bold = "bold" in fontname or span.get("bold", False)
152
  return (
153
  (
154
- span["size"] > most_common_font_size or
155
- span["font"].lower() != most_common_font.lower() or
156
- (is_bold and span["size"] > most_common_font_size )
157
  )
158
  )
159
-
160
  def add_span_to_nearest_group(span_y, grouped_dict, pageNum=None, threshold=0.5):
161
  for (p, y) in grouped_dict:
162
  if pageNum is not None and p != pageNum:
@@ -653,6 +653,20 @@ def highlight_boxes(doc, highlights, stringtowrite, fixed_width=500): # Set you
653
  )
654
  annot1.update()
655
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
656
  def get_leaf_headers_with_paths(listtoloop, path=None, output=None):
657
  if path is None:
658
  path = []
@@ -729,8 +743,10 @@ def extract_section_under_header(multiplePDF_Paths):
729
  )
730
 
731
  hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
 
732
  listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
733
-
 
734
  # Precompute all children headers once
735
  allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
736
  allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
@@ -752,9 +768,12 @@ def extract_section_under_header(multiplePDF_Paths):
752
  # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
753
 
754
  for heading_to_searchDict, paths in listofHeaderstoMarkup:
 
755
  heading_to_search = heading_to_searchDict['text']
756
  heading_to_searchPageNum = heading_to_searchDict['page']
757
-
 
 
758
  # Initialize variables
759
  headertoContinue1 = False
760
  headertoContinue2 = False
@@ -2457,4 +2476,7 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths):
2457
  combined_json_str = json.dumps(jsonCombined, indent=1)
2458
  print(combined_json_str)
2459
  return pdf_bytes.getvalue(), docHighlights , combined_json_str, Alltexttobebilled , filenames
2460
-
 
 
 
 
145
  def get_spaced_text_from_spans(spans):
146
  return normalize_text(" ".join(span["text"].strip() for span in spans))
147
 
148
+
149
  def is_header(span, most_common_font_size, most_common_color, most_common_font):
150
  fontname = span.get("font", "").lower()
151
  # is_italic = "italic" in fontname or "oblique" in fontname
152
  is_bold = "bold" in fontname or span.get("bold", False)
153
  return (
154
  (
155
+ ( span["size"] > most_common_font_size or
156
+ span["font"].lower() != most_common_font.lower()) and
157
+ is_bold
158
  )
159
  )
 
160
  def add_span_to_nearest_group(span_y, grouped_dict, pageNum=None, threshold=0.5):
161
  for (p, y) in grouped_dict:
162
  if pageNum is not None and p != pageNum:
 
653
  )
654
  annot1.update()
655
 
656
+ # def get_leaf_headers_with_paths(listtoloop, path=None, output=None):
657
+ # if path is None:
658
+ # path = []
659
+ # if output is None:
660
+ # output = []
661
+ # for header in listtoloop:
662
+ # current_path = path + [header['text']]
663
+ # if not header['children']:
664
+ # if header['level'] != 0 and header['level'] != 1:
665
+ # output.append((header, current_path))
666
+ # else:
667
+ # get_leaf_headers_with_paths(header['children'], current_path, output)
668
+ # return output
669
+
670
  def get_leaf_headers_with_paths(listtoloop, path=None, output=None):
671
  if path is None:
672
  path = []
 
743
  )
744
 
745
  hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font)
746
+ print(hierarchy)
747
  listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy)
748
+ print(len(listofHeaderstoMarkup))
749
+
750
  # Precompute all children headers once
751
  allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup]
752
  allchildrenheaders_set = set(allchildrenheaders) # For faster lookups
 
768
  # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages]
769
 
770
  for heading_to_searchDict, paths in listofHeaderstoMarkup:
771
+
772
  heading_to_search = heading_to_searchDict['text']
773
  heading_to_searchPageNum = heading_to_searchDict['page']
774
+ if len(heading_to_searchDict['children'])==0:
775
+ continue
776
+ print(paths,heading_to_search)
777
  # Initialize variables
778
  headertoContinue1 = False
779
  headertoContinue2 = False
 
2476
  combined_json_str = json.dumps(jsonCombined, indent=1)
2477
  print(combined_json_str)
2478
  return pdf_bytes.getvalue(), docHighlights , combined_json_str, Alltexttobebilled , filenames
2479
+
2480
+
2481
+
2482
+ extract_section_under_header('https://www.dropbox.com/scl/fi/vrqetlyh7a18a7a327nng/4460-NBS-Weybridge-Point-2025-08-21.pdf?rlkey=ocrll9lnbbnbrqc2l4lkrwb89&st=4zm04cyk&dl=0')