Marthee commited on
Commit
dca685a
·
verified ·
1 Parent(s): 3d03a5a

Update InitialMarkups.py

Browse files
Files changed (1) hide show
  1. InitialMarkups.py +16 -8
InitialMarkups.py CHANGED
@@ -1570,13 +1570,14 @@ def extract_section_under_header_tobebilled2(pdf_path):
1570
  break_collecting = False
1571
  heading_norm = normalize_text(heading_to_search)
1572
  paths_norm = [normalize_text(p) for p in paths[0]] if paths and paths[0] else []
1573
-
1574
  for page_num in range(heading_to_searchPageNum,len(doc)):
1575
  print(heading_to_search)
1576
  if paths[0].strip().lower() != currentgroupname.strip().lower():
1577
- Alltexttobebilled+=' \n'+ paths[0] +'\n'
1578
  currentgroupname=paths[0]
1579
  print(paths[0])
 
 
1580
  if page_num in toc_pages:
1581
  continue
1582
  if break_collecting:
@@ -1628,9 +1629,11 @@ def extract_section_under_header_tobebilled2(pdf_path):
1628
  stringtowrite='Not to be billed'
1629
  else:
1630
  stringtowrite='To be billed'
1631
- if stringtowrite!='To be billed':
1632
- Alltexttobebilled+= combined_line_norm #################################################
1633
-
 
 
1634
  # Optimized header matching
1635
  existsfull = (
1636
  ( combined_line_norm in allchildrenheaders_set or
@@ -1664,7 +1667,8 @@ def extract_section_under_header_tobebilled2(pdf_path):
1664
  ]
1665
  if header_spans:
1666
  collecting = True
1667
- Alltexttobebilled+= ' '+ combined_line_norm
 
1668
  matched_header_font_size = max(span["size"] for span in header_spans)
1669
 
1670
  collected_lines.append(line_text)
@@ -1759,7 +1763,10 @@ def extract_section_under_header_tobebilled2(pdf_path):
1759
 
1760
  if header_spans and (meets_word_threshold or same_start_word(heading_to_search, combined_line_norm) ):
1761
  collecting = True
1762
- Alltexttobebilled+= ' '+ combined_line_norm
 
 
 
1763
  matched_header_font_size = max(span["size"] for span in header_spans)
1764
 
1765
  collected_lines.append(line_text)
@@ -1910,6 +1917,7 @@ def extract_section_under_header_tobebilled2(pdf_path):
1910
  pdf_bytes = BytesIO()
1911
  docHighlights.save(pdf_bytes)
1912
 
1913
- return pdf_bytes.getvalue(), docHighlights , json_output , Alltexttobebilled
 
1914
 
1915
 
 
1570
  break_collecting = False
1571
  heading_norm = normalize_text(heading_to_search)
1572
  paths_norm = [normalize_text(p) for p in paths[0]] if paths and paths[0] else []
 
1573
  for page_num in range(heading_to_searchPageNum,len(doc)):
1574
  print(heading_to_search)
1575
  if paths[0].strip().lower() != currentgroupname.strip().lower():
1576
+ Alltexttobebilled+= paths[0] +'\n'
1577
  currentgroupname=paths[0]
1578
  print(paths[0])
1579
+
1580
+
1581
  if page_num in toc_pages:
1582
  continue
1583
  if break_collecting:
 
1629
  stringtowrite='Not to be billed'
1630
  else:
1631
  stringtowrite='To be billed'
1632
+ if stringtowrite=='To be billed':
1633
+ # Alltexttobebilled+= combined_line_norm #################################################
1634
+ if matched_header_line_norm in combined_line_norm:
1635
+ Alltexttobebilled+='\n'
1636
+ Alltexttobebilled+= ' '+combined_line_norm
1637
  # Optimized header matching
1638
  existsfull = (
1639
  ( combined_line_norm in allchildrenheaders_set or
 
1667
  ]
1668
  if header_spans:
1669
  collecting = True
1670
+ # if stringtowrite=='To be billed':
1671
+ # Alltexttobebilled+='\n'
1672
  matched_header_font_size = max(span["size"] for span in header_spans)
1673
 
1674
  collected_lines.append(line_text)
 
1763
 
1764
  if header_spans and (meets_word_threshold or same_start_word(heading_to_search, combined_line_norm) ):
1765
  collecting = True
1766
+ if stringtowrite=='To be billed':
1767
+ Alltexttobebilled+='\n'
1768
+ # if stringtowrite=='To be billed':
1769
+ # Alltexttobebilled+= ' '+ combined_line_norm
1770
  matched_header_font_size = max(span["size"] for span in header_spans)
1771
 
1772
  collected_lines.append(line_text)
 
1917
  pdf_bytes = BytesIO()
1918
  docHighlights.save(pdf_bytes)
1919
 
1920
+ return pdf_bytes.getvalue(), docHighlights , json_output, Alltexttobebilled
1921
+
1922
 
1923