Marthee commited on
Commit
4221eea
·
verified ·
1 Parent(s): 9e1773c

Update InitialMarkups.py

Browse files
Files changed (1) hide show
  1. InitialMarkups.py +16 -3
InitialMarkups.py CHANGED
@@ -997,12 +997,25 @@ def extract_section_under_header(multiplePDF_Paths):
997
  i += 2
998
  continue
999
  if collecting:
 
 
 
 
1000
  try:
1001
- if dateparse(line_text, fuzzy=True):
1002
- collecting=False
1003
- break
 
 
 
 
 
 
 
1004
  except (ParserError, ValueError, OverflowError):
 
1005
  pass
 
1006
  norm_line = normalize_text(line_text)
1007
 
1008
  # Optimized URL check
 
997
  i += 2
998
  continue
999
  if collecting:
1000
+ # ----------------------------------------------------
1001
+ # ADD THIS BLOCK IN ITS PLACE
1002
+ # ----------------------------------------------------
1003
+ # NEW: Check if the line is a date, and if so, stop collecting
1004
  try:
1005
+ # Use the 'line_text_raw' we defined earlier for an accurate parse
1006
+ dateparse(line_text, fuzzy=True)
1007
+
1008
+ # --- Date Found: Stop Collecting ---
1009
+ print(f"🛑 Stop at date: '{line_text}'")
1010
+ collecting = False
1011
+ done = True # Mark this header as finished
1012
+ break_collecting = True # Signal outer loops to stop
1013
+ break # Break this 'while' loop
1014
+
1015
  except (ParserError, ValueError, OverflowError):
1016
+ # No date found, continue normally to process this line
1017
  pass
1018
+ # ----------------------------------------------------
1019
  norm_line = normalize_text(line_text)
1020
 
1021
  # Optimized URL check