iammraat commited on
Commit
03ee172
·
verified ·
1 Parent(s): 1016bb6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -2
app.py CHANGED
@@ -29,9 +29,17 @@ def remove_page_numbers(text):
29
  def clean_sentence_start(sentence):
30
  """
31
  Removes numbering like "1.", "2)", "3.4" from the start of a sentence.
 
32
  """
33
- # Regex: Start of line, optional space, digits/letters, dot/paren, space
34
- return re.sub(r'^\s*(?:\d+|[a-zA-Z])[\.\)]\s+', '', sentence)
 
 
 
 
 
 
 
35
 
36
  def clean_text_block(raw_text):
37
  """
 
29
  def clean_sentence_start(sentence):
30
  """
31
  Removes numbering like "1.", "2)", "3.4" from the start of a sentence.
32
+ Also removes standalone footnote numbers (e.g. "1 The...")
33
  """
34
+ # Regex 1: Start of line, optional space, digits/letters, dot/paren, space
35
+ # Matches "1.", "2)", "A.", "b)"
36
+ sentence = re.sub(r'^\s*(?:\d+|[a-zA-Z])[\.\)]\s+', '', sentence)
37
+
38
+ # Regex 2: Standalone numbers (1-3 digits) followed by space and Capital letter
39
+ # Matches "1 The", "12 It", but avoids years like "1999 The" (4 digits)
40
+ sentence = re.sub(r'^\s*\d{1,3}\s+(?=[A-Z])', '', sentence)
41
+
42
+ return sentence
43
 
44
  def clean_text_block(raw_text):
45
  """