marcos-banik commited on
Commit
55e8561
·
1 Parent(s): c66cdec

🚧 extract_page_numbers

Browse files
Files changed (2) hide show
  1. app.py +2 -0
  2. tools.py +29 -0
app.py CHANGED
@@ -18,6 +18,7 @@ from tools import (
18
  extract_nth_table_in_wikipedia_section,
19
  wikipedia_featured_articles_title,
20
  transcribe_audio_with_whisper,
 
21
  )
22
 
23
  # (Keep Constants as is)
@@ -76,6 +77,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
76
  extract_nth_table_in_wikipedia_section,
77
  wikipedia_featured_articles_title,
78
  transcribe_audio_with_whisper,
 
79
  ],
80
  verbosity_level=2,
81
  additional_authorized_imports=authorized_imports,
 
18
  extract_nth_table_in_wikipedia_section,
19
  wikipedia_featured_articles_title,
20
  transcribe_audio_with_whisper,
21
+ extract_page_numbers,
22
  )
23
 
24
  # (Keep Constants as is)
 
77
  extract_nth_table_in_wikipedia_section,
78
  wikipedia_featured_articles_title,
79
  transcribe_audio_with_whisper,
80
+ extract_page_numbers,
81
  ],
82
  verbosity_level=2,
83
  additional_authorized_imports=authorized_imports,
tools.py CHANGED
@@ -211,3 +211,32 @@ def transcribe_audio_with_whisper(filename: str) -> str:
211
  with open(filename, "rb") as f:
212
  audio_bytes = f.read()
213
  return asr_pipeline(audio_bytes)["text"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
  with open(filename, "rb") as f:
212
  audio_bytes = f.read()
213
  return asr_pipeline(audio_bytes)["text"]
214
+
215
+
216
+ @tool
217
+ def extract_page_numbers(text: str) -> str:
218
+ """
219
+ Extract all page numbers referenced explicitly after the word 'page' or 'pages' in the text.
220
+
221
+ Args:
222
+ text (str): Input text that may mention "page 1", "pages 10, 20 and 30", etc.
223
+
224
+ Returns:
225
+ str: A comma delimited list of unique page numbers in ascending order.
226
+ """
227
+ matches = re.findall(
228
+ r"\bpages?\s+([0-9]+(?:\s*,\s*[0-9]+)*(?:\s+and\s+[0-9]+)?)",
229
+ text,
230
+ flags=re.IGNORECASE,
231
+ )
232
+ pages = set()
233
+ for match in matches:
234
+ for num in re.split(r"(?:,|\band\b)", match):
235
+ num = num.strip()
236
+ if num.isdigit():
237
+ pages.add(int(num))
238
+
239
+ if not pages:
240
+ return ""
241
+
242
+ return ", ".join(str(p) for p in sorted(pages))