Spaces:
Runtime error
Runtime error
Commit
·
55e8561
1
Parent(s):
c66cdec
🚧 extract_page_numbers
Browse files
app.py
CHANGED
|
@@ -18,6 +18,7 @@ from tools import (
|
|
| 18 |
extract_nth_table_in_wikipedia_section,
|
| 19 |
wikipedia_featured_articles_title,
|
| 20 |
transcribe_audio_with_whisper,
|
|
|
|
| 21 |
)
|
| 22 |
|
| 23 |
# (Keep Constants as is)
|
|
@@ -76,6 +77,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
| 76 |
extract_nth_table_in_wikipedia_section,
|
| 77 |
wikipedia_featured_articles_title,
|
| 78 |
transcribe_audio_with_whisper,
|
|
|
|
| 79 |
],
|
| 80 |
verbosity_level=2,
|
| 81 |
additional_authorized_imports=authorized_imports,
|
|
|
|
| 18 |
extract_nth_table_in_wikipedia_section,
|
| 19 |
wikipedia_featured_articles_title,
|
| 20 |
transcribe_audio_with_whisper,
|
| 21 |
+
extract_page_numbers,
|
| 22 |
)
|
| 23 |
|
| 24 |
# (Keep Constants as is)
|
|
|
|
| 77 |
extract_nth_table_in_wikipedia_section,
|
| 78 |
wikipedia_featured_articles_title,
|
| 79 |
transcribe_audio_with_whisper,
|
| 80 |
+
extract_page_numbers,
|
| 81 |
],
|
| 82 |
verbosity_level=2,
|
| 83 |
additional_authorized_imports=authorized_imports,
|
tools.py
CHANGED
|
@@ -211,3 +211,32 @@ def transcribe_audio_with_whisper(filename: str) -> str:
|
|
| 211 |
with open(filename, "rb") as f:
|
| 212 |
audio_bytes = f.read()
|
| 213 |
return asr_pipeline(audio_bytes)["text"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
with open(filename, "rb") as f:
|
| 212 |
audio_bytes = f.read()
|
| 213 |
return asr_pipeline(audio_bytes)["text"]
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
@tool
|
| 217 |
+
def extract_page_numbers(text: str) -> str:
|
| 218 |
+
"""
|
| 219 |
+
Extract all page numbers referenced explicitly after the word 'page' or 'pages' in the text.
|
| 220 |
+
|
| 221 |
+
Args:
|
| 222 |
+
text (str): Input text that may mention "page 1", "pages 10, 20 and 30", etc.
|
| 223 |
+
|
| 224 |
+
Returns:
|
| 225 |
+
str: A comma delimited list of unique page numbers in ascending order.
|
| 226 |
+
"""
|
| 227 |
+
matches = re.findall(
|
| 228 |
+
r"\bpages?\s+([0-9]+(?:\s*,\s*[0-9]+)*(?:\s+and\s+[0-9]+)?)",
|
| 229 |
+
text,
|
| 230 |
+
flags=re.IGNORECASE,
|
| 231 |
+
)
|
| 232 |
+
pages = set()
|
| 233 |
+
for match in matches:
|
| 234 |
+
for num in re.split(r"(?:,|\band\b)", match):
|
| 235 |
+
num = num.strip()
|
| 236 |
+
if num.isdigit():
|
| 237 |
+
pages.add(int(num))
|
| 238 |
+
|
| 239 |
+
if not pages:
|
| 240 |
+
return ""
|
| 241 |
+
|
| 242 |
+
return ", ".join(str(p) for p in sorted(pages))
|