Final_Assignment_Template

Runtime error

marcos-banik commited on Jun 21, 2025

Commit

55e8561

1 Parent(s): c66cdec

🚧 extract_page_numbers

Files changed (2) hide show

app.py CHANGED Viewed

@@ -18,6 +18,7 @@ from tools import (
     extract_nth_table_in_wikipedia_section,
     wikipedia_featured_articles_title,
     transcribe_audio_with_whisper,
 )
 # (Keep Constants as is)
@@ -76,6 +77,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
                 extract_nth_table_in_wikipedia_section,
                 wikipedia_featured_articles_title,
                 transcribe_audio_with_whisper,
             ],
             verbosity_level=2,
             additional_authorized_imports=authorized_imports,

     extract_nth_table_in_wikipedia_section,
     wikipedia_featured_articles_title,
     transcribe_audio_with_whisper,
+    extract_page_numbers,
 )
 # (Keep Constants as is)
                 extract_nth_table_in_wikipedia_section,
                 wikipedia_featured_articles_title,
                 transcribe_audio_with_whisper,
+                extract_page_numbers,
             ],
             verbosity_level=2,
             additional_authorized_imports=authorized_imports,

tools.py CHANGED Viewed

@@ -211,3 +211,32 @@ def transcribe_audio_with_whisper(filename: str) -> str:
     with open(filename, "rb") as f:
         audio_bytes = f.read()
     return asr_pipeline(audio_bytes)["text"]

     with open(filename, "rb") as f:
         audio_bytes = f.read()
     return asr_pipeline(audio_bytes)["text"]
+@tool
+def extract_page_numbers(text: str) -> str:
+    """
+    Extract all page numbers referenced explicitly after the word 'page' or 'pages' in the text.
+    Args:
+        text (str): Input text that may mention "page 1", "pages 10, 20 and 30", etc.
+    Returns:
+        str: A comma delimited list of unique page numbers in ascending order.
+    """
+    matches = re.findall(
+        r"\bpages?\s+([0-9]+(?:\s*,\s*[0-9]+)*(?:\s+and\s+[0-9]+)?)",
+        text,
+        flags=re.IGNORECASE,
+    )
+    pages = set()
+    for match in matches:
+        for num in re.split(r"(?:,|\band\b)", match):
+            num = num.strip()
+            if num.isdigit():
+                pages.add(int(num))
+    if not pages:
+        return ""
+    return ", ".join(str(p) for p in sorted(pages))