Final_Assignment_Template

Runtime error

marcos-banik commited on Jun 21, 2025

Commit

79818ec

1 Parent(s): b04256b

🚧 extract_text_from_pdf

Files changed (3) hide show

app.py CHANGED Viewed

@@ -21,6 +21,7 @@ from tools import (
     extract_page_numbers,
     fetch_raw_html,
     extract_links,
 )
 # (Keep Constants as is)
@@ -82,6 +83,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
                 extract_page_numbers,
                 fetch_raw_html,
                 extract_links,
             ],
             verbosity_level=2,
             additional_authorized_imports=authorized_imports,

     extract_page_numbers,
     fetch_raw_html,
     extract_links,
+    extract_text_from_pdf,
 )
 # (Keep Constants as is)
                 extract_page_numbers,
                 fetch_raw_html,
                 extract_links,
+                extract_text_from_pdf,
             ],
             verbosity_level=2,
             additional_authorized_imports=authorized_imports,

requirements.txt CHANGED Viewed

@@ -1,5 +1,6 @@
 gradio
 pandas
 requests
 smolagents[toolkit]
 torch

 gradio
 pandas
+pdfminer.six
 requests
 smolagents[toolkit]
 torch

tools.py CHANGED Viewed

@@ -5,6 +5,7 @@ from smolagents import tool
 import torch
 import spaces
 from transformers import pipeline
 @tool
@@ -271,3 +272,22 @@ def extract_links(html: str) -> list[str]:
     """
     soup = BeautifulSoup(html, "html.parser")
     return list({tag["href"] for tag in soup.find_all("a", href=True)})

 import torch
 import spaces
 from transformers import pipeline
+from pdfminer.high_level import extract_text
 @tool
     """
     soup = BeautifulSoup(html, "html.parser")
     return list({tag["href"] for tag in soup.find_all("a", href=True)})
+@tool
+def extract_text_from_pdf(pdf_path: str) -> str:
+    """
+    Extract all readable text from a PDF file.
+    Args:
+        pdf_path (str): Path to the PDF file (e.g. "input/paper.pdf").
+    Returns:
+        str: Complete extracted text from the PDF.
+             Returns an empty string if extraction fails or file isn't found.
+    """
+    try:
+        text = extract_text(pdf_path)
+        return text or ""
+    except Exception:
+        return ""