Spaces:
Runtime error
Runtime error
Commit ·
79818ec
1
Parent(s): b04256b
🚧 extract_text_from_pdf
Browse files- app.py +2 -0
- requirements.txt +1 -0
- tools.py +20 -0
app.py
CHANGED
|
@@ -21,6 +21,7 @@ from tools import (
|
|
| 21 |
extract_page_numbers,
|
| 22 |
fetch_raw_html,
|
| 23 |
extract_links,
|
|
|
|
| 24 |
)
|
| 25 |
|
| 26 |
# (Keep Constants as is)
|
|
@@ -82,6 +83,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
| 82 |
extract_page_numbers,
|
| 83 |
fetch_raw_html,
|
| 84 |
extract_links,
|
|
|
|
| 85 |
],
|
| 86 |
verbosity_level=2,
|
| 87 |
additional_authorized_imports=authorized_imports,
|
|
|
|
| 21 |
extract_page_numbers,
|
| 22 |
fetch_raw_html,
|
| 23 |
extract_links,
|
| 24 |
+
extract_text_from_pdf,
|
| 25 |
)
|
| 26 |
|
| 27 |
# (Keep Constants as is)
|
|
|
|
| 83 |
extract_page_numbers,
|
| 84 |
fetch_raw_html,
|
| 85 |
extract_links,
|
| 86 |
+
extract_text_from_pdf,
|
| 87 |
],
|
| 88 |
verbosity_level=2,
|
| 89 |
additional_authorized_imports=authorized_imports,
|
requirements.txt
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
gradio
|
| 2 |
pandas
|
|
|
|
| 3 |
requests
|
| 4 |
smolagents[toolkit]
|
| 5 |
torch
|
|
|
|
| 1 |
gradio
|
| 2 |
pandas
|
| 3 |
+
pdfminer.six
|
| 4 |
requests
|
| 5 |
smolagents[toolkit]
|
| 6 |
torch
|
tools.py
CHANGED
|
@@ -5,6 +5,7 @@ from smolagents import tool
|
|
| 5 |
import torch
|
| 6 |
import spaces
|
| 7 |
from transformers import pipeline
|
|
|
|
| 8 |
|
| 9 |
|
| 10 |
@tool
|
|
@@ -271,3 +272,22 @@ def extract_links(html: str) -> list[str]:
|
|
| 271 |
"""
|
| 272 |
soup = BeautifulSoup(html, "html.parser")
|
| 273 |
return list({tag["href"] for tag in soup.find_all("a", href=True)})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
import torch
|
| 6 |
import spaces
|
| 7 |
from transformers import pipeline
|
| 8 |
+
from pdfminer.high_level import extract_text
|
| 9 |
|
| 10 |
|
| 11 |
@tool
|
|
|
|
| 272 |
"""
|
| 273 |
soup = BeautifulSoup(html, "html.parser")
|
| 274 |
return list({tag["href"] for tag in soup.find_all("a", href=True)})
|
| 275 |
+
|
| 276 |
+
|
| 277 |
+
@tool
|
| 278 |
+
def extract_text_from_pdf(pdf_path: str) -> str:
|
| 279 |
+
"""
|
| 280 |
+
Extract all readable text from a PDF file.
|
| 281 |
+
|
| 282 |
+
Args:
|
| 283 |
+
pdf_path (str): Path to the PDF file (e.g. "input/paper.pdf").
|
| 284 |
+
|
| 285 |
+
Returns:
|
| 286 |
+
str: Complete extracted text from the PDF.
|
| 287 |
+
Returns an empty string if extraction fails or file isn't found.
|
| 288 |
+
"""
|
| 289 |
+
try:
|
| 290 |
+
text = extract_text(pdf_path)
|
| 291 |
+
return text or ""
|
| 292 |
+
except Exception:
|
| 293 |
+
return ""
|