marcos-banik commited on
Commit
79818ec
·
1 Parent(s): b04256b

🚧 extract_text_from_pdf

Browse files
Files changed (3) hide show
  1. app.py +2 -0
  2. requirements.txt +1 -0
  3. tools.py +20 -0
app.py CHANGED
@@ -21,6 +21,7 @@ from tools import (
21
  extract_page_numbers,
22
  fetch_raw_html,
23
  extract_links,
 
24
  )
25
 
26
  # (Keep Constants as is)
@@ -82,6 +83,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
82
  extract_page_numbers,
83
  fetch_raw_html,
84
  extract_links,
 
85
  ],
86
  verbosity_level=2,
87
  additional_authorized_imports=authorized_imports,
 
21
  extract_page_numbers,
22
  fetch_raw_html,
23
  extract_links,
24
+ extract_text_from_pdf,
25
  )
26
 
27
  # (Keep Constants as is)
 
83
  extract_page_numbers,
84
  fetch_raw_html,
85
  extract_links,
86
+ extract_text_from_pdf,
87
  ],
88
  verbosity_level=2,
89
  additional_authorized_imports=authorized_imports,
requirements.txt CHANGED
@@ -1,5 +1,6 @@
1
  gradio
2
  pandas
 
3
  requests
4
  smolagents[toolkit]
5
  torch
 
1
  gradio
2
  pandas
3
+ pdfminer.six
4
  requests
5
  smolagents[toolkit]
6
  torch
tools.py CHANGED
@@ -5,6 +5,7 @@ from smolagents import tool
5
  import torch
6
  import spaces
7
  from transformers import pipeline
 
8
 
9
 
10
  @tool
@@ -271,3 +272,22 @@ def extract_links(html: str) -> list[str]:
271
  """
272
  soup = BeautifulSoup(html, "html.parser")
273
  return list({tag["href"] for tag in soup.find_all("a", href=True)})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  import torch
6
  import spaces
7
  from transformers import pipeline
8
+ from pdfminer.high_level import extract_text
9
 
10
 
11
  @tool
 
272
  """
273
  soup = BeautifulSoup(html, "html.parser")
274
  return list({tag["href"] for tag in soup.find_all("a", href=True)})
275
+
276
+
277
+ @tool
278
+ def extract_text_from_pdf(pdf_path: str) -> str:
279
+ """
280
+ Extract all readable text from a PDF file.
281
+
282
+ Args:
283
+ pdf_path (str): Path to the PDF file (e.g. "input/paper.pdf").
284
+
285
+ Returns:
286
+ str: Complete extracted text from the PDF.
287
+ Returns an empty string if extraction fails or file isn't found.
288
+ """
289
+ try:
290
+ text = extract_text(pdf_path)
291
+ return text or ""
292
+ except Exception:
293
+ return ""