inank commited on
Commit
6d3d769
·
verified ·
1 Parent(s): 677d71b

feat: add PDF extractor tool

Browse files
Files changed (1) hide show
  1. tools/pdf_extractor.py +30 -0
tools/pdf_extractor.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from smolagents import tool
2
+ import PyPDF2
3
+
4
+
5
+ @tool
6
+ def extract_text_from_pdf(pdf_path: str) -> str:
7
+ """Extracts all text content from a PDF file.
8
+
9
+ Args:
10
+ pdf_path: The file path to the PDF file to extract text from (e.g., '/tmp/document.pdf')
11
+
12
+ Returns:
13
+ The extracted text content from the PDF file
14
+ """
15
+ try:
16
+ extracted_text = []
17
+ with open(pdf_path, 'rb') as pdf_file:
18
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
19
+ num_pages = len(pdf_reader.pages)
20
+
21
+ for page_num in range(num_pages):
22
+ page = pdf_reader.pages[page_num]
23
+ text = page.extract_text()
24
+ extracted_text.append(f"--- Page {page_num + 1} ---\n{text}")
25
+
26
+ return "\n\n".join(extracted_text)
27
+ except FileNotFoundError:
28
+ return f"Error: PDF file not found at path: {pdf_path}"
29
+ except Exception as e:
30
+ return f"Error extracting text from PDF: {str(e)}"