First_agent_template / tools /pdf_extractor.py
inank's picture
fix: restore pdf extractor
9a4c8eb verified
raw
history blame contribute delete
968 Bytes
from smolagents import tool
import PyPDF2
@tool
def extract_text_from_pdf(pdf_path: str) -> str:
"""Extracts all text content from a PDF file.
Args:
pdf_path: The file path to the PDF file to extract text from (e.g., '/tmp/document.pdf')
Returns:
The extracted text content from the PDF file
"""
try:
extracted_text = []
with open(pdf_path, 'rb') as pdf_file:
pdf_reader = PyPDF2.PdfReader(pdf_file)
num_pages = len(pdf_reader.pages)
for page_num in range(num_pages):
page = pdf_reader.pages[page_num]
text = page.extract_text()
extracted_text.append(f"--- Page {page_num + 1} ---\n{text}")
return "\n\n".join(extracted_text)
except FileNotFoundError:
return f"Error: PDF file not found at path: {pdf_path}"
except Exception as e:
return f"Error extracting text from PDF: {str(e)}"