| | from smolagents import Tool |
| |
|
| | class SimpleTool(Tool): |
| | name = "pdf_extraction" |
| | description = """Reads and extracts the text from all PDF files in the given folder and returns the combined text.""" |
| | inputs = { |
| | "path": { "type": "string", "description": "Folder location of PDF files", "default": "pdfs", "nullable": True } |
| | } |
| | output_type = "string" |
| |
|
| | def __init__(self, *args, **kwargs): |
| | super().__init__(*args, **kwargs) |
| | try: |
| | from pypdf import PdfReader |
| | except ImportError: |
| | raise ImportError( |
| | "You must install package `pypdf` to run this tool: for instance, run `pip install pypdf`." |
| | ) |
| | self.reader_class = PdfReader |
| |
|
| | def forward(self, path: str = "pdfs") -> str: |
| | |
| | if not os.path.exists(path): |
| | return f"Error: The folder '{path}' does not exist." |
| |
|
| | |
| | pdf_files = [file for file in os.listdir(path) if file.endswith(".pdf")] |
| | if not pdf_files: |
| | return f"No PDF files found in the folder '{path}'." |
| |
|
| | combined_text = [] |
| | |
| | |
| | for pdf_file in pdf_files: |
| | pdf_path = os.path.join(path, pdf_file) |
| | try: |
| | reader = self.reader_class(pdf_path) |
| | file_text = "" |
| | for page in reader.pages: |
| | file_text += page.extract_text() |
| | combined_text.append(f"### File: {pdf_file}\n{file_text.strip()}") |
| | except Exception as e: |
| | combined_text.append(f"### File: {pdf_file}\nError reading file: {str(e)}") |
| |
|
| | |
| | return "\n\n".join(combined_text) |