Spaces:
Runtime error
Runtime error
| from typing import Optional | |
| from llama_index.core.readers.base import BaseReader | |
| from llama_index.core.schema import Document | |
| from fastapi import UploadFile | |
| from typing import List | |
| from PyPDF2 import PdfReader | |
| from llama_parse import LlamaParse | |
| class Reader(BaseReader): | |
| async def read_from_uploadfile(self, file: UploadFile) -> List[Document]: | |
| try: | |
| file_content = await file.read() | |
| # Initialize PdfReader with file-like object | |
| reader = PdfReader(file.file) | |
| # Extract text from each page and store in a list | |
| pages = [] | |
| for page_num, page in enumerate(reader.pages): | |
| text = page.extract_text() or "" # Extract text or use empty if none | |
| if text.strip(): # Only add non-empty pages | |
| pages.append((page_num + 1, text.strip())) | |
| # Create Document objects with page number in metadata | |
| documents = [ | |
| Document(text=page_text, metadata={"page": page_num}) | |
| for page_num, page_text in pages | |
| ] | |
| return documents | |
| except Exception as e: | |
| # Handle specific exceptions or fallback to generic one | |
| print(f"Error reading PDF file: {e}") | |
| raise RuntimeError(f"Failed to process the uploaded file: {e}") |