Spaces:
Runtime error
Runtime error
| from typing import Optional, List | |
| from fastapi import UploadFile | |
| from fastapi.responses import JSONResponse | |
| from llama_index.core.readers.base import BaseReader | |
| from llama_index.core.schema import Document | |
| from PyPDF2 import PdfReader | |
| class Reader(BaseReader): | |
| async def read_from_uploadfile(self, file: UploadFile) -> List[Document]: | |
| try: | |
| file_content = await file.read() | |
| # Initialize PdfReader with file-like object | |
| reader = PdfReader(file.file) | |
| total_pages = reader.pages | |
| print("Total pages: ", len(total_pages)) | |
| # Extract text from each page and store it in a list | |
| documents = [] | |
| for page_num, page in enumerate(total_pages, start=1): | |
| text = page.extract_text() or "" # Extract text or use empty if none | |
| if text.strip(): # Only add non-empty text as a document | |
| documents.append( | |
| Document(text=text.strip(), metadata={"page": page_num}) | |
| ) | |
| else: | |
| # Handle the case where a page is empty but should still be accounted for | |
| documents.append(Document(text="", metadata={"page": page_num})) | |
| print("Number of documents: ", len(documents)) | |
| return documents | |
| except Exception as e: | |
| return JSONResponse( | |
| status_code=500, content=f"Failed to process the uploaded file: {e}" | |
| ) | |