Spaces:
Runtime error
Runtime error
| from typing import Optional | |
| from llama_index.core.readers.base import BaseReader | |
| from llama_index.core.schema import Document | |
| from fastapi import UploadFile | |
| from typing import List | |
| from PyPDF2 import PdfReader | |
| from io import BytesIO | |
| import fitz # PyMuPDF | |
| class Reader(BaseReader): | |
| async def read_from_uploadfile(self, file: UploadFile) -> List[Document]: | |
| try: | |
| # Read the file content asynchronously | |
| file_content = await file.read() | |
| # Initialize PyMuPDF document with file content | |
| pdf_document = fitz.open(stream=file_content, filetype="pdf") | |
| # Extract text and images from each page | |
| pages = [] | |
| for page_num in range(len(pdf_document)): | |
| page = pdf_document.load_page(page_num) | |
| # Extract text | |
| text = page.get_text().strip() | |
| if text: | |
| pages.append(Document(text=text, metadata={"page": page_num + 1})) | |
| # Extract images | |
| for img_index, img in enumerate(page.get_images(full=True)): | |
| xref = img[0] | |
| base_image = pdf_document.extract_image(xref) | |
| image_bytes = base_image["image"] | |
| image_stream = BytesIO(image_bytes) | |
| # Store the image as a Document (or any other structure you need) | |
| pages.append( | |
| Document( | |
| text=f"Image {img_index + 1} on page {page_num + 1}", | |
| metadata={ | |
| "page": page_num + 1, | |
| "image_index": img_index + 1, | |
| "image": image_stream, | |
| }, | |
| ) | |
| ) | |
| return pages | |
| except Exception as e: | |
| # Handle exceptions more granularly if needed | |
| print(f"Error reading PDF file: {e}") | |
| raise RuntimeError(f"Failed to process the uploaded file: {e}") | |