Spaces:
Build error
Build error
| import PyPDF2 | |
| from datetime import datetime | |
| from typing import Dict, Any | |
| from io import BytesIO | |
| class PDFProcessor: | |
| """Handles PDF text extraction and metadata creation for the RAG system.""" | |
| def extract_text(self, file: BytesIO) -> str: | |
| """ | |
| Extract text from a PDF file. | |
| Args: | |
| file: Streamlit uploaded file (BytesIO object). | |
| Returns: | |
| Extracted text as a string. | |
| """ | |
| try: | |
| pdf_reader = PyPDF2.PdfReader(file) | |
| text = "" | |
| for page in pdf_reader.pages: | |
| page_text = page.extract_text() or "" | |
| text += page_text + "\n" | |
| return text.strip() | |
| except Exception as e: | |
| raise Exception(f"Failed to extract text from PDF: {str(e)}") | |
| def create_document_metadata(self, file: BytesIO, document_type: str) -> Dict[str, Any]: | |
| """ | |
| Create metadata for a document. | |
| Args: | |
| file: Streamlit uploaded file (BytesIO object). | |
| document_type: Category of the document (e.g., 'Research Paper'). | |
| Returns: | |
| Dictionary containing metadata. | |
| """ | |
| try: | |
| return { | |
| 'filename': file.name, | |
| 'document_type': document_type, | |
| 'ingestion_timestamp': datetime.now().isoformat() | |
| } | |
| except Exception as e: | |
| raise Exception(f"Failed to create metadata: {str(e)}") |