import PyPDF2 from datetime import datetime from typing import Dict, Any from io import BytesIO class PDFProcessor: """Handles PDF text extraction and metadata creation for the RAG system.""" def extract_text(self, file: BytesIO) -> str: """ Extract text from a PDF file. Args: file: Streamlit uploaded file (BytesIO object). Returns: Extracted text as a string. """ try: pdf_reader = PyPDF2.PdfReader(file) text = "" for page in pdf_reader.pages: page_text = page.extract_text() or "" text += page_text + "\n" return text.strip() except Exception as e: raise Exception(f"Failed to extract text from PDF: {str(e)}") def create_document_metadata(self, file: BytesIO, document_type: str) -> Dict[str, Any]: """ Create metadata for a document. Args: file: Streamlit uploaded file (BytesIO object). document_type: Category of the document (e.g., 'Research Paper'). Returns: Dictionary containing metadata. """ try: return { 'filename': file.name, 'document_type': document_type, 'ingestion_timestamp': datetime.now().isoformat() } except Exception as e: raise Exception(f"Failed to create metadata: {str(e)}")