Rag / pdf_processor.py
andrewammann's picture
Create pdf_processor.py
562637f verified
import PyPDF2
from datetime import datetime
from typing import Dict, Any
from io import BytesIO
class PDFProcessor:
"""Handles PDF text extraction and metadata creation for the RAG system."""
def extract_text(self, file: BytesIO) -> str:
"""
Extract text from a PDF file.
Args:
file: Streamlit uploaded file (BytesIO object).
Returns:
Extracted text as a string.
"""
try:
pdf_reader = PyPDF2.PdfReader(file)
text = ""
for page in pdf_reader.pages:
page_text = page.extract_text() or ""
text += page_text + "\n"
return text.strip()
except Exception as e:
raise Exception(f"Failed to extract text from PDF: {str(e)}")
def create_document_metadata(self, file: BytesIO, document_type: str) -> Dict[str, Any]:
"""
Create metadata for a document.
Args:
file: Streamlit uploaded file (BytesIO object).
document_type: Category of the document (e.g., 'Research Paper').
Returns:
Dictionary containing metadata.
"""
try:
return {
'filename': file.name,
'document_type': document_type,
'ingestion_timestamp': datetime.now().isoformat()
}
except Exception as e:
raise Exception(f"Failed to create metadata: {str(e)}")