Spaces:

andrewammann
/

Rag

Build error

App Files Files Community

andrewammann commited on Sep 15, 2025

Commit

562637f

verified ·

1 Parent(s): cd896f7

Create pdf_processor.py

Browse files

Files changed (1) hide show

pdf_processor.py +47 -0

pdf_processor.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import PyPDF2
+from datetime import datetime
+from typing import Dict, Any
+from io import BytesIO
+class PDFProcessor:
+    """Handles PDF text extraction and metadata creation for the RAG system."""
+    def extract_text(self, file: BytesIO) -> str:
+        """
+        Extract text from a PDF file.
+        Args:
+            file: Streamlit uploaded file (BytesIO object).
+        Returns:
+            Extracted text as a string.
+        """
+        try:
+            pdf_reader = PyPDF2.PdfReader(file)
+            text = ""
+            for page in pdf_reader.pages:
+                page_text = page.extract_text() or ""
+                text += page_text + "\n"
+            return text.strip()
+        except Exception as e:
+            raise Exception(f"Failed to extract text from PDF: {str(e)}")
+    def create_document_metadata(self, file: BytesIO, document_type: str) -> Dict[str, Any]:
+        """
+        Create metadata for a document.
+        Args:
+            file: Streamlit uploaded file (BytesIO object).
+            document_type: Category of the document (e.g., 'Research Paper').
+        Returns:
+            Dictionary containing metadata.
+        """
+        try:
+            return {
+                'filename': file.name,
+                'document_type': document_type,
+                'ingestion_timestamp': datetime.now().isoformat()
+            }
+        except Exception as e:
+            raise Exception(f"Failed to create metadata: {str(e)}")