andrewammann commited on
Commit
562637f
·
verified ·
1 Parent(s): cd896f7

Create pdf_processor.py

Browse files
Files changed (1) hide show
  1. pdf_processor.py +47 -0
pdf_processor.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PyPDF2
2
+ from datetime import datetime
3
+ from typing import Dict, Any
4
+ from io import BytesIO
5
+
6
+ class PDFProcessor:
7
+ """Handles PDF text extraction and metadata creation for the RAG system."""
8
+
9
+ def extract_text(self, file: BytesIO) -> str:
10
+ """
11
+ Extract text from a PDF file.
12
+
13
+ Args:
14
+ file: Streamlit uploaded file (BytesIO object).
15
+
16
+ Returns:
17
+ Extracted text as a string.
18
+ """
19
+ try:
20
+ pdf_reader = PyPDF2.PdfReader(file)
21
+ text = ""
22
+ for page in pdf_reader.pages:
23
+ page_text = page.extract_text() or ""
24
+ text += page_text + "\n"
25
+ return text.strip()
26
+ except Exception as e:
27
+ raise Exception(f"Failed to extract text from PDF: {str(e)}")
28
+
29
+ def create_document_metadata(self, file: BytesIO, document_type: str) -> Dict[str, Any]:
30
+ """
31
+ Create metadata for a document.
32
+
33
+ Args:
34
+ file: Streamlit uploaded file (BytesIO object).
35
+ document_type: Category of the document (e.g., 'Research Paper').
36
+
37
+ Returns:
38
+ Dictionary containing metadata.
39
+ """
40
+ try:
41
+ return {
42
+ 'filename': file.name,
43
+ 'document_type': document_type,
44
+ 'ingestion_timestamp': datetime.now().isoformat()
45
+ }
46
+ except Exception as e:
47
+ raise Exception(f"Failed to create metadata: {str(e)}")