organicoder commited on
Commit
bd61a63
Β·
verified Β·
1 Parent(s): a84f182

Delete pdf_processor.py

Browse files
Files changed (1) hide show
  1. pdf_processor.py +0 -120
pdf_processor.py DELETED
@@ -1,120 +0,0 @@
1
- import PyPDF2
2
- import os
3
- from typing import List, Optional
4
- from langchain.text_splitter import RecursiveCharacterTextSplitter
5
- from langchain_openai import OpenAIEmbeddings
6
- from langchain.vectorstores import FAISS
7
- import pickle
8
-
9
- class PDFProcessor:
10
- """Process PDF files and create searchable vector database"""
11
-
12
- def __init__(self, pdf_path: str = "Health Tech Hub Copenhagen.pdf"):
13
- self.pdf_path = pdf_path
14
- self.vector_store = None
15
- self.text_chunks = []
16
-
17
- def extract_text_from_pdf(self) -> str:
18
- """Extract text content from PDF file"""
19
- if not os.path.exists(self.pdf_path):
20
- raise FileNotFoundError(f"PDF file not found: {self.pdf_path}")
21
-
22
- text = ""
23
- try:
24
- with open(self.pdf_path, 'rb') as file:
25
- pdf_reader = PyPDF2.PdfReader(file)
26
- for page_num in range(len(pdf_reader.pages)):
27
- page = pdf_reader.pages[page_num]
28
- text += page.extract_text() + "\n"
29
-
30
- print(f"βœ… Successfully extracted text from {self.pdf_path}")
31
- return text
32
-
33
- except Exception as e:
34
- print(f"❌ Error extracting text from PDF: {e}")
35
- raise
36
-
37
- def split_text_into_chunks(self, text: str, chunk_size: int = 1000, chunk_overlap: int = 200) -> List[str]:
38
- """Split text into smaller chunks for better processing"""
39
- text_splitter = RecursiveCharacterTextSplitter(
40
- chunk_size=chunk_size,
41
- chunk_overlap=chunk_overlap,
42
- length_function=len,
43
- )
44
-
45
- chunks = text_splitter.split_text(text)
46
- self.text_chunks = chunks
47
- print(f"βœ… Split text into {len(chunks)} chunks")
48
- return chunks
49
-
50
- def create_vector_store(self, chunks: List[str]) -> FAISS:
51
- """Create a vector store from text chunks"""
52
- try:
53
- embeddings = OpenAIEmbeddings()
54
- vector_store = FAISS.from_texts(chunks, embeddings)
55
- self.vector_store = vector_store
56
- print("βœ… Vector store created successfully")
57
- return vector_store
58
-
59
- except Exception as e:
60
- print(f"❌ Error creating vector store: {e}")
61
- raise
62
-
63
- def search_similar_content(self, query: str, k: int = 3) -> List[str]:
64
- """Search for similar content in the PDF"""
65
- if not self.vector_store:
66
- raise ValueError("Vector store not initialized. Call process_pdf() first.")
67
-
68
- try:
69
- results = self.vector_store.similarity_search(query, k=k)
70
- return [doc.page_content for doc in results]
71
-
72
- except Exception as e:
73
- print(f"❌ Error searching content: {e}")
74
- return []
75
-
76
- def process_pdf(self) -> bool:
77
- """Complete PDF processing pipeline"""
78
- try:
79
- print(f"πŸ”„ Processing PDF: {self.pdf_path}")
80
-
81
- # Extract text
82
- text = self.extract_text_from_pdf()
83
-
84
- # Split into chunks
85
- chunks = self.split_text_into_chunks(text)
86
-
87
- # Create vector store
88
- self.create_vector_store(chunks)
89
-
90
- print("βœ… PDF processing completed successfully")
91
- return True
92
-
93
- except Exception as e:
94
- print(f"❌ PDF processing failed: {e}")
95
- return False
96
-
97
- def save_vector_store(self, filepath: str = "vector_store.pkl"):
98
- """Save vector store to file"""
99
- if self.vector_store:
100
- try:
101
- with open(filepath, 'wb') as f:
102
- pickle.dump(self.vector_store, f)
103
- print(f"βœ… Vector store saved to {filepath}")
104
- except Exception as e:
105
- print(f"❌ Error saving vector store: {e}")
106
-
107
- def load_vector_store(self, filepath: str = "vector_store.pkl") -> bool:
108
- """Load vector store from file"""
109
- try:
110
- if os.path.exists(filepath):
111
- with open(filepath, 'rb') as f:
112
- self.vector_store = pickle.load(f)
113
- print(f"βœ… Vector store loaded from {filepath}")
114
- return True
115
- else:
116
- print(f"⚠️ Vector store file not found: {filepath}")
117
- return False
118
- except Exception as e:
119
- print(f"❌ Error loading vector store: {e}")
120
- return False