Adieee5 commited on
Commit
e9ce2a7
·
verified ·
1 Parent(s): f53c091

Upload 8 files

Browse files
Files changed (8) hide show
  1. Dockerfile +25 -0
  2. app.py +332 -0
  3. config.py +22 -0
  4. pdf_processor.py +98 -0
  5. rag_engine.py +90 -0
  6. requirements.txt +7 -0
  7. vector_db/.DS_Store +0 -0
  8. vector_store.py +115 -0
Dockerfile ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Install system dependencies
6
+ RUN apt-get update && apt-get install -y \
7
+ gcc \
8
+ g++ \
9
+ && rm -rf /var/lib/apt/lists/*
10
+
11
+ # Copy requirements and install Python dependencies
12
+ COPY requirements.txt .
13
+ RUN pip install --no-cache-dir -r requirements.txt
14
+
15
+ # Copy application code
16
+ COPY . .
17
+
18
+ # Create necessary directories
19
+ RUN mkdir -p uploads vector_db
20
+
21
+ # Expose port
22
+ EXPOSE 7860
23
+
24
+ # Run the application
25
+ CMD ["python", "app.py"]
app.py ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import uuid
4
+ import tempfile
5
+ from typing import List, Tuple, Optional
6
+ from config import Config
7
+ from pdf_processor import PDFProcessor
8
+ from vector_store import VectorStore
9
+ from rag_engine import RAGEngine
10
+
11
+ # Initialize components
12
+ pdf_processor = PDFProcessor(
13
+ chunk_size=Config.CHUNK_SIZE,
14
+ chunk_overlap=Config.CHUNK_OVERLAP
15
+ )
16
+
17
+ vector_store = VectorStore(
18
+ model_name=Config.EMBEDDING_MODEL,
19
+ vector_db_path=Config.VECTOR_DB_PATH
20
+ )
21
+
22
+ rag_engine = RAGEngine(vector_store)
23
+
24
+ def upload_and_process_pdfs(files: List[tempfile._TemporaryFileWrapper]) -> str:
25
+ """Process uploaded PDF files and add them to the vector store."""
26
+ if not files:
27
+ return "❌ No files uploaded."
28
+
29
+ try:
30
+ uploaded_files = []
31
+ total_chunks = 0
32
+
33
+ for file in files:
34
+ if file is None:
35
+ continue
36
+
37
+ file_path = file.name
38
+ filename = os.path.basename(file_path)
39
+
40
+ # Check if it's a PDF
41
+ if not filename.lower().endswith('.pdf'):
42
+ continue
43
+
44
+ # Process PDF
45
+ chunks = pdf_processor.extract_text_from_pdf(file_path)
46
+
47
+ # Add to vector store
48
+ vector_store.add_documents(chunks)
49
+
50
+ uploaded_files.append(filename)
51
+ total_chunks += len(chunks)
52
+
53
+ if uploaded_files:
54
+ stats = vector_store.get_stats()
55
+ return f"✅ Successfully processed {len(uploaded_files)} PDF(s):\n" + \
56
+ f"📄 Files: {', '.join(uploaded_files)}\n" + \
57
+ f"📊 Total chunks created: {total_chunks}\n" + \
58
+ f"🗃️ Database now contains {stats['total_documents']} total documents"
59
+ else:
60
+ return "❌ No valid PDF files found."
61
+
62
+ except Exception as e:
63
+ return f"❌ Error processing files: {str(e)}"
64
+
65
+ def get_database_stats() -> str:
66
+ """Get current database statistics."""
67
+ stats = vector_store.get_stats()
68
+ return f"📊 **Database Statistics**\n\n" + \
69
+ f"📄 Total Documents: {stats['total_documents']}\n" + \
70
+ f"🔍 Index Size: {stats['index_size']}\n" + \
71
+ f"📏 Vector Dimension: {stats.get('dimension', 'N/A')}"
72
+
73
+ def clear_database() -> str:
74
+ """Clear the entire vector database."""
75
+ try:
76
+ vector_store.clear_index()
77
+ return "✅ Database cleared successfully!"
78
+ except Exception as e:
79
+ return f"❌ Error clearing database: {str(e)}"
80
+
81
+ def respond(message: str, chat_history: List[dict]) -> Tuple[str, List[dict]]:
82
+ """Chat function that handles the new messages format."""
83
+ if not message.strip():
84
+ return "", chat_history
85
+
86
+ try:
87
+ # Get response from RAG engine
88
+ result = rag_engine.generate_answer(message, top_k=Config.TOP_K)
89
+
90
+ response = result['answer']
91
+ sources = result.get('sources', [])
92
+
93
+ # Add source information to response
94
+ if sources:
95
+ response += "\n\n**📚 Sources:**\n"
96
+ for i, source in enumerate(sources[:3], 1):
97
+ response += f"{i}. 📄 **{source['source_file']}** (Page {source['page_number']})\n"
98
+ response += f" 📝 _{source['content_preview']}_\n"
99
+
100
+ # Add user message to chat history
101
+ chat_history.append({"role": "user", "content": message})
102
+
103
+ # Add assistant response to chat history
104
+ chat_history.append({"role": "assistant", "content": response})
105
+
106
+ return "", chat_history
107
+
108
+ except Exception as e:
109
+ error_response = f"❌ Error: {str(e)}"
110
+
111
+ # Add user message and error response to chat history
112
+ chat_history.append({"role": "user", "content": message})
113
+ chat_history.append({"role": "assistant", "content": error_response})
114
+
115
+ return "", chat_history
116
+
117
+ def create_interface():
118
+ """Create the Gradio interface."""
119
+
120
+ with gr.Blocks(title="PDF RAG System") as interface:
121
+
122
+ # Header
123
+ gr.Markdown("# 🤖 PDF RAG Assistant")
124
+ gr.Markdown("Upload PDFs and ask intelligent questions about their content using AI")
125
+
126
+ with gr.Tabs():
127
+
128
+ # Tab 1: Document Management
129
+ with gr.Tab("📁 Document Management"):
130
+
131
+ with gr.Row():
132
+ with gr.Column(scale=2):
133
+ gr.Markdown("## 📤 Upload PDF Documents")
134
+ gr.Markdown("Drag and drop your PDF files or click to browse")
135
+
136
+ file_upload = gr.File(
137
+ file_count="multiple",
138
+ file_types=[".pdf"],
139
+ label="Select PDF files to upload"
140
+ )
141
+
142
+ upload_btn = gr.Button(
143
+ "🚀 Process PDFs",
144
+ variant="primary",
145
+ size="lg"
146
+ )
147
+
148
+ upload_status = gr.Textbox(
149
+ label="📊 Upload Status",
150
+ interactive=False,
151
+ max_lines=8
152
+ )
153
+
154
+ with gr.Column(scale=1):
155
+ gr.Markdown("## 🗄️ Database Management")
156
+
157
+ stats_display = gr.Markdown(get_database_stats())
158
+
159
+ with gr.Row():
160
+ refresh_btn = gr.Button("🔄 Refresh", size="sm", variant="secondary")
161
+ clear_btn = gr.Button("🗑️ Clear Database", size="sm", variant="stop")
162
+
163
+ clear_status = gr.Textbox(
164
+ label="🔧 Database Status",
165
+ interactive=False,
166
+ max_lines=3
167
+ )
168
+
169
+ # Event handlers for document management
170
+ def update_stats_display():
171
+ return get_database_stats()
172
+
173
+ upload_btn.click(
174
+ fn=upload_and_process_pdfs,
175
+ inputs=[file_upload],
176
+ outputs=[upload_status]
177
+ ).then(
178
+ fn=update_stats_display,
179
+ outputs=[stats_display]
180
+ )
181
+
182
+ refresh_btn.click(
183
+ fn=update_stats_display,
184
+ outputs=[stats_display]
185
+ )
186
+
187
+ clear_btn.click(
188
+ fn=clear_database,
189
+ outputs=[clear_status]
190
+ ).then(
191
+ fn=update_stats_display,
192
+ outputs=[stats_display]
193
+ )
194
+
195
+ # Tab 2: Chat Interface
196
+ with gr.Tab("💬 AI Assistant"):
197
+
198
+ gr.Markdown("## 🤖 Ask questions about your uploaded documents")
199
+ gr.Markdown("**💡 Tips:** Upload PDFs first, then ask specific questions about their content for detailed answers with source references.")
200
+
201
+ # Create chat interface with messages format
202
+ chatbot = gr.Chatbot(
203
+ height=500,
204
+ show_label=False,
205
+ type="messages",
206
+ value=[{
207
+ "role": "assistant",
208
+ "content": "👋 **Welcome to PDF RAG Assistant!**\n\nI'm here to help you analyze and understand your PDF documents. \n\n📋 **Getting started:**\n1. Upload PDFs in the 'Document Management' tab\n2. Come back here and ask me questions\n3. I'll provide detailed answers with source references\n\n🚀 **Ready to get started?**"
209
+ }]
210
+ )
211
+
212
+ with gr.Row():
213
+ msg_input = gr.Textbox(
214
+ placeholder="💭 Ask a question about your documents...",
215
+ label="Your Question",
216
+ lines=2,
217
+ scale=4
218
+ )
219
+ send_btn = gr.Button(
220
+ "📨 Send",
221
+ variant="primary",
222
+ size="lg",
223
+ scale=1
224
+ )
225
+
226
+ clear_chat_btn = gr.Button(
227
+ "🧹 Clear Chat",
228
+ variant="secondary",
229
+ size="sm"
230
+ )
231
+
232
+ # Event handlers for chat
233
+ send_btn.click(
234
+ fn=respond,
235
+ inputs=[msg_input, chatbot],
236
+ outputs=[msg_input, chatbot]
237
+ )
238
+
239
+ msg_input.submit(
240
+ fn=respond,
241
+ inputs=[msg_input, chatbot],
242
+ outputs=[msg_input, chatbot]
243
+ )
244
+
245
+ clear_chat_btn.click(
246
+ fn=lambda: [{
247
+ "role": "assistant",
248
+ "content": "👋 **Welcome back!**\n\nI'm ready to help you with your PDF documents again. What would you like to know?"
249
+ }],
250
+ outputs=[chatbot]
251
+ )
252
+
253
+ # Tab 3: System Information
254
+ with gr.Tab("ℹ️ System Information"):
255
+
256
+ gr.Markdown("# ⚙️ System Configuration & Information")
257
+
258
+ with gr.Row():
259
+ with gr.Column():
260
+ gr.Markdown("## 🔧 Current Settings")
261
+
262
+ settings_info = f"""
263
+ **🧠 Embedding Model:** `{Config.EMBEDDING_MODEL}`
264
+
265
+ **📝 Chunk Size:** {Config.CHUNK_SIZE} characters
266
+
267
+ **🔗 Chunk Overlap:** {Config.CHUNK_OVERLAP} characters
268
+
269
+ **🎯 Search Results:** Top {Config.TOP_K} most relevant chunks
270
+
271
+ **📁 Max File Size:** 16MB per PDF
272
+ """
273
+ gr.Markdown(settings_info)
274
+
275
+ with gr.Column():
276
+ gr.Markdown("## 🚀 Key Features")
277
+
278
+ features_info = """
279
+ ✅ Multiple PDF upload and processing
280
+
281
+ ✅ Intelligent text chunking
282
+
283
+ ✅ Vector similarity search using FAISS
284
+
285
+ ✅ AI-powered Q&A with Google Gemini
286
+
287
+ ✅ Source attribution with page numbers
288
+
289
+ ✅ Persistent vector database storage
290
+
291
+ ✅ Real-time chat interface
292
+
293
+ ✅ Responsive modern UI
294
+ """
295
+ gr.Markdown(features_info)
296
+
297
+ gr.Markdown("## 🛠️ Technology Stack")
298
+
299
+ with gr.Row():
300
+ with gr.Column():
301
+ gr.Markdown("**🖥️ Framework:** Gradio 4.44+")
302
+ gr.Markdown("**📄 PDF Processing:** PyMuPDF")
303
+ with gr.Column():
304
+ gr.Markdown("**🧮 Embeddings:** Sentence Transformers")
305
+ gr.Markdown("**🗃️ Vector Database:** FAISS")
306
+ with gr.Column():
307
+ gr.Markdown("**🤖 Language Model:** Google Gemini 1.5")
308
+
309
+ gr.Markdown("## 📝 Quick Start Guide")
310
+
311
+ guide_info = """
312
+ **1.** Upload Documents - Go to 'Document Management' tab and upload your PDF files
313
+
314
+ **2.** Process & Index - Wait for the system to extract text and create embeddings
315
+
316
+ **3.** Ask Questions - Switch to 'AI Assistant' tab and start asking questions
317
+
318
+ **4.** Get Intelligent Answers - Receive detailed responses with source references and page numbers
319
+ """
320
+ gr.Markdown(guide_info)
321
+
322
+ return interface
323
+
324
+ if __name__ == "__main__":
325
+ # Create and launch the interface
326
+ interface = create_interface()
327
+ interface.launch(
328
+ server_name="0.0.0.0",
329
+ server_port=7860,
330
+ share=False,
331
+ show_error=True
332
+ )
config.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+
4
+ load_dotenv()
5
+
6
+ class Config:
7
+ GEMINI_API_KEY = os.environ.get('GEMINI_API_KEY') #os.getenv('GEMINI_API_KEY')
8
+
9
+ SECRET_KEY = os.environ.get('SECRET_KEY', 'your-secret-key-here')
10
+ UPLOAD_FOLDER = 'uploads'
11
+ VECTOR_DB_PATH = 'vector_db'
12
+ MAX_CONTENT_LENGTH = 16 * 1024 * 1024
13
+
14
+ EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
15
+ CHUNK_SIZE = 1000
16
+ CHUNK_OVERLAP = 200
17
+
18
+ TOP_K = 5
19
+
20
+ ALLOWED_EXTENSIONS = {'pdf'}
21
+ os.makedirs(Config.UPLOAD_FOLDER, exist_ok=True)
22
+ os.makedirs(Config.VECTOR_DB_PATH, exist_ok=True)
pdf_processor.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz
2
+ import os
3
+ import re
4
+ from typing import List, Dict, Any
5
+ from dataclasses import dataclass
6
+
7
+ @dataclass
8
+ class DocumentChunk:
9
+ content: str
10
+ metadata: Dict[str, Any]
11
+ page_number: int
12
+ source_file: str
13
+
14
+ class PDFProcessor:
15
+ def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
16
+ self.chunk_size = chunk_size
17
+ self.chunk_overlap = chunk_overlap
18
+
19
+ def extract_text_from_pdf(self, pdf_path: str) -> List[DocumentChunk]:
20
+ """Extract text from PDF and return chunks with metadata."""
21
+ chunks = []
22
+
23
+ try:
24
+ doc = fitz.open(pdf_path)
25
+ filename = os.path.basename(pdf_path)
26
+
27
+ for page_num in range(len(doc)):
28
+ page = doc.load_page(page_num)
29
+ text = page.get_text()
30
+
31
+ if text.strip():
32
+ cleaned_text = self._clean_text(text)
33
+ page_chunks = self._create_chunks(cleaned_text, page_num + 1, filename)
34
+ chunks.extend(page_chunks)
35
+
36
+ doc.close()
37
+ return chunks
38
+
39
+ except Exception as e:
40
+ raise Exception(f"Error processing PDF {pdf_path}: {str(e)}")
41
+
42
+ def _clean_text(self, text: str) -> str:
43
+ """Clean and normalize text."""
44
+ text = re.sub(r'\s+', ' ', text)
45
+ text = re.sub(r'[^\w\s.,!?;:()\[\]{}"-]', '', text)
46
+ text = re.sub(r'([.,!?;:]){2,}', r'\1', text)
47
+
48
+ return text.strip()
49
+
50
+ def _create_chunks(self, text: str, page_number: int, filename: str) -> List[DocumentChunk]:
51
+ """Split text into overlapping chunks."""
52
+ chunks = []
53
+ sentences = re.split(r'(?<=[.!?])\s+', text)
54
+ current_chunk = ""
55
+ current_length = 0
56
+
57
+ for sentence in sentences:
58
+ sentence_length = len(sentence)
59
+
60
+ if current_length + sentence_length > self.chunk_size and current_chunk:
61
+ chunks.append(DocumentChunk(
62
+ content=current_chunk.strip(),
63
+ metadata={
64
+ 'filename': filename,
65
+ 'page_number': page_number,
66
+ 'chunk_length': len(current_chunk)
67
+ },
68
+ page_number=page_number,
69
+ source_file=filename
70
+ ))
71
+
72
+ overlap_text = self._get_overlap_text(current_chunk)
73
+ current_chunk = overlap_text + " " + sentence
74
+ current_length = len(current_chunk)
75
+ else:
76
+ current_chunk += " " + sentence if current_chunk else sentence
77
+ current_length = len(current_chunk)
78
+
79
+
80
+ if current_chunk.strip():
81
+ chunks.append(DocumentChunk(
82
+ content=current_chunk.strip(),
83
+ metadata={
84
+ 'filename': filename,
85
+ 'page_number': page_number,
86
+ 'chunk_length': len(current_chunk)
87
+ },
88
+ page_number=page_number,
89
+ source_file=filename
90
+ ))
91
+
92
+ return chunks
93
+
94
+ def _get_overlap_text(self, text: str) -> str:
95
+ """Get overlap text from the end of current chunk."""
96
+ if len(text) <= self.chunk_overlap:
97
+ return text
98
+ return text[-self.chunk_overlap:]
rag_engine.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import google.generativeai as genai
2
+ from typing import List, Dict, Any
3
+ from vector_store import VectorStore
4
+ from config import Config
5
+
6
+ class RAGEngine:
7
+ def __init__(self, vector_store: VectorStore):
8
+ self.vector_store = vector_store
9
+
10
+ genai.configure(api_key=Config.GEMINI_API_KEY) # type: ignore
11
+ self.model = genai.GenerativeModel('gemini-2.0-flash-lite') # type: ignore
12
+
13
+ def generate_answer(self, query: str, top_k: int = 5) -> Dict[str, Any]:
14
+ """Generate answer using RAG pipeline."""
15
+ try:
16
+
17
+ search_results = self.vector_store.search(query, top_k)
18
+
19
+ if not search_results:
20
+ return {
21
+ 'answer': "I couldn't find any relevant information in the uploaded documents to answer your question.",
22
+ 'sources': [],
23
+ 'context_used': ""
24
+ }
25
+
26
+
27
+ context_parts = []
28
+ sources = []
29
+
30
+ for i, result in enumerate(search_results):
31
+ context_parts.append(f"[Context {i+1}]: {result['content']}")
32
+ sources.append({
33
+ 'source_file': result['source_file'],
34
+ 'page_number': result['page_number'],
35
+ 'similarity_score': result['similarity_score'],
36
+ 'content_preview': result['content'][:200] + "..." if len(result['content']) > 200 else result['content']
37
+ })
38
+
39
+ context = "\n\n".join(context_parts)
40
+ prompt = self._create_prompt(query, context)
41
+ response = self.model.generate_content(prompt)
42
+
43
+ return {
44
+ 'answer': response.text,
45
+ 'sources': sources,
46
+ 'context_used': context,
47
+ 'query': query
48
+ }
49
+
50
+ except Exception as e:
51
+ return {
52
+ 'answer': f"An error occurred while generating the answer: {str(e)}",
53
+ 'sources': [],
54
+ 'context_used': "",
55
+ 'error': str(e)
56
+ }
57
+
58
+ def _create_prompt(self, query: str, context: str) -> str:
59
+ """Create a prompt for the language model."""
60
+ prompt = f"""You are an AI assistant that answers questions based on provided document context.
61
+
62
+ Instructions:
63
+ 1. Answer the question using ONLY the information provided in the context below
64
+ 2. If the context doesn't contain enough information to answer the question, say so clearly
65
+ 3. Be concise but comprehensive in your answer
66
+ 4. If you reference specific information, mention which context section it comes from
67
+ 5. Do not make up information that's not in the provided context
68
+
69
+ Context from documents:
70
+ {context}
71
+
72
+ Question: {query}
73
+
74
+ Answer:"""
75
+
76
+ return prompt
77
+
78
+ def get_conversation_response(self, query: str) -> str:
79
+ """Get a simple text response for conversation interface."""
80
+ result = self.generate_answer(query)
81
+
82
+ answer = result['answer']
83
+ sources = result.get('sources', [])
84
+
85
+ if sources:
86
+ answer += "\n\n**Sources:**\n"
87
+ for i, source in enumerate(sources[:3], 1):
88
+ answer += f"{i}. {source['source_file']} (Page {source['page_number']})\n"
89
+
90
+ return answer
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ gradio
2
+ PyMuPDF
3
+ sentence-transformers
4
+ faiss-cpu
5
+ google-generativeai
6
+ python-dotenv
7
+ numpy
vector_db/.DS_Store ADDED
Binary file (6.15 kB). View file
 
vector_store.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import faiss
2
+ import numpy as np
3
+ import pickle
4
+ import os
5
+ from typing import List, Dict, Any, Tuple
6
+ from sentence_transformers import SentenceTransformer
7
+ from pdf_processor import DocumentChunk
8
+
9
+ class VectorStore:
10
+ def __init__(self, model_name: str, vector_db_path: str):
11
+ self.model = SentenceTransformer(model_name)
12
+ self.vector_db_path = vector_db_path
13
+ self.index_path = os.path.join(vector_db_path, 'faiss_index.bin')
14
+ self.metadata_path = os.path.join(vector_db_path, 'metadata.pkl')
15
+
16
+ self.index = None
17
+ self.metadata = []
18
+ self.load_index()
19
+
20
+ def load_index(self):
21
+ """Load existing FAISS index and metadata."""
22
+ try:
23
+ if os.path.exists(self.index_path) and os.path.exists(self.metadata_path):
24
+ self.index = faiss.read_index(self.index_path)
25
+ with open(self.metadata_path, 'rb') as f:
26
+ self.metadata = pickle.load(f)
27
+
28
+ print(f"Loaded existing index with {len(self.metadata)} documents")
29
+ else:
30
+ print("No existing index found. Will create new one.")
31
+ except Exception as e:
32
+ print(f"Error loading index: {e}")
33
+ self.index = None
34
+ self.metadata = []
35
+
36
+ def add_documents(self, chunks: List[DocumentChunk]):
37
+ """Add document chunks to the vector store."""
38
+ if not chunks:
39
+ return
40
+
41
+ texts = [chunk.content for chunk in chunks]
42
+ embeddings = self.model.encode(texts, convert_to_tensor=False)
43
+ embeddings = np.array(embeddings).astype('float32')
44
+
45
+ if self.index is None:
46
+ dimension = embeddings.shape[1]
47
+ self.index = faiss.IndexFlatIP(dimension)
48
+ faiss.normalize_L2(embeddings)
49
+
50
+ self.index.add(embeddings) # type: ignore
51
+
52
+
53
+ for chunk in chunks:
54
+ self.metadata.append({
55
+ 'content': chunk.content,
56
+ 'metadata': chunk.metadata,
57
+ 'page_number': chunk.page_number,
58
+ 'source_file': chunk.source_file
59
+ })
60
+
61
+
62
+ self.save_index()
63
+ print(f"Added {len(chunks)} chunks to vector store")
64
+
65
+ def search(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
66
+ """Search for similar documents."""
67
+ if self.index is None or len(self.metadata) == 0:
68
+ return []
69
+
70
+ query_embedding = self.model.encode([query], convert_to_tensor=False)
71
+ query_embedding = np.array(query_embedding).astype('float32')
72
+ faiss.normalize_L2(query_embedding)
73
+ scores, indices = self.index.search(query_embedding, min(top_k, len(self.metadata))) # type: ignore
74
+ results = []
75
+ for score, idx in zip(scores[0], indices[0]):
76
+ if idx != -1:
77
+ result = self.metadata[idx].copy()
78
+ result['similarity_score'] = float(score)
79
+ results.append(result)
80
+
81
+ return results
82
+
83
+ def save_index(self):
84
+ """Save FAISS index and metadata to disk."""
85
+ try:
86
+ if self.index is not None:
87
+ faiss.write_index(self.index, self.index_path)
88
+
89
+ with open(self.metadata_path, 'wb') as f:
90
+ pickle.dump(self.metadata, f)
91
+
92
+ except Exception as e:
93
+ print(f"Error saving index: {e}")
94
+
95
+ def get_stats(self) -> Dict[str, Any]:
96
+ """Get statistics about the vector store."""
97
+ if self.index is None:
98
+ return {'total_documents': 0, 'index_size': 0}
99
+
100
+ return {
101
+ 'total_documents': len(self.metadata),
102
+ 'index_size': self.index.ntotal,
103
+ 'dimension': self.index.d
104
+ }
105
+
106
+ def clear_index(self):
107
+ """Clear the entire index."""
108
+ self.index = None
109
+ self.metadata = []
110
+ if os.path.exists(self.index_path):
111
+ os.remove(self.index_path)
112
+ if os.path.exists(self.metadata_path):
113
+ os.remove(self.metadata_path)
114
+
115
+ print("Index cleared successfully")