datbkpro commited on
Commit
96b3a3f
·
verified ·
1 Parent(s): 2298900

Update core/rag_system.py

Browse files
Files changed (1) hide show
  1. core/rag_system.py +84 -81
core/rag_system.py CHANGED
@@ -2,9 +2,11 @@ import numpy as np
2
  import faiss
3
  from typing import List, Dict, Optional
4
  from sentence_transformers import SentenceTransformer
5
- from models.schemas import RAGSearchResult
6
- from config.settings import settings
7
- from core.multilingual_manager import MultilingualManager
 
 
8
 
9
  class EnhancedRAGSystem:
10
  def __init__(self):
@@ -13,13 +15,12 @@ class EnhancedRAGSystem:
13
  self.embeddings: Optional[np.ndarray] = None
14
  self.index: Optional[faiss.Index] = None
15
 
16
- # Multilingual support
17
- self.multilingual_manager = MultilingualManager()
18
- self.current_dimension = settings.EMBEDDING_DIMENSION
19
 
20
- self._initialize_sample_data() # SỬA TÊN HÀM
21
 
22
- def _initialize_sample_data(self): # SỬA TÊN HÀM
23
  """Khởi tạo dữ liệu mẫu"""
24
  # Vietnamese sample data
25
  vietnamese_data = [
@@ -43,7 +44,7 @@ class EnhancedRAGSystem:
43
  "The United States has diverse climate zones from tropical to arctic"
44
  ]
45
 
46
- # Vietnamese metadata - SỬA LỖI SYNTAX
47
  vietnamese_metadatas = [
48
  {"type": "nutrition", "source": "sample", "language": "vi"},
49
  {"type": "nutrition", "source": "sample", "language": "vi"},
@@ -54,7 +55,7 @@ class EnhancedRAGSystem:
54
  {"type": "geography", "source": "sample", "language": "vi"}
55
  ]
56
 
57
- # English metadata - SỬA LỖI SYNTAX
58
  english_metadatas = [
59
  {"type": "nutrition", "source": "sample", "language": "en"},
60
  {"type": "nutrition", "source": "sample", "language": "en"},
@@ -69,6 +70,15 @@ class EnhancedRAGSystem:
69
  self.add_documents(vietnamese_data, vietnamese_metadatas)
70
  self.add_documents(english_data, english_metadatas)
71
 
 
 
 
 
 
 
 
 
 
72
  def add_documents(self, documents: List[str], metadatas: List[Dict] = None):
73
  """Thêm documents vào database - ĐÃ SỬA LỖI"""
74
  print(f"🔄 RAG System: Bắt đầu thêm {len(documents)} documents...")
@@ -97,11 +107,11 @@ class EnhancedRAGSystem:
97
  valid_metadatas = []
98
 
99
  for i, doc in enumerate(documents):
100
- if doc and isinstance(doc, str) and len(doc.strip()) > 5: # At least 5 characters
101
  valid_documents.append(doc.strip())
102
  valid_metadatas.append(metadatas[i] if i < len(metadatas) else {})
103
  else:
104
- print(f"⚠️ Bỏ qua document {i}: không hợp lệ")
105
 
106
  print(f"📊 Documents hợp lệ: {len(valid_documents)}/{len(documents)}")
107
 
@@ -110,30 +120,36 @@ class EnhancedRAGSystem:
110
  return
111
 
112
  # Create embeddings
 
 
 
 
 
 
 
113
  new_embeddings_list = []
114
  successful_embeddings = 0
115
 
116
  for i, doc in enumerate(valid_documents):
117
  try:
118
- language = valid_metadatas[i].get('language', 'vi')
119
- embedding_model = self.multilingual_manager.get_embedding_model(language)
120
-
121
- if embedding_model is None:
122
- print(f"⚠️ Không có embedding model cho document {i}")
123
- continue
124
-
125
- # Create embedding
126
  doc_embedding = embedding_model.encode([doc])
127
  new_embeddings_list.append(doc_embedding[0])
128
  successful_embeddings += 1
129
 
 
 
 
130
  except Exception as e:
131
  print(f"❌ Lỗi embedding document {i}: {e}")
 
 
132
 
133
  print(f"📊 Embeddings thành công: {successful_embeddings}/{len(valid_documents)}")
134
 
135
  if not new_embeddings_list:
136
- print("❌ Không tạo được embeddings nào")
 
137
  return
138
 
139
  # Convert to numpy array
@@ -142,6 +158,7 @@ class EnhancedRAGSystem:
142
  print(f"✅ Embedding matrix shape: {new_embeddings.shape}")
143
  except Exception as e:
144
  print(f"❌ Lỗi tạo embedding matrix: {e}")
 
145
  return
146
 
147
  # Handle existing embeddings
@@ -159,19 +176,26 @@ class EnhancedRAGSystem:
159
  # Check dimension compatibility
160
  if self.embeddings.shape[1] != new_embeddings.shape[1]:
161
  print(f"⚠️ Dimension mismatch: {self.embeddings.shape[1]} vs {new_embeddings.shape[1]}")
162
- print("🔄 Tạo system mới do dimension không khớp")
163
- self.embeddings = new_embeddings
164
- self.documents = valid_documents
165
- self.metadatas = valid_metadatas
166
- else:
167
- # Compatible dimensions, append
168
- self.embeddings = np.vstack([self.embeddings, new_embeddings])
169
- self.documents.extend(valid_documents)
170
- self.metadatas.extend(valid_metadatas)
171
- print(" Đã thêm vào system hiện có")
 
 
 
 
 
 
172
 
173
  except Exception as e:
174
  print(f"❌ Lỗi khi thêm vào system: {e}")
 
175
  return
176
 
177
  # Update FAISS index
@@ -181,9 +205,16 @@ class EnhancedRAGSystem:
181
  print(f"🎉 THÀNH CÔNG: Đã thêm {new_doc_count - old_doc_count} documents mới")
182
  print(f"📊 Tổng documents: {new_doc_count}")
183
 
 
 
 
 
 
 
184
  def _update_faiss_index(self):
185
  """Cập nhật FAISS index với embeddings hiện tại"""
186
  if self.embeddings is None or len(self.embeddings) == 0:
 
187
  return
188
 
189
  try:
@@ -198,23 +229,20 @@ class EnhancedRAGSystem:
198
  except Exception as e:
199
  print(f"❌ Lỗi cập nhật FAISS index: {e}")
200
 
201
- def semantic_search(self, query: str, top_k: int = None) -> List[RAGSearchResult]:
202
- """Tìm kiếm ngữ nghĩa với model phù hợp theo ngôn ngữ"""
203
  if top_k is None:
204
- top_k = settings.TOP_K_RESULTS
205
 
206
  if not self.documents or self.index is None:
207
  return self._fallback_keyword_search(query, top_k)
208
 
209
- # Detect query language and get appropriate model
210
- query_language = self.multilingual_manager.detect_language(query)
211
- embedding_model = self.multilingual_manager.get_embedding_model(query_language)
212
-
213
  if embedding_model is None:
214
  return self._fallback_keyword_search(query, top_k)
215
 
216
  try:
217
- # Encode query with appropriate model
218
  query_embedding = embedding_model.encode([query])
219
 
220
  # Normalize query embedding for cosine similarity
@@ -229,52 +257,27 @@ class EnhancedRAGSystem:
229
  results = []
230
  for i, (similarity, idx) in enumerate(zip(similarities[0], indices[0])):
231
  if idx < len(self.documents):
232
- results.append(RAGSearchResult(
233
- id=str(idx),
234
- text=self.documents[idx],
235
- similarity=float(similarity),
236
- metadata=self.metadatas[idx] if idx < len(self.metadatas) else {}
237
- ))
238
 
239
- # Filter results by language relevance
240
- filtered_results = self._filter_by_language_relevance(results, query_language)
241
-
242
- print(f"🔍 Tìm kiếm '{query[:50]}...' (ngôn ngữ: {query_language}) - Tìm thấy {len(filtered_results)} kết quả")
243
- return filtered_results
244
 
245
  except Exception as e:
246
  print(f"❌ Lỗi tìm kiếm ngữ nghĩa: {e}")
247
  return self._fallback_keyword_search(query, top_k)
248
 
249
- def _filter_by_language_relevance(self, results: List[RAGSearchResult], query_language: str) -> List[RAGSearchResult]:
250
- """Lọc kết quả theo độ liên quan ngôn ngữ"""
251
- if not results:
252
- return results
253
-
254
- # Boost scores for documents in the same language
255
- for result in results:
256
- doc_language = result.metadata.get('language', 'vi')
257
- if doc_language == query_language:
258
- # Boost similarity score for same language documents
259
- result.similarity = min(result.similarity * 1.2, 1.0)
260
-
261
- # Re-sort by updated similarity scores
262
- results.sort(key=lambda x: x.similarity, reverse=True)
263
- return results
264
-
265
- def _fallback_keyword_search(self, query: str, top_k: int) -> List[RAGSearchResult]:
266
  """Tìm kiếm dự phòng dựa trên từ khóa"""
267
  query_lower = query.lower()
268
  results = []
269
 
270
  for i, doc in enumerate(self.documents):
271
  score = 0
272
- doc_language = self.metadatas[i].get('language', 'vi') if i < len(self.metadatas) else 'vi'
273
- query_language = self.multilingual_manager.detect_language(query)
274
-
275
- # Language matching bonus
276
- if doc_language == query_language:
277
- score += 0.5
278
 
279
  # Keyword matching
280
  for word in query_lower.split():
@@ -282,18 +285,18 @@ class EnhancedRAGSystem:
282
  score += 1
283
 
284
  if score > 0:
285
- results.append(RAGSearchResult(
286
- id=str(i),
287
- text=doc,
288
- similarity=min(score / 5, 1.0),
289
- metadata=self.metadatas[i] if i < len(self.metadatas) else {}
290
- ))
291
 
292
- results.sort(key=lambda x: x.similarity, reverse=True)
293
  return results[:top_k]
294
 
295
  def get_collection_stats(self) -> Dict:
296
- """Lấy thống kê collection với thông tin đa ngôn ngữ"""
297
  language_stats = {}
298
  for metadata in self.metadatas:
299
  lang = metadata.get('language', 'unknown')
 
2
  import faiss
3
  from typing import List, Dict, Optional
4
  from sentence_transformers import SentenceTransformer
5
+ import os
6
+ import json
7
+ import pandas as pd
8
+ from typing import List
9
+ import traceback
10
 
11
  class EnhancedRAGSystem:
12
  def __init__(self):
 
15
  self.embeddings: Optional[np.ndarray] = None
16
  self.index: Optional[faiss.Index] = None
17
 
18
+ # Multilingual support - simplified for now
19
+ self.current_dimension = 384 # Default dimension
 
20
 
21
+ self._initialize_sample_data()
22
 
23
+ def _initialize_sample_data(self):
24
  """Khởi tạo dữ liệu mẫu"""
25
  # Vietnamese sample data
26
  vietnamese_data = [
 
44
  "The United States has diverse climate zones from tropical to arctic"
45
  ]
46
 
47
+ # Vietnamese metadata
48
  vietnamese_metadatas = [
49
  {"type": "nutrition", "source": "sample", "language": "vi"},
50
  {"type": "nutrition", "source": "sample", "language": "vi"},
 
55
  {"type": "geography", "source": "sample", "language": "vi"}
56
  ]
57
 
58
+ # English metadata
59
  english_metadatas = [
60
  {"type": "nutrition", "source": "sample", "language": "en"},
61
  {"type": "nutrition", "source": "sample", "language": "en"},
 
70
  self.add_documents(vietnamese_data, vietnamese_metadatas)
71
  self.add_documents(english_data, english_metadatas)
72
 
73
+ def _get_embedding_model(self):
74
+ """Lấy embedding model - simplified version"""
75
+ try:
76
+ # Sử dụng model nhỏ để tiết kiệm bộ nhớ
77
+ return SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
78
+ except Exception as e:
79
+ print(f"❌ Lỗi load embedding model: {e}")
80
+ return None
81
+
82
  def add_documents(self, documents: List[str], metadatas: List[Dict] = None):
83
  """Thêm documents vào database - ĐÃ SỬA LỖI"""
84
  print(f"🔄 RAG System: Bắt đầu thêm {len(documents)} documents...")
 
107
  valid_metadatas = []
108
 
109
  for i, doc in enumerate(documents):
110
+ if doc and isinstance(doc, str) and len(doc.strip()) > 3: # Giảm độ dài tối thiểu
111
  valid_documents.append(doc.strip())
112
  valid_metadatas.append(metadatas[i] if i < len(metadatas) else {})
113
  else:
114
+ print(f"⚠️ Bỏ qua document {i}: không hợp lệ - '{doc}'")
115
 
116
  print(f"📊 Documents hợp lệ: {len(valid_documents)}/{len(documents)}")
117
 
 
120
  return
121
 
122
  # Create embeddings
123
+ embedding_model = self._get_embedding_model()
124
+ if embedding_model is None:
125
+ print("❌ Không thể tạo embedding model")
126
+ # Vẫn thêm documents không có embedding
127
+ self._add_documents_without_embeddings(valid_documents, valid_metadatas)
128
+ return
129
+
130
  new_embeddings_list = []
131
  successful_embeddings = 0
132
 
133
  for i, doc in enumerate(valid_documents):
134
  try:
135
+ # Create embedding - sử dụng model duy nhất
 
 
 
 
 
 
 
136
  doc_embedding = embedding_model.encode([doc])
137
  new_embeddings_list.append(doc_embedding[0])
138
  successful_embeddings += 1
139
 
140
+ if i % 10 == 0: # Log tiến độ
141
+ print(f"📊 Đã embedding {i+1}/{len(valid_documents)} documents")
142
+
143
  except Exception as e:
144
  print(f"❌ Lỗi embedding document {i}: {e}")
145
+ # Thêm document không có embedding
146
+ new_embeddings_list.append(np.zeros(self.current_dimension))
147
 
148
  print(f"📊 Embeddings thành công: {successful_embeddings}/{len(valid_documents)}")
149
 
150
  if not new_embeddings_list:
151
+ print("❌ Không tạo được embeddings nào, thêm documents không embedding")
152
+ self._add_documents_without_embeddings(valid_documents, valid_metadatas)
153
  return
154
 
155
  # Convert to numpy array
 
158
  print(f"✅ Embedding matrix shape: {new_embeddings.shape}")
159
  except Exception as e:
160
  print(f"❌ Lỗi tạo embedding matrix: {e}")
161
+ self._add_documents_without_embeddings(valid_documents, valid_metadatas)
162
  return
163
 
164
  # Handle existing embeddings
 
176
  # Check dimension compatibility
177
  if self.embeddings.shape[1] != new_embeddings.shape[1]:
178
  print(f"⚠️ Dimension mismatch: {self.embeddings.shape[1]} vs {new_embeddings.shape[1]}")
179
+ # Resize embeddings để phù hợp
180
+ if self.embeddings.shape[1] < new_embeddings.shape[1]:
181
+ # Pad existing embeddings
182
+ pad_width = new_embeddings.shape[1] - self.embeddings.shape[1]
183
+ self.embeddings = np.pad(self.embeddings, ((0,0), (0,pad_width)))
184
+ else:
185
+ # Truncate new embeddings
186
+ new_embeddings = new_embeddings[:, :self.embeddings.shape[1]]
187
+
188
+ print("🔄 Đã điều chỉnh dimension")
189
+
190
+ # Compatible dimensions, append
191
+ self.embeddings = np.vstack([self.embeddings, new_embeddings])
192
+ self.documents.extend(valid_documents)
193
+ self.metadatas.extend(valid_metadatas)
194
+ print("✅ Đã thêm vào system hiện có")
195
 
196
  except Exception as e:
197
  print(f"❌ Lỗi khi thêm vào system: {e}")
198
+ self._add_documents_without_embeddings(valid_documents, valid_metadatas)
199
  return
200
 
201
  # Update FAISS index
 
205
  print(f"🎉 THÀNH CÔNG: Đã thêm {new_doc_count - old_doc_count} documents mới")
206
  print(f"📊 Tổng documents: {new_doc_count}")
207
 
208
+ def _add_documents_without_embeddings(self, documents: List[str], metadatas: List[Dict]):
209
+ """Thêm documents không có embeddings (fallback)"""
210
+ self.documents.extend(documents)
211
+ self.metadatas.extend(metadatas)
212
+ print(f"✅ Đã thêm {len(documents)} documents không có embeddings")
213
+
214
  def _update_faiss_index(self):
215
  """Cập nhật FAISS index với embeddings hiện tại"""
216
  if self.embeddings is None or len(self.embeddings) == 0:
217
+ print("⚠️ Không có embeddings để cập nhật index")
218
  return
219
 
220
  try:
 
229
  except Exception as e:
230
  print(f"❌ Lỗi cập nhật FAISS index: {e}")
231
 
232
+ def semantic_search(self, query: str, top_k: int = 5) -> List[Dict]:
233
+ """Tìm kiếm ngữ nghĩa - simplified version"""
234
  if top_k is None:
235
+ top_k = 5
236
 
237
  if not self.documents or self.index is None:
238
  return self._fallback_keyword_search(query, top_k)
239
 
240
+ embedding_model = self._get_embedding_model()
 
 
 
241
  if embedding_model is None:
242
  return self._fallback_keyword_search(query, top_k)
243
 
244
  try:
245
+ # Encode query
246
  query_embedding = embedding_model.encode([query])
247
 
248
  # Normalize query embedding for cosine similarity
 
257
  results = []
258
  for i, (similarity, idx) in enumerate(zip(similarities[0], indices[0])):
259
  if idx < len(self.documents):
260
+ results.append({
261
+ "id": str(idx),
262
+ "text": self.documents[idx],
263
+ "similarity": float(similarity),
264
+ "metadata": self.metadatas[idx] if idx < len(self.metadatas) else {}
265
+ })
266
 
267
+ print(f"🔍 Tìm kiếm '{query[:50]}...' - Tìm thấy {len(results)} kết quả")
268
+ return results
 
 
 
269
 
270
  except Exception as e:
271
  print(f"❌ Lỗi tìm kiếm ngữ nghĩa: {e}")
272
  return self._fallback_keyword_search(query, top_k)
273
 
274
+ def _fallback_keyword_search(self, query: str, top_k: int) -> List[Dict]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
  """Tìm kiếm dự phòng dựa trên từ khóa"""
276
  query_lower = query.lower()
277
  results = []
278
 
279
  for i, doc in enumerate(self.documents):
280
  score = 0
 
 
 
 
 
 
281
 
282
  # Keyword matching
283
  for word in query_lower.split():
 
285
  score += 1
286
 
287
  if score > 0:
288
+ results.append({
289
+ "id": str(i),
290
+ "text": doc,
291
+ "similarity": min(score / 5, 1.0),
292
+ "metadata": self.metadatas[i] if i < len(self.metadatas) else {}
293
+ })
294
 
295
+ results.sort(key=lambda x: x["similarity"], reverse=True)
296
  return results[:top_k]
297
 
298
  def get_collection_stats(self) -> Dict:
299
+ """Lấy thống kê collection"""
300
  language_stats = {}
301
  for metadata in self.metadatas:
302
  lang = metadata.get('language', 'unknown')