Update core/rag_system.py
Browse files- core/rag_system.py +84 -81
core/rag_system.py
CHANGED
|
@@ -2,9 +2,11 @@ import numpy as np
|
|
| 2 |
import faiss
|
| 3 |
from typing import List, Dict, Optional
|
| 4 |
from sentence_transformers import SentenceTransformer
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
|
|
|
|
|
|
| 8 |
|
| 9 |
class EnhancedRAGSystem:
|
| 10 |
def __init__(self):
|
|
@@ -13,13 +15,12 @@ class EnhancedRAGSystem:
|
|
| 13 |
self.embeddings: Optional[np.ndarray] = None
|
| 14 |
self.index: Optional[faiss.Index] = None
|
| 15 |
|
| 16 |
-
# Multilingual support
|
| 17 |
-
self.
|
| 18 |
-
self.current_dimension = settings.EMBEDDING_DIMENSION
|
| 19 |
|
| 20 |
-
self._initialize_sample_data()
|
| 21 |
|
| 22 |
-
def _initialize_sample_data(self):
|
| 23 |
"""Khởi tạo dữ liệu mẫu"""
|
| 24 |
# Vietnamese sample data
|
| 25 |
vietnamese_data = [
|
|
@@ -43,7 +44,7 @@ class EnhancedRAGSystem:
|
|
| 43 |
"The United States has diverse climate zones from tropical to arctic"
|
| 44 |
]
|
| 45 |
|
| 46 |
-
# Vietnamese metadata
|
| 47 |
vietnamese_metadatas = [
|
| 48 |
{"type": "nutrition", "source": "sample", "language": "vi"},
|
| 49 |
{"type": "nutrition", "source": "sample", "language": "vi"},
|
|
@@ -54,7 +55,7 @@ class EnhancedRAGSystem:
|
|
| 54 |
{"type": "geography", "source": "sample", "language": "vi"}
|
| 55 |
]
|
| 56 |
|
| 57 |
-
# English metadata
|
| 58 |
english_metadatas = [
|
| 59 |
{"type": "nutrition", "source": "sample", "language": "en"},
|
| 60 |
{"type": "nutrition", "source": "sample", "language": "en"},
|
|
@@ -69,6 +70,15 @@ class EnhancedRAGSystem:
|
|
| 69 |
self.add_documents(vietnamese_data, vietnamese_metadatas)
|
| 70 |
self.add_documents(english_data, english_metadatas)
|
| 71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
def add_documents(self, documents: List[str], metadatas: List[Dict] = None):
|
| 73 |
"""Thêm documents vào database - ĐÃ SỬA LỖI"""
|
| 74 |
print(f"🔄 RAG System: Bắt đầu thêm {len(documents)} documents...")
|
|
@@ -97,11 +107,11 @@ class EnhancedRAGSystem:
|
|
| 97 |
valid_metadatas = []
|
| 98 |
|
| 99 |
for i, doc in enumerate(documents):
|
| 100 |
-
if doc and isinstance(doc, str) and len(doc.strip()) >
|
| 101 |
valid_documents.append(doc.strip())
|
| 102 |
valid_metadatas.append(metadatas[i] if i < len(metadatas) else {})
|
| 103 |
else:
|
| 104 |
-
print(f"⚠️ Bỏ qua document {i}: không hợp lệ")
|
| 105 |
|
| 106 |
print(f"📊 Documents hợp lệ: {len(valid_documents)}/{len(documents)}")
|
| 107 |
|
|
@@ -110,30 +120,36 @@ class EnhancedRAGSystem:
|
|
| 110 |
return
|
| 111 |
|
| 112 |
# Create embeddings
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
new_embeddings_list = []
|
| 114 |
successful_embeddings = 0
|
| 115 |
|
| 116 |
for i, doc in enumerate(valid_documents):
|
| 117 |
try:
|
| 118 |
-
|
| 119 |
-
embedding_model = self.multilingual_manager.get_embedding_model(language)
|
| 120 |
-
|
| 121 |
-
if embedding_model is None:
|
| 122 |
-
print(f"⚠️ Không có embedding model cho document {i}")
|
| 123 |
-
continue
|
| 124 |
-
|
| 125 |
-
# Create embedding
|
| 126 |
doc_embedding = embedding_model.encode([doc])
|
| 127 |
new_embeddings_list.append(doc_embedding[0])
|
| 128 |
successful_embeddings += 1
|
| 129 |
|
|
|
|
|
|
|
|
|
|
| 130 |
except Exception as e:
|
| 131 |
print(f"❌ Lỗi embedding document {i}: {e}")
|
|
|
|
|
|
|
| 132 |
|
| 133 |
print(f"📊 Embeddings thành công: {successful_embeddings}/{len(valid_documents)}")
|
| 134 |
|
| 135 |
if not new_embeddings_list:
|
| 136 |
-
print("❌ Không tạo được embeddings nào")
|
|
|
|
| 137 |
return
|
| 138 |
|
| 139 |
# Convert to numpy array
|
|
@@ -142,6 +158,7 @@ class EnhancedRAGSystem:
|
|
| 142 |
print(f"✅ Embedding matrix shape: {new_embeddings.shape}")
|
| 143 |
except Exception as e:
|
| 144 |
print(f"❌ Lỗi tạo embedding matrix: {e}")
|
|
|
|
| 145 |
return
|
| 146 |
|
| 147 |
# Handle existing embeddings
|
|
@@ -159,19 +176,26 @@ class EnhancedRAGSystem:
|
|
| 159 |
# Check dimension compatibility
|
| 160 |
if self.embeddings.shape[1] != new_embeddings.shape[1]:
|
| 161 |
print(f"⚠️ Dimension mismatch: {self.embeddings.shape[1]} vs {new_embeddings.shape[1]}")
|
| 162 |
-
|
| 163 |
-
self.embeddings
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
print("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
|
| 173 |
except Exception as e:
|
| 174 |
print(f"❌ Lỗi khi thêm vào system: {e}")
|
|
|
|
| 175 |
return
|
| 176 |
|
| 177 |
# Update FAISS index
|
|
@@ -181,9 +205,16 @@ class EnhancedRAGSystem:
|
|
| 181 |
print(f"🎉 THÀNH CÔNG: Đã thêm {new_doc_count - old_doc_count} documents mới")
|
| 182 |
print(f"📊 Tổng documents: {new_doc_count}")
|
| 183 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
def _update_faiss_index(self):
|
| 185 |
"""Cập nhật FAISS index với embeddings hiện tại"""
|
| 186 |
if self.embeddings is None or len(self.embeddings) == 0:
|
|
|
|
| 187 |
return
|
| 188 |
|
| 189 |
try:
|
|
@@ -198,23 +229,20 @@ class EnhancedRAGSystem:
|
|
| 198 |
except Exception as e:
|
| 199 |
print(f"❌ Lỗi cập nhật FAISS index: {e}")
|
| 200 |
|
| 201 |
-
def semantic_search(self, query: str, top_k: int =
|
| 202 |
-
"""Tìm kiếm ngữ nghĩa
|
| 203 |
if top_k is None:
|
| 204 |
-
top_k =
|
| 205 |
|
| 206 |
if not self.documents or self.index is None:
|
| 207 |
return self._fallback_keyword_search(query, top_k)
|
| 208 |
|
| 209 |
-
|
| 210 |
-
query_language = self.multilingual_manager.detect_language(query)
|
| 211 |
-
embedding_model = self.multilingual_manager.get_embedding_model(query_language)
|
| 212 |
-
|
| 213 |
if embedding_model is None:
|
| 214 |
return self._fallback_keyword_search(query, top_k)
|
| 215 |
|
| 216 |
try:
|
| 217 |
-
# Encode query
|
| 218 |
query_embedding = embedding_model.encode([query])
|
| 219 |
|
| 220 |
# Normalize query embedding for cosine similarity
|
|
@@ -229,52 +257,27 @@ class EnhancedRAGSystem:
|
|
| 229 |
results = []
|
| 230 |
for i, (similarity, idx) in enumerate(zip(similarities[0], indices[0])):
|
| 231 |
if idx < len(self.documents):
|
| 232 |
-
results.append(
|
| 233 |
-
id
|
| 234 |
-
text
|
| 235 |
-
similarity
|
| 236 |
-
metadata
|
| 237 |
-
)
|
| 238 |
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
print(f"🔍 Tìm kiếm '{query[:50]}...' (ngôn ngữ: {query_language}) - Tìm thấy {len(filtered_results)} kết quả")
|
| 243 |
-
return filtered_results
|
| 244 |
|
| 245 |
except Exception as e:
|
| 246 |
print(f"❌ Lỗi tìm kiếm ngữ nghĩa: {e}")
|
| 247 |
return self._fallback_keyword_search(query, top_k)
|
| 248 |
|
| 249 |
-
def
|
| 250 |
-
"""Lọc kết quả theo độ liên quan ngôn ngữ"""
|
| 251 |
-
if not results:
|
| 252 |
-
return results
|
| 253 |
-
|
| 254 |
-
# Boost scores for documents in the same language
|
| 255 |
-
for result in results:
|
| 256 |
-
doc_language = result.metadata.get('language', 'vi')
|
| 257 |
-
if doc_language == query_language:
|
| 258 |
-
# Boost similarity score for same language documents
|
| 259 |
-
result.similarity = min(result.similarity * 1.2, 1.0)
|
| 260 |
-
|
| 261 |
-
# Re-sort by updated similarity scores
|
| 262 |
-
results.sort(key=lambda x: x.similarity, reverse=True)
|
| 263 |
-
return results
|
| 264 |
-
|
| 265 |
-
def _fallback_keyword_search(self, query: str, top_k: int) -> List[RAGSearchResult]:
|
| 266 |
"""Tìm kiếm dự phòng dựa trên từ khóa"""
|
| 267 |
query_lower = query.lower()
|
| 268 |
results = []
|
| 269 |
|
| 270 |
for i, doc in enumerate(self.documents):
|
| 271 |
score = 0
|
| 272 |
-
doc_language = self.metadatas[i].get('language', 'vi') if i < len(self.metadatas) else 'vi'
|
| 273 |
-
query_language = self.multilingual_manager.detect_language(query)
|
| 274 |
-
|
| 275 |
-
# Language matching bonus
|
| 276 |
-
if doc_language == query_language:
|
| 277 |
-
score += 0.5
|
| 278 |
|
| 279 |
# Keyword matching
|
| 280 |
for word in query_lower.split():
|
|
@@ -282,18 +285,18 @@ class EnhancedRAGSystem:
|
|
| 282 |
score += 1
|
| 283 |
|
| 284 |
if score > 0:
|
| 285 |
-
results.append(
|
| 286 |
-
id
|
| 287 |
-
text
|
| 288 |
-
similarity
|
| 289 |
-
metadata
|
| 290 |
-
)
|
| 291 |
|
| 292 |
-
results.sort(key=lambda x: x
|
| 293 |
return results[:top_k]
|
| 294 |
|
| 295 |
def get_collection_stats(self) -> Dict:
|
| 296 |
-
"""Lấy thống kê collection
|
| 297 |
language_stats = {}
|
| 298 |
for metadata in self.metadatas:
|
| 299 |
lang = metadata.get('language', 'unknown')
|
|
|
|
| 2 |
import faiss
|
| 3 |
from typing import List, Dict, Optional
|
| 4 |
from sentence_transformers import SentenceTransformer
|
| 5 |
+
import os
|
| 6 |
+
import json
|
| 7 |
+
import pandas as pd
|
| 8 |
+
from typing import List
|
| 9 |
+
import traceback
|
| 10 |
|
| 11 |
class EnhancedRAGSystem:
|
| 12 |
def __init__(self):
|
|
|
|
| 15 |
self.embeddings: Optional[np.ndarray] = None
|
| 16 |
self.index: Optional[faiss.Index] = None
|
| 17 |
|
| 18 |
+
# Multilingual support - simplified for now
|
| 19 |
+
self.current_dimension = 384 # Default dimension
|
|
|
|
| 20 |
|
| 21 |
+
self._initialize_sample_data()
|
| 22 |
|
| 23 |
+
def _initialize_sample_data(self):
|
| 24 |
"""Khởi tạo dữ liệu mẫu"""
|
| 25 |
# Vietnamese sample data
|
| 26 |
vietnamese_data = [
|
|
|
|
| 44 |
"The United States has diverse climate zones from tropical to arctic"
|
| 45 |
]
|
| 46 |
|
| 47 |
+
# Vietnamese metadata
|
| 48 |
vietnamese_metadatas = [
|
| 49 |
{"type": "nutrition", "source": "sample", "language": "vi"},
|
| 50 |
{"type": "nutrition", "source": "sample", "language": "vi"},
|
|
|
|
| 55 |
{"type": "geography", "source": "sample", "language": "vi"}
|
| 56 |
]
|
| 57 |
|
| 58 |
+
# English metadata
|
| 59 |
english_metadatas = [
|
| 60 |
{"type": "nutrition", "source": "sample", "language": "en"},
|
| 61 |
{"type": "nutrition", "source": "sample", "language": "en"},
|
|
|
|
| 70 |
self.add_documents(vietnamese_data, vietnamese_metadatas)
|
| 71 |
self.add_documents(english_data, english_metadatas)
|
| 72 |
|
| 73 |
+
def _get_embedding_model(self):
|
| 74 |
+
"""Lấy embedding model - simplified version"""
|
| 75 |
+
try:
|
| 76 |
+
# Sử dụng model nhỏ để tiết kiệm bộ nhớ
|
| 77 |
+
return SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
|
| 78 |
+
except Exception as e:
|
| 79 |
+
print(f"❌ Lỗi load embedding model: {e}")
|
| 80 |
+
return None
|
| 81 |
+
|
| 82 |
def add_documents(self, documents: List[str], metadatas: List[Dict] = None):
|
| 83 |
"""Thêm documents vào database - ĐÃ SỬA LỖI"""
|
| 84 |
print(f"🔄 RAG System: Bắt đầu thêm {len(documents)} documents...")
|
|
|
|
| 107 |
valid_metadatas = []
|
| 108 |
|
| 109 |
for i, doc in enumerate(documents):
|
| 110 |
+
if doc and isinstance(doc, str) and len(doc.strip()) > 3: # Giảm độ dài tối thiểu
|
| 111 |
valid_documents.append(doc.strip())
|
| 112 |
valid_metadatas.append(metadatas[i] if i < len(metadatas) else {})
|
| 113 |
else:
|
| 114 |
+
print(f"⚠️ Bỏ qua document {i}: không hợp lệ - '{doc}'")
|
| 115 |
|
| 116 |
print(f"📊 Documents hợp lệ: {len(valid_documents)}/{len(documents)}")
|
| 117 |
|
|
|
|
| 120 |
return
|
| 121 |
|
| 122 |
# Create embeddings
|
| 123 |
+
embedding_model = self._get_embedding_model()
|
| 124 |
+
if embedding_model is None:
|
| 125 |
+
print("❌ Không thể tạo embedding model")
|
| 126 |
+
# Vẫn thêm documents không có embedding
|
| 127 |
+
self._add_documents_without_embeddings(valid_documents, valid_metadatas)
|
| 128 |
+
return
|
| 129 |
+
|
| 130 |
new_embeddings_list = []
|
| 131 |
successful_embeddings = 0
|
| 132 |
|
| 133 |
for i, doc in enumerate(valid_documents):
|
| 134 |
try:
|
| 135 |
+
# Create embedding - sử dụng model duy nhất
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
doc_embedding = embedding_model.encode([doc])
|
| 137 |
new_embeddings_list.append(doc_embedding[0])
|
| 138 |
successful_embeddings += 1
|
| 139 |
|
| 140 |
+
if i % 10 == 0: # Log tiến độ
|
| 141 |
+
print(f"📊 Đã embedding {i+1}/{len(valid_documents)} documents")
|
| 142 |
+
|
| 143 |
except Exception as e:
|
| 144 |
print(f"❌ Lỗi embedding document {i}: {e}")
|
| 145 |
+
# Thêm document không có embedding
|
| 146 |
+
new_embeddings_list.append(np.zeros(self.current_dimension))
|
| 147 |
|
| 148 |
print(f"📊 Embeddings thành công: {successful_embeddings}/{len(valid_documents)}")
|
| 149 |
|
| 150 |
if not new_embeddings_list:
|
| 151 |
+
print("❌ Không tạo được embeddings nào, thêm documents không embedding")
|
| 152 |
+
self._add_documents_without_embeddings(valid_documents, valid_metadatas)
|
| 153 |
return
|
| 154 |
|
| 155 |
# Convert to numpy array
|
|
|
|
| 158 |
print(f"✅ Embedding matrix shape: {new_embeddings.shape}")
|
| 159 |
except Exception as e:
|
| 160 |
print(f"❌ Lỗi tạo embedding matrix: {e}")
|
| 161 |
+
self._add_documents_without_embeddings(valid_documents, valid_metadatas)
|
| 162 |
return
|
| 163 |
|
| 164 |
# Handle existing embeddings
|
|
|
|
| 176 |
# Check dimension compatibility
|
| 177 |
if self.embeddings.shape[1] != new_embeddings.shape[1]:
|
| 178 |
print(f"⚠️ Dimension mismatch: {self.embeddings.shape[1]} vs {new_embeddings.shape[1]}")
|
| 179 |
+
# Resize embeddings để phù hợp
|
| 180 |
+
if self.embeddings.shape[1] < new_embeddings.shape[1]:
|
| 181 |
+
# Pad existing embeddings
|
| 182 |
+
pad_width = new_embeddings.shape[1] - self.embeddings.shape[1]
|
| 183 |
+
self.embeddings = np.pad(self.embeddings, ((0,0), (0,pad_width)))
|
| 184 |
+
else:
|
| 185 |
+
# Truncate new embeddings
|
| 186 |
+
new_embeddings = new_embeddings[:, :self.embeddings.shape[1]]
|
| 187 |
+
|
| 188 |
+
print("🔄 Đã điều chỉnh dimension")
|
| 189 |
+
|
| 190 |
+
# Compatible dimensions, append
|
| 191 |
+
self.embeddings = np.vstack([self.embeddings, new_embeddings])
|
| 192 |
+
self.documents.extend(valid_documents)
|
| 193 |
+
self.metadatas.extend(valid_metadatas)
|
| 194 |
+
print("✅ Đã thêm vào system hiện có")
|
| 195 |
|
| 196 |
except Exception as e:
|
| 197 |
print(f"❌ Lỗi khi thêm vào system: {e}")
|
| 198 |
+
self._add_documents_without_embeddings(valid_documents, valid_metadatas)
|
| 199 |
return
|
| 200 |
|
| 201 |
# Update FAISS index
|
|
|
|
| 205 |
print(f"🎉 THÀNH CÔNG: Đã thêm {new_doc_count - old_doc_count} documents mới")
|
| 206 |
print(f"📊 Tổng documents: {new_doc_count}")
|
| 207 |
|
| 208 |
+
def _add_documents_without_embeddings(self, documents: List[str], metadatas: List[Dict]):
|
| 209 |
+
"""Thêm documents không có embeddings (fallback)"""
|
| 210 |
+
self.documents.extend(documents)
|
| 211 |
+
self.metadatas.extend(metadatas)
|
| 212 |
+
print(f"✅ Đã thêm {len(documents)} documents không có embeddings")
|
| 213 |
+
|
| 214 |
def _update_faiss_index(self):
|
| 215 |
"""Cập nhật FAISS index với embeddings hiện tại"""
|
| 216 |
if self.embeddings is None or len(self.embeddings) == 0:
|
| 217 |
+
print("⚠️ Không có embeddings để cập nhật index")
|
| 218 |
return
|
| 219 |
|
| 220 |
try:
|
|
|
|
| 229 |
except Exception as e:
|
| 230 |
print(f"❌ Lỗi cập nhật FAISS index: {e}")
|
| 231 |
|
| 232 |
+
def semantic_search(self, query: str, top_k: int = 5) -> List[Dict]:
|
| 233 |
+
"""Tìm kiếm ngữ nghĩa - simplified version"""
|
| 234 |
if top_k is None:
|
| 235 |
+
top_k = 5
|
| 236 |
|
| 237 |
if not self.documents or self.index is None:
|
| 238 |
return self._fallback_keyword_search(query, top_k)
|
| 239 |
|
| 240 |
+
embedding_model = self._get_embedding_model()
|
|
|
|
|
|
|
|
|
|
| 241 |
if embedding_model is None:
|
| 242 |
return self._fallback_keyword_search(query, top_k)
|
| 243 |
|
| 244 |
try:
|
| 245 |
+
# Encode query
|
| 246 |
query_embedding = embedding_model.encode([query])
|
| 247 |
|
| 248 |
# Normalize query embedding for cosine similarity
|
|
|
|
| 257 |
results = []
|
| 258 |
for i, (similarity, idx) in enumerate(zip(similarities[0], indices[0])):
|
| 259 |
if idx < len(self.documents):
|
| 260 |
+
results.append({
|
| 261 |
+
"id": str(idx),
|
| 262 |
+
"text": self.documents[idx],
|
| 263 |
+
"similarity": float(similarity),
|
| 264 |
+
"metadata": self.metadatas[idx] if idx < len(self.metadatas) else {}
|
| 265 |
+
})
|
| 266 |
|
| 267 |
+
print(f"🔍 Tìm kiếm '{query[:50]}...' - Tìm thấy {len(results)} kết quả")
|
| 268 |
+
return results
|
|
|
|
|
|
|
|
|
|
| 269 |
|
| 270 |
except Exception as e:
|
| 271 |
print(f"❌ Lỗi tìm kiếm ngữ nghĩa: {e}")
|
| 272 |
return self._fallback_keyword_search(query, top_k)
|
| 273 |
|
| 274 |
+
def _fallback_keyword_search(self, query: str, top_k: int) -> List[Dict]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 275 |
"""Tìm kiếm dự phòng dựa trên từ khóa"""
|
| 276 |
query_lower = query.lower()
|
| 277 |
results = []
|
| 278 |
|
| 279 |
for i, doc in enumerate(self.documents):
|
| 280 |
score = 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 281 |
|
| 282 |
# Keyword matching
|
| 283 |
for word in query_lower.split():
|
|
|
|
| 285 |
score += 1
|
| 286 |
|
| 287 |
if score > 0:
|
| 288 |
+
results.append({
|
| 289 |
+
"id": str(i),
|
| 290 |
+
"text": doc,
|
| 291 |
+
"similarity": min(score / 5, 1.0),
|
| 292 |
+
"metadata": self.metadatas[i] if i < len(self.metadatas) else {}
|
| 293 |
+
})
|
| 294 |
|
| 295 |
+
results.sort(key=lambda x: x["similarity"], reverse=True)
|
| 296 |
return results[:top_k]
|
| 297 |
|
| 298 |
def get_collection_stats(self) -> Dict:
|
| 299 |
+
"""Lấy thống kê collection"""
|
| 300 |
language_stats = {}
|
| 301 |
for metadata in self.metadatas:
|
| 302 |
lang = metadata.get('language', 'unknown')
|