tach retrival

Browse files

Files changed (4) hide show

core/embeddings/retrival.py +154 -0
core/embeddings/vector_store.py +223 -291
core/gradio/gradio_rag_qwen.py +5 -1
evaluation/simple_eval.py +38 -7

core/embeddings/retrival.py CHANGED Viewed

	@@ -0,0 +1,154 @@

+from __future__ import annotations
+import logging
+from typing import Any, Dict, List, Optional, TYPE_CHECKING
+if TYPE_CHECKING:
+    from core.embeddings.vector_store import ChromaVectorDB
+# Reranker - sentence_transformers với model BGE-M3
+try:
+    from sentence_transformers import CrossEncoder
+    HAS_RERANKER = True
+except ImportError:
+    HAS_RERANKER = False
+logger = logging.getLogger(__name__)
+DEFAULT_INITIAL_K = 100
+DEFAULT_TOP_K = 5
+RERANKER_MAX_LENGTH = 512
+class Retriever:
+    def __init__(
+        self,
+        vector_db: "ChromaVectorDB",
+        reranker_model: str = "BAAI/bge-reranker-v2-m3",
+        use_reranker: bool = True,
+    ):
+        self._vector_db = vector_db
+        self._reranker: Optional[Any] = None
+        self._reranker_model = reranker_model
+        if use_reranker and HAS_RERANKER:
+            self._load_reranker(reranker_model)
+    def _load_reranker(self, model_name: str) -> None:
+        try:
+            logger.info(f"Loading reranker: {model_name}...")
+            self._reranker = CrossEncoder(model_name, max_length=RERANKER_MAX_LENGTH)
+            logger.info("Reranker loaded successfully!")
+        except Exception as e:
+            logger.error(f"Reranker failed to load: {e}")
+            self._reranker = None
+    @property
+    def has_reranker(self) -> bool:
+        return self._reranker is not None
+    def query(
+        self,
+        text: str,
+        *,
+        k: int = DEFAULT_TOP_K,
+        where: Optional[Dict[str, Any]] = None,
+    ) -> List[Dict[str, Any]]:
+        if not text.strip():
+            return []
+        if k <= 0:
+            raise ValueError("k must be positive")
+        vectorstore = self._vector_db.vectorstore
+        results = vectorstore.similarity_search_with_score(text, k=k, filter=where)
+        out: List[Dict[str, Any]] = []
+        for doc, score in results:
+            out.append({
+                "id": (doc.metadata or {}).get("id"),
+                "content": doc.page_content,
+                "metadata": doc.metadata,
+                "distance": score,
+            })
+        return out
+    def search_with_rerank(
+        self,
+        text: str,
+        *,
+        k: int = DEFAULT_TOP_K,
+        where: Optional[Dict[str, Any]] = None,
+        initial_k: int = DEFAULT_INITIAL_K,
+    ) -> List[Dict[str, Any]]:
+        if not text.strip():
+            return []
+        if k <= 0:
+            raise ValueError("k must be positive")
+        if initial_k < k:
+            logger.warning(f"initial_k ({initial_k}) < k ({k}), setting initial_k = k")
+            initial_k = k
+        # Stage 1: Vector Search
+        vectorstore = self._vector_db.vectorstore
+        vector_results = vectorstore.similarity_search_with_score(
+            text, k=initial_k, filter=where
+        )
+        if not vector_results:
+            return []
+        # Build candidates list
+        candidates = []
+        for rank, (doc, score) in enumerate(vector_results):
+            doc_id = (doc.metadata or {}).get("id", doc.page_content[:50])
+            candidates.append({
+                "id": doc_id,
+                "content": doc.page_content,
+                "metadata": doc.metadata,
+                "vector_distance": score,
+                "vector_rank": rank + 1,
+            })
+        # Stage 2: Re-ranking
+        candidates = self._rerank_candidates(text, candidates)
+        # Add final rank
+        for i, c in enumerate(candidates[:k]):
+            c["final_rank"] = i + 1
+        return candidates[:k]
+    def _rerank_candidates(
+        self,
+        query: str,
+        candidates: List[Dict[str, Any]],
+    ) -> List[Dict[str, Any]]:
+        if self._reranker and len(candidates) > 1:
+            try:
+                pairs = [[query, c["content"]] for c in candidates]
+                scores = self._reranker.predict(pairs)
+                for i, score in enumerate(scores):
+                    candidates[i]["rerank_score"] = float(score)
+                candidates.sort(key=lambda x: x.get("rerank_score", 0), reverse=True)
+            except Exception as e:
+                logger.error(f"Rerank error: {e}")
+                self._fallback_scoring(candidates)
+        else:
+            # No reranker: use inverse vector distance as fallback
+            self._fallback_scoring(candidates)
+        return candidates
+    def _fallback_scoring(self, candidates: List[Dict[str, Any]]) -> None:
+        """Apply fallback scoring using inverse vector distance."""
+        for c in candidates:
+            c["rerank_score"] = 1.0 / (1.0 + c["vector_distance"])

core/embeddings/vector_store.py CHANGED Viewed

@@ -1,307 +1,239 @@
 from __future__ import annotations
 import json
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Sequence
-from core.hash_file.hash_file import HashProcessor
 from langchain_core.documents import Document
 from langchain_chroma import Chroma
 from utils.helpers import read_yaml
-# Reranker - sentence_transformers với model BGE-M3
-try:
-    from sentence_transformers import CrossEncoder
-    HAS_RERANKER = True
-except ImportError:
-    HAS_RERANKER = False
 @dataclass
 class ChromaConfig:
-	persist_dir: str
-	collection_name: str
-	space: str
-	@staticmethod
-	def default_yaml_path() -> Path:
-		return Path(__file__).resolve().parents[2] / "config" / "vector_db.yaml"
-	@classmethod
-	def from_yaml(cls, path: str | Path | None = None) -> "ChromaConfig":
-		cfg_path = Path(path) if path is not None else cls.default_yaml_path()
-		try:
-			if not cfg_path.exists():
-				raise FileNotFoundError(f"Vector DB config not found: {cfg_path}")
-			data = read_yaml(cfg_path) or {}
-			if not isinstance(data, dict):
-				raise ValueError(f"Invalid config format: {cfg_path}")
-			required = {"persist_dir", "collection_name", "space"}
-			missing = sorted([k for k in required if k not in data])
-			if missing:
-				raise KeyError(f"Missing keys in {cfg_path}: {', '.join(missing)}")
-			cfg = cls(
-				persist_dir=str(data["persist_dir"]),
-				collection_name=str(data["collection_name"]),
-				space=str(data["space"]),
-			)
-			p = Path(cfg.persist_dir)
-			if not p.is_absolute():
-				cfg.persist_dir = str((cfg_path.parent.parent / p).resolve())
-			return cfg
-		except Exception:
-			raise
 class ChromaVectorDB:
-	def __init__(
-		self,
-		embedder: Any,
-		config: ChromaConfig | None = None,
-		reranker_model: str = "BAAI/bge-reranker-v2-m3",
-	):
-		self.embedder = embedder
-		self.config = config or ChromaConfig.from_yaml()
-		self._hasher = HashProcessor(verbose=False)
-		self._vs = Chroma(
-			collection_name=self.config.collection_name,
-			embedding_function=self.embedder,
-			persist_directory=self.config.persist_dir,
-		)
-		# Reranker (Cross-Encoder)
-		self._reranker: Optional[Any] = None
-		self._reranker_model = reranker_model
-		if HAS_RERANKER:
-			try:
-				print(f"Loading reranker: {reranker_model}...")
-				self._reranker = CrossEncoder(reranker_model, max_length=512)
-				print(f"Reranker loaded successfully!")
-			except Exception as e:
-				print(f" Reranker failed to load: {e}")
-				self._reranker = None
-	@property
-	def collection(self):
-		return getattr(self._vs, "_collection", None)
-	@property
-	def vectorstore(self):
-		return self._vs
-	def _flatten_metadata(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
-		out: Dict[str, Any] = {}
-		for k, v in (metadata or {}).items():
-			key = str(k)
-			if v is None:
-				continue
-			if isinstance(v, (str, int, float, bool)):
-				out[key] = v
-				continue
-			if isinstance(v, (list, tuple, set, dict)):
-				out[key] = json.dumps(v, ensure_ascii=False)
-				continue
-			out[key] = str(v)
-		return out
-	def _to_documents(self, docs: Sequence[Dict[str, Any]], ids: Sequence[str]) -> List[Document]:
-		out: List[Document] = []
-		for d, doc_id in zip(docs, ids):
-			md = self._flatten_metadata(d.get("metadata", {}) or {})
-			md.setdefault("id", doc_id)
-			out.append(Document(page_content=d.get("content", ""), metadata=md))
-		return out
-	def _doc_id(self, doc: Dict[str, Any]) -> str:
-		md = doc.get("metadata") or {}
-		key = {
-			"source_path": md.get("source_path"),
-			"source_file": md.get("source_file"),
-			"source_basename": md.get("source_basename"),
-			"section": md.get("section"),
-			"section_path": md.get("section_path"),
-			"type": md.get("type"),
-			"course_code": md.get("course_code"),
-			"stt": md.get("stt"),
-			"chunk_index": md.get("chunk_index"),
-			"chunk_in_section": md.get("chunk_in_section"),
-			"content": doc.get("content"),
-		}
-		return self._hasher.get_string_hash(str(key))
-	def _ensure_unique_ids(self, ids: Sequence[str]) -> List[str]:
-		seen: Dict[str, int] = {}
-		out: List[str] = []
-		for i in ids:
-			base = str(i)
-			n = seen.get(base, 0)
-			seen[base] = n + 1
-			out.append(base if n == 0 else f"{base}__dup{n}")
-		return out
-	def add_documents(
-		self,
-		docs: Sequence[Dict[str, Any]],
-		*,
-		ids: Optional[Sequence[str]] = None,
-		batch_size: int = 128,
-	) -> int:
-		if not docs:
-			return 0
-		if ids is not None and len(ids) != len(docs):
-			raise ValueError("ids length must match docs length")
-		all_ids = list(ids) if ids is not None else [self._doc_id(d) for d in docs]
-		all_ids = self._ensure_unique_ids(all_ids)
-		bs = max(1, batch_size)
-		total = 0
-		for start in range(0, len(docs), bs):
-			batch = docs[start : start + bs]
-			batch_ids = all_ids[start : start + bs]
-			lc_docs = self._to_documents(batch, batch_ids)
-			try:
-				self._vs.add_documents(lc_docs, ids=batch_ids)
-			except TypeError:
-				texts = [d.page_content for d in lc_docs]
-				metas = [d.metadata for d in lc_docs]
-				self._vs.add_texts(texts=texts, metadatas=metas, ids=batch_ids)
-			total += len(batch)
-		return total
-	def upsert_documents(
-		self,
-		docs: Sequence[Dict[str, Any]],
-		*,
-		ids: Optional[Sequence[str]] = None,
-		batch_size: int = 128,
-	) -> int:
-		if not docs:
-			return 0
-		if ids is not None and len(ids) != len(docs):
-			raise ValueError("ids length must match docs length")
-		all_ids = list(ids) if ids is not None else [self._doc_id(d) for d in docs]
-		all_ids = self._ensure_unique_ids(all_ids)
-		bs = max(1, batch_size)
-		col = getattr(self._vs, "_collection", None)
-		if col is None:
-			return self.add_documents(docs, ids=all_ids, batch_size=bs)
-		total = 0
-		for start in range(0, len(docs), bs):
-			batch = docs[start : start + bs]
-			batch_ids = all_ids[start : start + bs]
-			lc_docs = self._to_documents(batch, batch_ids)
-			texts = [d.page_content for d in lc_docs]
-			metas = [d.metadata for d in lc_docs]
-			embs = self.embedder.embed_documents(texts)
-			col.upsert(ids=batch_ids, documents=texts, metadatas=metas, embeddings=embs)
-			total += len(batch)
-		return total
-	def query(
-		self,
-		text: str,
-		*,
-		k: int = 5,
-		where: Optional[Dict[str, Any]] = None,
-	) -> List[Dict[str, Any]]:
-		if not text.strip():
-			return []
-		results = self._vs.similarity_search_with_score(text, k=k, filter=where)
-		out: List[Dict[str, Any]] = []
-		for doc, score in results:
-			out.append({
-				"id": (doc.metadata or {}).get("id"),
-				"content": doc.page_content,
-				"metadata": doc.metadata,
-				"distance": score,
-			})
-		return out
-	def count(self) -> int:
-		col = getattr(self._vs, "_collection", None)
-		if col is None:
-			return 0
-		return int(col.count())
-	def get_all_documents(self, limit: int = 5000) -> List[Dict[str, Any]]:
-		col = self.collection
-		if col is None:
-			return []
-		result = col.get(limit=limit, include=['documents', 'metadatas'])
-		docs = []
-		for i, doc_content in enumerate(result.get('documents', [])):
-			if doc_content:
-				meta = result['metadatas'][i] if result.get('metadatas') else {}
-				docs.append({
-					'id': result['ids'][i] if result.get('ids') else str(i),
-					'content': doc_content,
-					'metadata': meta or {},
-				})
-		return docs
-	def search_with_rerank(
-		self,
-		text: str,
-		*,
-		k: int = 5,
-		where: Optional[Dict[str, Any]] = None,
-		initial_k: int = 100,
-	) -> List[Dict[str, Any]]:
-		if not text.strip():
-			return []
-		# Stage 1: Vector Search
-		vector_results = self._vs.similarity_search_with_score(text, k=initial_k, filter=where)
-		if not vector_results:
-			return []
-		candidates = []
-		for rank, (doc, score) in enumerate(vector_results):
-			doc_id = (doc.metadata or {}).get("id", doc.page_content[:50])
-			candidates.append({
-				"id": doc_id,
-				"content": doc.page_content,
-				"metadata": doc.metadata,
-				"vector_distance": score,
-				"vector_rank": rank + 1,
-			})
-		# Stage 2: Re-ranking
-		if self._reranker and len(candidates) > 1:
-			try:
-				pairs = [[text, c["content"]] for c in candidates]
-				scores = self._reranker.predict(pairs)
-				for i, score in enumerate(scores):
-					candidates[i]["rerank_score"] = float(score)
-				candidates.sort(key=lambda x: x.get("rerank_score", 0), reverse=True)
-			except Exception as e:
-				print(f" Rerank error: {e}")
-				for c in candidates:
-					c["rerank_score"] = 0.0
-		else:
-			# No reranker: use inverse vector distance
-			for c in candidates:
-				c["rerank_score"] = 1.0 / (1.0 + c["vector_distance"])
-		# Add final rank
-		for i, c in enumerate(candidates[:k]):
-			c["final_rank"] = i + 1
-		return candidates[:k]

 from __future__ import annotations
 import json
+import logging
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Sequence
 from langchain_core.documents import Document
 from langchain_chroma import Chroma
+from core.hash_file.hash_file import HashProcessor
 from utils.helpers import read_yaml
+logger = logging.getLogger(__name__)
 @dataclass
 class ChromaConfig:
+    persist_dir: str
+    collection_name: str
+    space: str
+    @staticmethod
+    def default_yaml_path() -> Path:
+        return Path(__file__).resolve().parents[2] / "config" / "vector_db.yaml"
+    @classmethod
+    def from_yaml(cls, path: str | Path | None = None) -> "ChromaConfig":
+        cfg_path = Path(path) if path is not None else cls.default_yaml_path()
+        try:
+            if not cfg_path.exists():
+                raise FileNotFoundError(f"Vector DB config not found: {cfg_path}")
+            data = read_yaml(cfg_path) or {}
+            if not isinstance(data, dict):
+                raise ValueError(f"Invalid config format: {cfg_path}")
+            required = {"persist_dir", "collection_name", "space"}
+            missing = sorted([k for k in required if k not in data])
+            if missing:
+                raise KeyError(f"Missing keys in {cfg_path}: {', '.join(missing)}")
+            cfg = cls(
+                persist_dir=str(data["persist_dir"]),
+                collection_name=str(data["collection_name"]),
+                space=str(data["space"]),
+            )
+            p = Path(cfg.persist_dir)
+            if not p.is_absolute():
+                cfg.persist_dir = str((cfg_path.parent.parent / p).resolve())
+            return cfg
+        except Exception:
+            raise
 class ChromaVectorDB:
+    def __init__(
+        self,
+        embedder: Any,
+        config: ChromaConfig | None = None,
+    ):
+        self.embedder = embedder
+        self.config = config or ChromaConfig.from_yaml()
+        self._hasher = HashProcessor(verbose=False)
+        self._vs = Chroma(
+            collection_name=self.config.collection_name,
+            embedding_function=self.embedder,
+            persist_directory=self.config.persist_dir,
+        )
+        logger.info(f"ChromaVectorDB initialized: {self.config.collection_name}")
+    @property
+    def collection(self):
+        return getattr(self._vs, "_collection", None)
+    @property
+    def vectorstore(self):
+        return self._vs
+    def _flatten_metadata(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
+        out: Dict[str, Any] = {}
+        for k, v in (metadata or {}).items():
+            key = str(k)
+            if v is None:
+                continue
+            if isinstance(v, (str, int, float, bool)):
+                out[key] = v
+                continue
+            if isinstance(v, (list, tuple, set, dict)):
+                out[key] = json.dumps(v, ensure_ascii=False)
+                continue
+            out[key] = str(v)
+        return out
+    def _to_documents(self, docs: Sequence[Dict[str, Any]], ids: Sequence[str]) -> List[Document]:
+        out: List[Document] = []
+        for d, doc_id in zip(docs, ids):
+            md = self._flatten_metadata(d.get("metadata", {}) or {})
+            md.setdefault("id", doc_id)
+            out.append(Document(page_content=d.get("content", ""), metadata=md))
+        return out
+    def _doc_id(self, doc: Dict[str, Any]) -> str:
+        md = doc.get("metadata") or {}
+        key = {
+            "source_path": md.get("source_path"),
+            "source_file": md.get("source_file"),
+            "source_basename": md.get("source_basename"),
+            "section": md.get("section"),
+            "section_path": md.get("section_path"),
+            "type": md.get("type"),
+            "course_code": md.get("course_code"),
+            "stt": md.get("stt"),
+            "chunk_index": md.get("chunk_index"),
+            "chunk_in_section": md.get("chunk_in_section"),
+            "content": doc.get("content"),
+        }
+        return self._hasher.get_string_hash(str(key))
+    def _ensure_unique_ids(self, ids: Sequence[str]) -> List[str]:
+        seen: Dict[str, int] = {}
+        out: List[str] = []
+        for i in ids:
+            base = str(i)
+            n = seen.get(base, 0)
+            seen[base] = n + 1
+            out.append(base if n == 0 else f"{base}__dup{n}")
+        return out
+    def add_documents(
+        self,
+        docs: Sequence[Dict[str, Any]],
+        *,
+        ids: Optional[Sequence[str]] = None,
+        batch_size: int = 128,
+    ) -> int:
+        if not docs:
+            return 0
+        if ids is not None and len(ids) != len(docs):
+            raise ValueError("ids length must match docs length")
+        all_ids = list(ids) if ids is not None else [self._doc_id(d) for d in docs]
+        all_ids = self._ensure_unique_ids(all_ids)
+        bs = max(1, batch_size)
+        total = 0
+        for start in range(0, len(docs), bs):
+            batch = docs[start : start + bs]
+            batch_ids = all_ids[start : start + bs]
+            lc_docs = self._to_documents(batch, batch_ids)
+            try:
+                self._vs.add_documents(lc_docs, ids=batch_ids)
+            except TypeError:
+                texts = [d.page_content for d in lc_docs]
+                metas = [d.metadata for d in lc_docs]
+                self._vs.add_texts(texts=texts, metadatas=metas, ids=batch_ids)
+            total += len(batch)
+        logger.info(f"Added {total} documents to vector store")
+        return total
+    def upsert_documents(
+        self,
+        docs: Sequence[Dict[str, Any]],
+        *,
+        ids: Optional[Sequence[str]] = None,
+        batch_size: int = 128,
+    ) -> int:
+        if not docs:
+            return 0
+        if ids is not None and len(ids) != len(docs):
+            raise ValueError("ids length must match docs length")
+        all_ids = list(ids) if ids is not None else [self._doc_id(d) for d in docs]
+        all_ids = self._ensure_unique_ids(all_ids)
+        bs = max(1, batch_size)
+        col = getattr(self._vs, "_collection", None)
+        if col is None:
+            return self.add_documents(docs, ids=all_ids, batch_size=bs)
+        total = 0
+        for start in range(0, len(docs), bs):
+            batch = docs[start : start + bs]
+            batch_ids = all_ids[start : start + bs]
+            lc_docs = self._to_documents(batch, batch_ids)
+            texts = [d.page_content for d in lc_docs]
+            metas = [d.metadata for d in lc_docs]
+            embs = self.embedder.embed_documents(texts)
+            col.upsert(ids=batch_ids, documents=texts, metadatas=metas, embeddings=embs)
+            total += len(batch)
+        logger.info(f"Upserted {total} documents to vector store")
+        return total
+    def count(self) -> int:
+        col = getattr(self._vs, "_collection", None)
+        if col is None:
+            return 0
+        return int(col.count())
+    def get_all_documents(self, limit: int = 5000) -> List[Dict[str, Any]]:
+        col = self.collection
+        if col is None:
+            return []
+        result = col.get(limit=limit, include=['documents', 'metadatas'])
+        docs = []
+        for i, doc_content in enumerate(result.get('documents', [])):
+            if doc_content:
+                meta = result['metadatas'][i] if result.get('metadatas') else {}
+                docs.append({
+                    'id': result['ids'][i] if result.get('ids') else str(i),
+                    'content': doc_content,
+                    'metadata': meta or {},
+                })
+        return docs
+    def delete_documents(self, ids: Sequence[str]) -> int:
+        if not ids:
+            return 0
+        col = self.collection
+        if col is None:
+            return 0
+        col.delete(ids=list(ids))
+        logger.info(f"Deleted {len(ids)} documents from vector store")
+        return len(ids)

core/gradio/gradio_rag_qwen.py CHANGED Viewed

@@ -21,6 +21,7 @@ def _load_env() -> None:
 from core.embeddings.embedding_model import VietnameseBiEncoderConfig, VietnameseBiEncoderEmbeddings
 from core.embeddings.vector_store import ChromaConfig, ChromaVectorDB
 _load_env()
@@ -32,6 +33,7 @@ GROQ_MODEL = os.getenv("GROQ_MODEL", "qwen/qwen3-32b")
 class AppState:
     def __init__(self) -> None:
         self.db: Optional[ChromaVectorDB] = None
         self.groq: Optional[Groq] = None
@@ -56,6 +58,7 @@ def _init_resources() -> None:
         embedder=emb,
         config=db_cfg,
     )
     api_key = (os.getenv("GROQ_API_KEY") or "").strip()
     if not api_key:
@@ -72,7 +75,8 @@ def rag_chat(message: str, history: List[Dict[str, str]] | None = None):
     assert STATE.groq is not None
     # Vector Search + Re-ranking
-    results = STATE.db.search_with_rerank(message, k=TOP_K, initial_k=50)
     if not results:
         yield "Xin lỗi, tôi không tìm thấy thông tin phù hợp trong dữ liệu."

 from core.embeddings.embedding_model import VietnameseBiEncoderConfig, VietnameseBiEncoderEmbeddings
 from core.embeddings.vector_store import ChromaConfig, ChromaVectorDB
+from core.embeddings.retrival import Retriever
 _load_env()
 class AppState:
     def __init__(self) -> None:
         self.db: Optional[ChromaVectorDB] = None
+        self.retriever: Optional[Retriever] = None
         self.groq: Optional[Groq] = None
         embedder=emb,
         config=db_cfg,
     )
+    STATE.retriever = Retriever(vector_db=STATE.db)
     api_key = (os.getenv("GROQ_API_KEY") or "").strip()
     if not api_key:
     assert STATE.groq is not None
     # Vector Search + Re-ranking
+    assert STATE.retriever is not None
+    results = STATE.retriever.search_with_rerank(message, k=TOP_K, initial_k=50)
     if not results:
         yield "Xin lỗi, tôi không tìm thấy thông tin phù hợp trong dữ liệu."

evaluation/simple_eval.py CHANGED Viewed

@@ -22,6 +22,7 @@ load_dotenv(find_dotenv(usecwd=True))
 from langchain_groq import ChatGroq
 from core.embeddings.embedding_model import VietnameseBiEncoderConfig, VietnameseBiEncoderEmbeddings
 from core.embeddings.vector_store import ChromaConfig, ChromaVectorDB
 TOP_K = int(os.getenv("TOP_K", "5"))
 INITIAL_K = int(os.getenv("INITIAL_K", "50"))
@@ -51,6 +52,7 @@ def extract_keywords(text: str) -> set:
 class SimpleRAGEvaluator:
     def __init__(self):
         self.db: Optional[ChromaVectorDB] = None
         self.embedder: Optional[VietnameseBiEncoderEmbeddings] = None
         self.llm = None
         self.llm_fast = None
@@ -71,6 +73,7 @@ class SimpleRAGEvaluator:
         print(f"Vector DB: {db_cfg.collection_name}")
         self.db = ChromaVectorDB(embedder=self.embedder, config=db_cfg)
         api_key = os.getenv("GROQ_API_KEY")
         if not api_key:
@@ -115,20 +118,48 @@ TRẢ LỜI:"""
     def retrieve_contexts(self, question: str) -> List[str]:
         try:
-            results = self.db.search_with_rerank(question, k=TOP_K, initial_k=INITIAL_K)
             return [r.get("content", "")[:1000] for r in results if r.get("content")]
         except Exception as e:
             print(f"Retrieval error: {e}")
             return []
-    def calculate_semantic_similarity(self, text1: str, text2: str) -> float:
         try:
-            emb1 = np.array(self.embedder.embed_query(text1))
-            emb2 = np.array(self.embedder.embed_query(text2))
-            return cosine_similarity(emb1, emb2)
         except Exception as e:
-            print(f"Embedding error: {e}")
-            return 0.0
     def calculate_keyword_overlap(self, answer: str, ground_truth: str) -> float:
         gt_keywords = extract_keywords(ground_truth)

 from langchain_groq import ChatGroq
 from core.embeddings.embedding_model import VietnameseBiEncoderConfig, VietnameseBiEncoderEmbeddings
 from core.embeddings.vector_store import ChromaConfig, ChromaVectorDB
+from core.embeddings.retrival import Retriever
 TOP_K = int(os.getenv("TOP_K", "5"))
 INITIAL_K = int(os.getenv("INITIAL_K", "50"))
 class SimpleRAGEvaluator:
     def __init__(self):
         self.db: Optional[ChromaVectorDB] = None
+        self.retriever: Optional[Retriever] = None
         self.embedder: Optional[VietnameseBiEncoderEmbeddings] = None
         self.llm = None
         self.llm_fast = None
         print(f"Vector DB: {db_cfg.collection_name}")
         self.db = ChromaVectorDB(embedder=self.embedder, config=db_cfg)
+        self.retriever = Retriever(vector_db=self.db)
         api_key = os.getenv("GROQ_API_KEY")
         if not api_key:
     def retrieve_contexts(self, question: str) -> List[str]:
         try:
+            results = self.retriever.search_with_rerank(question, k=TOP_K, initial_k=INITIAL_K)
             return [r.get("content", "")[:1000] for r in results if r.get("content")]
         except Exception as e:
             print(f"Retrieval error: {e}")
             return []
+    def calculate_semantic_similarity(self, answer: str, ground_truth: str) -> float:
+        """
+        Đánh giá semantic similarity giữa answer và ground_truth bằng LLM.
+        Thay thế cosine similarity bằng LLM-based scoring.
+        """
+        if not answer.strip() or not ground_truth.strip():
+            return 0.0
+        prompt = f"""Bạn là giám khảo chấm thi.
+Nhiệm vụ: So sánh độ tương đồng ngữ nghĩa giữa CÂU TRẢ LỜI và ĐÁP ÁN CHUẨN.
+ĐÁP ÁN CHUẨN:
+{ground_truth[:800]}
+CÂU TRẢ LỜI:
+{answer[:800]}
+Yêu cầu đánh giá độ tương đồng ngữ nghĩa:
+- 1.0: Câu trả lời chứa đầy đủ và chính xác thông tin như đáp án chuẩn
+- 0.8: Câu trả lời đúng ý chính, có thể thiếu một số chi tiết nhỏ
+- 0.6: Câu trả lời đúng một phần, thiếu một số thông tin quan trọng
+- 0.4: Câu trả lời có liên quan nhưng thiếu nhiều thông tin hoặc không chính xác
+- 0.2: Câu trả lời chỉ đúng một phần rất nhỏ
+- 0.0: Câu trả lời hoàn toàn sai hoặc không liên quan
+CHỈ TRẢ VỀ MỘT CON SỐ (0.0, 0.2, 0.4, 0.6, 0.8 hoặc 1.0), KHÔNG GIẢI THÍCH:"""
         try:
+            response = self.llm_fast.invoke(prompt).content.strip()
+            match = re.search(r"(1\.0|0\.\d|0|1)", response)
+            if match:
+                return float(match.group())
+            return 0.5
         except Exception as e:
+            print(f"Semantic similarity LLM error: {e}")
+            return 0.5
     def calculate_keyword_overlap(self, answer: str, ground_truth: str) -> float:
         gt_keywords = extract_keywords(ground_truth)