Spaces:

EphAsad
/

BactKing

Sleeping

App Files Files Community

EphAsad commited on Dec 17, 2025

Commit

d776bf4

verified ·

1 Parent(s): 186a43a

Update rag/rag_retriever.py

Browse files

Files changed (1) hide show

rag/rag_retriever.py +110 -16

rag/rag_retriever.py CHANGED Viewed

@@ -1,8 +1,12 @@
 # rag/rag_retriever.py
 # ============================================================
-# RAG retriever:
-# - Loads kb_index.json
-# - Retrieves best-matching chunks for a given phenotype + genus
 # ============================================================
 from __future__ import annotations
@@ -13,6 +17,26 @@ import numpy as np
 from rag.rag_embedder import embed_text, load_kb_index
 def _cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
     """
     Cosine similarity for normalized embeddings.
@@ -20,12 +44,26 @@ def _cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
     return float(np.dot(a, b))
 def retrieve_rag_context(
     phenotype_text: str,
     target_genus: str,
     top_k: int = 5,
     kb_path: str = "data/rag/index/kb_index.json",
 ) -> Dict[str, Any]:
     kb = load_kb_index(kb_path)
     records = kb.get("records", [])
@@ -37,64 +75,120 @@ def retrieve_rag_context(
             "combined_context": "",
         }
-    q_emb = embed_text(phenotype_text, normalize=True)
     target_genus_lc = (target_genus or "").strip().lower()
     scored_records: List[Dict[str, Any]] = []
     for rec in records:
-        g = (rec.get("genus") or "").strip().lower()
-        if target_genus_lc and g != target_genus_lc:
             continue
         emb = rec.get("embedding")
         if emb is None:
             continue
-        score = _cosine_similarity(q_emb, emb)
         scored_records.append({
             "id": rec.get("id"),
             "genus": rec.get("genus"),
             "species": rec.get("species"),
-            "source_type": rec.get("level"),
             "path": rec.get("source_file"),
             "text": rec.get("text"),
             "score": score,
         })
-    # Fallback: use all records
     if not scored_records:
         for rec in records:
             emb = rec.get("embedding")
             if emb is None:
                 continue
-            score = _cosine_similarity(q_emb, emb)
             scored_records.append({
                 "id": rec.get("id"),
                 "genus": rec.get("genus"),
                 "species": rec.get("species"),
-                "source_type": rec.get("level"),
                 "path": rec.get("source_file"),
                 "text": rec.get("text"),
                 "score": score,
             })
     scored_records.sort(key=lambda r: r["score"], reverse=True)
-    top = scored_records[:top_k]
     combined_ctx_parts: List[str] = []
-    for rec in top:
-        label = rec["genus"]
         if rec.get("species"):
             label = f"{label} {rec['species']}"
         combined_ctx_parts.append(
-            f"[{label} — {rec['source_type']}] {rec['text']}"
         )
     combined_context = "\n\n".join(combined_ctx_parts)
     return {
         "genus": target_genus,
-        "chunks": top,
         "combined_context": combined_context,
     }

 # rag/rag_retriever.py
 # ============================================================
+# RAG retriever (Stage 2 – microbiology-aware)
+#
+# Improvements:
+# - Source-type weighting (species > genus > notes)
+# - Genus-aware query expansion
+# - Diversity enforcement (avoid duplicate sources)
+# - Explicit ranking & score annotations for generator
 # ============================================================
 from __future__ import annotations
 from rag.rag_embedder import embed_text, load_kb_index
+# ------------------------------------------------------------
+# Configuration
+# ------------------------------------------------------------
+# Weight different knowledge chunk types
+SOURCE_TYPE_WEIGHTS = {
+    "species": 1.15,
+    "genus": 1.00,
+    "table": 1.10,
+    "note": 0.85,
+}
+# Max chunks allowed per source file (diversity control)
+MAX_CHUNKS_PER_SOURCE = 1
+# ------------------------------------------------------------
+# Similarity helper
+# ------------------------------------------------------------
 def _cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
     """
     Cosine similarity for normalized embeddings.
     return float(np.dot(a, b))
+# ------------------------------------------------------------
+# Public API
+# ------------------------------------------------------------
 def retrieve_rag_context(
     phenotype_text: str,
     target_genus: str,
     top_k: int = 5,
     kb_path: str = "data/rag/index/kb_index.json",
 ) -> Dict[str, Any]:
+    """
+    Retrieve the most relevant RAG chunks for a phenotype + genus.
+    Returns:
+      {
+        "genus": target_genus,
+        "chunks": [...],              # ranked chunk metadata
+        "combined_context": "..."     # formatted context for generator
+      }
+    """
     kb = load_kb_index(kb_path)
     records = kb.get("records", [])
             "combined_context": "",
         }
+    # --------------------------------------------------------
+    # Build genus-aware query
+    # --------------------------------------------------------
+    query_text = phenotype_text.strip()
+    if target_genus:
+        query_text = f"{query_text}\nTarget genus: {target_genus}"
+    q_emb = embed_text(query_text, normalize=True)
     target_genus_lc = (target_genus or "").strip().lower()
     scored_records: List[Dict[str, Any]] = []
+    # --------------------------------------------------------
+    # Primary pass: genus-filtered retrieval
+    # --------------------------------------------------------
     for rec in records:
+        rec_genus = (rec.get("genus") or "").strip().lower()
+        if target_genus_lc and rec_genus != target_genus_lc:
             continue
         emb = rec.get("embedding")
         if emb is None:
             continue
+        base_score = _cosine_similarity(q_emb, emb)
+        source_type = rec.get("level")
+        weight = SOURCE_TYPE_WEIGHTS.get(source_type, 1.0)
+        score = base_score * weight
         scored_records.append({
             "id": rec.get("id"),
             "genus": rec.get("genus"),
             "species": rec.get("species"),
+            "source_type": source_type,
             "path": rec.get("source_file"),
             "text": rec.get("text"),
             "score": score,
         })
+    # --------------------------------------------------------
+    # Fallback: no genus-matched records
+    # --------------------------------------------------------
     if not scored_records:
         for rec in records:
             emb = rec.get("embedding")
             if emb is None:
                 continue
+            base_score = _cosine_similarity(q_emb, emb)
+            source_type = rec.get("level")
+            weight = SOURCE_TYPE_WEIGHTS.get(source_type, 1.0)
+            score = base_score * weight
             scored_records.append({
                 "id": rec.get("id"),
                 "genus": rec.get("genus"),
                 "species": rec.get("species"),
+                "source_type": source_type,
                 "path": rec.get("source_file"),
                 "text": rec.get("text"),
                 "score": score,
             })
+    # --------------------------------------------------------
+    # Sort by weighted score
+    # --------------------------------------------------------
     scored_records.sort(key=lambda r: r["score"], reverse=True)
+    # --------------------------------------------------------
+    # Diversity enforcement (avoid duplicate sources)
+    # --------------------------------------------------------
+    selected: List[Dict[str, Any]] = []
+    source_counts: Dict[str, int] = {}
+    for rec in scored_records:
+        src = rec.get("path") or ""
+        count = source_counts.get(src, 0)
+        if count >= MAX_CHUNKS_PER_SOURCE:
+            continue
+        selected.append(rec)
+        source_counts[src] = count + 1
+        if len(selected) >= top_k:
+            break
+    # --------------------------------------------------------
+    # Build combined context with explicit ranking
+    # --------------------------------------------------------
     combined_ctx_parts: List[str] = []
+    for idx, rec in enumerate(selected, start=1):
+        label = rec.get("genus") or "Unknown genus"
         if rec.get("species"):
             label = f"{label} {rec['species']}"
         combined_ctx_parts.append(
+            f"[RANK {idx} | SCORE {rec['score']:.3f} | {label} — {rec['source_type']}]\n"
+            f"{rec['text']}"
         )
     combined_context = "\n\n".join(combined_ctx_parts)
     return {
         "genus": target_genus,
+        "chunks": selected,
         "combined_context": combined_context,
     }