Spaces:

j-js
/

GameAI

Sleeping

App Files Files Community

j-js commited on 24 days ago

Commit

496d977

verified ·

1 Parent(s): ba0bf0a

Update retrieval_engine.py

Browse files

Files changed (1) hide show

retrieval_engine.py +67 -44

retrieval_engine.py CHANGED Viewed

@@ -114,52 +114,75 @@ class RetrievalEngine:
         return bonus
     def search(
-        self,
-        query: str,
-        topic: str = "",
-        intent: str = "answer",
-        k: int = 3,
-    ) -> List[RetrievedChunk]:
-        if not self.rows:
-            return []
-        combined_query = clean_math_text(query)
-        scores = []
-        if self.encoder is not None and self.embeddings is not None and np is not None:
-            try:
-                q = self.encoder.encode(
-                    [combined_query],
-                    convert_to_numpy=True,
-                    normalize_embeddings=True,
-                )[0]
-                semantic_scores = self.embeddings @ q
-                for row, sem in zip(self.rows, semantic_scores.tolist()):
-                    lexical = score_token_overlap(combined_query, row["text"])
-                    bonus = self._topic_bonus(topic, row["topic"], intent)
-                    total = 0.7 * sem + 0.3 * lexical + bonus
-                    scores.append((total, row))
-            except Exception:
-                scores = []
-        if not scores:
-            for row in self.rows:
                 lexical = score_token_overlap(combined_query, row["text"])
                 bonus = self._topic_bonus(topic, row["topic"], intent)
-                scores.append((lexical + bonus, row))
-        scores.sort(key=lambda x: x[0], reverse=True)
-        results: List[RetrievedChunk] = []
-        for score, row in scores[:k]:
-            results.append(
-                RetrievedChunk(
-                    text=row["text"],
-                    topic=row["topic"],
-                    source=row["source"],
-                    score=float(score),
-                )
             )
-        return results

         return bonus
     def search(
+    self,
+    query: str,
+    topic: str = "",
+    intent: str = "answer",
+    k: int = 3,
+) -> List[RetrievedChunk]:
+    if not self.rows:
+        return []
+    combined_query = clean_math_text(query)
+    normalized_topic = (topic or "").strip().lower()
+    # First narrow the pool when we have a specific topic.
+    candidate_rows = self.rows
+    if normalized_topic:
+        exact_topic_rows = [
+            row for row in self.rows
+            if (row.get("topic") or "").strip().lower() == normalized_topic
+        ]
+        if exact_topic_rows:
+            candidate_rows = exact_topic_rows
+    scores = []
+    if self.encoder is not None and self.embeddings is not None and np is not None:
+        try:
+            q = self.encoder.encode(
+                [combined_query],
+                convert_to_numpy=True,
+                normalize_embeddings=True,
+            )[0]
+            # If we filtered rows, we must also filter embeddings to the same indices.
+            if candidate_rows is self.rows:
+                candidate_embeddings = self.embeddings
+            else:
+                candidate_indices = [
+                    i for i, row in enumerate(self.rows)
+                    if (row.get("topic") or "").strip().lower() == normalized_topic
+                ]
+                candidate_embeddings = self.embeddings[candidate_indices]
+            semantic_scores = candidate_embeddings @ q
+            for row, sem in zip(candidate_rows, semantic_scores.tolist()):
                 lexical = score_token_overlap(combined_query, row["text"])
                 bonus = self._topic_bonus(topic, row["topic"], intent)
+                total = 0.7 * sem + 0.3 * lexical + bonus
+                scores.append((total, row))
+        except Exception:
+            scores = []
+    if not scores:
+        for row in candidate_rows:
+            lexical = score_token_overlap(combined_query, row["text"])
+            bonus = self._topic_bonus(topic, row["topic"], intent)
+            scores.append((lexical + bonus, row))
+    scores.sort(key=lambda x: x[0], reverse=True)
+    results: List[RetrievedChunk] = []
+    for score, row in scores[:k]:
+        results.append(
+            RetrievedChunk(
+                text=row["text"],
+                topic=row["topic"],
+                source=row["source"],
+                score=float(score),
             )
+        )
+    return results