Spaces:

j-js
/

GameAI

Sleeping

App Files Files Community

j-js commited on 24 days ago

Commit

55b1e0c

verified ·

1 Parent(s): ba68b33

Update retrieval_engine.py

Browse files

Files changed (1) hide show

retrieval_engine.py +56 -58

retrieval_engine.py CHANGED Viewed

@@ -120,70 +120,68 @@ class RetrievalEngine:
         intent: str = "answer",
         k: int = 3,
     ) -> List[RetrievedChunk]:
         if not self.rows:
             return []
-    combined_query = clean_math_text(query)
-    normalized_topic = (topic or "").strip().lower()
-    # First narrow the pool when we have a specific topic.
-    candidate_rows = self.rows
-    if normalized_topic:
-        exact_topic_rows = [
-            row for row in self.rows
-            if (row.get("topic") or "").strip().lower() == normalized_topic
-        ]
-        if exact_topic_rows:
-            candidate_rows = exact_topic_rows
-    scores = []
-    if self.encoder is not None and self.embeddings is not None and np is not None:
-        try:
-            q = self.encoder.encode(
-                [combined_query],
-                convert_to_numpy=True,
-                normalize_embeddings=True,
-            )[0]
-            # If we filtered rows, we must also filter embeddings to the same indices.
-            if candidate_rows is self.rows:
-                candidate_embeddings = self.embeddings
-            else:
-                candidate_indices = [
-                    i for i, row in enumerate(self.rows)
-                    if (row.get("topic") or "").strip().lower() == normalized_topic
-                ]
-                candidate_embeddings = self.embeddings[candidate_indices]
-            semantic_scores = candidate_embeddings @ q
-            for row, sem in zip(candidate_rows, semantic_scores.tolist()):
                 lexical = score_token_overlap(combined_query, row["text"])
                 bonus = self._topic_bonus(topic, row["topic"], intent)
-                total = 0.7 * sem + 0.3 * lexical + bonus
-                scores.append((total, row))
-        except Exception:
-            scores = []
-    if not scores:
-        for row in candidate_rows:
-            lexical = score_token_overlap(combined_query, row["text"])
-            bonus = self._topic_bonus(topic, row["topic"], intent)
-            scores.append((lexical + bonus, row))
-    scores.sort(key=lambda x: x[0], reverse=True)
-    results: List[RetrievedChunk] = []
-    for score, row in scores[:k]:
-        results.append(
-            RetrievedChunk(
-                text=row["text"],
-                topic=row["topic"],
-                source=row["source"],
-                score=float(score),
             )
-        )
-return results

         intent: str = "answer",
         k: int = 3,
     ) -> List[RetrievedChunk]:
         if not self.rows:
             return []
+        combined_query = clean_math_text(query)
+        normalized_topic = (topic or "").strip().lower()
+        # Narrow search pool by topic if possible
+        candidate_rows = self.rows
+        candidate_indices = None
+        if normalized_topic:
+            exact_topic_rows = [
+                (i, row) for i, row in enumerate(self.rows)
+                if (row.get("topic") or "").strip().lower() == normalized_topic
+            ]
+            if exact_topic_rows:
+                candidate_indices = [i for i, _ in exact_topic_rows]
+                candidate_rows = [row for _, row in exact_topic_rows]
+        scores = []
+        if self.encoder is not None and self.embeddings is not None and np is not None:
+            try:
+                q = self.encoder.encode(
+                    [combined_query],
+                    convert_to_numpy=True,
+                    normalize_embeddings=True,
+                )[0]
+                if candidate_indices is None:
+                    candidate_embeddings = self.embeddings
+                else:
+                    candidate_embeddings = self.embeddings[candidate_indices]
+                semantic_scores = candidate_embeddings @ q
+                for row, sem in zip(candidate_rows, semantic_scores.tolist()):
+                    lexical = score_token_overlap(combined_query, row["text"])
+                    bonus = self._topic_bonus(topic, row["topic"], intent)
+                    total = 0.7 * sem + 0.3 * lexical + bonus
+                    scores.append((total, row))
+            except Exception:
+                scores = []
+        if not scores:
+            for row in candidate_rows:
                 lexical = score_token_overlap(combined_query, row["text"])
                 bonus = self._topic_bonus(topic, row["topic"], intent)
+                scores.append((lexical + bonus, row))
+        scores.sort(key=lambda x: x[0], reverse=True)
+        results: List[RetrievedChunk] = []
+        for score, row in scores[:k]:
+            results.append(
+                RetrievedChunk(
+                    text=row["text"],
+                    topic=row["topic"],
+                    source=row["source"],
+                    score=float(score),
+                )
             )
+        return results