Spaces:

j-js
/

GameAI

Sleeping

App Files Files Community

j-js commited on 25 days ago

Commit

a214825

verified ·

1 Parent(s): 75970da

Update retrieval_engine.py

Browse files

Files changed (1) hide show

retrieval_engine.py +85 -18

retrieval_engine.py CHANGED Viewed

@@ -2,7 +2,7 @@ from __future__ import annotations
 import json
 import os
-from typing import List, Optional
 from models import RetrievedChunk
 from utils import clean_math_text, score_token_overlap
@@ -24,18 +24,24 @@ class RetrievalEngine:
         self.rows = self._load_rows(data_path)
         self.encoder = None
         self.embeddings = None
         if SentenceTransformer is not None and self.rows:
             try:
                 self.encoder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
-                self.embeddings = self.encoder.encode([r["text"] for r in self.rows], convert_to_numpy=True, normalize_embeddings=True)
             except Exception:
                 self.encoder = None
                 self.embeddings = None
-    def _load_rows(self, data_path: str):
-        rows = []
         if not os.path.exists(data_path):
             return rows
         with open(data_path, "r", encoding="utf-8") as f:
             for line in f:
                 line = line.strip()
@@ -45,43 +51,95 @@ class RetrievalEngine:
                     item = json.loads(line)
                 except Exception:
                     continue
-                rows.append({
-                    "text": item.get("text", ""),
-                    "topic": item.get("topic", item.get("section", "general")) or "general",
-                    "source": item.get("source", "local_corpus"),
-                })
         return rows
     def _topic_bonus(self, desired_topic: str, row_topic: str, intent: str) -> float:
         desired_topic = (desired_topic or "").lower()
         row_topic = (row_topic or "").lower()
         intent = (intent or "").lower()
         bonus = 0.0
         if desired_topic and desired_topic in row_topic:
             bonus += 1.25
         if desired_topic == "algebra" and row_topic in {"algebra", "linear equations", "equations"}:
             bonus += 1.0
         if desired_topic == "percent" and "percent" in row_topic:
             bonus += 1.0
-        if intent in {"method", "step_by_step", "full_working", "hint"}:
-            if any(k in row_topic for k in ["algebra", "percent", "fractions", "word_problems", "general"]):
                 bonus += 0.25
         return bonus
-    def search(self, query: str, topic: str = "", intent: str = "answer", k: int = 3) -> List[RetrievedChunk]:
         if not self.rows:
             return []
-        combined_query = clean_math_text(query)
         scores = []
         if self.encoder is not None and self.embeddings is not None and np is not None:
             try:
-                q = self.encoder.encode([combined_query], convert_to_numpy=True, normalize_embeddings=True)[0]
                 semantic_scores = self.embeddings @ q
                 for row, sem in zip(self.rows, semantic_scores.tolist()):
                     lexical = score_token_overlap(combined_query, row["text"])
                     bonus = self._topic_bonus(topic, row["topic"], intent)
-                    scores.append((0.7 * sem + 0.3 * lexical + bonus, row))
             except Exception:
                 scores = []
@@ -92,7 +150,16 @@ class RetrievalEngine:
                 scores.append((lexical + bonus, row))
         scores.sort(key=lambda x: x[0], reverse=True)
-        results = []
         for score, row in scores[:k]:
-            results.append(RetrievedChunk(text=row["text"], topic=row["topic"], source=row["source"], score=float(score)))
-        return results

 import json
 import os
+from typing import List
 from models import RetrievedChunk
 from utils import clean_math_text, score_token_overlap
         self.rows = self._load_rows(data_path)
         self.encoder = None
         self.embeddings = None
         if SentenceTransformer is not None and self.rows:
             try:
                 self.encoder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+                self.embeddings = self.encoder.encode(
+                    [r["text"] for r in self.rows],
+                    convert_to_numpy=True,
+                    normalize_embeddings=True,
+                )
             except Exception:
                 self.encoder = None
                 self.embeddings = None
+    def _load_rows(self, data_path: str) -> List[dict]:
+        rows: List[dict] = []
         if not os.path.exists(data_path):
             return rows
         with open(data_path, "r", encoding="utf-8") as f:
             for line in f:
                 line = line.strip()
                     item = json.loads(line)
                 except Exception:
                     continue
+                rows.append(
+                    {
+                        "text": item.get("text", ""),
+                        "topic": item.get("topic", item.get("section", "general")) or "general",
+                        "source": item.get("source", "local_corpus"),
+                    }
+                )
         return rows
     def _topic_bonus(self, desired_topic: str, row_topic: str, intent: str) -> float:
         desired_topic = (desired_topic or "").lower()
         row_topic = (row_topic or "").lower()
         intent = (intent or "").lower()
         bonus = 0.0
         if desired_topic and desired_topic in row_topic:
             bonus += 1.25
         if desired_topic == "algebra" and row_topic in {"algebra", "linear equations", "equations"}:
             bonus += 1.0
         if desired_topic == "percent" and "percent" in row_topic:
             bonus += 1.0
+        if desired_topic in {"number_theory", "number_properties"} and any(
+            k in row_topic for k in ["number", "divisible", "remainder", "prime", "factor"]
+        ):
+            bonus += 1.0
+        if desired_topic == "geometry" and any(
+            k in row_topic for k in ["geometry", "circle", "triangle", "area", "perimeter"]
+        ):
+            bonus += 1.0
+        if desired_topic == "probability" and "probability" in row_topic:
+            bonus += 1.0
+        if desired_topic == "statistics" and any(
+            k in row_topic for k in ["statistics", "mean", "median", "average", "distribution"]
+        ):
+            bonus += 1.0
+        if intent in {"method", "step_by_step", "full_working", "hint", "walkthrough", "instruction"}:
+            if any(
+                k in row_topic
+                for k in [
+                    "algebra",
+                    "percent",
+                    "fractions",
+                    "word_problems",
+                    "general",
+                    "ratio",
+                    "probability",
+                    "statistics",
+                ]
+            ):
                 bonus += 0.25
         return bonus
+    def search(
+        self,
+        query: str,
+        topic: str = "",
+        intent: str = "answer",
+        k: int = 3,
+    ) -> List[RetrievedChunk]:
         if not self.rows:
             return []
+        combined_query = clean_math_text(query)
         scores = []
         if self.encoder is not None and self.embeddings is not None and np is not None:
             try:
+                q = self.encoder.encode(
+                    [combined_query],
+                    convert_to_numpy=True,
+                    normalize_embeddings=True,
+                )[0]
                 semantic_scores = self.embeddings @ q
                 for row, sem in zip(self.rows, semantic_scores.tolist()):
                     lexical = score_token_overlap(combined_query, row["text"])
                     bonus = self._topic_bonus(topic, row["topic"], intent)
+                    total = 0.7 * sem + 0.3 * lexical + bonus
+                    scores.append((total, row))
             except Exception:
                 scores = []
                 scores.append((lexical + bonus, row))
         scores.sort(key=lambda x: x[0], reverse=True)
+        results: List[RetrievedChunk] = []
         for score, row in scores[:k]:
+            results.append(
+                RetrievedChunk(
+                    text=row["text"],
+                    topic=row["topic"],
+                    source=row["source"],
+                    score=float(score),
+                )
+            )
+        return results