Spaces:

j-js
/

GameAI

Sleeping

App Files Files Community

j-js commited on 14 days ago

Commit

efac600

verified ·

1 Parent(s): 40b145a

Update question_support_loader.py

Browse files

Files changed (1) hide show

question_support_loader.py +112 -9

question_support_loader.py CHANGED Viewed

@@ -1,8 +1,9 @@
 from __future__ import annotations
 import json
 from pathlib import Path
-from typing import Any, Dict, List, Optional
 class QuestionSupportBank:
@@ -12,13 +13,35 @@ class QuestionSupportBank:
         self._loaded = False
         self._by_id: Dict[str, Dict[str, Any]] = {}
         self._by_text: Dict[str, Dict[str, Any]] = {}
     def _normalize(self, text: Optional[str]) -> str:
-        return " ".join((text or "").strip().lower().split())
     def load(self) -> None:
         self._by_id = {}
         self._by_text = {}
         if self.data_path.exists():
             with self.data_path.open("r", encoding="utf-8") as handle:
@@ -41,14 +64,58 @@ class QuestionSupportBank:
     def _store_item(self, item: Dict[str, Any]) -> None:
         if not isinstance(item, dict):
             return
-        qid = str(item.get("question_id") or "").strip()
-        qtext = self._normalize(item.get("question_text") or item.get("stem") or "")
         if qid:
-            self._by_id[qid] = item
         if qtext:
-            self._by_text[qtext] = item
-    def get(self, question_id: Optional[str] = None, question_text: Optional[str] = None) -> Optional[Dict[str, Any]]:
         self._ensure_loaded()
         qid = str(question_id or "").strip()
         if qid and qid in self._by_id:
@@ -57,6 +124,42 @@ class QuestionSupportBank:
         qtext = self._normalize(question_text)
         if qtext and qtext in self._by_text:
             return dict(self._by_text[qtext])
         return None
     def upsert(self, item: Dict[str, Any]) -> None:
@@ -65,7 +168,7 @@ class QuestionSupportBank:
     def all_items(self) -> List[Dict[str, Any]]:
         self._ensure_loaded()
-        return [dict(v) for v in self._by_id.values()]
-question_support_bank = QuestionSupportBank()

 from __future__ import annotations
 import json
+import re
 from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
 class QuestionSupportBank:
         self._loaded = False
         self._by_id: Dict[str, Dict[str, Any]] = {}
         self._by_text: Dict[str, Dict[str, Any]] = {}
+        self._by_signature: Dict[str, Dict[str, Any]] = {}
+        self._items: List[Dict[str, Any]] = []
     def _normalize(self, text: Optional[str]) -> str:
+        cleaned = (text or "").strip().lower()
+        cleaned = cleaned.replace("’", "'")
+        cleaned = re.sub(r"\s+", " ", cleaned)
+        return cleaned
+    def _tokenize(self, text: Optional[str]) -> List[str]:
+        return re.findall(r"[a-z0-9%/]+", self._normalize(text))
+    def _normalize_choice(self, value: Any) -> str:
+        return self._normalize(str(value) if value is not None else "")
+    def _choice_signature(self, choices: Optional[List[Any]]) -> str:
+        cleaned = [self._normalize_choice(choice) for choice in (choices or []) if self._normalize_choice(choice)]
+        return " || ".join(cleaned)
+    def _question_signature(self, question_text: Optional[str], choices: Optional[List[Any]] = None) -> str:
+        q = self._normalize(question_text)
+        c = self._choice_signature(choices)
+        return f"{q} ## {c}" if c else q
     def load(self) -> None:
         self._by_id = {}
         self._by_text = {}
+        self._by_signature = {}
+        self._items = []
         if self.data_path.exists():
             with self.data_path.open("r", encoding="utf-8") as handle:
     def _store_item(self, item: Dict[str, Any]) -> None:
         if not isinstance(item, dict):
             return
+        stored = dict(item)
+        qid = str(stored.get("question_id") or "").strip()
+        stem = stored.get("question_text") or stored.get("stem") or ""
+        choices = stored.get("options_text") or stored.get("choices") or []
+        qtext = self._normalize(stem)
+        signature = self._question_signature(stem, choices)
         if qid:
+            self._by_id[qid] = stored
         if qtext:
+            self._by_text[qtext] = stored
+        if signature:
+            self._by_signature[signature] = stored
+        self._items.append(stored)
+    def _score_candidate(
+        self,
+        *,
+        query_text: str,
+        query_choices: Optional[List[Any]],
+        candidate: Dict[str, Any],
+    ) -> Tuple[float, float, float]:
+        cand_text = candidate.get("question_text") or candidate.get("stem") or ""
+        cand_choices = candidate.get("options_text") or candidate.get("choices") or []
+        q_tokens = set(self._tokenize(query_text))
+        c_tokens = set(self._tokenize(cand_text))
+        if not q_tokens or not c_tokens:
+            token_overlap = 0.0
+        else:
+            token_overlap = len(q_tokens & c_tokens) / max(len(q_tokens | c_tokens), 1)
+        q_choice_sig = self._choice_signature(query_choices)
+        c_choice_sig = self._choice_signature(cand_choices)
+        if q_choice_sig and c_choice_sig:
+            choice_match = 1.0 if q_choice_sig == c_choice_sig else 0.0
+        else:
+            choice_match = 0.0
+        exact_text = 1.0 if self._normalize(query_text) == self._normalize(cand_text) else 0.0
+        score = (0.55 * token_overlap) + (0.35 * choice_match) + (0.10 * exact_text)
+        return score, token_overlap, choice_match
+    def get(
+        self,
+        question_id: Optional[str] = None,
+        question_text: Optional[str] = None,
+        options_text: Optional[List[Any]] = None,
+    ) -> Optional[Dict[str, Any]]:
         self._ensure_loaded()
         qid = str(question_id or "").strip()
         if qid and qid in self._by_id:
         qtext = self._normalize(question_text)
         if qtext and qtext in self._by_text:
             return dict(self._by_text[qtext])
+        signature = self._question_signature(question_text, options_text)
+        if signature and signature in self._by_signature:
+            return dict(self._by_signature[signature])
+        if not qtext:
+            return None
+        best: Optional[Dict[str, Any]] = None
+        best_score = 0.0
+        best_overlap = 0.0
+        best_choice = 0.0
+        for item in self._items:
+            score, token_overlap, choice_match = self._score_candidate(
+                query_text=question_text or "",
+                query_choices=options_text,
+                candidate=item,
+            )
+            if score > best_score:
+                best = item
+                best_score = score
+                best_overlap = token_overlap
+                best_choice = choice_match
+        threshold = 0.84 if options_text else 0.92
+        if best is not None and (best_score >= threshold or (best_choice >= 1.0 and best_overlap >= 0.55)):
+            out = dict(best)
+            out.setdefault("support_match", {})
+            out["support_match"] = {
+                "mode": "fuzzy",
+                "score": round(best_score, 4),
+                "token_overlap": round(best_overlap, 4),
+                "choice_match": round(best_choice, 4),
+            }
+            return out
         return None
     def upsert(self, item: Dict[str, Any]) -> None:
     def all_items(self) -> List[Dict[str, Any]]:
         self._ensure_loaded()
+        return [dict(v) for v in self._items]
+question_support_bank = QuestionSupportBank()