Spaces:

j-js
/

GameAI

Sleeping

App Files Files Community

j-js commited on 14 days ago

Commit

bcf6874

verified ·

1 Parent(s): 010a947

Update question_support_loader.py

Browse files

Files changed (1) hide show

question_support_loader.py +53 -188

question_support_loader.py CHANGED Viewed

@@ -2,7 +2,6 @@ from __future__ import annotations
 import json
 import re
-from difflib import SequenceMatcher
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
@@ -14,78 +13,34 @@ class QuestionSupportBank:
         self._loaded = False
         self._by_id: Dict[str, Dict[str, Any]] = {}
         self._by_text: Dict[str, Dict[str, Any]] = {}
-        self._by_canonical_text: Dict[str, Dict[str, Any]] = {}
         self._by_signature: Dict[str, Dict[str, Any]] = {}
-        self._by_signature_nolabels: Dict[str, Dict[str, Any]] = {}
         self._items: List[Dict[str, Any]] = []
     def _normalize(self, text: Optional[str]) -> str:
         cleaned = (text or "").strip().lower()
         cleaned = cleaned.replace("’", "'")
-        cleaned = cleaned.replace("‘", "'")
-        cleaned = cleaned.replace("“", '"').replace("”", '"')
-        cleaned = cleaned.replace("–", "-").replace("—", "-")
-        cleaned = cleaned.replace("×", "x")
         cleaned = re.sub(r"\s+", " ", cleaned)
         return cleaned
-    def _canonicalize_text(self, text: Optional[str]) -> str:
-        cleaned = self._normalize(text)
-        if not cleaned:
-            return ""
-        cleaned = re.sub(r"\b([a-e])\s*[\)\.]\s*", " ", cleaned)
-        cleaned = re.sub(r"\boption\s+[a-e]\b", " ", cleaned)
-        cleaned = re.sub(r"\bchoices?\s*:\s*", " ", cleaned)
-        cleaned = re.sub(r"\s*\?\s*$", "", cleaned)
-        cleaned = re.sub(r"\s*[:;,]\s*", " ", cleaned)
-        cleaned = re.sub(r"\s*([=+\-/*()])\s*", r" \1 ", cleaned)
-        cleaned = re.sub(r"[^a-z0-9%/=+\-/*(). ]+", " ", cleaned)
-        cleaned = re.sub(r"\s+", " ", cleaned).strip()
-        return cleaned
     def _tokenize(self, text: Optional[str]) -> List[str]:
-        canon = self._canonicalize_text(text)
-        return re.findall(r"[a-z0-9%/\.]+", canon)
     def _normalize_choice(self, value: Any) -> str:
-        text = self._canonicalize_text(str(value) if value is not None else "")
-        text = re.sub(r"^([a-e])\s*[\)\.]\s*", "", text).strip()
-        return text
     def _choice_signature(self, choices: Optional[List[Any]]) -> str:
         cleaned = [self._normalize_choice(choice) for choice in (choices or []) if self._normalize_choice(choice)]
         return " || ".join(cleaned)
-    def _choice_signature_nolabels(self, choices: Optional[List[Any]]) -> str:
-        cleaned = sorted([self._normalize_choice(choice) for choice in (choices or []) if self._normalize_choice(choice)])
-        return " || ".join(cleaned)
     def _question_signature(self, question_text: Optional[str], choices: Optional[List[Any]] = None) -> str:
-        q = self._canonicalize_text(question_text)
         c = self._choice_signature(choices)
         return f"{q} ## {c}" if c else q
-    def _question_signature_nolabels(self, question_text: Optional[str], choices: Optional[List[Any]] = None) -> str:
-        q = self._canonicalize_text(question_text)
-        c = self._choice_signature_nolabels(choices)
-        return f"{q} ## {c}" if c else q
-    def _shingles(self, text: Optional[str], size: int = 3) -> set[str]:
-        tokens = self._tokenize(text)
-        if len(tokens) < size:
-            return {" ".join(tokens)} if tokens else set()
-        return {" ".join(tokens[i : i + size]) for i in range(len(tokens) - size + 1)}
-    def _ensure_loaded(self) -> None:
-        if not self._loaded:
-            self.load()
     def load(self) -> None:
         self._by_id = {}
         self._by_text = {}
-        self._by_canonical_text = {}
         self._by_signature = {}
-        self._by_signature_nolabels = {}
         self._items = []
         if self.data_path.exists():
@@ -102,6 +57,10 @@ class QuestionSupportBank:
         self._loaded = True
     def _store_item(self, item: Dict[str, Any]) -> None:
         if not isinstance(item, dict):
             return
@@ -111,102 +70,45 @@ class QuestionSupportBank:
         stem = stored.get("question_text") or stored.get("stem") or ""
         choices = stored.get("options_text") or stored.get("choices") or []
-        raw_text = self._normalize(stem)
-        canonical_text = self._canonicalize_text(stem)
         signature = self._question_signature(stem, choices)
-        signature_nolabels = self._question_signature_nolabels(stem, choices)
         if qid:
             self._by_id[qid] = stored
-        if raw_text:
-            self._by_text[raw_text] = stored
-        if canonical_text:
-            self._by_canonical_text[canonical_text] = stored
         if signature:
             self._by_signature[signature] = stored
-        if signature_nolabels:
-            self._by_signature_nolabels[signature_nolabels] = stored
         self._items.append(stored)
-    def _clone_with_match(self, item: Dict[str, Any], match: Dict[str, Any]) -> Dict[str, Any]:
-        out = dict(item)
-        out["support_match"] = match
-        return out
     def _score_candidate(
         self,
         *,
         query_text: str,
         query_choices: Optional[List[Any]],
         candidate: Dict[str, Any],
-    ) -> Dict[str, float]:
         cand_text = candidate.get("question_text") or candidate.get("stem") or ""
         cand_choices = candidate.get("options_text") or candidate.get("choices") or []
-        query_norm = self._canonicalize_text(query_text)
-        cand_norm = self._canonicalize_text(cand_text)
-        q_tokens = set(self._tokenize(query_norm))
-        c_tokens = set(self._tokenize(cand_norm))
-        token_overlap = len(q_tokens & c_tokens) / max(len(q_tokens | c_tokens), 1) if q_tokens and c_tokens else 0.0
-        q_shingles = self._shingles(query_norm)
-        c_shingles = self._shingles(cand_norm)
-        shingle_overlap = len(q_shingles & c_shingles) / max(len(q_shingles | c_shingles), 1) if q_shingles and c_shingles else 0.0
-        seq = SequenceMatcher(None, query_norm, cand_norm).ratio() if query_norm and cand_norm else 0.0
-        q_nums = set(re.findall(r"\d+(?:\.\d+)?%?", query_norm))
-        c_nums = set(re.findall(r"\d+(?:\.\d+)?%?", cand_norm))
-        number_overlap = len(q_nums & c_nums) / max(len(q_nums | c_nums), 1) if q_nums and c_nums else (1.0 if not q_nums and not c_nums else 0.0)
         q_choice_sig = self._choice_signature(query_choices)
         c_choice_sig = self._choice_signature(cand_choices)
-        q_choice_sig_nl = self._choice_signature_nolabels(query_choices)
-        c_choice_sig_nl = self._choice_signature_nolabels(cand_choices)
-        choice_match = 1.0 if q_choice_sig and c_choice_sig and q_choice_sig == c_choice_sig else 0.0
-        choice_set_match = 1.0 if q_choice_sig_nl and c_choice_sig_nl and q_choice_sig_nl == c_choice_sig_nl else 0.0
-        exact_text = 1.0 if query_norm and query_norm == cand_norm else 0.0
-        exact_signature = 1.0 if self._question_signature(query_text, query_choices) == self._question_signature(cand_text, cand_choices) else 0.0
-        exact_signature_nolabels = 1.0 if self._question_signature_nolabels(query_text, query_choices) == self._question_signature_nolabels(cand_text, cand_choices) else 0.0
-        score = (
-            0.28 * exact_text
-            + 0.18 * exact_signature
-            + 0.08 * exact_signature_nolabels
-            + 0.16 * choice_match
-            + 0.08 * choice_set_match
-            + 0.12 * token_overlap
-            + 0.06 * shingle_overlap
-            + 0.02 * number_overlap
-            + 0.02 * seq
-        )
-        return {
-            "score": round(score, 6),
-            "token_overlap": round(token_overlap, 6),
-            "shingle_overlap": round(shingle_overlap, 6),
-            "sequence_ratio": round(seq, 6),
-            "number_overlap": round(number_overlap, 6),
-            "choice_match": round(choice_match, 6),
-            "choice_set_match": round(choice_set_match, 6),
-            "exact_text": round(exact_text, 6),
-            "exact_signature": round(exact_signature, 6),
-            "exact_signature_nolabels": round(exact_signature_nolabels, 6),
-        }
-    def _confidence_label(self, metrics: Dict[str, float]) -> str:
-        score = metrics.get("score", 0.0)
-        if metrics.get("exact_signature", 0.0) >= 1.0 or metrics.get("exact_text", 0.0) >= 1.0:
-            return "exact"
-        if score >= 0.82:
-            return "high"
-        if score >= 0.70:
-            return "medium"
-        return "low"
     def get(
         self,
@@ -216,86 +118,49 @@ class QuestionSupportBank:
     ) -> Optional[Dict[str, Any]]:
         self._ensure_loaded()
         qid = str(question_id or "").strip()
-        raw_text = self._normalize(question_text)
-        canonical_text = self._canonicalize_text(question_text)
-        signature = self._question_signature(question_text, options_text)
-        signature_nolabels = self._question_signature_nolabels(question_text, options_text)
         if qid and qid in self._by_id:
-            return self._clone_with_match(
-                self._by_id[qid],
-                {"mode": "question_id", "confidence": "exact", "score": 1.0},
-            )
-        if signature and signature in self._by_signature:
-            return self._clone_with_match(
-                self._by_signature[signature],
-                {"mode": "signature", "confidence": "exact", "score": 0.995},
-            )
-        if signature_nolabels and signature_nolabels in self._by_signature_nolabels:
-            return self._clone_with_match(
-                self._by_signature_nolabels[signature_nolabels],
-                {"mode": "signature_nolabels", "confidence": "exact", "score": 0.99},
-            )
-        if raw_text and raw_text in self._by_text:
-            return self._clone_with_match(
-                self._by_text[raw_text],
-                {"mode": "question_text", "confidence": "exact", "score": 0.985},
-            )
-        if canonical_text and canonical_text in self._by_canonical_text:
-            return self._clone_with_match(
-                self._by_canonical_text[canonical_text],
-                {"mode": "canonical_text", "confidence": "exact", "score": 0.98},
-            )
-        if not canonical_text:
             return None
         best: Optional[Dict[str, Any]] = None
-        best_metrics: Optional[Dict[str, float]] = None
         for item in self._items:
-            metrics = self._score_candidate(
                 query_text=question_text or "",
                 query_choices=options_text,
                 candidate=item,
             )
-            if best_metrics is None or metrics["score"] > best_metrics["score"]:
                 best = item
-                best_metrics = metrics
-        if best is None or best_metrics is None:
-            return None
-        confidence = self._confidence_label(best_metrics)
-        score = best_metrics["score"]
-        accept = False
-        if confidence == "exact":
-            accept = True
-        elif score >= 0.82:
-            accept = True
-        elif best_metrics.get("choice_set_match", 0.0) >= 1.0 and best_metrics.get("token_overlap", 0.0) >= 0.55:
-            accept = True
-        elif best_metrics.get("shingle_overlap", 0.0) >= 0.72 and best_metrics.get("sequence_ratio", 0.0) >= 0.84:
-            accept = True
-        elif best_metrics.get("number_overlap", 0.0) >= 1.0 and best_metrics.get("token_overlap", 0.0) >= 0.68:
-            accept = True
-        if not accept:
-            return None
-        return self._clone_with_match(
-            best,
-            {
                 "mode": "fuzzy",
-                "confidence": confidence,
-                **best_metrics,
-            },
-        )
     def upsert(self, item: Dict[str, Any]) -> None:
         self._ensure_loaded()

 import json
 import re
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
         self._loaded = False
         self._by_id: Dict[str, Dict[str, Any]] = {}
         self._by_text: Dict[str, Dict[str, Any]] = {}
         self._by_signature: Dict[str, Dict[str, Any]] = {}
         self._items: List[Dict[str, Any]] = []
     def _normalize(self, text: Optional[str]) -> str:
         cleaned = (text or "").strip().lower()
         cleaned = cleaned.replace("’", "'")
         cleaned = re.sub(r"\s+", " ", cleaned)
         return cleaned
     def _tokenize(self, text: Optional[str]) -> List[str]:
+        return re.findall(r"[a-z0-9%/]+", self._normalize(text))
     def _normalize_choice(self, value: Any) -> str:
+        return self._normalize(str(value) if value is not None else "")
     def _choice_signature(self, choices: Optional[List[Any]]) -> str:
         cleaned = [self._normalize_choice(choice) for choice in (choices or []) if self._normalize_choice(choice)]
         return " || ".join(cleaned)
     def _question_signature(self, question_text: Optional[str], choices: Optional[List[Any]] = None) -> str:
+        q = self._normalize(question_text)
         c = self._choice_signature(choices)
         return f"{q} ## {c}" if c else q
     def load(self) -> None:
         self._by_id = {}
         self._by_text = {}
         self._by_signature = {}
         self._items = []
         if self.data_path.exists():
         self._loaded = True
+    def _ensure_loaded(self) -> None:
+        if not self._loaded:
+            self.load()
     def _store_item(self, item: Dict[str, Any]) -> None:
         if not isinstance(item, dict):
             return
         stem = stored.get("question_text") or stored.get("stem") or ""
         choices = stored.get("options_text") or stored.get("choices") or []
+        qtext = self._normalize(stem)
         signature = self._question_signature(stem, choices)
         if qid:
             self._by_id[qid] = stored
+        if qtext:
+            self._by_text[qtext] = stored
         if signature:
             self._by_signature[signature] = stored
         self._items.append(stored)
     def _score_candidate(
         self,
         *,
         query_text: str,
         query_choices: Optional[List[Any]],
         candidate: Dict[str, Any],
+    ) -> Tuple[float, float, float]:
         cand_text = candidate.get("question_text") or candidate.get("stem") or ""
         cand_choices = candidate.get("options_text") or candidate.get("choices") or []
+        q_tokens = set(self._tokenize(query_text))
+        c_tokens = set(self._tokenize(cand_text))
+        if not q_tokens or not c_tokens:
+            token_overlap = 0.0
+        else:
+            token_overlap = len(q_tokens & c_tokens) / max(len(q_tokens | c_tokens), 1)
         q_choice_sig = self._choice_signature(query_choices)
         c_choice_sig = self._choice_signature(cand_choices)
+        if q_choice_sig and c_choice_sig:
+            choice_match = 1.0 if q_choice_sig == c_choice_sig else 0.0
+        else:
+            choice_match = 0.0
+        exact_text = 1.0 if self._normalize(query_text) == self._normalize(cand_text) else 0.0
+        score = (0.55 * token_overlap) + (0.35 * choice_match) + (0.10 * exact_text)
+        return score, token_overlap, choice_match
     def get(
         self,
     ) -> Optional[Dict[str, Any]]:
         self._ensure_loaded()
         qid = str(question_id or "").strip()
         if qid and qid in self._by_id:
+            return dict(self._by_id[qid])
+        qtext = self._normalize(question_text)
+        if qtext and qtext in self._by_text:
+            return dict(self._by_text[qtext])
+        signature = self._question_signature(question_text, options_text)
+        if signature and signature in self._by_signature:
+            return dict(self._by_signature[signature])
+        if not qtext:
             return None
         best: Optional[Dict[str, Any]] = None
+        best_score = 0.0
+        best_overlap = 0.0
+        best_choice = 0.0
         for item in self._items:
+            score, token_overlap, choice_match = self._score_candidate(
                 query_text=question_text or "",
                 query_choices=options_text,
                 candidate=item,
             )
+            if score > best_score:
                 best = item
+                best_score = score
+                best_overlap = token_overlap
+                best_choice = choice_match
+        threshold = 0.84 if options_text else 0.92
+        if best is not None and (best_score >= threshold or (best_choice >= 1.0 and best_overlap >= 0.55)):
+            out = dict(best)
+            out.setdefault("support_match", {})
+            out["support_match"] = {
                 "mode": "fuzzy",
+                "score": round(best_score, 4),
+                "token_overlap": round(best_overlap, 4),
+                "choice_match": round(best_choice, 4),
+            }
+            return out
+        return None
     def upsert(self, item: Dict[str, Any]) -> None:
         self._ensure_loaded()