Spaces:

j-js
/

GameAI

Sleeping

App Files Files Community

j-js commited on 14 days ago

Commit

076a07d

verified ·

1 Parent(s): efac600

Update question_support_loader.py

Browse files

Files changed (1) hide show

question_support_loader.py +188 -53

question_support_loader.py CHANGED Viewed

@@ -2,6 +2,7 @@ from __future__ import annotations
 import json
 import re
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
@@ -13,34 +14,78 @@ class QuestionSupportBank:
         self._loaded = False
         self._by_id: Dict[str, Dict[str, Any]] = {}
         self._by_text: Dict[str, Dict[str, Any]] = {}
         self._by_signature: Dict[str, Dict[str, Any]] = {}
         self._items: List[Dict[str, Any]] = []
     def _normalize(self, text: Optional[str]) -> str:
         cleaned = (text or "").strip().lower()
         cleaned = cleaned.replace("’", "'")
         cleaned = re.sub(r"\s+", " ", cleaned)
         return cleaned
     def _tokenize(self, text: Optional[str]) -> List[str]:
-        return re.findall(r"[a-z0-9%/]+", self._normalize(text))
     def _normalize_choice(self, value: Any) -> str:
-        return self._normalize(str(value) if value is not None else "")
     def _choice_signature(self, choices: Optional[List[Any]]) -> str:
         cleaned = [self._normalize_choice(choice) for choice in (choices or []) if self._normalize_choice(choice)]
         return " || ".join(cleaned)
     def _question_signature(self, question_text: Optional[str], choices: Optional[List[Any]] = None) -> str:
-        q = self._normalize(question_text)
         c = self._choice_signature(choices)
         return f"{q} ## {c}" if c else q
     def load(self) -> None:
         self._by_id = {}
         self._by_text = {}
         self._by_signature = {}
         self._items = []
         if self.data_path.exists():
@@ -57,10 +102,6 @@ class QuestionSupportBank:
         self._loaded = True
-    def _ensure_loaded(self) -> None:
-        if not self._loaded:
-            self.load()
     def _store_item(self, item: Dict[str, Any]) -> None:
         if not isinstance(item, dict):
             return
@@ -70,45 +111,102 @@ class QuestionSupportBank:
         stem = stored.get("question_text") or stored.get("stem") or ""
         choices = stored.get("options_text") or stored.get("choices") or []
-        qtext = self._normalize(stem)
         signature = self._question_signature(stem, choices)
         if qid:
             self._by_id[qid] = stored
-        if qtext:
-            self._by_text[qtext] = stored
         if signature:
             self._by_signature[signature] = stored
         self._items.append(stored)
     def _score_candidate(
         self,
         *,
         query_text: str,
         query_choices: Optional[List[Any]],
         candidate: Dict[str, Any],
-    ) -> Tuple[float, float, float]:
         cand_text = candidate.get("question_text") or candidate.get("stem") or ""
         cand_choices = candidate.get("options_text") or candidate.get("choices") or []
-        q_tokens = set(self._tokenize(query_text))
-        c_tokens = set(self._tokenize(cand_text))
-        if not q_tokens or not c_tokens:
-            token_overlap = 0.0
-        else:
-            token_overlap = len(q_tokens & c_tokens) / max(len(q_tokens | c_tokens), 1)
         q_choice_sig = self._choice_signature(query_choices)
         c_choice_sig = self._choice_signature(cand_choices)
-        if q_choice_sig and c_choice_sig:
-            choice_match = 1.0 if q_choice_sig == c_choice_sig else 0.0
-        else:
-            choice_match = 0.0
-        exact_text = 1.0 if self._normalize(query_text) == self._normalize(cand_text) else 0.0
-        score = (0.55 * token_overlap) + (0.35 * choice_match) + (0.10 * exact_text)
-        return score, token_overlap, choice_match
     def get(
         self,
@@ -118,49 +216,86 @@ class QuestionSupportBank:
     ) -> Optional[Dict[str, Any]]:
         self._ensure_loaded()
         qid = str(question_id or "").strip()
-        if qid and qid in self._by_id:
-            return dict(self._by_id[qid])
-        qtext = self._normalize(question_text)
-        if qtext and qtext in self._by_text:
-            return dict(self._by_text[qtext])
-        signature = self._question_signature(question_text, options_text)
         if signature and signature in self._by_signature:
-            return dict(self._by_signature[signature])
-        if not qtext:
             return None
         best: Optional[Dict[str, Any]] = None
-        best_score = 0.0
-        best_overlap = 0.0
-        best_choice = 0.0
         for item in self._items:
-            score, token_overlap, choice_match = self._score_candidate(
                 query_text=question_text or "",
                 query_choices=options_text,
                 candidate=item,
             )
-            if score > best_score:
                 best = item
-                best_score = score
-                best_overlap = token_overlap
-                best_choice = choice_match
-        threshold = 0.84 if options_text else 0.92
-        if best is not None and (best_score >= threshold or (best_choice >= 1.0 and best_overlap >= 0.55)):
-            out = dict(best)
-            out.setdefault("support_match", {})
-            out["support_match"] = {
                 "mode": "fuzzy",
-                "score": round(best_score, 4),
-                "token_overlap": round(best_overlap, 4),
-                "choice_match": round(best_choice, 4),
-            }
-            return out
-        return None
     def upsert(self, item: Dict[str, Any]) -> None:
         self._ensure_loaded()

 import json
 import re
+from difflib import SequenceMatcher
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
         self._loaded = False
         self._by_id: Dict[str, Dict[str, Any]] = {}
         self._by_text: Dict[str, Dict[str, Any]] = {}
+        self._by_canonical_text: Dict[str, Dict[str, Any]] = {}
         self._by_signature: Dict[str, Dict[str, Any]] = {}
+        self._by_signature_nolabels: Dict[str, Dict[str, Any]] = {}
         self._items: List[Dict[str, Any]] = []
     def _normalize(self, text: Optional[str]) -> str:
         cleaned = (text or "").strip().lower()
         cleaned = cleaned.replace("’", "'")
+        cleaned = cleaned.replace("‘", "'")
+        cleaned = cleaned.replace("“", '"').replace("”", '"')
+        cleaned = cleaned.replace("–", "-").replace("—", "-")
+        cleaned = cleaned.replace("×", "x")
         cleaned = re.sub(r"\s+", " ", cleaned)
         return cleaned
+    def _canonicalize_text(self, text: Optional[str]) -> str:
+        cleaned = self._normalize(text)
+        if not cleaned:
+            return ""
+        cleaned = re.sub(r"\b([a-e])\s*[\)\.]\s*", " ", cleaned)
+        cleaned = re.sub(r"\boption\s+[a-e]\b", " ", cleaned)
+        cleaned = re.sub(r"\bchoices?\s*:\s*", " ", cleaned)
+        cleaned = re.sub(r"\s*\?\s*$", "", cleaned)
+        cleaned = re.sub(r"\s*[:;,]\s*", " ", cleaned)
+        cleaned = re.sub(r"\s*([=+\-/*()])\s*", r" \1 ", cleaned)
+        cleaned = re.sub(r"[^a-z0-9%/=+\-/*(). ]+", " ", cleaned)
+        cleaned = re.sub(r"\s+", " ", cleaned).strip()
+        return cleaned
     def _tokenize(self, text: Optional[str]) -> List[str]:
+        canon = self._canonicalize_text(text)
+        return re.findall(r"[a-z0-9%/\.]+", canon)
     def _normalize_choice(self, value: Any) -> str:
+        text = self._canonicalize_text(str(value) if value is not None else "")
+        text = re.sub(r"^([a-e])\s*[\)\.]\s*", "", text).strip()
+        return text
     def _choice_signature(self, choices: Optional[List[Any]]) -> str:
         cleaned = [self._normalize_choice(choice) for choice in (choices or []) if self._normalize_choice(choice)]
         return " || ".join(cleaned)
+    def _choice_signature_nolabels(self, choices: Optional[List[Any]]) -> str:
+        cleaned = sorted([self._normalize_choice(choice) for choice in (choices or []) if self._normalize_choice(choice)])
+        return " || ".join(cleaned)
     def _question_signature(self, question_text: Optional[str], choices: Optional[List[Any]] = None) -> str:
+        q = self._canonicalize_text(question_text)
         c = self._choice_signature(choices)
         return f"{q} ## {c}" if c else q
+    def _question_signature_nolabels(self, question_text: Optional[str], choices: Optional[List[Any]] = None) -> str:
+        q = self._canonicalize_text(question_text)
+        c = self._choice_signature_nolabels(choices)
+        return f"{q} ## {c}" if c else q
+    def _shingles(self, text: Optional[str], size: int = 3) -> set[str]:
+        tokens = self._tokenize(text)
+        if len(tokens) < size:
+            return {" ".join(tokens)} if tokens else set()
+        return {" ".join(tokens[i : i + size]) for i in range(len(tokens) - size + 1)}
+    def _ensure_loaded(self) -> None:
+        if not self._loaded:
+            self.load()
     def load(self) -> None:
         self._by_id = {}
         self._by_text = {}
+        self._by_canonical_text = {}
         self._by_signature = {}
+        self._by_signature_nolabels = {}
         self._items = []
         if self.data_path.exists():
         self._loaded = True
     def _store_item(self, item: Dict[str, Any]) -> None:
         if not isinstance(item, dict):
             return
         stem = stored.get("question_text") or stored.get("stem") or ""
         choices = stored.get("options_text") or stored.get("choices") or []
+        raw_text = self._normalize(stem)
+        canonical_text = self._canonicalize_text(stem)
         signature = self._question_signature(stem, choices)
+        signature_nolabels = self._question_signature_nolabels(stem, choices)
         if qid:
             self._by_id[qid] = stored
+        if raw_text:
+            self._by_text[raw_text] = stored
+        if canonical_text:
+            self._by_canonical_text[canonical_text] = stored
         if signature:
             self._by_signature[signature] = stored
+        if signature_nolabels:
+            self._by_signature_nolabels[signature_nolabels] = stored
         self._items.append(stored)
+    def _clone_with_match(self, item: Dict[str, Any], match: Dict[str, Any]) -> Dict[str, Any]:
+        out = dict(item)
+        out["support_match"] = match
+        return out
     def _score_candidate(
         self,
         *,
         query_text: str,
         query_choices: Optional[List[Any]],
         candidate: Dict[str, Any],
+    ) -> Dict[str, float]:
         cand_text = candidate.get("question_text") or candidate.get("stem") or ""
         cand_choices = candidate.get("options_text") or candidate.get("choices") or []
+        query_norm = self._canonicalize_text(query_text)
+        cand_norm = self._canonicalize_text(cand_text)
+        q_tokens = set(self._tokenize(query_norm))
+        c_tokens = set(self._tokenize(cand_norm))
+        token_overlap = len(q_tokens & c_tokens) / max(len(q_tokens | c_tokens), 1) if q_tokens and c_tokens else 0.0
+        q_shingles = self._shingles(query_norm)
+        c_shingles = self._shingles(cand_norm)
+        shingle_overlap = len(q_shingles & c_shingles) / max(len(q_shingles | c_shingles), 1) if q_shingles and c_shingles else 0.0
+        seq = SequenceMatcher(None, query_norm, cand_norm).ratio() if query_norm and cand_norm else 0.0
+        q_nums = set(re.findall(r"\d+(?:\.\d+)?%?", query_norm))
+        c_nums = set(re.findall(r"\d+(?:\.\d+)?%?", cand_norm))
+        number_overlap = len(q_nums & c_nums) / max(len(q_nums | c_nums), 1) if q_nums and c_nums else (1.0 if not q_nums and not c_nums else 0.0)
         q_choice_sig = self._choice_signature(query_choices)
         c_choice_sig = self._choice_signature(cand_choices)
+        q_choice_sig_nl = self._choice_signature_nolabels(query_choices)
+        c_choice_sig_nl = self._choice_signature_nolabels(cand_choices)
+        choice_match = 1.0 if q_choice_sig and c_choice_sig and q_choice_sig == c_choice_sig else 0.0
+        choice_set_match = 1.0 if q_choice_sig_nl and c_choice_sig_nl and q_choice_sig_nl == c_choice_sig_nl else 0.0
+        exact_text = 1.0 if query_norm and query_norm == cand_norm else 0.0
+        exact_signature = 1.0 if self._question_signature(query_text, query_choices) == self._question_signature(cand_text, cand_choices) else 0.0
+        exact_signature_nolabels = 1.0 if self._question_signature_nolabels(query_text, query_choices) == self._question_signature_nolabels(cand_text, cand_choices) else 0.0
+        score = (
+            0.28 * exact_text
+            + 0.18 * exact_signature
+            + 0.08 * exact_signature_nolabels
+            + 0.16 * choice_match
+            + 0.08 * choice_set_match
+            + 0.12 * token_overlap
+            + 0.06 * shingle_overlap
+            + 0.02 * number_overlap
+            + 0.02 * seq
+        )
+        return {
+            "score": round(score, 6),
+            "token_overlap": round(token_overlap, 6),
+            "shingle_overlap": round(shingle_overlap, 6),
+            "sequence_ratio": round(seq, 6),
+            "number_overlap": round(number_overlap, 6),
+            "choice_match": round(choice_match, 6),
+            "choice_set_match": round(choice_set_match, 6),
+            "exact_text": round(exact_text, 6),
+            "exact_signature": round(exact_signature, 6),
+            "exact_signature_nolabels": round(exact_signature_nolabels, 6),
+        }
+    def _confidence_label(self, metrics: Dict[str, float]) -> str:
+        score = metrics.get("score", 0.0)
+        if metrics.get("exact_signature", 0.0) >= 1.0 or metrics.get("exact_text", 0.0) >= 1.0:
+            return "exact"
+        if score >= 0.82:
+            return "high"
+        if score >= 0.70:
+            return "medium"
+        return "low"
     def get(
         self,
     ) -> Optional[Dict[str, Any]]:
         self._ensure_loaded()
         qid = str(question_id or "").strip()
+        raw_text = self._normalize(question_text)
+        canonical_text = self._canonicalize_text(question_text)
+        signature = self._question_signature(question_text, options_text)
+        signature_nolabels = self._question_signature_nolabels(question_text, options_text)
+        if qid and qid in self._by_id:
+            return self._clone_with_match(
+                self._by_id[qid],
+                {"mode": "question_id", "confidence": "exact", "score": 1.0},
+            )
         if signature and signature in self._by_signature:
+            return self._clone_with_match(
+                self._by_signature[signature],
+                {"mode": "signature", "confidence": "exact", "score": 0.995},
+            )
+        if signature_nolabels and signature_nolabels in self._by_signature_nolabels:
+            return self._clone_with_match(
+                self._by_signature_nolabels[signature_nolabels],
+                {"mode": "signature_nolabels", "confidence": "exact", "score": 0.99},
+            )
+        if raw_text and raw_text in self._by_text:
+            return self._clone_with_match(
+                self._by_text[raw_text],
+                {"mode": "question_text", "confidence": "exact", "score": 0.985},
+            )
+        if canonical_text and canonical_text in self._by_canonical_text:
+            return self._clone_with_match(
+                self._by_canonical_text[canonical_text],
+                {"mode": "canonical_text", "confidence": "exact", "score": 0.98},
+            )
+        if not canonical_text:
             return None
         best: Optional[Dict[str, Any]] = None
+        best_metrics: Optional[Dict[str, float]] = None
         for item in self._items:
+            metrics = self._score_candidate(
                 query_text=question_text or "",
                 query_choices=options_text,
                 candidate=item,
             )
+            if best_metrics is None or metrics["score"] > best_metrics["score"]:
                 best = item
+                best_metrics = metrics
+        if best is None or best_metrics is None:
+            return None
+        confidence = self._confidence_label(best_metrics)
+        score = best_metrics["score"]
+        accept = False
+        if confidence == "exact":
+            accept = True
+        elif score >= 0.82:
+            accept = True
+        elif best_metrics.get("choice_set_match", 0.0) >= 1.0 and best_metrics.get("token_overlap", 0.0) >= 0.55:
+            accept = True
+        elif best_metrics.get("shingle_overlap", 0.0) >= 0.72 and best_metrics.get("sequence_ratio", 0.0) >= 0.84:
+            accept = True
+        elif best_metrics.get("number_overlap", 0.0) >= 1.0 and best_metrics.get("token_overlap", 0.0) >= 0.68:
+            accept = True
+        if not accept:
+            return None
+        return self._clone_with_match(
+            best,
+            {
                 "mode": "fuzzy",
+                "confidence": confidence,
+                **best_metrics,
+            },
+        )
     def upsert(self, item: Dict[str, Any]) -> None:
         self._ensure_loaded()