Spaces:

j-js
/

GameAI

Sleeping

File size: 8,535 Bytes

ae81756
 
d68661a
efac600
ae81756
d68661a
ae81756
d68661a
 
 
ae81756
 
280a1a8
 
 
d68661a
ae81756
280a1a8
 
efac600
ae81756
280a1a8
 
efac600
ae81756
 
efac600
19bee64
ae81756
efac600
 
ae81756
efac600
 
bcf6874
efac600
ae81756
 
 
 
 
 
 
 
 
 
 
 
 
 
efac600
 
ae81756
bcf6874
ae81756
efac600
d68661a
 
ae81756
d68661a
280a1a8
efac600
ae81756
280a1a8
 
 
 
 
 
 
 
 
 
 
 
d68661a
 
 
bcf6874
 
 
 
280a1a8
 
 
efac600
 
 
ae81756
 
 
 
efac600
280a1a8
efac600
ae81756
 
efac600
 
ae81756
 
efac600
 
ae81756
efac600
 
280a1a8
ae81756
 
 
 
 
bcf6874
 
ae81756
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
280a1a8
 
076a07d
ae81756
 
 
 
 
efac600
bcf6874
 
ae81756
076a07d
ae81756
 
 
076a07d
bcf6874
efac600
 
ae81756
 
bcf6874
efac600
ae81756
 
bcf6874
ae81756
 
bcf6874
ae81756
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bcf6874
280a1a8
 
 
 
 
 
 
efac600
280a1a8
 
ae81756

from __future__ import annotations

import json
import re
from difflib import SequenceMatcher
from pathlib import Path
from typing import Any, Dict, List, Optional


class QuestionSupportBank:
    """Load and retrieve authored question support entries with strong matching."""

    def __init__(self, data_path: Optional[str] = None) -> None:
        base_dir = Path(__file__).resolve().parent
        self.data_path = Path(data_path) if data_path else base_dir / "data" / "question_support_bank.jsonl"
        self._loaded = False
        self._items: List[Dict[str, Any]] = []
        self._by_id: Dict[str, Dict[str, Any]] = {}
        self._by_text: Dict[str, Dict[str, Any]] = {}
        self._by_signature: Dict[str, Dict[str, Any]] = {}
        self._by_unordered_signature: Dict[str, Dict[str, Any]] = {}

    def _normalize(self, text: Optional[str]) -> str:
        cleaned = (text or "").strip().lower()
        cleaned = cleaned.replace("’", "'").replace("“", '"').replace("”", '"')
        cleaned = cleaned.replace("−", "-").replace("–", "-")
        cleaned = re.sub(r"\s+", " ", cleaned)
        cleaned = re.sub(r"\s*([=+\-*/:,;()])\s*", r"\1", cleaned)
        return cleaned.strip()

    def _tokenize(self, text: Optional[str]) -> List[str]:
        return re.findall(r"[a-z0-9%/.]+", self._normalize(text))

    def _normalize_choice(self, value: Any) -> str:
        return self._normalize(str(value) if value is not None else "")

    def _coerce_choices(self, choices: Optional[List[Any]]) -> List[str]:
        if not choices:
            return []
        out: List[str] = []
        for choice in choices:
            normalized = self._normalize_choice(choice)
            if normalized:
                out.append(normalized)
        return out

    def _choice_signature(self, choices: Optional[List[Any]], *, ordered: bool = True) -> str:
        cleaned = self._coerce_choices(choices)
        if not ordered:
            cleaned = sorted(cleaned)
        return " || ".join(cleaned)

    def _question_signature(self, question_text: Optional[str], choices: Optional[List[Any]] = None, *, ordered: bool = True) -> str:
        q = self._normalize(question_text)
        c = self._choice_signature(choices, ordered=ordered)
        return f"{q} ## {c}" if c else q

    def load(self) -> None:
        self._items = []
        self._by_id = {}
        self._by_text = {}
        self._by_signature = {}
        self._by_unordered_signature = {}

        if self.data_path.exists():
            with self.data_path.open("r", encoding="utf-8") as handle:
                for raw_line in handle:
                    line = raw_line.strip()
                    if not line:
                        continue
                    try:
                        item = json.loads(line)
                    except json.JSONDecodeError:
                        continue
                    self._store_item(item)

        self._loaded = True

    def _ensure_loaded(self) -> None:
        if not self._loaded:
            self.load()

    def _store_item(self, item: Dict[str, Any]) -> None:
        if not isinstance(item, dict):
            return
        stored = dict(item)
        stem = stored.get("question_text") or stored.get("stem") or ""
        choices = stored.get("options_text") or stored.get("choices") or []
        qid = str(stored.get("question_id") or "").strip()
        normalized_text = self._normalize(stem)
        signature = self._question_signature(stem, choices, ordered=True)
        unordered_signature = self._question_signature(stem, choices, ordered=False)

        if qid:
            self._by_id[qid] = stored
        if normalized_text:
            self._by_text[normalized_text] = stored
        if signature:
            self._by_signature[signature] = stored
        if unordered_signature:
            self._by_unordered_signature[unordered_signature] = stored
        self._items.append(stored)

    def _candidate_stats(self, *, query_text: str, query_choices: Optional[List[Any]], candidate: Dict[str, Any]) -> Dict[str, float]:
        cand_text = candidate.get("question_text") or candidate.get("stem") or ""
        cand_choices = candidate.get("options_text") or candidate.get("choices") or []

        norm_query = self._normalize(query_text)
        norm_cand = self._normalize(cand_text)
        text_exact = 1.0 if norm_query and norm_query == norm_cand else 0.0
        text_ratio = SequenceMatcher(None, norm_query, norm_cand).ratio() if norm_query and norm_cand else 0.0

        q_tokens = set(self._tokenize(query_text))
        c_tokens = set(self._tokenize(cand_text))
        token_overlap = len(q_tokens & c_tokens) / max(len(q_tokens | c_tokens), 1) if q_tokens and c_tokens else 0.0

        q_sig = self._choice_signature(query_choices, ordered=True)
        c_sig = self._choice_signature(cand_choices, ordered=True)
        q_unsig = self._choice_signature(query_choices, ordered=False)
        c_unsig = self._choice_signature(cand_choices, ordered=False)
        ordered_choice_match = 1.0 if q_sig and c_sig and q_sig == c_sig else 0.0
        unordered_choice_match = 1.0 if q_unsig and c_unsig and q_unsig == c_unsig else 0.0

        score = (
            0.30 * text_exact
            + 0.28 * text_ratio
            + 0.22 * token_overlap
            + 0.12 * ordered_choice_match
            + 0.08 * unordered_choice_match
        )
        return {
            "score": score,
            "text_exact": text_exact,
            "text_ratio": text_ratio,
            "token_overlap": token_overlap,
            "ordered_choice_match": ordered_choice_match,
            "unordered_choice_match": unordered_choice_match,
        }

    def _annotate(self, item: Dict[str, Any], *, mode: str, stats: Optional[Dict[str, float]] = None) -> Dict[str, Any]:
        out = dict(item)
        out["support_match"] = {"mode": mode}
        if stats:
            out["support_match"].update({k: round(v, 4) for k, v in stats.items()})
        return out

    def get(self, question_id: Optional[str] = None, question_text: Optional[str] = None, options_text: Optional[List[Any]] = None) -> Optional[Dict[str, Any]]:
        self._ensure_loaded()
        qid = str(question_id or "").strip()
        if qid and qid in self._by_id:
            return self._annotate(self._by_id[qid], mode="question_id")

        signature = self._question_signature(question_text, options_text, ordered=True)
        if signature and signature in self._by_signature:
            return self._annotate(self._by_signature[signature], mode="signature_exact")

        qtext = self._normalize(question_text)
        if qtext and qtext in self._by_text:
            return self._annotate(self._by_text[qtext], mode="text_exact")

        unordered_signature = self._question_signature(question_text, options_text, ordered=False)
        if unordered_signature and unordered_signature in self._by_unordered_signature:
            return self._annotate(self._by_unordered_signature[unordered_signature], mode="signature_unordered")

        if not qtext:
            return None

        best_item: Optional[Dict[str, Any]] = None
        best_stats: Optional[Dict[str, float]] = None
        best_score = 0.0
        for item in self._items:
            stats = self._candidate_stats(query_text=question_text or "", query_choices=options_text, candidate=item)
            score = stats["score"]
            if score > best_score:
                best_item = item
                best_stats = stats
                best_score = score

        if not best_item or not best_stats:
            return None

        strong_choice = best_stats["ordered_choice_match"] >= 1.0 or best_stats["unordered_choice_match"] >= 1.0
        threshold = 0.70 if strong_choice else 0.82
        if best_stats["text_exact"] >= 1.0:
            threshold = min(threshold, 0.55)
        elif best_stats["text_ratio"] >= 0.94:
            threshold = min(threshold, 0.68)
        elif best_stats["token_overlap"] >= 0.75:
            threshold = min(threshold, 0.74)

        if best_score >= threshold:
            return self._annotate(best_item, mode="fuzzy", stats=best_stats)
        return None

    def upsert(self, item: Dict[str, Any]) -> None:
        self._ensure_loaded()
        self._store_item(item)

    def all_items(self) -> List[Dict[str, Any]]:
        self._ensure_loaded()
        return [dict(v) for v in self._items]


question_support_bank = QuestionSupportBank()