GameAI / question_support_loader.py
j-js's picture
Update question_support_loader.py
19bee64 verified
from __future__ import annotations
import json
import re
from difflib import SequenceMatcher
from pathlib import Path
from typing import Any, Dict, List, Optional
class QuestionSupportBank:
"""Load and retrieve authored question support entries with strong matching."""
def __init__(self, data_path: Optional[str] = None) -> None:
base_dir = Path(__file__).resolve().parent
self.data_path = Path(data_path) if data_path else base_dir / "data" / "question_support_bank.jsonl"
self._loaded = False
self._items: List[Dict[str, Any]] = []
self._by_id: Dict[str, Dict[str, Any]] = {}
self._by_text: Dict[str, Dict[str, Any]] = {}
self._by_signature: Dict[str, Dict[str, Any]] = {}
self._by_unordered_signature: Dict[str, Dict[str, Any]] = {}
def _normalize(self, text: Optional[str]) -> str:
cleaned = (text or "").strip().lower()
cleaned = cleaned.replace("’", "'").replace("“", '"').replace("”", '"')
cleaned = cleaned.replace("−", "-").replace("–", "-")
cleaned = re.sub(r"\s+", " ", cleaned)
cleaned = re.sub(r"\s*([=+\-*/:,;()])\s*", r"\1", cleaned)
return cleaned.strip()
def _tokenize(self, text: Optional[str]) -> List[str]:
return re.findall(r"[a-z0-9%/.]+", self._normalize(text))
def _normalize_choice(self, value: Any) -> str:
return self._normalize(str(value) if value is not None else "")
def _coerce_choices(self, choices: Optional[List[Any]]) -> List[str]:
if not choices:
return []
out: List[str] = []
for choice in choices:
normalized = self._normalize_choice(choice)
if normalized:
out.append(normalized)
return out
def _choice_signature(self, choices: Optional[List[Any]], *, ordered: bool = True) -> str:
cleaned = self._coerce_choices(choices)
if not ordered:
cleaned = sorted(cleaned)
return " || ".join(cleaned)
def _question_signature(self, question_text: Optional[str], choices: Optional[List[Any]] = None, *, ordered: bool = True) -> str:
q = self._normalize(question_text)
c = self._choice_signature(choices, ordered=ordered)
return f"{q} ## {c}" if c else q
def load(self) -> None:
self._items = []
self._by_id = {}
self._by_text = {}
self._by_signature = {}
self._by_unordered_signature = {}
if self.data_path.exists():
with self.data_path.open("r", encoding="utf-8") as handle:
for raw_line in handle:
line = raw_line.strip()
if not line:
continue
try:
item = json.loads(line)
except json.JSONDecodeError:
continue
self._store_item(item)
self._loaded = True
def _ensure_loaded(self) -> None:
if not self._loaded:
self.load()
def _store_item(self, item: Dict[str, Any]) -> None:
if not isinstance(item, dict):
return
stored = dict(item)
stem = stored.get("question_text") or stored.get("stem") or ""
choices = stored.get("options_text") or stored.get("choices") or []
qid = str(stored.get("question_id") or "").strip()
normalized_text = self._normalize(stem)
signature = self._question_signature(stem, choices, ordered=True)
unordered_signature = self._question_signature(stem, choices, ordered=False)
if qid:
self._by_id[qid] = stored
if normalized_text:
self._by_text[normalized_text] = stored
if signature:
self._by_signature[signature] = stored
if unordered_signature:
self._by_unordered_signature[unordered_signature] = stored
self._items.append(stored)
def _candidate_stats(self, *, query_text: str, query_choices: Optional[List[Any]], candidate: Dict[str, Any]) -> Dict[str, float]:
cand_text = candidate.get("question_text") or candidate.get("stem") or ""
cand_choices = candidate.get("options_text") or candidate.get("choices") or []
norm_query = self._normalize(query_text)
norm_cand = self._normalize(cand_text)
text_exact = 1.0 if norm_query and norm_query == norm_cand else 0.0
text_ratio = SequenceMatcher(None, norm_query, norm_cand).ratio() if norm_query and norm_cand else 0.0
q_tokens = set(self._tokenize(query_text))
c_tokens = set(self._tokenize(cand_text))
token_overlap = len(q_tokens & c_tokens) / max(len(q_tokens | c_tokens), 1) if q_tokens and c_tokens else 0.0
q_sig = self._choice_signature(query_choices, ordered=True)
c_sig = self._choice_signature(cand_choices, ordered=True)
q_unsig = self._choice_signature(query_choices, ordered=False)
c_unsig = self._choice_signature(cand_choices, ordered=False)
ordered_choice_match = 1.0 if q_sig and c_sig and q_sig == c_sig else 0.0
unordered_choice_match = 1.0 if q_unsig and c_unsig and q_unsig == c_unsig else 0.0
score = (
0.30 * text_exact
+ 0.28 * text_ratio
+ 0.22 * token_overlap
+ 0.12 * ordered_choice_match
+ 0.08 * unordered_choice_match
)
return {
"score": score,
"text_exact": text_exact,
"text_ratio": text_ratio,
"token_overlap": token_overlap,
"ordered_choice_match": ordered_choice_match,
"unordered_choice_match": unordered_choice_match,
}
def _annotate(self, item: Dict[str, Any], *, mode: str, stats: Optional[Dict[str, float]] = None) -> Dict[str, Any]:
out = dict(item)
out["support_match"] = {"mode": mode}
if stats:
out["support_match"].update({k: round(v, 4) for k, v in stats.items()})
return out
def get(self, question_id: Optional[str] = None, question_text: Optional[str] = None, options_text: Optional[List[Any]] = None) -> Optional[Dict[str, Any]]:
self._ensure_loaded()
qid = str(question_id or "").strip()
if qid and qid in self._by_id:
return self._annotate(self._by_id[qid], mode="question_id")
signature = self._question_signature(question_text, options_text, ordered=True)
if signature and signature in self._by_signature:
return self._annotate(self._by_signature[signature], mode="signature_exact")
qtext = self._normalize(question_text)
if qtext and qtext in self._by_text:
return self._annotate(self._by_text[qtext], mode="text_exact")
unordered_signature = self._question_signature(question_text, options_text, ordered=False)
if unordered_signature and unordered_signature in self._by_unordered_signature:
return self._annotate(self._by_unordered_signature[unordered_signature], mode="signature_unordered")
if not qtext:
return None
best_item: Optional[Dict[str, Any]] = None
best_stats: Optional[Dict[str, float]] = None
best_score = 0.0
for item in self._items:
stats = self._candidate_stats(query_text=question_text or "", query_choices=options_text, candidate=item)
score = stats["score"]
if score > best_score:
best_item = item
best_stats = stats
best_score = score
if not best_item or not best_stats:
return None
strong_choice = best_stats["ordered_choice_match"] >= 1.0 or best_stats["unordered_choice_match"] >= 1.0
threshold = 0.70 if strong_choice else 0.82
if best_stats["text_exact"] >= 1.0:
threshold = min(threshold, 0.55)
elif best_stats["text_ratio"] >= 0.94:
threshold = min(threshold, 0.68)
elif best_stats["token_overlap"] >= 0.75:
threshold = min(threshold, 0.74)
if best_score >= threshold:
return self._annotate(best_item, mode="fuzzy", stats=best_stats)
return None
def upsert(self, item: Dict[str, Any]) -> None:
self._ensure_loaded()
self._store_item(item)
def all_items(self) -> List[Dict[str, Any]]:
self._ensure_loaded()
return [dict(v) for v in self._items]
question_support_bank = QuestionSupportBank()