Spaces:

j-js
/

GameAI

Sleeping

App Files Files Community

GameAI / question_support_loader.py

j-js

Update question_support_loader.py

19bee64 verified 12 days ago

raw

history blame contribute delete

8.54 kB

	from __future__ import annotations

	import json
	import re
	from difflib import SequenceMatcher
	from pathlib import Path
	from typing import Any, Dict, List, Optional


	class QuestionSupportBank:
	"""Load and retrieve authored question support entries with strong matching."""

	def __init__(self, data_path: Optional[str] = None) -> None:
	base_dir = Path(__file__).resolve().parent
	self.data_path = Path(data_path) if data_path else base_dir / "data" / "question_support_bank.jsonl"
	self._loaded = False
	self._items: List[Dict[str, Any]] = []
	self._by_id: Dict[str, Dict[str, Any]] = {}
	self._by_text: Dict[str, Dict[str, Any]] = {}
	self._by_signature: Dict[str, Dict[str, Any]] = {}
	self._by_unordered_signature: Dict[str, Dict[str, Any]] = {}

	def _normalize(self, text: Optional[str]) -> str:
	cleaned = (text or "").strip().lower()
	cleaned = cleaned.replace("’", "'").replace("“", '"').replace("”", '"')
	cleaned = cleaned.replace("−", "-").replace("–", "-")
	cleaned = re.sub(r"\s+", " ", cleaned)
	cleaned = re.sub(r"\s([=+\-/:,;()])\s*", r"\1", cleaned)
	return cleaned.strip()

	def _tokenize(self, text: Optional[str]) -> List[str]:
	return re.findall(r"[a-z0-9%/.]+", self._normalize(text))

	def _normalize_choice(self, value: Any) -> str:
	return self._normalize(str(value) if value is not None else "")

	def _coerce_choices(self, choices: Optional[List[Any]]) -> List[str]:
	if not choices:
	return []
	out: List[str] = []
	for choice in choices:
	normalized = self._normalize_choice(choice)
	if normalized:
	out.append(normalized)
	return out

	def _choice_signature(self, choices: Optional[List[Any]], *, ordered: bool = True) -> str:
	cleaned = self._coerce_choices(choices)
	if not ordered:
	cleaned = sorted(cleaned)
	return " \|\| ".join(cleaned)

	def _question_signature(self, question_text: Optional[str], choices: Optional[List[Any]] = None, *, ordered: bool = True) -> str:
	q = self._normalize(question_text)
	c = self._choice_signature(choices, ordered=ordered)
	return f"{q} ## {c}" if c else q

	def load(self) -> None:
	self._items = []
	self._by_id = {}
	self._by_text = {}
	self._by_signature = {}
	self._by_unordered_signature = {}

	if self.data_path.exists():
	with self.data_path.open("r", encoding="utf-8") as handle:
	for raw_line in handle:
	line = raw_line.strip()
	if not line:
	continue
	try:
	item = json.loads(line)
	except json.JSONDecodeError:
	continue
	self._store_item(item)

	self._loaded = True

	def _ensure_loaded(self) -> None:
	if not self._loaded:
	self.load()

	def _store_item(self, item: Dict[str, Any]) -> None:
	if not isinstance(item, dict):
	return
	stored = dict(item)
	stem = stored.get("question_text") or stored.get("stem") or ""
	choices = stored.get("options_text") or stored.get("choices") or []
	qid = str(stored.get("question_id") or "").strip()
	normalized_text = self._normalize(stem)
	signature = self._question_signature(stem, choices, ordered=True)
	unordered_signature = self._question_signature(stem, choices, ordered=False)

	if qid:
	self._by_id[qid] = stored
	if normalized_text:
	self._by_text[normalized_text] = stored
	if signature:
	self._by_signature[signature] = stored
	if unordered_signature:
	self._by_unordered_signature[unordered_signature] = stored
	self._items.append(stored)

	def _candidate_stats(self, *, query_text: str, query_choices: Optional[List[Any]], candidate: Dict[str, Any]) -> Dict[str, float]:
	cand_text = candidate.get("question_text") or candidate.get("stem") or ""
	cand_choices = candidate.get("options_text") or candidate.get("choices") or []

	norm_query = self._normalize(query_text)
	norm_cand = self._normalize(cand_text)
	text_exact = 1.0 if norm_query and norm_query == norm_cand else 0.0
	text_ratio = SequenceMatcher(None, norm_query, norm_cand).ratio() if norm_query and norm_cand else 0.0

	q_tokens = set(self._tokenize(query_text))
	c_tokens = set(self._tokenize(cand_text))
	token_overlap = len(q_tokens & c_tokens) / max(len(q_tokens \| c_tokens), 1) if q_tokens and c_tokens else 0.0

	q_sig = self._choice_signature(query_choices, ordered=True)
	c_sig = self._choice_signature(cand_choices, ordered=True)
	q_unsig = self._choice_signature(query_choices, ordered=False)
	c_unsig = self._choice_signature(cand_choices, ordered=False)
	ordered_choice_match = 1.0 if q_sig and c_sig and q_sig == c_sig else 0.0
	unordered_choice_match = 1.0 if q_unsig and c_unsig and q_unsig == c_unsig else 0.0

	score = (
	0.30 * text_exact
	+ 0.28 * text_ratio
	+ 0.22 * token_overlap
	+ 0.12 * ordered_choice_match
	+ 0.08 * unordered_choice_match
	)
	return {
	"score": score,
	"text_exact": text_exact,
	"text_ratio": text_ratio,
	"token_overlap": token_overlap,
	"ordered_choice_match": ordered_choice_match,
	"unordered_choice_match": unordered_choice_match,
	}

	def _annotate(self, item: Dict[str, Any], *, mode: str, stats: Optional[Dict[str, float]] = None) -> Dict[str, Any]:
	out = dict(item)
	out["support_match"] = {"mode": mode}
	if stats:
	out["support_match"].update({k: round(v, 4) for k, v in stats.items()})
	return out

	def get(self, question_id: Optional[str] = None, question_text: Optional[str] = None, options_text: Optional[List[Any]] = None) -> Optional[Dict[str, Any]]:
	self._ensure_loaded()
	qid = str(question_id or "").strip()
	if qid and qid in self._by_id:
	return self._annotate(self._by_id[qid], mode="question_id")

	signature = self._question_signature(question_text, options_text, ordered=True)
	if signature and signature in self._by_signature:
	return self._annotate(self._by_signature[signature], mode="signature_exact")

	qtext = self._normalize(question_text)
	if qtext and qtext in self._by_text:
	return self._annotate(self._by_text[qtext], mode="text_exact")

	unordered_signature = self._question_signature(question_text, options_text, ordered=False)
	if unordered_signature and unordered_signature in self._by_unordered_signature:
	return self._annotate(self._by_unordered_signature[unordered_signature], mode="signature_unordered")

	if not qtext:
	return None

	best_item: Optional[Dict[str, Any]] = None
	best_stats: Optional[Dict[str, float]] = None
	best_score = 0.0
	for item in self._items:
	stats = self._candidate_stats(query_text=question_text or "", query_choices=options_text, candidate=item)
	score = stats["score"]
	if score > best_score:
	best_item = item
	best_stats = stats
	best_score = score

	if not best_item or not best_stats:
	return None

	strong_choice = best_stats["ordered_choice_match"] >= 1.0 or best_stats["unordered_choice_match"] >= 1.0
	threshold = 0.70 if strong_choice else 0.82
	if best_stats["text_exact"] >= 1.0:
	threshold = min(threshold, 0.55)
	elif best_stats["text_ratio"] >= 0.94:
	threshold = min(threshold, 0.68)
	elif best_stats["token_overlap"] >= 0.75:
	threshold = min(threshold, 0.74)

	if best_score >= threshold:
	return self._annotate(best_item, mode="fuzzy", stats=best_stats)
	return None

	def upsert(self, item: Dict[str, Any]) -> None:
	self._ensure_loaded()
	self._store_item(item)

	def all_items(self) -> List[Dict[str, Any]]:
	self._ensure_loaded()
	return [dict(v) for v in self._items]


	question_support_bank = QuestionSupportBank()