j-js commited on
Commit
efac600
·
verified ·
1 Parent(s): 40b145a

Update question_support_loader.py

Browse files
Files changed (1) hide show
  1. question_support_loader.py +112 -9
question_support_loader.py CHANGED
@@ -1,8 +1,9 @@
1
  from __future__ import annotations
2
 
3
  import json
 
4
  from pathlib import Path
5
- from typing import Any, Dict, List, Optional
6
 
7
 
8
  class QuestionSupportBank:
@@ -12,13 +13,35 @@ class QuestionSupportBank:
12
  self._loaded = False
13
  self._by_id: Dict[str, Dict[str, Any]] = {}
14
  self._by_text: Dict[str, Dict[str, Any]] = {}
 
 
15
 
16
  def _normalize(self, text: Optional[str]) -> str:
17
- return " ".join((text or "").strip().lower().split())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  def load(self) -> None:
20
  self._by_id = {}
21
  self._by_text = {}
 
 
22
 
23
  if self.data_path.exists():
24
  with self.data_path.open("r", encoding="utf-8") as handle:
@@ -41,14 +64,58 @@ class QuestionSupportBank:
41
  def _store_item(self, item: Dict[str, Any]) -> None:
42
  if not isinstance(item, dict):
43
  return
44
- qid = str(item.get("question_id") or "").strip()
45
- qtext = self._normalize(item.get("question_text") or item.get("stem") or "")
 
 
 
 
 
 
 
46
  if qid:
47
- self._by_id[qid] = item
48
  if qtext:
49
- self._by_text[qtext] = item
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
- def get(self, question_id: Optional[str] = None, question_text: Optional[str] = None) -> Optional[Dict[str, Any]]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  self._ensure_loaded()
53
  qid = str(question_id or "").strip()
54
  if qid and qid in self._by_id:
@@ -57,6 +124,42 @@ class QuestionSupportBank:
57
  qtext = self._normalize(question_text)
58
  if qtext and qtext in self._by_text:
59
  return dict(self._by_text[qtext])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  return None
61
 
62
  def upsert(self, item: Dict[str, Any]) -> None:
@@ -65,7 +168,7 @@ class QuestionSupportBank:
65
 
66
  def all_items(self) -> List[Dict[str, Any]]:
67
  self._ensure_loaded()
68
- return [dict(v) for v in self._by_id.values()]
69
 
70
 
71
- question_support_bank = QuestionSupportBank()
 
1
  from __future__ import annotations
2
 
3
  import json
4
+ import re
5
  from pathlib import Path
6
+ from typing import Any, Dict, List, Optional, Tuple
7
 
8
 
9
  class QuestionSupportBank:
 
13
  self._loaded = False
14
  self._by_id: Dict[str, Dict[str, Any]] = {}
15
  self._by_text: Dict[str, Dict[str, Any]] = {}
16
+ self._by_signature: Dict[str, Dict[str, Any]] = {}
17
+ self._items: List[Dict[str, Any]] = []
18
 
19
  def _normalize(self, text: Optional[str]) -> str:
20
+ cleaned = (text or "").strip().lower()
21
+ cleaned = cleaned.replace("’", "'")
22
+ cleaned = re.sub(r"\s+", " ", cleaned)
23
+ return cleaned
24
+
25
+ def _tokenize(self, text: Optional[str]) -> List[str]:
26
+ return re.findall(r"[a-z0-9%/]+", self._normalize(text))
27
+
28
+ def _normalize_choice(self, value: Any) -> str:
29
+ return self._normalize(str(value) if value is not None else "")
30
+
31
+ def _choice_signature(self, choices: Optional[List[Any]]) -> str:
32
+ cleaned = [self._normalize_choice(choice) for choice in (choices or []) if self._normalize_choice(choice)]
33
+ return " || ".join(cleaned)
34
+
35
+ def _question_signature(self, question_text: Optional[str], choices: Optional[List[Any]] = None) -> str:
36
+ q = self._normalize(question_text)
37
+ c = self._choice_signature(choices)
38
+ return f"{q} ## {c}" if c else q
39
 
40
  def load(self) -> None:
41
  self._by_id = {}
42
  self._by_text = {}
43
+ self._by_signature = {}
44
+ self._items = []
45
 
46
  if self.data_path.exists():
47
  with self.data_path.open("r", encoding="utf-8") as handle:
 
64
  def _store_item(self, item: Dict[str, Any]) -> None:
65
  if not isinstance(item, dict):
66
  return
67
+
68
+ stored = dict(item)
69
+ qid = str(stored.get("question_id") or "").strip()
70
+ stem = stored.get("question_text") or stored.get("stem") or ""
71
+ choices = stored.get("options_text") or stored.get("choices") or []
72
+
73
+ qtext = self._normalize(stem)
74
+ signature = self._question_signature(stem, choices)
75
+
76
  if qid:
77
+ self._by_id[qid] = stored
78
  if qtext:
79
+ self._by_text[qtext] = stored
80
+ if signature:
81
+ self._by_signature[signature] = stored
82
+
83
+ self._items.append(stored)
84
+
85
+ def _score_candidate(
86
+ self,
87
+ *,
88
+ query_text: str,
89
+ query_choices: Optional[List[Any]],
90
+ candidate: Dict[str, Any],
91
+ ) -> Tuple[float, float, float]:
92
+ cand_text = candidate.get("question_text") or candidate.get("stem") or ""
93
+ cand_choices = candidate.get("options_text") or candidate.get("choices") or []
94
 
95
+ q_tokens = set(self._tokenize(query_text))
96
+ c_tokens = set(self._tokenize(cand_text))
97
+ if not q_tokens or not c_tokens:
98
+ token_overlap = 0.0
99
+ else:
100
+ token_overlap = len(q_tokens & c_tokens) / max(len(q_tokens | c_tokens), 1)
101
+
102
+ q_choice_sig = self._choice_signature(query_choices)
103
+ c_choice_sig = self._choice_signature(cand_choices)
104
+ if q_choice_sig and c_choice_sig:
105
+ choice_match = 1.0 if q_choice_sig == c_choice_sig else 0.0
106
+ else:
107
+ choice_match = 0.0
108
+
109
+ exact_text = 1.0 if self._normalize(query_text) == self._normalize(cand_text) else 0.0
110
+ score = (0.55 * token_overlap) + (0.35 * choice_match) + (0.10 * exact_text)
111
+ return score, token_overlap, choice_match
112
+
113
+ def get(
114
+ self,
115
+ question_id: Optional[str] = None,
116
+ question_text: Optional[str] = None,
117
+ options_text: Optional[List[Any]] = None,
118
+ ) -> Optional[Dict[str, Any]]:
119
  self._ensure_loaded()
120
  qid = str(question_id or "").strip()
121
  if qid and qid in self._by_id:
 
124
  qtext = self._normalize(question_text)
125
  if qtext and qtext in self._by_text:
126
  return dict(self._by_text[qtext])
127
+
128
+ signature = self._question_signature(question_text, options_text)
129
+ if signature and signature in self._by_signature:
130
+ return dict(self._by_signature[signature])
131
+
132
+ if not qtext:
133
+ return None
134
+
135
+ best: Optional[Dict[str, Any]] = None
136
+ best_score = 0.0
137
+ best_overlap = 0.0
138
+ best_choice = 0.0
139
+
140
+ for item in self._items:
141
+ score, token_overlap, choice_match = self._score_candidate(
142
+ query_text=question_text or "",
143
+ query_choices=options_text,
144
+ candidate=item,
145
+ )
146
+ if score > best_score:
147
+ best = item
148
+ best_score = score
149
+ best_overlap = token_overlap
150
+ best_choice = choice_match
151
+
152
+ threshold = 0.84 if options_text else 0.92
153
+ if best is not None and (best_score >= threshold or (best_choice >= 1.0 and best_overlap >= 0.55)):
154
+ out = dict(best)
155
+ out.setdefault("support_match", {})
156
+ out["support_match"] = {
157
+ "mode": "fuzzy",
158
+ "score": round(best_score, 4),
159
+ "token_overlap": round(best_overlap, 4),
160
+ "choice_match": round(best_choice, 4),
161
+ }
162
+ return out
163
  return None
164
 
165
  def upsert(self, item: Dict[str, Any]) -> None:
 
168
 
169
  def all_items(self) -> List[Dict[str, Any]]:
170
  self._ensure_loaded()
171
+ return [dict(v) for v in self._items]
172
 
173
 
174
+ question_support_bank = QuestionSupportBank()