linhnguyen02 commited on
Commit
e7e6099
·
1 Parent(s): 8e72e5b

eval question

Browse files
env.py CHANGED
@@ -28,5 +28,60 @@ config = {
28
  "elastic": {
29
  "url": os.getenv("ELASTIC_URL"),
30
  "api_key": os.getenv("ELASTIC_API_KEY")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  }
 
32
  }
 
28
  "elastic": {
29
  "url": os.getenv("ELASTIC_URL"),
30
  "api_key": os.getenv("ELASTIC_API_KEY")
31
+ },
32
+ "evalution" : {
33
+ "weights": {
34
+ "structure": os.getenv("WEIGHT_STRUCTURE") | 0.2,
35
+ "popularity": os.getenv("WEIGHT_POPULARITY") | 0.2,
36
+ "distractor": os.getenv("WEIGHT_DISTRACTOR") | 0.4,
37
+ "ai_adjust_factor": os.getenv("WEIGHT_AI_ADJUST_FACTOR") | 0.8
38
+ },
39
+ "penalty_for_error" : {
40
+ "structure" : {
41
+ "missing_question_text": os.getenv("PENALTY_MISSING_QUESTION_TEXT") | 0.4,
42
+ "missing_choice": os.getenv("PENALTY_MISSING_CHOICE") | 0.2,
43
+ "no_correct_answer": os.getenv("PENALTY_NO_CORRECT_ANSWER") | 0.4,
44
+ "empty_choice": os.getenv("PENALTY_EMPTY_CHOICE") | 0.1,
45
+ "duplicated_choices": os.getenv("PENALTY_DUPLICATED_CHOICES") | 0.1,
46
+ "grammar_error": os.getenv("PENALTY_GRAMMAR_ERROR") | 0.05
47
+ }
48
+ },
49
+ "distractor": {
50
+ "empty_choice_deduction": os.getenv("DISTRACTOR_EMPTY_CHOICE_DEDUCTION") | 0.05, # trong _check_pos_and_meaning_of_choice
51
+ "embedding_similarity_thresholds": {
52
+ "too_different": os.getenv("DISTRACTOR_EMBEDDING_SIMILARITY_TOO_DIFFERENT") |0.35,
53
+ "moderate": os.getenv("DISTRACTOR_EMBEDDING_SIMILARITY_MODERATE") |0.45,
54
+ "good": os.getenv("DISTRACTOR_EMBEDDING_SIMILARITY_GOOD") |0.6,
55
+ "strong": os.getenv("DISTRACTOR_EMBEDDING_SIMILARITY_STRONG") |0.7
56
+ },
57
+ "paragraph": {
58
+ "length_weight": os.getenv("DISTRACTOR_PARAGRAPH_LENGTH_WEIGHT") |0.1,
59
+ "difficulty_weight": os.getenv("DISTRACTOR_PARAGRAPH_DIFFICULTY_WEIGHT") |0.9,
60
+ "vocab_length_thresholds": os.getenv("DISTRACTOR_PARAGRAPH_VOCAB_LENGTH_THRESHOLDS") |[50, 100, 200, 300], # tương ứng score 0.2 → 0.5
61
+ "other_length_thresholds": os.getenv("DISTRACTOR_PARAGRAPH_OTHER_LENGTH_THRESHOLDS") |[50, 100, 200, 300], # tương ứng 0.3 → 1.0
62
+ "direct_match_sim": os.getenv("DISTRACTOR_PARAGRAPH_DIRECT_MATCH_SIM") |0.85,
63
+ "paraphrase_sim": os.getenv("DISTRACTOR_PARAGRAPH_PARAPHRASE_SIM") |0.5,
64
+ "difficulty_levels": os.getenv("DISTRACTOR_PARAGRAPH_DIFFICULTY_LEVELS") |[1, 3, 5]
65
+ },
66
+ "lexical_family": {
67
+ "thresholds": {
68
+ "high_lemma": os.getenv("DISTRACTOR_LEXICAL_FAMILY_HIGH_LEMMA") |0.9,
69
+ "high_pos": os.getenv("DISTRACTOR_LEXICAL_FAMILY_HIGH_POS") |0.9,
70
+ "medium_high_pos": os.getenv("DISTRACTOR_LEXICAL_FAMILY_MEDIUM_HIGH_POS") |0.6,
71
+ "medium_lemma": os.getenv("DISTRACTOR_LEXICAL_FAMILY_MEDIUM_LEMMA") |0.7,
72
+ "medium_both": os.getenv("DISTRACTOR_LEXICAL_FAMILY_MEDIUM_BOTH") |0.4,
73
+ "low": os.getenv("DISTRACTOR_LEXICAL_FAMILY_LOW") |0.3
74
+ },
75
+ "scores": {
76
+ "high_lemma": os.getenv("DISTRACTOR_LEXICAL_FAMILY_HIGH_LEMMA_SCORE") |0.75,
77
+ "high_pos": os.getenv("DISTRACTOR_LEXICAL_FAMILY_HIGH_POS_SCORE") |0.9,
78
+ "medium_high_pos": os.getenv("DISTRACTOR_LEXICAL_FAMILY_MEDIUM_HIGH_POS_SCORE") |0.7,
79
+ "medium_lemma": os.getenv("DISTRACTOR_LEXICAL_FAMILY_MEDIUM_LEMMA_SCORE") |0.6,
80
+ "medium_both": os.getenv("DISTRACTOR_LEXICAL_FAMILY_MEDIUM_BOTH_SCORE") |0.45,
81
+ "low": os.getenv("DISTRACTOR_LEXICAL_FAMILY_LOW_SCORE") |0.3
82
+ }
83
+ }
84
+ }
85
  }
86
+
87
  }
src/interfaces/choice.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel
2
+ from typing import Optional
3
+
4
+ class IChoice(BaseModel):
5
+ content: str
6
+ is_correct: bool
7
+ explanation: Optional[str] = None
src/interfaces/evaluation.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ from dataclasses import dataclass, field, asdict
3
+ from typing import List, Optional, Dict, Any
4
+
5
+ from src.enums import QuestionTypeEnum
6
+ from src.interfaces.choice import IChoice
7
+
8
+
9
+ @dataclass
10
+ class GeneratedQuestion:
11
+ # Các trường cơ bản của câu hỏi
12
+ list_words: List[str] = field(default_factory=list)
13
+ paragraph: Optional[str]
14
+ num_ans_per_question: int
15
+ num_question: int
16
+
17
+ content: str
18
+ type: QuestionTypeEnum
19
+ choices: List[IChoice] = field(default_factory=list)
20
+ tags: List[str] = field(default_factory=list)
21
+
22
+ # Tùy chọn: meta khác (CEFR level, grade, ... )
23
+ metadata: Dict[str, Any] = field(default_factory=dict)
src/services/AI/false_ans_generator.py CHANGED
@@ -87,6 +87,12 @@ class FalseAnswerGenerator:
87
  tuple[list[str], list[str]]: sentence model embedding of answer and distractors.
88
  """
89
  return self._sentence_model.encode([answer]), self._sentence_model.encode(distractors)
 
 
 
 
 
 
90
 
91
  def filter_output(self, orig, dummies):
92
  """Filter out final answers.
@@ -273,7 +279,7 @@ class FalseAnswerGenerator:
273
  correct_words: list[str],
274
  num_distractors: int = 3,
275
  sim_min: float = 0.25,
276
- sim_max: float = 0.75,
277
  balance_threshold: float = 0.2
278
  ):
279
  """
 
87
  tuple[list[str], list[str]]: sentence model embedding of answer and distractors.
88
  """
89
  return self._sentence_model.encode([answer]), self._sentence_model.encode(distractors)
90
+
91
+ def get_embedding_list_word(self, word_list: list[str]):
92
+ """
93
+ Returns sentence model embedding of answer and distractors.
94
+ """
95
+ return self._sentence_model.encode([word_list])
96
 
97
  def filter_output(self, orig, dummies):
98
  """Filter out final answers.
 
279
  correct_words: list[str],
280
  num_distractors: int = 3,
281
  sim_min: float = 0.25,
282
+ sim_max: float = 0.8,
283
  balance_threshold: float = 0.2
284
  ):
285
  """
src/services/eval.py ADDED
@@ -0,0 +1,387 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from typing import Any, Dict, List, Optional
3
+
4
+ import spacy
5
+ from loaders.elastic import Elastic
6
+ from env import config
7
+ import language_tool_python
8
+ import re
9
+ from collections import defaultdict
10
+ from sklearn.metrics.pairwise import cosine_similarity
11
+
12
+ from services.AI.false_ans_generator import FalseAnswerGenerator
13
+ from src.interfaces.evaluation import GeneratedQuestion
14
+ from src.enums import QuestionTypeEnum
15
+
16
+
17
+ class QuestionQualityEvaluator:
18
+ INDEX = "vocabulary"
19
+
20
+ def __init__(self, config: dict):
21
+ self.config = config
22
+ self._grammar_tool = language_tool_python.LanguageTool('en-US')
23
+ self.nlp = spacy.load("en_core_web_sm")
24
+
25
+ # Cache các config để dễ đọc
26
+ self.weights = config["evaluation"]["weights"]
27
+ self.penalties = config["evaluation"]["penalty_for_error"]["structure"]
28
+ self.distractor_cfg = config["evaluation"]["distractor"]
29
+
30
+ def evaluate(self, q: GeneratedQuestion, check_by_ai: bool = False) -> Dict[str, Any]:
31
+ all_issues: List[Dict[str, Any]] = []
32
+ all_suggestions: List[str] = []
33
+
34
+ # 1. Structure
35
+ s_score, s_issues, s_suggestions = self._check_structure(q)
36
+ all_issues.append({"field": "structure", "score": s_score, "issues": s_issues})
37
+ all_suggestions.extend(s_suggestions)
38
+
39
+ # 2. Popularity
40
+ p_score = self._check_popularity(q)
41
+ all_issues.append({"field": "popularity", "score": p_score, "issues": []})
42
+
43
+ # 3. Distractor
44
+ d_score, d_issues = self._check_distractors(q)
45
+ all_issues.append({"field": "distractor", "score": d_score, "issues": d_issues})
46
+
47
+ w_score = self.weights["structure"] + self.weights["popularity"] + self.weights["distractor"] + self.weights["ai_adjust_factor"] if check_by_ai else 0.0
48
+ final_score = (
49
+ s_score * self.weights["structure"] +
50
+ p_score * self.weights["popularity"] +
51
+ d_score * self.weights["distractor"]
52
+ ) / w_score
53
+
54
+ rounded_score = math.ceil(final_score * 10) / 10
55
+
56
+ return {
57
+ "score": min(round(rounded_score, 1), 10.0),
58
+ "issues": all_issues,
59
+ "suggestions": list(set(all_suggestions))
60
+ }
61
+
62
+ def _check_structure(self, q: GeneratedQuestion):
63
+ issues: List[Any] = []
64
+ suggestions: List[str] = []
65
+ score = 1.0
66
+
67
+ # Question text
68
+ if not q.content or not q.content.strip():
69
+ issues.append("missing_question_text")
70
+ score -= self.penalties["missing_question_text"]
71
+ else:
72
+ grammar_count, grammar_msgs = self._check_grammar(q.content)
73
+ if grammar_count > 0:
74
+ issues.append({
75
+ "type": "question_grammar_error",
76
+ "count": grammar_count,
77
+ "details": grammar_msgs
78
+ })
79
+ score -= grammar_count * self.penalties["grammar_error_per_count"]
80
+
81
+ # Choices
82
+ if not q.choices or len(q.choices) == 0:
83
+ issues.append("missing_choices")
84
+ score -= self.penalties["missing_choices"]
85
+ else:
86
+ empty_count = 0
87
+ unique_contents = []
88
+ has_correct = False
89
+
90
+ for choice in q.choices:
91
+ content = (choice.content or "").strip()
92
+ if not content:
93
+ empty_count += 1
94
+ continue
95
+ unique_contents.append(content)
96
+ if choice.is_correct:
97
+ has_correct = True
98
+
99
+ if empty_count > 0:
100
+ issues.append(f"{empty_count}_empty_choices")
101
+ score -= self.penalties["empty_choice_ratio"] * (empty_count / len(q.choices))
102
+
103
+ if len(set(unique_contents)) < len(unique_contents):
104
+ issues.append("duplicated_choices")
105
+ score -= self.penalties["duplicated_choices"]
106
+
107
+ if not has_correct:
108
+ issues.append("no_correct_answer")
109
+ score -= self.penalties["no_correct_answer"]
110
+
111
+ for content in unique_contents:
112
+ grammar_count, grammar_msgs = self._check_grammar(content)
113
+ if grammar_count > 0:
114
+ issues.append({
115
+ "type": "choice_grammar_error",
116
+ "choice": content,
117
+ "count": grammar_count,
118
+ "details": grammar_msgs
119
+ })
120
+ score -= grammar_count * self.penalties["grammar_error_per_count"]
121
+
122
+ return max(score, 0.0), issues, suggestions
123
+
124
+ def _check_popularity(self, q: GeneratedQuestion) -> float:
125
+ unique_words = set(q.content.lower().split())
126
+ for choice in q.choices or []:
127
+ unique_words.update((choice.content or "").lower().split())
128
+
129
+ if not unique_words:
130
+ return 0.0
131
+
132
+ es = Elastic()
133
+ resp = es.search(
134
+ index=self.INDEX,
135
+ size=0,
136
+ query={"terms": {"word.keyword": list(unique_words)}},
137
+ aggs={
138
+ "by_word": {
139
+ "terms": {"field": "word.keyword", "size": len(unique_words)},
140
+ "aggs": {"cefr_level": {"avg": {"field": "cefr"}}}
141
+ }
142
+ }
143
+ )
144
+
145
+ word_cefr_map = {
146
+ bucket["key"].lower(): bucket["cefr_level"]["value"] or 4.0
147
+ for bucket in resp["aggregations"]["by_word"]["buckets"]
148
+ }
149
+
150
+ total = sum(word_cefr_map.get(word, 4.0) for word in unique_words)
151
+ avg_cefr = total / len(unique_words)
152
+
153
+ # Score cao khi từ khó hơn (CEFR cao hơn)
154
+ popularity_score = max(0.0, (avg_cefr - 1) / 5.0)
155
+ return round(popularity_score, 3)
156
+
157
+ def _check_distractors(self, q: GeneratedQuestion):
158
+ issues: List[Dict[str, Any]] = []
159
+ scores: List[float] = []
160
+
161
+ # 1. POS & lexical family
162
+ pos_score = self._check_pos_and_meaning_of_choice(q)
163
+ if pos_score is not None:
164
+ scores.append(pos_score)
165
+ issues.append({"type": "pos_lexical_family", "score": round(pos_score, 3)})
166
+
167
+ # 2. Embedding similarity
168
+ emb_score = self._cal_score_embedding_similarity(q)
169
+ if emb_score is not None:
170
+ scores.append(emb_score)
171
+ t = self.distractor_cfg["embedding_similarity_thresholds"]
172
+ level = (
173
+ "too_different" if emb_score <= t["too_different"] else
174
+ "moderate" if emb_score <= t["moderate"] else
175
+ "good" if emb_score <= t["good"] else
176
+ "strong" if emb_score <= t["strong"] else
177
+ "excellent"
178
+ )
179
+
180
+ issues.append({
181
+ "type": "embedding_similarity",
182
+ "score": round(emb_score, 3),
183
+ "level": level
184
+ })
185
+
186
+ # 3. Paragraph difficulty
187
+ para_score = self._cal_score_for_paragraph(q)
188
+ if para_score is not None:
189
+ scores.append(para_score)
190
+ diff_part = (para_score - self.distractor_cfg["paragraph"]["length_weight"]) / self.distractor_cfg["paragraph"]["difficulty_weight"] * 5
191
+ level = "direct_match" if diff_part < 2 else "paraphrase" if diff_part < 4 else "inference"
192
+ issues.append({
193
+ "type": "paragraph_difficulty",
194
+ "score": round(para_score, 3),
195
+ "level": level
196
+ })
197
+
198
+ final_score = sum(scores) / len(scores) if scores else 0.0
199
+ if scores:
200
+ issues.append({
201
+ "type": "distractor_summary",
202
+ "score": round(final_score, 3),
203
+ "components": len(scores)
204
+ })
205
+
206
+ return round(final_score, 3), issues
207
+
208
+ def _check_grammar(self, text: str, max_errors: int = 2):
209
+ if not text or len(text.strip()) < 5:
210
+ return 0, []
211
+
212
+ matches = self._grammar_tool.check(text)
213
+ serious_matches = [
214
+ m for m in matches
215
+ if m.ruleIssueType in {"grammar", "misspelling"}
216
+ and not m.ruleId.startswith("UPPERCASE_SENTENCE_START")
217
+ ]
218
+
219
+ error_messages = [
220
+ {
221
+ "message": m.message,
222
+ "rule": m.ruleId,
223
+ "error_text": text[m.offset:m.offset + m.errorLength],
224
+ "suggestions": m.replacements[:3]
225
+ }
226
+ for m in serious_matches[:max_errors]
227
+ ]
228
+ return len(error_messages), error_messages
229
+
230
+ def _check_pos_and_meaning_of_choice(self, q: GeneratedQuestion) -> Optional[float]:
231
+ if q.type in {QuestionTypeEnum.PRONUNCIATION, QuestionTypeEnum.STRESS}:
232
+ return 1.0
233
+
234
+ to_be_regex = re.compile(
235
+ r'\b(has been|have been|had been|will be|am|is|are|was|were|be|being|been|\'s|\'re|\'m)\b',
236
+ flags=re.IGNORECASE
237
+ )
238
+
239
+ cleaned_choices: List[str] = []
240
+ score = 1.0
241
+
242
+ for c in q.choices or []:
243
+ content = (c.content or "").strip()
244
+ if not content:
245
+ score -= self.distractor_cfg["empty_choice_deduction"]
246
+ continue
247
+ cleaned = to_be_regex.sub("", content)
248
+ cleaned = " ".join(cleaned.split()).lower()
249
+ cleaned_choices.append(cleaned)
250
+
251
+ if any(len(t.split()) > 1 for t in cleaned_choices):
252
+ return score
253
+
254
+ docs = [self.nlp(text) for text in cleaned_choices]
255
+ tokens = [token for doc in docs for token in doc]
256
+
257
+ return score * self.lexical_family_difficulty(tokens, q.num_ans_per_question or 4)
258
+
259
+ def _cal_score_embedding_similarity(self, q: GeneratedQuestion) -> Optional[float]:
260
+ if q.type not in {QuestionTypeEnum.SYNONYM, QuestionTypeEnum.ANTONYM, QuestionTypeEnum.VOCAB}:
261
+ return None
262
+
263
+ correct = [c.content for c in q.choices if c.is_correct]
264
+ distractors = [c.content for c in q.choices if not c.is_correct]
265
+ if not correct or not distractors:
266
+ return 0.0
267
+
268
+ ai = FalseAnswerGenerator()
269
+ emb_correct = ai.get_embedding_list_word(correct)
270
+ emb_dist = ai.get_embedding_list_word(distractors)
271
+
272
+ similarities = [
273
+ cosine_similarity(c.reshape(1, -1), d.reshape(1, -1))[0][0]
274
+ for c in emb_correct for d in emb_dist
275
+ ]
276
+ if not similarities:
277
+ return 0.0
278
+
279
+ avg_sim = sum(similarities) / len(similarities)
280
+ t = self.distractor_cfg["embedding_similarity_thresholds"]
281
+
282
+ if avg_sim <= t["too_different"]:
283
+ return 0.2
284
+ elif avg_sim <= t["moderate"]:
285
+ return 0.4
286
+ elif avg_sim <= t["good"]:
287
+ return 0.6
288
+ elif avg_sim <= t["strong"]:
289
+ return 0.8
290
+ else:
291
+ return 1.0
292
+
293
+ def _cal_score_for_paragraph(self, q: GeneratedQuestion) -> Optional[float]:
294
+ if q.type not in {
295
+ QuestionTypeEnum.VOCAB, QuestionTypeEnum.FACT,
296
+ QuestionTypeEnum.MAIN_IDEA, QuestionTypeEnum.INFERENCE,
297
+ QuestionTypeEnum.PURPOSE
298
+ }:
299
+ return None
300
+
301
+ correct_answer = next((c.content for c in q.choices if c.is_correct), None)
302
+ if not correct_answer or not q.paragraph:
303
+ return 0.0
304
+
305
+ words = q.paragraph.lower().split()
306
+ word_count = len(words)
307
+ p_cfg = self.distractor_cfg["paragraph"]
308
+
309
+ # Length score
310
+ if q.type == QuestionTypeEnum.VOCAB:
311
+ thresholds = p_cfg["vocab_length_thresholds"]
312
+ scores = [0.2, 0.3, 0.4, 0.5]
313
+ else:
314
+ thresholds = p_cfg["other_length_thresholds"]
315
+ scores = [0.3, 0.5, 0.7, 0.9, 1.0]
316
+
317
+ length_score = scores[-1]
318
+ for thresh, sc in zip(thresholds, scores):
319
+ if word_count <= thresh:
320
+ length_score = sc
321
+ break
322
+
323
+ # Difficulty score
324
+ doc = self.nlp(q.paragraph)
325
+ sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
326
+ if not sentences:
327
+ return length_score * p_cfg["length_weight"]
328
+
329
+ ai = FalseAnswerGenerator()
330
+ sent_embs = ai.get_embedding_list_word(sentences)
331
+ ans_emb = ai.get_embedding_list_word([correct_answer])
332
+
333
+ cos_scores = cosine_similarity(ans_emb, sent_embs)[0]
334
+ max_sim = float(max(cos_scores)) if cos_scores.size else 0.0
335
+
336
+ levels = p_cfg["difficulty_levels"]
337
+ if max_sim >= p_cfg["direct_match_sim"]:
338
+ diff_val = levels[0]
339
+ elif max_sim >= p_cfg["paraphrase_sim"]:
340
+ diff_val = levels[1]
341
+ else:
342
+ diff_val = levels[2]
343
+
344
+ diff_score = diff_val / 5.0
345
+
346
+ return p_cfg["length_weight"] * length_score + p_cfg["difficulty_weight"] * diff_score
347
+
348
+ def group_by_lemma(self, tokens):
349
+ groups = defaultdict(list)
350
+ for t in tokens:
351
+ groups[t.lemma_.lower()].append(t)
352
+ return groups
353
+
354
+ def group_by_pos(self, tokens):
355
+ groups = defaultdict(list)
356
+ for t in tokens:
357
+ groups[t.pos_].append(t)
358
+ return groups
359
+
360
+ def lexical_family_difficulty(self, tokens, num_ans_per_question: int = 4) -> float:
361
+ if not tokens:
362
+ return self.distractor_cfg["lexical_family"]["scores"]["low"]
363
+
364
+ lemma_groups = self.group_by_lemma(tokens)
365
+ pos_groups = self.group_by_pos(tokens)
366
+ n = len(tokens)
367
+
368
+ lemma_score = sum(len(v) for v in lemma_groups.values() if len(v) >= 3)
369
+ lemma_ratio = lemma_score / n
370
+
371
+ pos_score = sum(len(v) for v in pos_groups.values() if len(v) >= min(num_ans_per_question, 3))
372
+ pos_ratio = pos_score / n
373
+
374
+ t = self.distractor_cfg["lexical_family"]["thresholds"]
375
+ s = self.distractor_cfg["lexical_family"]["scores"]
376
+
377
+ if lemma_ratio >= t["high_lemma"]:
378
+ return s["high_lemma"]
379
+ if pos_ratio >= t["high_pos"]:
380
+ return s["high_pos"]
381
+ if pos_ratio >= t["medium_high_pos"]:
382
+ return s["medium_high_pos"]
383
+ if lemma_ratio >= t["medium_lemma"]:
384
+ return s["medium_lemma"]
385
+ if pos_ratio >= t["medium_both"] and lemma_ratio >= t["medium_both"]:
386
+ return s["medium_both"]
387
+ return s["low"]