Spaces:

kbl2810
/

gen-question

Sleeping

App Files Files Community

linhnguyen02 commited on 19 days ago

Commit

8e72e5b

1 Parent(s): 48a5849

sen2vec false ans

Browse files

Files changed (5) hide show

src/factories/gen_question/types/antonym_question.py +48 -35
src/factories/gen_question/types/synonym_question.py +53 -39
src/factories/gen_question_for_paragraph/types/synthetic.py +2 -3
src/interfaces/question.py +1 -1
src/services/AI/false_ans_generator.py +147 -0

src/factories/gen_question/types/antonym_question.py CHANGED Viewed

@@ -5,11 +5,17 @@ from src.factories.gen_question.types.base import Question, nltk_words
 from src.enums import QuestionTypeEnum
 from src.loaders.elastic import Elastic
 class AntonymsQuestion(Question):
     INDEX = "vocabulary"
     def generate_questions(self, list_words: List[str] = None, num_question: int = 1,
                            num_ans_per_question: int = 4, cefr: int = 3):
@@ -22,48 +28,55 @@ class AntonymsQuestion(Question):
         used_choices = set()
         for _ in range(num_question):
-            max_loop = 100
-            question_word, correct_answer, antonym_set = \
                 self._pick_question_word(list_unique_words, used_words, cefr)
-            pos = self.get_pos({
-                "bool": {
-                    "must": [
-                        {"term": {"word.keyword": question_word.lower()}},
-                        {"term": {"antonyms.keyword": correct_answer.lower()}}
-                    ]
-                }
-            })
-            used_words.update([question_word, correct_answer])
             choices = [correct_answer]
-            while len(choices) < num_ans_per_question and max_loop > 0:
-                doc = self.get_random(self.INDEX, None, cefr=cefr, pos=pos)
-                if not doc:
-                    continue
-                candidate = doc["word"]
-                # Loại trừ điều kiện chung
-                if (
-                    candidate in used_choices or
-                    candidate in used_words or
-                    candidate in antonym_set or
-                    candidate == question_word or
-                    candidate == correct_answer
-                ):
-                    continue
-                # Loại distractor có nghĩa trùng với đáp án
-                syns = set(self.get_list_antonym(candidate))
-                if correct_answer in syns:
-                    continue
-                choices.append(candidate)
-                used_choices.add(candidate)
-                max_loop -= 1
             random.shuffle(choices)
             final_choices = []
@@ -122,7 +135,7 @@ class AntonymsQuestion(Question):
                 continue
             correct = random.choice(valid_syns)
-            return source, correct, set(syns)
         # FALLBACK ES
         while True:
@@ -140,4 +153,4 @@ class AntonymsQuestion(Question):
             if not valid_syns:
                 continue
-            return source, random.choice(valid_syns), set(syns)

 from src.enums import QuestionTypeEnum
 from src.loaders.elastic import Elastic
+from src.services.AI.false_ans_generator import FalseAnswerGenerator
 class AntonymsQuestion(Question):
     INDEX = "vocabulary"
+    false_ans_gen: FalseAnswerGenerator = None
+    def __init__(self):
+        if self.false_ans_gen is None:
+            self.false_ans_gen = FalseAnswerGenerator()
     def generate_questions(self, list_words: List[str] = None, num_question: int = 1,
                            num_ans_per_question: int = 4, cefr: int = 3):
         used_choices = set()
         for _ in range(num_question):
+            question_word, correct_answer = \
                 self._pick_question_word(list_unique_words, used_words, cefr)
+            # max_loop = 100
+            # pos = self.get_pos({
+            #     "bool": {
+            #         "must": [
+            #             {"term": {"word.keyword": question_word.lower()}},
+            #             {"term": {"antonyms.keyword": correct_answer.lower()}}
+            #         ]
+            #     }
+            # })
+            # used_words.update([question_word, correct_answer])
             choices = [correct_answer]
+            # while len(choices) < num_ans_per_question and max_loop > 0:
+            #     doc = self.get_random(self.INDEX, None, cefr=cefr, pos=pos)
+            #     if not doc:
+            #         continue
+            #     candidate = doc["word"]
+            #     # Loại trừ điều kiện chung
+            #     if (
+            #         candidate in used_choices or
+            #         candidate in used_words or
+            #         candidate in antonym_set or
+            #         candidate == question_word or
+            #         candidate == correct_answer
+            #     ):
+            #         continue
+            #     # Loại distractor có nghĩa trùng với đáp án
+            #     syns = set(self.get_list_antonym(candidate))
+            #     if correct_answer in syns:
+            #         continue
+            #     choices.append(candidate)
+            #     used_choices.add(candidate)
+            #     max_loop -= 1
+            distractors = self.false_ans_gen.generate_distractors_from_antonyms(
+                target_word=[correct_answer, question_word],
+                num_false_answers=num_ans_per_question - 1
+            )
+            choices.extend(distractors)
             random.shuffle(choices)
             final_choices = []
                 continue
             correct = random.choice(valid_syns)
+            return source, correct
         # FALLBACK ES
         while True:
             if not valid_syns:
                 continue
+            return source, random.choice(valid_syns)

src/factories/gen_question/types/synonym_question.py CHANGED Viewed

@@ -5,9 +5,16 @@ from src.factories.gen_question.types.base import Question, nltk_words
 from src.enums import QuestionTypeEnum
 from src.loaders.elastic import Elastic
 class SynonymsQuestion(Question):
     INDEX = "vocabulary"
     def generate_questions(self, list_words: List[str] = None, num_question: int = 1,
                            num_ans_per_question: int = 4, cefr: int = 3):
@@ -20,48 +27,53 @@ class SynonymsQuestion(Question):
         used_choices = set()
         for _ in range(num_question):
-            max_loop = 100
-            question_word, correct_answer, synonym_set = \
                 self._pick_question_word(list_unique_words, used_words, cefr)
-            pos = self.get_pos({
-                "bool": {
-                    "must": [
-                        {"term": {"word.keyword": question_word.lower()}},
-                        {"term": {"synonyms.keyword": correct_answer.lower()}}
-                    ]
-                }
-            })
-            used_words.update([question_word, correct_answer])
             choices = [correct_answer]
-            while len(choices) < num_ans_per_question and max_loop > 0:
-                doc = self.get_random(self.INDEX, None, cefr=cefr, pos=pos)
-                if not doc:
-                    continue
-                candidate = doc["word"]
-                # Loại trừ điều kiện chung
-                if (
-                    candidate in used_choices or
-                    candidate in used_words or
-                    candidate in synonym_set or
-                    candidate == question_word or
-                    candidate == correct_answer
-                ):
-                    continue
-                # Loại distractor có nghĩa trùng với đáp án
-                syns = set(self.get_list_synonym(candidate))
-                if correct_answer in syns:
-                    continue
-                choices.append(candidate)
-                used_choices.add(candidate)
-                max_loop -= 1
             random.shuffle(choices)
             final_choices = []
@@ -120,7 +132,8 @@ class SynonymsQuestion(Question):
                 continue
             correct = random.choice(valid_syns)
-            return source, correct, set(syns)
         # FALLBACK ES
         while True:
@@ -138,4 +151,5 @@ class SynonymsQuestion(Question):
             if not valid_syns:
                 continue
-            return source, random.choice(valid_syns), set(syns)

 from src.enums import QuestionTypeEnum
 from src.loaders.elastic import Elastic
+from src.services.AI.false_ans_generator import FalseAnswerGenerator
 class SynonymsQuestion(Question):
     INDEX = "vocabulary"
+    false_ans_gen: FalseAnswerGenerator = None
+    def __init__(self):
+        if self.false_ans_gen is None:
+            self.false_ans_gen = FalseAnswerGenerator()
     def generate_questions(self, list_words: List[str] = None, num_question: int = 1,
                            num_ans_per_question: int = 4, cefr: int = 3):
         used_choices = set()
         for _ in range(num_question):
+            question_word, correct_answer = \
                 self._pick_question_word(list_unique_words, used_words, cefr)
+            # max_loop = 100
+            # pos = self.get_pos({
+            #     "bool": {
+            #         "must": [
+            #             {"term": {"word.keyword": question_word.lower()}},
+            #             {"term": {"synonyms.keyword": correct_answer.lower()}}
+            #         ]
+            #     }
+            # })
+            # used_words.update([question_word, correct_answer])
             choices = [correct_answer]
+            # while len(choices) < num_ans_per_question and max_loop > 0:
+            #     doc = self.get_random(self.INDEX, None, cefr=cefr, pos=pos)
+            #     if not doc:
+            #         continue
+            #     candidate = doc["word"]
+            #     # Loại trừ điều kiện chung
+            #     if (
+            #         candidate in used_choices or
+            #         candidate in used_words or
+            #         candidate in synonym_set or
+            #         candidate == question_word or
+            #         candidate == correct_answer
+            #     ):
+            #         continue
+            #     # Loại distractor có nghĩa trùng với đáp án
+            #     syns = set(self.get_list_synonym(candidate))
+            #     if correct_answer in syns:
+            #         continue
+            #     choices.append(candidate)
+            #     used_choices.add(candidate)
+            #     max_loop -= 1
+            distractors = self.false_ans_gen.generate_distractors_from_synonyms(
+                target_word=[correct_answer, question_word],
+                num_false_answers=num_ans_per_question - 1
+            )
+            choices.extend(distractors)
             random.shuffle(choices)
             final_choices = []
                 continue
             correct = random.choice(valid_syns)
+            return source, correct
+        # return source, correct, set(syns)
         # FALLBACK ES
         while True:
             if not valid_syns:
                 continue
+            return source, random.choice(valid_syns)
+            # return source, random.choice(valid_syns), set(syns)

src/factories/gen_question_for_paragraph/types/synthetic.py CHANGED Viewed

@@ -26,14 +26,13 @@ class ParagraphQuestion(Question):
             num = question_data.num_question
             type_to_total_count[qtype] = type_to_total_count.get(qtype, 0) + num
-        final_output = {}
         for qtype, total_count in type_to_total_count.items():
             prompt = type_to_prompt_map.get(qtype)
             if not prompt:
                 continue
             content_user = (
-                f"PARAGRAPH: {data.description}\n"
                 f"QUESTION_COUNT: {total_count}\n"
                 f"OPTIONS_PER_QUESTION: {data.num_ans_per_question}\n"
             )
@@ -60,7 +59,7 @@ class ParagraphQuestion(Question):
             for question in data.get("list_questions", []):
                 result.append({
-                    "question": question.get("question"),
                     "type": qtype,
                     "choices": question.get("choices", []),
                     "answer": question.get("answer"),

             num = question_data.num_question
             type_to_total_count[qtype] = type_to_total_count.get(qtype, 0) + num
         for qtype, total_count in type_to_total_count.items():
             prompt = type_to_prompt_map.get(qtype)
             if not prompt:
                 continue
             content_user = (
+                f"PARAGRAPH: {data.paragraph}\n"
                 f"QUESTION_COUNT: {total_count}\n"
                 f"OPTIONS_PER_QUESTION: {data.num_ans_per_question}\n"
             )
             for question in data.get("list_questions", []):
                 result.append({
+                    "content": question.get("question"),
                     "type": qtype,
                     "choices": question.get("choices", []),
                     "answer": question.get("answer"),

src/interfaces/question.py CHANGED Viewed

@@ -15,7 +15,7 @@ class IQuestionConfig(BaseModel):
     num_question: int = Field(..., ge=1, le=5)
 class ICreateQuestionForParagraph(BaseModel):
-    description: Text
     num_ans_per_question: int = Field(..., ge=2, le=6)
     list_create_question: List[IQuestionConfig]

     num_question: int = Field(..., ge=1, le=5)
 class ICreateQuestionForParagraph(BaseModel):
+    paragraph: Text
     num_ans_per_question: int = Field(..., ge=2, le=6)
     list_create_question: List[IQuestionConfig]

src/services/AI/false_ans_generator.py CHANGED Viewed

@@ -198,3 +198,150 @@ class FalseAnswerGenerator:
                     all_answers.append(results)
         return crct_ans, sum(all_answers, [])

                     all_answers.append(results)
         return crct_ans, sum(all_answers, [])
+    def generate_distractors_from_synonyms(
+        self,
+        correct_words: list[str],
+        num_distractors: int = 3,
+        sim_min: float = 0.35,
+        sim_max: float = 0.75
+    ):
+        """
+        Generate distractors for synonym questions.
+        Input: 2 correct synonymous words
+        Output: distractors semantically related but NOT synonyms
+        """
+        assert len(correct_words) == 2, "Require exactly 2 correct synonyms"
+        w1, w2 = [w.lower().strip() for w in correct_words]
+        candidates = set()
+        # -------- 1. Collect candidates from sense2vec ----------
+        for w in [w1, w2]:
+            sense = self._s2v.get_best_sense(w.replace(" ", "_"))
+            if sense and sense in self._s2v:
+                sims = self._s2v.most_similar(sense, n=30)
+                formatted = change_format(sims)
+                candidates.update(formatted)
+        # Remove originals
+        candidates = {
+            c for c in candidates
+            if c.lower() not in {w1, w2}
+        }
+        if not candidates:
+            return []
+        candidates = list(candidates)
+        # -------- 2. Sentence embedding ----------
+        emb_correct = self._sentence_model.encode(correct_words)
+        emb_candidates = self._sentence_model.encode(candidates)
+        # similarity to each correct word
+        sim_1 = cosine_similarity(emb_candidates, emb_correct[0].reshape(1, -1))
+        sim_2 = cosine_similarity(emb_candidates, emb_correct[1].reshape(1, -1))
+        final_candidates = []
+        for idx, word in enumerate(candidates):
+            s1 = sim_1[idx][0]
+            s2 = sim_2[idx][0]
+            # loại bỏ các từ quá giống
+            if max(s1, s2) > sim_max:
+                continue
+            # loại bỏ các từ quá khác
+            if max(s1, s2) < sim_min:
+                continue
+            final_candidates.append((word, max(s1, s2)))
+        chosen = random.sample(
+            final_candidates,
+            k=min(num_distractors, len(final_candidates))
+        )
+        return [w.capitalize() for w, _ in chosen]
+    def generate_distractors_from_antonyms(
+        self,
+        correct_words: list[str],
+        num_distractors: int = 3,
+        sim_min: float = 0.25,
+        sim_max: float = 0.75,
+        balance_threshold: float = 0.2
+    ):
+        """
+        Generate distractors for antonym questions.
+        Input: 2 opposite words
+        Output: neutral / intermediate distractors
+        """
+        assert len(correct_words) == 2, "Require exactly 2 antonyms"
+        w1, w2 = [w.lower().strip() for w in correct_words]
+        candidates = set()
+        # -------- 1. Collect candidates from both antonyms ----------
+        for w in [w1, w2]:
+            sense = self._s2v.get_best_sense(w.replace(" ", "_"))
+            if sense and sense in self._s2v:
+                sims = self._s2v.most_similar(sense, n=40)
+                candidates.update(change_format(sims))
+        # Remove originals
+        candidates = {
+            c for c in candidates
+            if c.lower() not in {w1, w2}
+        }
+        if not candidates:
+            return []
+        candidates = list(candidates)
+        # -------- 2. Sentence embedding ----------
+        emb_correct = self._sentence_model.encode(correct_words)
+        emb_candidates = self._sentence_model.encode(candidates)
+        sim_1 = cosine_similarity(emb_candidates, emb_correct[0].reshape(1, -1))
+        sim_2 = cosine_similarity(emb_candidates, emb_correct[1].reshape(1, -1))
+        final_candidates = []
+        for idx, word in enumerate(candidates):
+            s1 = sim_1[idx][0]
+            s2 = sim_2[idx][0]
+            # quá gần một cực → loại
+            if max(s1, s2) > sim_max:
+                continue
+            # quá xa cả hai → loại
+            if max(s1, s2) < sim_min:
+                continue
+            # không cân bằng → nghiêng hẳn về 1 phía
+            if abs(s1 - s2) > balance_threshold:
+                continue
+            final_candidates.append(
+                (word, (s1 + s2) / 2)
+            )
+        if not final_candidates:
+            return []
+        chosen = random.sample(
+            final_candidates,
+            k=min(num_distractors, len(final_candidates))
+        )
+        return [w.capitalize() for w, _ in chosen]