Spaces:

kbl2810
/

gen-question

Running

App Files Files Community

linhnguyen02 commited on Nov 17, 2025

Commit

7a13735

1 Parent(s): ee409a4

update tu dong nghia va trai nghia

Browse files

Files changed (5) hide show

env.py +4 -0
src/factories/gen_question/types/antonym_question.py +100 -75
src/factories/gen_question/types/base.py +83 -19
src/factories/gen_question/types/synonym_question.py +97 -74
src/loaders/elastic.py +15 -0

env.py CHANGED Viewed

@@ -24,5 +24,9 @@ config = {
     },
     "google": {
         "api_key": os.getenv("GOOGLE_API_KEY"),
     }
 }

     },
     "google": {
         "api_key": os.getenv("GOOGLE_API_KEY"),
+    },
+    "elastic": {
+        "url": os.getenv("ELASTIC_URL"),
+        "api_key": os.getenv("ELASTIC_API_KEY")
     }
 }

src/factories/gen_question/types/antonym_question.py CHANGED Viewed

@@ -4,110 +4,135 @@ import random
 from src.factories.gen_question.types.base import Question, nltk_words
 from src.enums import QuestionTypeEnum
-class AntonymsQuestion(Question):
-    """
-    This class generates multiple-choice questions that ask the user
-    to select an antonym for a given word.
-    It uses dictionary data (from fetch_word_data) to retrieve
-    meanings and antonyms. If the input list is empty or invalid,
-    it falls back to randomly chosen words from a built-in word list (nltk_words).
-    """
-    def generate_questions(self, list_words: List[str] = None, num_question: int = 1, num_ans_per_question: int = 4):
-        if list_words is None:
-            list_words = []
-        result = []
         list_unique_words = set(list_words)
-        # Internal helper function to get a valid question/answer pair
-        def get_question_and_answer():
-            """
-            Randomly selects a word and finds one of its antonyms.
-            Returns:
-                tuple(str, str): question_word, antonym_answer
-            """
-            # Try from provided list word
-            while list_unique_words:
-                source_word = random.sample(list(list_unique_words), 1)[0]
-                list_unique_words.remove(source_word)
-                antonym_word = self.get_antonym(source_word)
-                if antonym_word in list_unique_words:
-                    list_unique_words.remove(antonym_word)
-                if antonym_word:
-                    return source_word, antonym_word
-            # Fallback: use nltk_words
-            while True:
-                source_word = random.choice(nltk_words)
-                antonym_word = self.get_antonym(source_word)
-                if antonym_word:
-                    return source_word, antonym_word
         for _ in range(num_question):
-            question_word, correct_answer = get_question_and_answer()
             choices = [correct_answer]
-            distractor_set = set()
-            while len(choices) < num_ans_per_question:
-                distractor_word = random.choice(nltk_words)
-                if (distractor_word.lower() != correct_answer.lower() and
-                    distractor_word.lower() != question_word.lower() and
-                    distractor_word.lower() not in distractor_set):
-                    distractor_set.add(distractor_word)
-                    choices.append(distractor_word)
             random.shuffle(choices)
             result.append({
                 "question": question_word,
-                "type": QuestionTypeEnum.ANTONYM,
                 "choices": choices,
                 "answer": choices.index(correct_answer),
-                "explain": [],
             })
         return result
-    def get_antonym(self, word: str):
         """
-        Retrieves a random antonym for the given word using dictionary API data.
-        It checks both the 'meanings.antonyms' and 'meanings.definitions.antonyms' fields.
-        Args:
-            word (str): The input word to find an antonym for.
-        Returns:
-            str or None: An antonym if found, else None.
-        """
-        data = self.fetch_word_data(word)
-        if not data:
-            return None
-        meanings = data.get("meanings", [])
-        # Randomly search for antonyms in the meaning entries
-        while meanings:
-            meaning = random.sample(meanings, 1)[0]
-            # Try top-level antonyms
-            antonyms = meaning.get("antonyms", [])
-            # Also check antonyms inside definitions
-            if not antonyms:
-                definitions = meaning.get("definitions", [])
-                for definition in definitions:
-                    antonyms.extend(definition.get("antonyms", []))
-            if antonyms:
-                return random.choice(antonyms)
-            meanings.remove(meaning)
-        return None

 from src.factories.gen_question.types.base import Question, nltk_words
 from src.enums import QuestionTypeEnum
+from src.loaders.elastic import Elastic
+class AntonymsQuestion(Question):
+    INDEX = "vocabulary"
+    def generate_questions(self, list_words: List[str] = None, num_question: int = 1,
+                           num_ans_per_question: int = 4, cefr: int = 3):
+        list_words = list_words or []
         list_unique_words = set(list_words)
+        result = []
+        used_words = set()
+        used_choices = set()
         for _ in range(num_question):
+            max_loop = 100
+            question_word, correct_answer, antonym_set = \
+                self._pick_question_word(list_unique_words, used_words, cefr)
+            pos = self.get_pos({
+                "bool": {
+                    "must": [
+                        {"term": {"word.keyword": question_word.lower()}},
+                        {"term": {"antonyms.keyword": correct_answer.lower()}}
+                    ]
+                }
+            })
+            used_words.update([question_word, correct_answer])
             choices = [correct_answer]
+            while len(choices) < num_ans_per_question and max_loop > 0:
+                doc = self.get_random(self.INDEX, None, cefr=cefr, pos=pos)
+                if not doc:
+                    continue
+                candidate = doc["word"]
+                # Loại trừ điều kiện chung
+                if (
+                    candidate in used_choices or
+                    candidate in used_words or
+                    candidate in antonym_set or
+                    candidate == question_word or
+                    candidate == correct_answer
+                ):
+                    continue
+                # Loại distractor có nghĩa trùng với đáp án
+                syns = set(self.get_list_antonym(candidate))
+                if correct_answer in syns:
+                    continue
+                choices.append(candidate)
+                used_choices.add(candidate)
+                max_loop -= 1
             random.shuffle(choices)
             result.append({
                 "question": question_word,
+                "type": QuestionTypeEnum.SYNONYM,
                 "choices": choices,
                 "answer": choices.index(correct_answer),
+                "explain": []
             })
         return result
+    # -----------------------------------------------------
+    # Lấy tất cả antonym của 1 từ từ ES (nhiều nghĩa)
+    # -----------------------------------------------------
+    def get_list_antonym(self, word: str):
+        es = Elastic()
+        query = {"term": {"word.keyword": word.lower()}}
+        resp = es.search(index=self.INDEX, query=query, size=1000)
+        hits = resp["hits"]["hits"]
+        if not hits:
+            return []
+        antonyms = set()
+        for h in hits:
+            s = h["_source"].get("antonyms", [])
+            antonyms.update(s)
+        return list(antonyms)
+    # -----------------------------------------------------
+    # Lấy 1 từ làm câu hỏi và 1 antonym làm đáp án
+    # -----------------------------------------------------
+    def _pick_question_word(self, list_unique_words, used_words, cefr):
+        """
+        - Ưu tiên lấy từ danh sách đầu vào
+        - Nếu hết → lấy từ ES random theo CEFR
         """
+        # ƯU TIÊN INPUT LIST
+        while list_unique_words:
+            source = list_unique_words.pop()
+            if source in used_words:
+                continue
+            syns = self.get_list_antonym(source)
+            valid_syns = [s for s in syns if s not in used_words]
+            if not valid_syns:
+                continue
+            correct = random.choice(valid_syns)
+            return source, correct, set(syns)
+        # FALLBACK ES
+        while True:
+            doc = self.get_random(self.INDEX, None, cefr=cefr)
+            if not doc:
+                continue
+            source = doc["word"]
+            if source in used_words:
+                continue
+            syns = self.get_list_antonym(source)
+            valid_syns = [s for s in syns if s not in used_words]
+            if not valid_syns:
+                continue
+            return source, random.choice(valid_syns), set(syns)

src/factories/gen_question/types/base.py CHANGED Viewed

@@ -1,13 +1,10 @@
 from abc import ABC, abstractmethod
 from typing import Set, Optional
-import requests
-import nltk
-nltk.download('words')
-from nltk.corpus import words
-nltk_words = words.words()
 class Question(ABC):
@@ -15,6 +12,86 @@ class Question(ABC):
     def generate_questions(self, list_words: Set[str], num_questions: int = 1, num_ans_per_question: int = 4):
         pass
     @staticmethod
     def cal_num_word_in_list_available_per_question(
             len_list_words: int,
@@ -22,17 +99,4 @@ class Question(ABC):
             num_ans_per_question: int = 4
     ) -> int:
         return min(len_list_words//num_questions, num_ans_per_question)
-    @staticmethod
-    def fetch_word_data(word: str) -> Optional[dict]:
-        """API get data of word"""
-        try:
-            base_url = "https://api.dictionaryapi.dev/api/v2/entries/en/"
-            resp = requests.get(base_url + word)
-            if resp.status_code == 200:
-                data = resp.json()
-                return data[0]
-            else:
-                return None
-        except Exception as e:
-            return None

 from abc import ABC, abstractmethod
 from typing import Set, Optional
+import random
+from src.loaders.elastic import Elastic
 class Question(ABC):
     def generate_questions(self, list_words: Set[str], num_questions: int = 1, num_ans_per_question: int = 4):
         pass
+    def _build_query(self, query: dict = None, cefr: Optional[int] = None, pos: str = None):
+        must = []
+        if query:
+            must.append(query)
+        if cefr is not None:
+            must.append({
+                "range": {
+                    "cefr": {"gte": cefr - 1, "lte": cefr + 1}
+                }
+            })
+        if pos is not None:
+            must.append({
+                "term": {"pos.keyword": pos}
+            })
+        return {"bool": {"must": must}} if must else {"match_all": {}}
+    # ---------------------------------------
+    # Get 1 random doc from ES using CEFR + filter
+    # ---------------------------------------
+    def get_random(self, index: str, query: dict = None, cefr: int = None, pos: str = None):
+        es = Elastic()
+        q = self._build_query(query, cefr, pos)
+        count = es.count(index=index, query=q)["count"]
+        if count == 0:
+            return None
+        offset = random.randint(0, count - 1)
+        resp = es.search(index=index, query=q, size=1, from_=offset)
+        hits = resp["hits"]["hits"]
+        return hits[0]["_source"] if hits else None
+    # ---------------------------------------
+    # Get first matched doc
+    # ---------------------------------------
+    def get_detail_word(self, index: str, query: dict = None):
+        es = Elastic()
+        q = self._build_query(query)
+        resp = es.search(index=index, query=q, size=1)
+        hits = resp["hits"]["hits"]
+        return hits[0]["_source"] if hits else None
+    def get_cefr_word(self, index: str, word: str):
+        doc = self.get_detail_word(index, {"term": {"word.keyword": word}})
+        return doc.get("cefr") if doc else None
+    # ---------------------------------------
+    def check_valid_cefr(self, target_cefr: int, candidate_cefr: Optional[int]):
+        if candidate_cefr is None:
+            return False
+        return abs(target_cefr - candidate_cefr) <= 1
+    # ---------------------------------------
+    # Get pos of word
+    # ---------------------------------------
+    def get_pos(self, index: str, query: dict = None):
+        list_docs = self.get_list_word(index, query)
+        if len(list_docs) > 0:
+            doc = random.choice(list_docs)
+            if doc and "pos" in doc:
+                return doc["pos"]
+        return "noun"
+    def get_list_word(self, index: str, query: dict = None):
+        es = Elastic()
+        resp = es.search(
+            index=index,
+            query=query,
+            size=1000
+        )
+        hits = resp["hits"]["hits"]
+        return [hit["_source"] for hit in hits] if hits else []
     @staticmethod
     def cal_num_word_in_list_available_per_question(
             len_list_words: int,
             num_ans_per_question: int = 4
     ) -> int:
         return min(len_list_words//num_questions, num_ans_per_question)

src/factories/gen_question/types/synonym_question.py CHANGED Viewed

@@ -3,64 +3,64 @@ import random
 from src.factories.gen_question.types.base import Question, nltk_words
 from src.enums import QuestionTypeEnum
 class SynonymsQuestion(Question):
-    """
-    This class generates multiple-choice questions that ask the user
-    to select a synonym for a given word.
-    It uses dictionary data (from fetch_word_data) to retrieve
-    meanings and synonyms. If the input list is empty or invalid,
-    it falls back to randomly chosen words from a built-in word list (nltk_words).
-    """
-    def generate_questions(self, list_words: List[str] = None, num_question: int = 1, num_ans_per_question: int = 4):
-        if list_words is None:
-            list_words = []
-        result = []
         list_unique_words = set(list_words)
-        # Internal helper function to get a valid question/answer pair
-        def get_question_and_answer():
-            """
-            Randomly selects a word and finds one of its synonyms.
-            Returns:
-                tuple(str, str): question_word, synonym_answer
-            """
-            # Try from provided list word
-            while list_unique_words:
-                source_word = random.sample(list(list_unique_words), 1)[0]
-                list_unique_words.remove(source_word)
-                synonym_word = self.get_synonym(source_word)
-                if synonym_word in list_unique_words:
-                    list_unique_words.remove(source_word)
-                if synonym_word:
-                    return source_word, synonym_word
-            # Fallback: use nltk_words
-            while True:
-                source_word = random.choice(nltk_words)
-                synonym_word = self.get_synonym(source_word)
-                if synonym_word:
-                    return source_word, synonym_word
         for _ in range(num_question):
-            question_word, correct_answer = get_question_and_answer()
             choices = [correct_answer]
-            distractor_set = set()
-            while len(choices) < num_ans_per_question:
-                distractor_word = random.choice(nltk_words)
-                if (distractor_word.lower() != correct_answer.lower() and
-                    distractor_word.lower() != question_word.lower() and
-                    distractor_word.lower() not in distractor_set):
-                    distractor_set.add(distractor_word)
-                    choices.append(distractor_word)
             random.shuffle(choices)
@@ -69,45 +69,68 @@ class SynonymsQuestion(Question):
                 "type": QuestionTypeEnum.SYNONYM,
                 "choices": choices,
                 "answer": choices.index(correct_answer),
-                "explain": [],
             })
         return result
-    def get_synonym(self, word: str):
         """
-        Retrieves a random synonym for the given word using dictionary API data.
-        It checks both the 'meanings.synonyms' and 'meanings.definitions.synonyms' fields.
-        Args:
-            word (str): The input word to find a synonym for.
-        Returns:
-            str or None: A synonym if found, else None.
-        """
-        data = self.fetch_word_data(word)
-        if not data:
-            return None
-        meanings = data.get("meanings", [])
-        # Randomly search for synonyms in the meaning entries
-        while meanings:
-            meaning = random.sample(meanings, 1)[0]
-            # Try top-level synonyms
-            synonyms = meaning.get("synonyms", [])
-            # Also check synonyms inside definitions
-            if not synonyms:
-                definitions = meaning.get("definitions", [])
-                for definition in definitions:
-                    synonyms.extend(definition.get("synonyms", []))
-            if synonyms:
-                return random.choice(synonyms)
-            meanings.remove(meaning)
-        return None

 from src.factories.gen_question.types.base import Question, nltk_words
 from src.enums import QuestionTypeEnum
+from src.loaders.elastic import Elastic
 class SynonymsQuestion(Question):
+    INDEX = "vocabulary"
+    def generate_questions(self, list_words: List[str] = None, num_question: int = 1,
+                           num_ans_per_question: int = 4, cefr: int = 3):
+        list_words = list_words or []
         list_unique_words = set(list_words)
+        result = []
+        used_words = set()
+        used_choices = set()
         for _ in range(num_question):
+            max_loop = 100
+            question_word, correct_answer, synonym_set = \
+                self._pick_question_word(list_unique_words, used_words, cefr)
+            pos = self.get_pos({
+                "bool": {
+                    "must": [
+                        {"term": {"word.keyword": question_word.lower()}},
+                        {"term": {"synonyms.keyword": correct_answer.lower()}}
+                    ]
+                }
+            })
+            used_words.update([question_word, correct_answer])
             choices = [correct_answer]
+            while len(choices) < num_ans_per_question and max_loop > 0:
+                doc = self.get_random(self.INDEX, None, cefr=cefr, pos=pos)
+                if not doc:
+                    continue
+                candidate = doc["word"]
+                # Loại trừ điều kiện chung
+                if (
+                    candidate in used_choices or
+                    candidate in used_words or
+                    candidate in synonym_set or
+                    candidate == question_word or
+                    candidate == correct_answer
+                ):
+                    continue
+                # Loại distractor có nghĩa trùng với đáp án
+                syns = set(self.get_list_synonym(candidate))
+                if correct_answer in syns:
+                    continue
+                choices.append(candidate)
+                used_choices.add(candidate)
+                max_loop -= 1
             random.shuffle(choices)
                 "type": QuestionTypeEnum.SYNONYM,
                 "choices": choices,
                 "answer": choices.index(correct_answer),
+                "explain": []
             })
         return result
+    # -----------------------------------------------------
+    # Lấy tất cả synonym của 1 từ từ ES (nhiều nghĩa)
+    # -----------------------------------------------------
+    def get_list_synonym(self, word: str):
+        es = Elastic()
+        query = {"term": {"word.keyword": word.lower()}}
+        resp = es.search(index=self.INDEX, query=query, size=1000)
+        hits = resp["hits"]["hits"]
+        if not hits:
+            return []
+        synonyms = set()
+        for h in hits:
+            s = h["_source"].get("synonyms", [])
+            synonyms.update(s)
+        return list(synonyms)
+    # -----------------------------------------------------
+    # Lấy 1 từ làm câu hỏi và 1 synonym làm đáp án
+    # -----------------------------------------------------
+    def _pick_question_word(self, list_unique_words, used_words, cefr):
+        """
+        - Ưu tiên lấy từ danh sách đầu vào
+        - Nếu hết → lấy từ ES random theo CEFR
         """
+        # ƯU TIÊN INPUT LIST
+        while list_unique_words:
+            source = list_unique_words.pop()
+            if source in used_words:
+                continue
+            syns = self.get_list_synonym(source)
+            valid_syns = [s for s in syns if s not in used_words]
+            if not valid_syns:
+                continue
+            correct = random.choice(valid_syns)
+            return source, correct, set(syns)
+        # FALLBACK ES
+        while True:
+            doc = self.get_random(self.INDEX, None, cefr=cefr)
+            if not doc:
+                continue
+            source = doc["word"]
+            if source in used_words:
+                continue
+            syns = self.get_list_synonym(source)
+            valid_syns = [s for s in syns if s not in used_words]
+            if not valid_syns:
+                continue
+            return source, random.choice(valid_syns), set(syns)

src/loaders/elastic.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from elasticsearch import Elasticsearch
+from env import config
+import random
+class Elastic:
+    _instance = None
+    def __new__(cls):
+        if cls._instance is None:
+            cls._instance = Elasticsearch(
+                config["elastic"]["url"],
+                api_key=config["elastic"]["api_key"]
+            )
+        return cls._instance