File size: 11,821 Bytes
300f197
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
import random
import nltk
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

class OptionGenerator:
    def __init__(self):
        """Initialize the option generator with NLTK resources."""
        try:
            # Download required NLTK data with explicit resource names
            nltk.download('punkt', quiet=True)
            nltk.download('averaged_perceptron_tagger', quiet=True)
            nltk.download('wordnet', quiet=True)
            nltk.download('stopwords', quiet=True)
            nltk.download('universal_tagset', quiet=True)
            nltk.download('tagsets', quiet=True)
            
            self.stop_words = set(nltk.corpus.stopwords.words('english'))
            self.word_net_lemmatizer = nltk.WordNetLemmatizer()
            
            # POS tag mapping for WordNet
            self.pos_mapping = {
                'NN': 'n', 'NNS': 'n', 'NNP': 'n', 'NNPS': 'n',
                'VB': 'v', 'VBD': 'v', 'VBG': 'v', 'VBN': 'v', 'VBP': 'v', 'VBZ': 'v',
                'JJ': 'a', 'JJR': 'a', 'JJS': 'a',
                'RB': 'r', 'RBR': 'r', 'RBS': 'r'
            }
            
        except Exception as e:
            print(f"Error initializing OptionGenerator: {str(e)}")
            raise
        
    def _get_synonyms(self, word, pos=None):
        """Get synonyms for a word using WordNet."""
        synonyms = set()
        
        # Skip if word is too short or a stop word
        if len(word) < 3 or word.lower() in self.stop_words:
            return []
            
        try:
            wordnet_pos = self.pos_mapping.get(pos, None) if pos else None
            
            # Try with the provided POS tag first
            if wordnet_pos:
                for syn in wordnet.synsets(word, pos=wordnet_pos):
                    for lemma in syn.lemmas():
                        synonym = lemma.name().replace('_', ' ').lower()
                        if synonym != word and len(synonym.split()) == 1:
                            synonyms.add(synonym)
                            
            # If no synonyms found, try without POS tag
            if not synonyms:
                for syn in wordnet.synsets(word):
                    for lemma in syn.lemmas():
                        synonym = lemma.name().replace('_', ' ').lower()
                        if synonym != word and len(synonym.split()) == 1:
                            synonyms.add(synonym)
            
            # If still no synonyms, try with lemmatization
            if not synonyms and pos and pos.startswith('VB'):
                lemma = self.word_net_lemmatizer.lemmatize(word, pos='v')
                if lemma != word:
                    for syn in wordnet.synsets(lemma, pos='v'):
                        for l in syn.lemmas():
                            synonym = l.name().replace('_', ' ').lower()
                            if synonym != word and len(synonym.split()) == 1:
                                synonyms.add(synonym)
                                
        except Exception as e:
            print(f"Error getting synonyms for '{word}': {str(e)}")
        
        return list(synonyms)[:10]  # Return at most 10 synonyms
    
    def _get_distractors(self, word, pos=None, num=3):
        """Generate distractors for a given word."""
        distractors = set()
        
        try:
            # Get synonyms first
            synonyms = self._get_synonyms(word, pos)
            distractors.update(synonyms[:num])
            
            # If not enough synonyms, add similar words
            if len(distractors) < num:
                wordnet_pos = self.pos_mapping.get(pos, None) if pos else None
                similar_words = []
                
                for syn in wordnet.synsets(word, pos=wordnet_pos):
                    for lemma in syn.lemmas():
                        if lemma.name() != word:
                            similar_words.append(lemma.name().replace('_', ' ').lower())
                
                # Add similar words that aren't already in distractors
                for w in similar_words:
                    if w not in distractors and w != word:
                        distractors.add(w)
                        if len(distractors) >= num:
                            break
        except Exception as e:
            print(f"Error generating distractors for '{word}': {str(e)}")
        
        return list(distractors)[:num]
    
    def extract_answer_from_context(self, question, context):
        """
        Extract the most likely answer from the context based on the question.
        This version uses simple string matching instead of POS tagging.
        
        Args:
            question (str): Generated question
            context (str): Source sentence/context
            
        Returns:
            str: Extracted answer
        """
        try:
            q_lower = question.lower()
            context_lower = context.lower()
            
            # Common patterns for answers
            patterns = [
                ('what is', 'is'),
                ('what are', 'are'),
                ('what was', 'was'),
                ('what were', 'were'),
                ('who is', 'is'),
                ('who are', 'are'),
                ('who was', 'was'),
                ('who were', 'were'),
                ('where is', 'is'),
                ('where are', 'are'),
                ('when is', 'is'),
                ('when was', 'was')
            ]
            
            # Try to find a direct answer using common patterns
            for q_pattern, verb in patterns:
                if q_lower.startswith(q_pattern):
                    # Look for the pattern "[verb] [answer]" in the context
                    verb_pos = context_lower.find(verb)
                    if verb_pos != -1:
                        # Get the text after the verb
                        answer_part = context[verb_pos + len(verb):].strip(' ,.?!')
                        # Return the first word or phrase
                        return answer_part.split(',')[0].split('.')[0].strip()
            
            # Fallback: return the first proper noun or capitalized word not in the question
            words = context.split()
            for word in words:
                # Skip short words and words that are in the question
                if (len(word) > 2 and word[0].isupper() and 
                    word.lower() not in q_lower and 
                    word.lower() not in self.stop_words):
                    return word.strip(',.!?;:')
            
            # Last resort: return the first noun-like word
            for word in words:
                if len(word) > 3 and word.lower() not in q_lower and word.lower() not in self.stop_words:
                    return word.strip(',.!?;:')
            
            # If all else fails, return the first word that's not a stop word
            for word in words:
                if word.lower() not in self.stop_words and len(word) > 2:
                    return word.strip(',.!?;:')
            
            # Final fallback
            return context.split()[0] if context else "Unknown"
            
        except Exception as e:
            print(f"Error extracting answer: {str(e)}")
            # Return the first word as fallback
            return context.split()[0] if context else "Unknown"
    
    def create_mcq_options(self, question, context, num_options=4, correct_answer=None, global_keywords=None):
        """
        Create multiple choice options for a given question and context.
        
        Args:
            question (str): The question text
            context (str): The context from which the question was generated
            num_options (int): Number of options to generate (including correct answer)
            correct_answer (str, optional): The correct answer if known
            global_keywords (list, optional): List of keywords from the entire document to use as distractors
            
        Returns:
            dict: Dictionary containing options and correct index
        """
        try:
            # Extract the correct answer from context if not provided
            if not correct_answer:
                correct_answer = self.extract_answer_from_context(question, context)
            
            # If we couldn't extract a good answer, use a fallback
            if not correct_answer or correct_answer == "Unknown":
                return {
                    'options': ['Option A', 'Option B', 'Option C', 'Option D'],
                    'correct_index': 0,
                    'correct_answer': 'Option A'
                }
            
            # Generate distractors
            distractors = self._get_distractors(
                correct_answer,
                num=min(10, num_options * 2)  # Generate more than needed to filter
            )
            
            # Ensure we have unique distractors
            distractors = list(set(d for d in distractors if d.lower() != correct_answer.lower()))
            
            # If we don't have enough distractors, try using global keywords
            if len(distractors) < num_options - 1 and global_keywords:
                # Filter keywords to ensure they are not the correct answer
                potential_distractors = [k for k in global_keywords if k.lower() != correct_answer.lower()]
                # Shuffle to get random ones
                random.shuffle(potential_distractors)
                
                for kw in potential_distractors:
                    if kw not in distractors:
                        distractors.append(kw)
                        if len(distractors) >= num_options + 2:  # Get a few extra
                            break
            
            # If we still don't have enough distractors, add some generic ones
            generic_distractors = [
                'True', 'False', 'Yes', 'No', 'Maybe', 'Always', 'Never',
                'Sometimes', 'Often', 'Rarely', 'All of the above', 'None of the above'
            ]
            
            while len(distractors) < num_options - 1 and generic_distractors:
                distractor = generic_distractors.pop(0)
                if distractor.lower() != correct_answer.lower() and distractor not in distractors:
                    distractors.append(distractor)
            
            # Select the final set of options
            options = [correct_answer] + distractors[:(num_options-1)]
            random.shuffle(options)
            
            # Find the index of the correct answer
            correct_index = options.index(correct_answer) if correct_answer in options else 0
            
            return {
                'options': options,
                'correct_index': correct_index,
                'correct_answer': correct_answer
            }
            
        except Exception as e:
            print(f"Error generating options: {str(e)}")
            # Fallback options
            return {
                'options': ['Option A', 'Option B', 'Option C', 'Option D'],
                'correct_index': 0,
                'correct_answer': 'Option A'
            }

# Example usage
if __name__ == "__main__":
    og = OptionGenerator()
    
    test_question = "What is the capital of France?"
    test_context = "Paris is the capital of France, known for its art, fashion, and culture."
    
    print(f"Question: {test_question}")
    print(f"Context: {test_context}")
    
    mcq = og.create_mcq_options(test_question, test_context)
    print("\nOptions:")
    for i, option in enumerate(mcq['options']):
        marker = "✓" if i == mcq['correct_index'] else " "
        print(f"{marker} {chr(65+i)}. {option}")