File size: 17,269 Bytes
300f197
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
import json
import os
from datetime import datetime
from text_processor import TextProcessor
from keyword_extractor import KeywordExtractor
from question_generator import QuestionGenerator
from option_generator import OptionGenerator
from syllabus_processor import SyllabusProcessor

class ExamQuestionSystem:
    def __init__(self, use_transformers=True):
        """Initialize the complete exam question generation system.
        
        Args:
            use_transformers: Whether to use transformer models for question generation
        """
        print("Initializing Exam Question Generation System...")
        self.text_processor = TextProcessor()
        self.keyword_extractor = KeywordExtractor()
        # Use rule-based generation by default for faster web deployment
        self.question_generator = QuestionGenerator(use_transformers=use_transformers)
        self.option_generator = OptionGenerator()
        self.syllabus_processor = SyllabusProcessor()
        print("System initialized successfully!")
    
    def process_text_file(self, file_path):
        """
        Process a text file and return its content.
        
        Args:
            file_path (str): Path to the text file
            
        Returns:
            str: File content
        """
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                return file.read()
        except Exception as e:
            raise Exception(f"Error reading file {file_path}: {e}")
    
    def generate_exam_questions(self, input_text, max_questions=5, include_mcq=True, syllabus_text=None):
        """
        Complete pipeline to generate exam questions from input text.
        
        Args:
            input_text (str): Input text to generate questions from
            max_questions (int): Maximum number of questions to generate
            include_mcq (bool): Whether to include multiple choice options
            syllabus_text (str, optional): Syllabus text for topic-based question generation
            
        Returns:
            dict: Generated questions and metadata
        """
        print("Starting question generation pipeline...")
        
        try:
            if not input_text or not input_text.strip():
                raise ValueError("Input text cannot be empty")
                
            # If syllabus text is provided, try syllabus-based generation
            if syllabus_text and syllabus_text.strip():
                syllabus_results = self._generate_syllabus_based_questions(input_text, syllabus_text, max_questions, include_mcq)
                if syllabus_results and syllabus_results.get('questions'):
                    return syllabus_results
                print("Warning: Syllabus-based generation produced no questions. Falling back to standard generation.")
                
            # Otherwise use the standard generation approach
            if not input_text or not input_text.strip():
                raise ValueError("Input text is empty or contains only whitespace")
                
            print(f"Input text length: {len(input_text)} characters")
            
            # Step 1: Text preprocessing
            print("1. Processing and cleaning text...")
            processed_data = self.text_processor.preprocess_text(input_text)
            
            if not processed_data or 'sentences' not in processed_data or not processed_data['sentences']:
                raise ValueError("Failed to process input text into sentences")
            
            print(f"Extracted {len(processed_data['sentences'])} sentences from input")
            
            # Step 2: Extract keywords and important sentences
            print("2. Extracting keywords and important sentences...")
            key_concepts = self.keyword_extractor.extract_key_concepts(
                processed_data['cleaned_text'], 
                processed_data['sentences'],
                top_n_sentences=max(10, max_questions)
            )
            
            if not key_concepts or 'important_sentences' not in key_concepts or not key_concepts['important_sentences']:
                # If no important sentences found, use the first few sentences
                print("Warning: No important sentences found, using first few sentences")
                key_concepts['important_sentences'] = processed_data['sentences'][:max_questions]
            
            print(f"Found {len(key_concepts.get('important_sentences', []))} important sentences")
            
            # Prepare sentences and keywords for generation
            generation_inputs = []
            if key_concepts and 'important_sentences' in key_concepts:
                for item in key_concepts['important_sentences']:
                    if isinstance(item, tuple) and len(item) >= 2:
                        # item is (score, sentence, keyword) or (score, sentence)
                        sentence = item[1]
                        keyword = item[2] if len(item) > 2 else None
                        generation_inputs.append({'context': sentence, 'answer': keyword})
                    elif isinstance(item, str):
                        generation_inputs.append({'context': item, 'answer': None})
            
            # Step 3: Generate questions
            print("3. Generating questions...")
            questions = []
            
            # Generate more questions than requested to ensure we have enough valid ones
            # and to cover all sections (MCQ, Short, Long)
            generation_target = max(max_questions * 2, 10)
            
            try:
                questions = self.question_generator.generate_multiple_questions(
                    generation_inputs, 
                    generation_target
                )
                
                # Ensure we have a list of questions
                if not questions:
                    raise ValueError("No questions were generated")
                    
                # Convert string questions to proper format
                formatted_questions = []
                for i, q in enumerate(questions):
                    if isinstance(q, str):
                        formatted_q = {
                            'question': q,
                            'context': 'Generated from input text',
                            'score': 1.0,
                            'type': 'short_answer',
                            'id': f'q_{i+1}'
                        }
                        formatted_questions.append(formatted_q)
                    elif isinstance(q, dict):
                        # Ensure required fields exist
                        q['question'] = q.get('question', f'Question {i+1}')
                        q['context'] = q.get('context', 'No context provided')
                        q['score'] = q.get('score', 1.0)
                        q['type'] = q.get('type', 'short_answer')
                        q['id'] = q.get('id', f'q_{i+1}')
                        formatted_questions.append(q)
                
                questions = formatted_questions
                
                # Step 4: Generate MCQ options if requested and we have enough questions
                if include_mcq and questions:
                    print("4. Generating multiple choice options...")
                    # Extract global keywords for distractors
                    global_keywords = [k[1] for k in key_concepts.get('keywords', [])]
                    
                    for question_data in questions[:max_questions]:  # Limit to max_questions
                        try:
                            mcq_data = self.option_generator.create_mcq_options(
                                question_data['question'],
                                question_data['context'],
                                correct_answer=question_data.get('correct_answer'),
                                global_keywords=global_keywords
                            )
                            if mcq_data and 'options' in mcq_data and len(mcq_data['options']) >= 2:
                                question_data.update(mcq_data)
                                question_data['type'] = 'mcq'
                                print(f"Generated {len(mcq_data['options'])} options for question")
                            else:
                                print("Not enough options generated, keeping as short answer")
                        except Exception as e:
                            print(f"Error generating MCQ options: {str(e)}"
                                  " (continuing with short answer)")
            
            except Exception as e:
                import traceback
                print(f"Error in question generation: {str(e)}\n{traceback.format_exc()}")
                # Create fallback questions
                questions = [{
                    'question': f"Sample question {i+1} (error: {str(e)[:50]}...)",
                    'context': 'Error occurred during question generation',
                    'score': 0.0,
                    'type': 'error',
                    'id': f'error_{i}'
                } for i in range(min(3, max_questions))]
            
            # Compile results
            results = {
                'metadata': {
                    'input_word_count': processed_data.get('word_count', 0),
                    'input_sentence_count': len(processed_data.get('sentences', [])),
                    'questions_generated': len(questions),
                    'keywords_extracted': len(key_concepts.get('keywords', [])),
                    'named_entities': len(key_concepts.get('named_entities', []))
                },
                'keywords': key_concepts.get('keywords', [])[:10],
                'named_entities': key_concepts.get('named_entities', [])[:10],
                'questions': questions[:max_questions]  # Ensure we don't return more than requested
            }
            
            print(f"Successfully generated {len(results['questions'])} questions")
            return results
            
        except Exception as e:
            import traceback
            error_msg = f"Error in generate_exam_questions: {str(e)}\n{traceback.format_exc()}"
            print(error_msg)
            
            # Return a minimal response with error information
            return {
                'metadata': {
                    'error': str(e),
                    'input_length': len(input_text) if input_text else 0,
                    'questions_generated': 0
                },
                'keywords': [],
                'named_entities': [],
                'questions': [{
                    'question': f"Error generating questions: {str(e)[:100]}",
                    'context': 'An error occurred during question generation',
                    'score': 0.0,
                    'type': 'error',
                    'id': 'error_0'
                }]
            }
        
        print(f"βœ… Generated {len(questions)} questions successfully!")
        return results
    
    def save_questions_to_json(self, questions_data, output_file):
        """
        Save generated questions to a JSON file.
        
        Args:
            questions_data (dict): Generated questions data
            output_file (str): Output file path
        """
        try:
            with open(output_file, 'w', encoding='utf-8') as file:
                json.dump(questions_data, file, indent=2, ensure_ascii=False)
            print(f"βœ… Questions saved to {output_file}")
        except Exception as e:
            print(f"❌ Error saving to file: {e}")
    
    def display_questions_console(self, questions_data):
        """
        Display generated questions in a formatted console output.
        
        Args:
            questions_data (dict): Generated questions data
        """
        print("\n" + "="*80)
        print("GENERATED EXAM QUESTIONS")
        print("="*80)
        
        # Display metadata
        metadata = questions_data['metadata']
        print(f"\nπŸ“Š STATISTICS:")
        print(f"   β€’ Input text: {metadata['input_word_count']} words, {metadata['input_sentence_count']} sentences")
        print(f"   β€’ Keywords extracted: {metadata['keywords_extracted']}")
        print(f"   β€’ Named entities found: {metadata['named_entities']}")
        print(f"   β€’ Questions generated: {metadata['questions_generated']}")
        
        # Display top keywords
        print(f"\nπŸ”‘ TOP KEYWORDS:")
        for score, keyword in questions_data['keywords'][:5]:
            print(f"   β€’ {keyword} (score: {score:.2f})")
        
        # Display questions
        print(f"\n❓ QUESTIONS:")
        for i, q in enumerate(questions_data['questions'], 1):
            print(f"\n{i}. {q['question']}")
            
            if 'options' in q:
                print("   Options:")
                for j, option in enumerate(q['options'], 1):
                    marker = "βœ“" if j-1 == q['correct_index'] else " "
                    print(f"   {marker} {chr(64+j)}. {option}")
            
            print(f"   Context: {q['context'][:100]}...")
            print(f"   Confidence: {q['score']:.2f}")
        
        print("\n" + "="*80)

    def _generate_syllabus_based_questions(self, content_text, syllabus_text, max_questions=10, include_mcq=True):
        """
        Generate questions based on syllabus topics.
        
        Args:
            content_text (str): The content text to generate questions from
            syllabus_text (str): The syllabus text with units and topics
            max_questions (int): Maximum number of questions to generate
            include_mcq (bool): Whether to include multiple choice options
            
        Returns:
            dict: Generated questions and metadata
        """
        print("Generating syllabus-based questions...")
        
        try:
            # Generate questions by topic
            questions_by_topic = self.syllabus_processor.generate_topic_based_questions(
                syllabus_text=syllabus_text,
                content_text=content_text,
                questions_per_topic=3  # Will be adjusted based on max_questions
            )
            
            # Flatten questions from all topics
            all_questions = []
            for topic, questions in questions_by_topic.items():
                for q in questions:
                    q['topic'] = topic
                    all_questions.append(q)
            
            # Limit to max_questions
            all_questions = all_questions[:max_questions]
            
            # Generate options for MCQs if needed
            if include_mcq:
                for question in all_questions:
                    if 'options' not in question and 'context' in question:
                        try:
                            mcq_data = self.option_generator.create_mcq_options(
                                question['question'],
                                question['context'],
                                num_options=4
                            )
                            if mcq_data and 'options' in mcq_data and len(mcq_data['options']) >= 2:
                                question.update(mcq_data)
                        except Exception as e:
                            print(f"Error generating options: {e}")
            
            # Prepare results
            results = {
                'metadata': {
                    'total_questions': len(all_questions),
                    'topics_covered': list(questions_by_topic.keys()),
                    'generated_at': str(datetime.now())
                },
                'questions': all_questions
            }
            
            return results
            
        except Exception as e:
            print(f"Error in syllabus-based question generation: {e}")
            raise

# Example usage and testing
if __name__ == "__main__":
    # Sample text for testing
    sample_text = """
    Artificial Intelligence (AI) is a branch of computer science that aims to create intelligent machines 
    that work and react like humans. Machine learning is a subset of AI that provides systems the ability 
    to automatically learn and improve from experience without being explicitly programmed. Deep learning 
    is a subset of machine learning that uses neural networks with three or more layers. These neural 
    networks attempt to simulate the behavior of the human brain to learn from large amounts of data. 
    Python is one of the most popular programming languages for AI development due to its simplicity 
    and extensive libraries like TensorFlow and PyTorch.
    """
    
    try:
        # Initialize system
        system = ExamQuestionSystem()
        
        # Generate questions
        results = system.generate_exam_questions(sample_text, max_questions=3)
        
        # Display results
        system.display_questions_console(results)
        
        # Save to JSON
        system.save_questions_to_json(results, "sample_questions.json")
        
    except Exception as e:
        print(f"❌ Error: {e}")