Omnamdev02 commited on
Commit
a0d93f7
·
unverified ·
1 Parent(s): a9d0548

Delete question_generator_old.py

Browse files
Files changed (1) hide show
  1. question_generator_old.py +0 -600
question_generator_old.py DELETED
@@ -1,600 +0,0 @@
1
- import re
2
- import random
3
- import os
4
- import sys
5
- import nltk
6
- from nltk.tokenize import sent_tokenize, word_tokenize
7
- from nltk.corpus import stopwords
8
- from nltk.probability import FreqDist
9
- from nltk.tag import pos_tag
10
- from collections import defaultdict
11
- import torch
12
- from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer
13
- import numpy as np
14
-
15
- # Simple NLTK data setup
16
- def setup_nltk():
17
- try:
18
- # Try to import required NLTK components
19
- sent_tokenize("Test")
20
- word_tokenize("Test")
21
- pos_tag(["test"])
22
- stopwords.words('english')
23
- return True
24
- except LookupError:
25
- try:
26
- import nltk
27
- nltk.download('punkt', quiet=True)
28
- nltk.download('stopwords', quiet=True)
29
- nltk.download('averaged_perceptron_tagger', quiet=True)
30
- nltk.download('wordnet', quiet=True)
31
- nltk.download('omw-1.4', quiet=True)
32
- return True
33
- except:
34
- return False
35
-
36
- # Initialize NLTK
37
- if not setup_nltk():
38
- print("Warning: Could not initialize NLTK. Some features may not work properly.")
39
-
40
- # Set up NLTK data path
41
- def setup_nltk():
42
- try:
43
- # Set NLTK data path to a local directory
44
- nltk_data = os.path.join(os.getcwd(), 'nltk_data')
45
- os.makedirs(nltk_data, exist_ok=True)
46
- nltk.data.path.append(nltk_data)
47
-
48
- # Download required NLTK data
49
- print("\n=== Downloading NLTK Data ===")
50
-
51
- # Download punkt tokenizer
52
- try:
53
- nltk.data.find('tokenizers/punkt')
54
- print("✓ punkt tokenizer is already available")
55
- except LookupError:
56
- print("Downloading punkt tokenizer...")
57
- nltk.download('punkt', download_dir=nltk_data)
58
- print("✓ Downloaded punkt tokenizer")
59
-
60
- # Download stopwords
61
- try:
62
- nltk.data.find('corpora/stopwords')
63
- print("✓ Stopwords are already available")
64
- except LookupError:
65
- print("Downloading stopwords...")
66
- nltk.download('stopwords', download_dir=nltk_data)
67
- print("✓ Downloaded stopwords")
68
-
69
- # Download averaged_perceptron_tagger
70
- try:
71
- nltk.data.find('taggers/averaged_perceptron_tagger')
72
- print("✓ POS tagger is already available")
73
- except LookupError:
74
- print("Downloading POS tagger...")
75
- nltk.download('averaged_perceptron_tagger', download_dir=nltk_data)
76
- print("✓ Downloaded POS tagger")
77
-
78
- # Download wordnet
79
- try:
80
- nltk.data.find('corpora/wordnet')
81
- print("✓ WordNet is already available")
82
- except LookupError:
83
- print("Downloading WordNet...")
84
- nltk.download('wordnet', download_dir=nltk_data)
85
- print("✓ Downloaded WordNet")
86
-
87
- # Download omw-1.4
88
- try:
89
- nltk.data.find('corpora/omw-1.4')
90
- print("✓ OMW-1.4 is already available")
91
- except LookupError:
92
- print("Downloading OMW-1.4...")
93
- nltk.download('omw-1.4', download_dir=nltk_data)
94
- print("✓ Downloaded OMW-1.4")
95
-
96
- # Test NLTK components
97
- print("\n=== Testing NLTK Components ===")
98
- sent_tokenize("This is a test.")
99
- word_tokenize("This is a test.")
100
- pos_tag(["test", "this", "is", "a", "sentence"])
101
- stopwords.words('english')
102
-
103
- print("\n=== NLTK Setup Completed Successfully ===\n")
104
- return True
105
-
106
- except Exception as e:
107
- print(f"\n⚠ Error during NLTK setup: {str(e)}")
108
- print("\nPlease try running these commands manually in a Python shell:")
109
- print("import nltk")
110
- print("nltk.download('punkt')")
111
- print("nltk.download('stopwords')")
112
- print("nltk.download('averaged_perceptron_tagger')")
113
- print("nltk.download('wordnet')")
114
- print("nltk.download('omw-1.4')\n")
115
- return False
116
-
117
- # Initialize NLTK
118
- if not setup_nltk():
119
- print("Failed to initialize NLTK. Some features may not work properly.")
120
- print("Trying to continue with limited functionality...\n")
121
- try:
122
- print(f"✓ {package} is already downloaded")
123
- except LookupError:
124
- print(f"Downloading {package}...")
125
- try:
126
- nltk.download(package, download_dir=nltk_data, quiet=False)
127
- # Verify download
128
- try:
129
- nltk.data.find(path)
130
- print(f"✓ Successfully downloaded {package}")
131
- except LookupError:
132
- print(f"⚠ Warning: {package} download verification failed")
133
- except Exception as e:
134
- print(f"⚠ Error downloading {package}: {str(e)}")
135
- if package == 'averaged_perceptron_tagger':
136
- print("⚠ This is a critical package. The application may not work properly.")
137
-
138
- print("\n=== NLTK Data Setup Complete ===\n")
139
-
140
- # Initialize NLTK data
141
- download_nltk_data()
142
-
143
- # Initialize NLTK components
144
- try:
145
- # Initialize tokenizers
146
- sent_tokenize("Initializing...")
147
- word_tokenize("Initializing...")
148
-
149
- # Initialize POS tagger
150
- from nltk.tag import pos_tag
151
- pos_tag(["test"])
152
-
153
- # Initialize stopwords
154
- stopwords.words('english')
155
-
156
- print("✓ NLTK components initialized successfully")
157
- except Exception as e:
158
- print(f"⚠ Error initializing NLTK components: {str(e)}")
159
-
160
- class QuestionGenerator:
161
- def __init__(self, model_name="deepset/roberta-base-squad2", use_transformers=True):
162
- """
163
- Initialize the question generator with improved context understanding.
164
-
165
- Args:
166
- model_name (str): Name of the pre-trained model to use
167
- use_transformers (bool): Whether to use transformer models for better quality
168
- """
169
- print("Initializing question generator with enhanced context understanding...")
170
- self.use_transformers = use_transformers
171
- self.stop_words = set(stopwords.words('english'))
172
- self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
173
-
174
- if use_transformers:
175
- try:
176
- print("Loading question generation model...")
177
- self.qg_model = pipeline("text2text-generation",
178
- model="valhalla/t5-base-qa-qg-hl",
179
- device=0 if self.device == 'cuda' else -1)
180
- print("Question generation model loaded successfully!")
181
- except Exception as e:
182
- print(f"Error loading transformer model: {str(e)}")
183
- print("Falling back to rule-based generation.")
184
- self.use_transformers = False
185
-
186
- if not self.use_transformers:
187
- print("Using rule-based question generation.")
188
- self._init_rule_based_system()
189
-
190
- print("Question generator initialized successfully!")
191
-
192
- def _init_rule_based_system(self):
193
- """Initialize the rule-based question generation system."""
194
- self.wh_words = ['what', 'when', 'where', 'who', 'whom', 'whose', 'which', 'why', 'how']
195
- self.aux_verbs = ['is', 'are', 'was', 'were', 'do', 'does', 'did', 'have', 'has', 'had', 'can', 'could', 'will', 'would', 'shall', 'should', 'may', 'might', 'must']
196
- self.important_pos_tags = {'NN', 'NNS', 'NNP', 'NNPS', 'VBG', 'VBN', 'JJ', 'JJR', 'JJS'}
197
-
198
- def _extract_key_phrases(self, text):
199
- """Extract key phrases from text based on POS tagging."""
200
- words = word_tokenize(text)
201
- pos_tags = nltk.pos_tag(words)
202
-
203
- key_phrases = []
204
- current_phrase = []
205
-
206
- for word, tag in pos_tags:
207
- if tag in self.important_pos_tags:
208
- current_phrase.append(word.lower())
209
- elif current_phrase:
210
- if len(current_phrase) > 1: # Only consider phrases with at least 2 words
211
- key_phrases.append(' '.join(current_phrase))
212
- current_phrase = []
213
-
214
- return list(set(key_phrases)) # Remove duplicates
215
-
216
- def generate_question_from_sentence(self, sentence):
217
- """Generate a question from a given sentence using rule-based approach."""
218
- words = word_tokenize(sentence)
219
- pos_tags = nltk.pos_tag(words)
220
-
221
- # Find the main verb and subject
222
- for i, (word, tag) in enumerate(pos_tags):
223
- if tag.startswith('VB'): # Verb
224
- # Find the subject before the verb
225
- for j in range(i-1, -1, -1):
226
- if pos_tags[j][1].startswith('NN'): # Noun
227
- subject = ' '.join([w for w, _ in pos_tags[j:i]])
228
- # Create a wh-question
229
- question = f"What {pos_tags[i][0]} {subject}?"
230
- return question
231
-
232
- # Fallback: create a what question about the main noun phrase
233
- for i, (word, tag) in enumerate(pos_tags):
234
- if tag.startswith('NN'): # Noun
235
- return f"What is {word}?"
236
-
237
- # Final fallback
238
- return f"What is the main idea of: {sentence[:50]}...?"
239
-
240
- def _analyze_text_structure(self, text):
241
- """Analyze text structure to identify important concepts and relationships."""
242
- sentences = sent_tokenize(text)
243
- key_phrases = self._extract_key_phrases(text)
244
-
245
- # Find most important terms using frequency distribution
246
- words = [word.lower() for word in word_tokenize(text)
247
- if word.isalnum() and word.lower() not in self.stop_words]
248
- freq_dist = FreqDist(words)
249
-
250
- return {
251
- 'sentences': sentences,
252
- 'key_phrases': key_phrases,
253
- 'top_terms': [word for word, _ in freq_dist.most_common(10)],
254
- 'concept_map': self._build_concept_map(sentences, key_phrases)
255
- }
256
-
257
- def _build_concept_map(self, sentences, key_phrases):
258
- """Build a simple concept map showing relationships between key phrases."""
259
- concept_map = defaultdict(list)
260
-
261
- for phrase in key_phrases:
262
- for sentence in sentences:
263
- if phrase in sentence.lower():
264
- # Find other key phrases in the same sentence
265
- related = [p for p in key_phrases if p != phrase and p in sentence.lower()]
266
- concept_map[phrase].extend(related)
267
-
268
- # Remove duplicates
269
- return {k: list(set(v)) for k, v in concept_map.items()}
270
- try:
271
- print("Attempting to load T5 model... This may take a few minutes on first run.")
272
-
273
- from transformers import T5ForConditionalGeneration, T5Tokenizer
274
-
275
- # Use smaller, faster model for web deployment
276
- model_name = "t5-base"
277
-
278
- print(f"Loading {model_name} model...")
279
- self.tokenizer = T5Tokenizer.from_pretrained(model_name)
280
- self.model = T5ForConditionalGeneration.from_pretrained(model_name)
281
-
282
- # Use CPU for more reliable deployment (avoid CUDA issues)
283
- self.device = torch.device("cpu")
284
- self.model.to(self.device)
285
- self.model.eval() # Set to evaluation mode
286
-
287
- self.use_transformers = True
288
- print(f"T5 model loaded successfully on {self.device}")
289
-
290
- except ImportError as e:
291
- print(f"Transformers library not installed: {e}")
292
- print("Install with: pip install transformers torch")
293
- self.use_transformers = False
294
- except Exception as e:
295
- print(f"Failed to load T5 model: {e}")
296
- print("Falling back to rule-based generation.")
297
- self.use_transformers = False
298
-
299
- def generate_questions(self, text, num_questions=5, context_window=3):
300
- """
301
- Generate meaningful questions from the given text with better context understanding.
302
-
303
- Args:
304
- text (str): Input text to generate questions from
305
- num_questions (int): Number of questions to generate
306
- context_window (int): Number of sentences to consider as context
307
-
308
- Returns:
309
- list: List of generated questions with their context
310
- """
311
- if not text.strip():
312
- return []
313
-
314
- # Analyze the text structure first
315
- analysis = self._analyze_text_structure(text)
316
- sentences = analysis['sentences']
317
-
318
- if not sentences:
319
- return []
320
-
321
- questions = []
322
-
323
- # Generate questions using different strategies
324
- if self.use_transformers and hasattr(self, 'qg_model'):
325
- # Use transformer-based generation for better quality
326
- for i in range(0, len(sentences), context_window):
327
- context = ' '.join(sentences[i:i+context_window])
328
- try:
329
- # Generate questions for this context window
330
- generated = self.qg_model(context, max_length=128, num_return_sequences=1)
331
- if generated and len(generated) > 0:
332
- question = generated[0]['generated_text'].strip()
333
- if question and question[-1] != '?':
334
- question += '?'
335
- questions.append({
336
- 'question': question,
337
- 'context': context,
338
- 'type': 'comprehension'
339
- })
340
- if len(questions) >= num_questions:
341
- break
342
- except Exception as e:
343
- print(f"Error in transformer-based generation: {str(e)}")
344
- continue
345
-
346
- # Fallback to rule-based generation if needed
347
- if len(questions) < num_questions:
348
- for i, sentence in enumerate(sentences):
349
- if len(sentence.split()) < 5: # Skip very short sentences
350
- continue
351
-
352
- # Generate question using rule-based approach
353
- question = self._generate_question_from_sentence(sentence)
354
-
355
- # Get context (previous and next sentences)
356
- start = max(0, i-1)
357
- end = min(len(sentences), i+2)
358
- context = ' '.join(sentences[start:end])
359
-
360
- questions.append({
361
- 'question': question,
362
- 'context': context,
363
- 'type': 'factual'
364
- })
365
-
366
- if len(questions) >= num_questions:
367
- break
368
-
369
- # Ensure we have enough questions
370
- if len(questions) < num_questions:
371
- # Generate some conceptual questions based on key phrases
372
- for phrase in analysis['key_phrases'][:num_questions - len(questions)]:
373
- questions.append({
374
- 'question': f"Explain the concept of {phrase} in detail.",
375
- 'context': f"The concept of {phrase} is important in this context.",
376
- 'type': 'conceptual'
377
- })
378
-
379
- return questions[:num_questions]
380
-
381
- def _generate_with_transformers(self, sentence, max_length):
382
- """Generate question using T5 model."""
383
- if not self.use_transformers or self.model is None:
384
- return self._generate_with_rules(sentence)
385
-
386
- try:
387
- # Prepare input for T5 model
388
- input_text = f"generate question: {sentence[:300]}" # Limit input length
389
-
390
- # Tokenize input with error handling
391
- inputs = self.tokenizer.encode(
392
- input_text,
393
- return_tensors="pt",
394
- max_length=256, # Reduced for faster processing
395
- truncation=True,
396
- padding=True
397
- )
398
-
399
- if self.device:
400
- inputs = inputs.to(self.device)
401
-
402
- # Generate question with optimized parameters
403
- with torch.no_grad():
404
- outputs = self.model.generate(
405
- inputs,
406
- max_length=min(max_length, 64), # Increased output length
407
- num_beams=4, # Increased beams for better quality
408
- early_stopping=True,
409
- do_sample=False, # Deterministic for consistency
410
- pad_token_id=self.tokenizer.eos_token_id
411
- )
412
-
413
- # Decode and clean question
414
- question = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
415
- cleaned_question = self.clean_question(question)
416
-
417
- # Validate the generated question
418
- if len(cleaned_question) < 10 or not cleaned_question.endswith('?'):
419
- print("Generated question quality low, using rule-based fallback")
420
- return self._generate_with_rules(sentence)
421
-
422
- return cleaned_question
423
-
424
- except Exception as e:
425
- print(f"Transformer generation failed: {e}")
426
- print("Falling back to rule-based generation")
427
- return self._generate_with_rules(sentence)
428
-
429
- def _generate_with_rules(self, sentence):
430
- """Generate question using rule-based approach."""
431
- sentence = sentence.strip()
432
- words = sentence.split()
433
-
434
- # Enhanced question templates based on sentence patterns
435
- question_templates = [
436
- # What questions - most common
437
- (lambda s: any(word in s.lower() for word in ['is', 'are', 'means', 'refers', 'definition', 'concept']),
438
- lambda s: f"What {self._extract_predicate(s)}?"),
439
-
440
- # Define/Explain questions
441
- (lambda s: any(word in s.lower() for word in ['definition', 'meaning', 'concept', 'term']),
442
- lambda s: f"Define {self._extract_main_subject(s)}."),
443
-
444
- # How questions
445
- (lambda s: any(word in s.lower() for word in ['process', 'method', 'way', 'procedure', 'algorithm']),
446
- lambda s: f"How {self._extract_predicate(s)}?"),
447
-
448
- # Why questions
449
- (lambda s: any(word in s.lower() for word in ['because', 'reason', 'cause', 'purpose', 'important']),
450
- lambda s: f"Why {self._extract_predicate(s)}?"),
451
-
452
- # When questions
453
- (lambda s: any(word in s.lower() for word in ['year', 'century', 'time', 'date', 'period', 'era']),
454
- lambda s: f"When {self._extract_predicate(s)}?"),
455
-
456
- # Where questions
457
- (lambda s: any(word in s.lower() for word in ['place', 'location', 'country', 'city', 'region']),
458
- lambda s: f"Where {self._extract_predicate(s)}?"),
459
-
460
- # Who questions
461
- (lambda s: any(word in s.lower() for word in ['person', 'people', 'scientist', 'author', 'researcher']),
462
- lambda s: f"Who {self._extract_predicate(s)}?"),
463
-
464
- # How questions
465
- (lambda s: any(word in s.lower() for word in ['method', 'process', 'way', 'how']),
466
- lambda s: f"How {self._extract_predicate(s)}?"),
467
-
468
- # Why questions
469
- (lambda s: any(word in s.lower() for word in ['reason', 'because', 'cause', 'why']),
470
- lambda s: f"Why {self._extract_predicate(s)}?"),
471
-
472
- # Default question
473
- (lambda s: True,
474
- lambda s: f"What can you tell me about {self._extract_main_subject(s)}?")
475
- ]
476
-
477
- # Apply first matching template
478
- for condition, template in question_templates:
479
- if condition(sentence):
480
- try:
481
- question = template(sentence)
482
- return self.clean_question(question)
483
- except:
484
- continue
485
-
486
- # Fallback
487
- return f"What is the main point about {words[0] if words else 'this topic'}?"
488
-
489
- def _extract_main_subject(self, sentence):
490
- """Extract the main subject from a sentence."""
491
- words = sentence.split()
492
- # Look for capitalized words (likely proper nouns)
493
- subjects = [word.strip('.,!?;:') for word in words if word[0].isupper() and len(word) > 2]
494
- if subjects:
495
- return subjects[0]
496
- # Fallback to first few words
497
- return ' '.join(words[:3]) if len(words) >= 3 else sentence
498
-
499
- def _extract_predicate(self, sentence):
500
- """Extract predicate for question formation."""
501
- sentence = sentence.lower()
502
- # Remove common sentence starters
503
- sentence = re.sub(r'^(the|this|that|these|those|a|an)\s+', '', sentence)
504
-
505
- # Find verb patterns
506
- if ' is ' in sentence:
507
- parts = sentence.split(' is ', 1)
508
- if len(parts) > 1:
509
- return f"is {parts[1].strip('.,!?;:')}"
510
-
511
- if ' are ' in sentence:
512
- parts = sentence.split(' are ', 1)
513
- if len(parts) > 1:
514
- return f"are {parts[1].strip('.,!?;:')}"
515
-
516
- # Default fallback
517
- words = sentence.split()
518
- if len(words) > 3:
519
- return ' '.join(words[1:]).strip('.,!?;:')
520
- return sentence.strip('.,!?;:')
521
-
522
- def clean_question(self, question):
523
- """
524
- Clean and format the generated question.
525
-
526
- Args:
527
- question (str): Raw generated question
528
-
529
- Returns:
530
- str: Cleaned question
531
- """
532
- # Remove extra spaces
533
- question = re.sub(r'\s+', ' ', question.strip())
534
-
535
- # Ensure question ends with question mark
536
- if not question.endswith('?'):
537
- question += '?'
538
-
539
- # Capitalize first letter
540
- if question:
541
- question = question[0].upper() + question[1:]
542
-
543
- return question
544
-
545
- def generate_multiple_questions(self, sentences, max_questions=5):
546
- """
547
- Generate multiple questions from a list of sentences.
548
-
549
- Args:
550
- sentences (list): List of sentences to generate questions from
551
- max_questions (int): Maximum number of questions to generate
552
-
553
- Returns:
554
- list: List of generated questions with their source sentences
555
- """
556
- questions = []
557
-
558
- for i, (score, sentence) in enumerate(sentences[:max_questions]):
559
- try:
560
- question = self.generate_question_from_sentence(sentence)
561
-
562
- # Filter out low-quality questions
563
- if self.is_valid_question(question):
564
- questions.append({
565
- 'question': question,
566
- 'context': sentence,
567
- 'score': score,
568
- 'question_id': i + 1
569
- })
570
- except Exception as e:
571
- print(f"Error generating question from sentence: {sentence[:50]}... Error: {e}")
572
- continue
573
-
574
- return questions
575
-
576
- def is_valid_question(self, question):
577
- """
578
- Check if a generated question is valid.
579
-
580
- Args:
581
- question (str): Generated question
582
-
583
- Returns:
584
- bool: True if question is valid
585
- """
586
- # Basic validation criteria
587
- if len(question) < 10: # Too short
588
- return False
589
-
590
- if len(question) > 200: # Too long
591
- return False
592
-
593
- # Must contain question words or end with question mark
594
- question_words = ['what', 'who', 'when', 'where', 'why', 'how', 'which', 'is', 'are', 'do', 'does', 'did', 'can', 'could', 'would', 'should']
595
- question_lower = question.lower()
596
-
597
- has_question_word = any(word in question_lower for word in question_words)
598
- ends_with_question_mark = question.endswith('?')
599
-
600
- return has_question_word or ends_with_question_mark