Spaces:
Sleeping
Sleeping
Add files via upload
Browse files- .env +1 -0
- PROJECT_REVIEW.md +250 -0
- README.md +50 -2
- app.py +497 -0
- download_nltk_data.py +29 -0
- exam_question_system.py +376 -0
- generated_questions.json +22 -0
- keyword_extractor.py +138 -0
- local_question_generator.py +69 -0
- nltk_install.py +69 -0
- nltk_setup.py +56 -0
- option_generator.py +272 -0
- question_generator.py +392 -0
- question_generator_old.py +600 -0
- requirements.txt +13 -0
- sample_text.txt +9 -0
- setup_nltk.py +40 -0
- simple_nltk_test.py +32 -0
- simple_question_generator.py +76 -0
- syllabus_processor.py +175 -0
- test_imports.py +113 -0
- test_local_generator.py +27 -0
- test_nltk.py +43 -0
- test_question_generator.py +28 -0
- test_syllabus.py +37 -0
- text_processor.py +118 -0
- verify_generation.py +124 -0
.env
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
OPENAI_API_KEY=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
|
PROJECT_REVIEW.md
ADDED
|
@@ -0,0 +1,250 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Project Review - AutoExamGen
|
| 2 |
+
|
| 3 |
+
## Overview
|
| 4 |
+
This is a comprehensive **Exam Question Generator** system built with Python and Flask. The system automatically generates exam questions (MCQ, Short Answer, Long Answer) from input text using NLP techniques.
|
| 5 |
+
|
| 6 |
+
## Project Structure
|
| 7 |
+
|
| 8 |
+
### Core Modules
|
| 9 |
+
|
| 10 |
+
1. **`app.py`** - Flask web application (main entry point)
|
| 11 |
+
- Handles file uploads (PDF, DOCX, TXT)
|
| 12 |
+
- Multi-step form flow (Input → Configuration → Results)
|
| 13 |
+
- Session management
|
| 14 |
+
- Question paper generation and download
|
| 15 |
+
|
| 16 |
+
2. **`exam_question_system.py`** - Main orchestration module
|
| 17 |
+
- Coordinates all components
|
| 18 |
+
- Handles question generation pipeline
|
| 19 |
+
- Supports syllabus-based generation
|
| 20 |
+
|
| 21 |
+
3. **`question_generator.py`** - Question generation engine
|
| 22 |
+
- Rule-based question generation (default)
|
| 23 |
+
- Optional transformer-based generation (T5 model)
|
| 24 |
+
- Multiple question generation strategies
|
| 25 |
+
|
| 26 |
+
4. **`keyword_extractor.py`** - Keyword and concept extraction
|
| 27 |
+
- RAKE algorithm for keyword extraction
|
| 28 |
+
- Named entity recognition
|
| 29 |
+
- Important sentence identification
|
| 30 |
+
|
| 31 |
+
5. **`text_processor.py`** - Text preprocessing
|
| 32 |
+
- Text cleaning and normalization
|
| 33 |
+
- Sentence and word tokenization
|
| 34 |
+
- Stopword removal and lemmatization
|
| 35 |
+
|
| 36 |
+
6. **`option_generator.py`** - MCQ option generation
|
| 37 |
+
- Distractor generation using WordNet
|
| 38 |
+
- Synonym-based options
|
| 39 |
+
- Answer extraction from context
|
| 40 |
+
|
| 41 |
+
7. **`syllabus_processor.py`** - Syllabus-based question generation
|
| 42 |
+
- Parses syllabus structure
|
| 43 |
+
- Topic-based question generation
|
| 44 |
+
- Unit and topic extraction
|
| 45 |
+
|
| 46 |
+
8. **`local_question_generator.py`** - Alternative transformer-based generator
|
| 47 |
+
- Uses T5-base model for question generation
|
| 48 |
+
|
| 49 |
+
## Issues Found and Fixed
|
| 50 |
+
|
| 51 |
+
### ✅ Fixed Issues
|
| 52 |
+
|
| 53 |
+
1. **`app.py` - Line 27: Duplicate Variable Assignment**
|
| 54 |
+
- **Issue**: `system_loading = False` was declared twice
|
| 55 |
+
- **Fix**: Removed duplicate assignment
|
| 56 |
+
|
| 57 |
+
2. **`app.py` - Lines 382-529: Unreachable Code**
|
| 58 |
+
- **Issue**: Dead code after return statement (lines 374, 380)
|
| 59 |
+
- **Fix**: Removed all unreachable code block
|
| 60 |
+
- **Impact**: Cleaned up ~150 lines of dead code
|
| 61 |
+
|
| 62 |
+
3. **`option_generator.py` - Lines 175-184: Unreachable Code**
|
| 63 |
+
- **Issue**: Code after return statement on line 174
|
| 64 |
+
- **Fix**: Removed unreachable exception handling block
|
| 65 |
+
|
| 66 |
+
4. **`exam_question_system.py` - Line 172: Syntax Error**
|
| 67 |
+
- **Issue**: Missing proper indentation in multi-line print statement
|
| 68 |
+
- **Fix**: Fixed indentation for string continuation
|
| 69 |
+
|
| 70 |
+
## Code Quality Assessment
|
| 71 |
+
|
| 72 |
+
### Strengths ✅
|
| 73 |
+
|
| 74 |
+
1. **Well-Structured Architecture**
|
| 75 |
+
- Clear separation of concerns
|
| 76 |
+
- Modular design with single responsibility
|
| 77 |
+
- Good use of classes and methods
|
| 78 |
+
|
| 79 |
+
2. **Error Handling**
|
| 80 |
+
- Try-except blocks throughout
|
| 81 |
+
- Graceful fallbacks (rule-based when transformers fail)
|
| 82 |
+
- User-friendly error messages
|
| 83 |
+
|
| 84 |
+
3. **Documentation**
|
| 85 |
+
- Docstrings for classes and methods
|
| 86 |
+
- Type hints in some modules
|
| 87 |
+
- README with usage instructions
|
| 88 |
+
|
| 89 |
+
4. **Feature Completeness**
|
| 90 |
+
- Multiple question types (MCQ, Short, Long)
|
| 91 |
+
- File upload support (PDF, DOCX, TXT)
|
| 92 |
+
- Web interface with multi-step flow
|
| 93 |
+
- Session management
|
| 94 |
+
- Download functionality
|
| 95 |
+
|
| 96 |
+
5. **NLP Integration**
|
| 97 |
+
- Multiple NLTK components
|
| 98 |
+
- RAKE for keyword extraction
|
| 99 |
+
- WordNet for synonyms/distractors
|
| 100 |
+
- Optional transformer models
|
| 101 |
+
|
| 102 |
+
### Areas for Improvement 🔧
|
| 103 |
+
|
| 104 |
+
1. **Code Duplication**
|
| 105 |
+
- Some repeated patterns in question formatting
|
| 106 |
+
- Similar error handling in multiple places
|
| 107 |
+
- **Recommendation**: Extract common functions
|
| 108 |
+
|
| 109 |
+
2. **Configuration Management**
|
| 110 |
+
- Hardcoded values scattered throughout
|
| 111 |
+
- Secret key in code (`app.secret_key`)
|
| 112 |
+
- **Recommendation**: Use config file or environment variables
|
| 113 |
+
|
| 114 |
+
3. **Testing**
|
| 115 |
+
- No visible test files for core functionality
|
| 116 |
+
- **Recommendation**: Add unit tests for each module
|
| 117 |
+
|
| 118 |
+
4. **Type Hints**
|
| 119 |
+
- Inconsistent use of type hints
|
| 120 |
+
- **Recommendation**: Add type hints throughout
|
| 121 |
+
|
| 122 |
+
5. **Logging**
|
| 123 |
+
- Mix of `print()` and `logging`
|
| 124 |
+
- **Recommendation**: Standardize on logging module
|
| 125 |
+
|
| 126 |
+
6. **Error Messages**
|
| 127 |
+
- Some generic error messages
|
| 128 |
+
- **Recommendation**: More specific error handling
|
| 129 |
+
|
| 130 |
+
7. **Session Management**
|
| 131 |
+
- Large content stored in session
|
| 132 |
+
- **Recommendation**: Consider database for production
|
| 133 |
+
|
| 134 |
+
8. **Security**
|
| 135 |
+
- Secret key should be in environment variable
|
| 136 |
+
- File upload validation could be stricter
|
| 137 |
+
- **Recommendation**: Add file type validation, size limits
|
| 138 |
+
|
| 139 |
+
## Dependencies Review
|
| 140 |
+
|
| 141 |
+
### Current Dependencies (`requirements.txt`)
|
| 142 |
+
- ✅ Well-maintained packages
|
| 143 |
+
- ✅ Appropriate versions
|
| 144 |
+
- ✅ Good coverage of NLP needs
|
| 145 |
+
|
| 146 |
+
### Recommendations
|
| 147 |
+
- Consider pinning exact versions for production
|
| 148 |
+
- Add `python-dotenv` for environment variable management
|
| 149 |
+
- Consider adding `gunicorn` or `waitress` for production deployment
|
| 150 |
+
|
| 151 |
+
## Functionality Review
|
| 152 |
+
|
| 153 |
+
### Working Features ✅
|
| 154 |
+
1. Text preprocessing and cleaning
|
| 155 |
+
2. Keyword extraction (RAKE)
|
| 156 |
+
3. Question generation (rule-based)
|
| 157 |
+
4. MCQ option generation
|
| 158 |
+
5. Web interface with file upload
|
| 159 |
+
6. Session management
|
| 160 |
+
7. Question paper download
|
| 161 |
+
|
| 162 |
+
### Potential Issues ⚠️
|
| 163 |
+
|
| 164 |
+
1. **Transformer Models**
|
| 165 |
+
- Optional transformer loading may fail silently
|
| 166 |
+
- Large model downloads on first use
|
| 167 |
+
- **Recommendation**: Add model download progress indicator
|
| 168 |
+
|
| 169 |
+
2. **File Processing**
|
| 170 |
+
- PDF extraction may have issues with complex layouts
|
| 171 |
+
- DOCX parsing is basic
|
| 172 |
+
- **Recommendation**: Add better error handling for file parsing
|
| 173 |
+
|
| 174 |
+
3. **Question Quality**
|
| 175 |
+
- Rule-based questions may be simplistic
|
| 176 |
+
- **Recommendation**: Add question quality scoring
|
| 177 |
+
|
| 178 |
+
4. **Performance**
|
| 179 |
+
- Synchronous processing may timeout on large files
|
| 180 |
+
- **Recommendation**: Consider async processing or background jobs
|
| 181 |
+
|
| 182 |
+
## Recommendations for Production
|
| 183 |
+
|
| 184 |
+
1. **Environment Configuration**
|
| 185 |
+
```python
|
| 186 |
+
# Use environment variables
|
| 187 |
+
app.secret_key = os.environ.get('SECRET_KEY', 'dev-secret-key')
|
| 188 |
+
```
|
| 189 |
+
|
| 190 |
+
2. **Database Integration**
|
| 191 |
+
- Store generated questions in database
|
| 192 |
+
- User session management
|
| 193 |
+
- Question history
|
| 194 |
+
|
| 195 |
+
3. **Caching**
|
| 196 |
+
- Cache NLTK data downloads
|
| 197 |
+
- Cache processed text
|
| 198 |
+
- Cache generated questions
|
| 199 |
+
|
| 200 |
+
4. **API Rate Limiting**
|
| 201 |
+
- Add rate limiting for API endpoints
|
| 202 |
+
- Prevent abuse
|
| 203 |
+
|
| 204 |
+
5. **Monitoring**
|
| 205 |
+
- Add logging to file
|
| 206 |
+
- Error tracking (e.g., Sentry)
|
| 207 |
+
- Performance monitoring
|
| 208 |
+
|
| 209 |
+
6. **Testing**
|
| 210 |
+
- Unit tests for each module
|
| 211 |
+
- Integration tests for web flow
|
| 212 |
+
- Test file uploads
|
| 213 |
+
|
| 214 |
+
7. **Documentation**
|
| 215 |
+
- API documentation
|
| 216 |
+
- Deployment guide
|
| 217 |
+
- Configuration guide
|
| 218 |
+
|
| 219 |
+
## Overall Assessment
|
| 220 |
+
|
| 221 |
+
**Grade: B+**
|
| 222 |
+
|
| 223 |
+
The project is well-structured and functional. The codebase shows good understanding of NLP concepts and Flask web development. The main issues were code cleanup (unreachable code) and minor syntax errors, which have been fixed.
|
| 224 |
+
|
| 225 |
+
### Key Strengths
|
| 226 |
+
- Comprehensive feature set
|
| 227 |
+
- Good architecture
|
| 228 |
+
- Error handling
|
| 229 |
+
- User-friendly interface
|
| 230 |
+
|
| 231 |
+
### Key Weaknesses
|
| 232 |
+
- Some code duplication
|
| 233 |
+
- Missing tests
|
| 234 |
+
- Configuration management
|
| 235 |
+
- Production readiness concerns
|
| 236 |
+
|
| 237 |
+
## Next Steps
|
| 238 |
+
|
| 239 |
+
1. ✅ **Completed**: Fixed code issues
|
| 240 |
+
2. 🔄 **Recommended**: Add unit tests
|
| 241 |
+
3. 🔄 **Recommended**: Improve configuration management
|
| 242 |
+
4. 🔄 **Recommended**: Add logging standardization
|
| 243 |
+
5. 🔄 **Recommended**: Security improvements
|
| 244 |
+
6. 🔄 **Recommended**: Performance optimization
|
| 245 |
+
|
| 246 |
+
---
|
| 247 |
+
|
| 248 |
+
**Review Date**: February 5, 2026
|
| 249 |
+
**Reviewed By**: AI Code Reviewer
|
| 250 |
+
**Status**: Issues Fixed ✅
|
README.md
CHANGED
|
@@ -1,2 +1,50 @@
|
|
| 1 |
-
#
|
| 2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Exam Question Generator
|
| 2 |
+
|
| 3 |
+
An intelligent Python-based system that automatically generates exam questions from input text using NLP techniques.
|
| 4 |
+
|
| 5 |
+
## Features
|
| 6 |
+
|
| 7 |
+
- **Text Preprocessing**: Cleans and preprocesses input text using NLTK
|
| 8 |
+
- **Keyword Extraction**: Identifies important concepts using RAKE algorithm
|
| 9 |
+
- **Question Generation**: Uses HuggingFace T5 model to generate questions
|
| 10 |
+
- **Option Generation**: Creates multiple-choice options with distractors
|
| 11 |
+
- **Multiple Interfaces**: CLI and Web interface (Flask) support
|
| 12 |
+
- **JSON Output**: Structured output format for easy integration
|
| 13 |
+
|
| 14 |
+
## Installation
|
| 15 |
+
|
| 16 |
+
1. Clone or download this project
|
| 17 |
+
2. Install dependencies:
|
| 18 |
+
```bash
|
| 19 |
+
pip install -r requirements.txt
|
| 20 |
+
```
|
| 21 |
+
|
| 22 |
+
3. Download required NLTK data:
|
| 23 |
+
```python
|
| 24 |
+
python -c "import nltk; nltk.download('punkt'); nltk.download('stopwords'); nltk.download('wordnet')"
|
| 25 |
+
```
|
| 26 |
+
|
| 27 |
+
## Usage
|
| 28 |
+
|
| 29 |
+
### Web Interface
|
| 30 |
+
```bash
|
| 31 |
+
python app.py
|
| 32 |
+
```
|
| 33 |
+
Then open http://localhost:5000 in your browser.
|
| 34 |
+
|
| 35 |
+
### Python API
|
| 36 |
+
```python
|
| 37 |
+
from question_generator import QuestionGenerator
|
| 38 |
+
|
| 39 |
+
generator = QuestionGenerator()
|
| 40 |
+
questions = generator.generate_questions("Your input text here")
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
## Project Structure
|
| 44 |
+
|
| 45 |
+
- `question_generator.py` - Core question generation logic
|
| 46 |
+
- `text_processor.py` - Text cleaning and preprocessing
|
| 47 |
+
- `keyword_extractor.py` - Keyword and important sentence extraction
|
| 48 |
+
- `option_generator.py` - Multiple choice option generation
|
| 49 |
+
- `app.py` - Flask web application
|
| 50 |
+
- `
|
app.py
ADDED
|
@@ -0,0 +1,497 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from flask import Flask, render_template, request, jsonify, send_file, session, redirect, url_for, flash
|
| 2 |
+
import os
|
| 3 |
+
import json
|
| 4 |
+
import tempfile
|
| 5 |
+
from werkzeug.utils import secure_filename
|
| 6 |
+
from exam_question_system import ExamQuestionSystem
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
import uuid
|
| 9 |
+
import threading
|
| 10 |
+
import time
|
| 11 |
+
from docx import Document
|
| 12 |
+
import PyPDF2
|
| 13 |
+
app = Flask(__name__)
|
| 14 |
+
app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 # 16MB max file size
|
| 15 |
+
app.config['UPLOAD_FOLDER'] = 'uploads'
|
| 16 |
+
app.config['TEMP_INPUT_FOLDER'] = os.path.join(tempfile.gettempdir(), 'eqg_inputs')
|
| 17 |
+
app.config['SEND_FILE_MAX_AGE_DEFAULT'] = 0 # Disable caching for development
|
| 18 |
+
app.secret_key = 'your-secret-key-change-this-in-production'
|
| 19 |
+
|
| 20 |
+
# Create necessary directories if they don't exist
|
| 21 |
+
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
|
| 22 |
+
os.makedirs(app.config['TEMP_INPUT_FOLDER'], exist_ok=True)
|
| 23 |
+
|
| 24 |
+
# Global variables for question system
|
| 25 |
+
question_system = None
|
| 26 |
+
system_loading = False
|
| 27 |
+
system_load_error = None
|
| 28 |
+
|
| 29 |
+
def read_file_content(filepath):
|
| 30 |
+
"""Read content from a file based on its extension."""
|
| 31 |
+
ext = os.path.splitext(filepath)[1].lower()
|
| 32 |
+
|
| 33 |
+
try:
|
| 34 |
+
if ext == '.pdf':
|
| 35 |
+
text = ""
|
| 36 |
+
with open(filepath, 'rb') as f:
|
| 37 |
+
reader = PyPDF2.PdfReader(f)
|
| 38 |
+
for page in reader.pages:
|
| 39 |
+
text += page.extract_text() + "\n"
|
| 40 |
+
return text
|
| 41 |
+
|
| 42 |
+
elif ext == '.docx':
|
| 43 |
+
doc = Document(filepath)
|
| 44 |
+
return "\n".join([paragraph.text for paragraph in doc.paragraphs])
|
| 45 |
+
|
| 46 |
+
else:
|
| 47 |
+
# Default to text file
|
| 48 |
+
with open(filepath, 'r', encoding='utf-8', errors='replace') as f:
|
| 49 |
+
return f.read()
|
| 50 |
+
|
| 51 |
+
except Exception as e:
|
| 52 |
+
raise Exception(f"Error reading {ext} file: {str(e)}")
|
| 53 |
+
|
| 54 |
+
def get_question_system():
|
| 55 |
+
"""Get or initialize the question generation system."""
|
| 56 |
+
global question_system, system_loading, system_load_error
|
| 57 |
+
|
| 58 |
+
if question_system is None and not system_loading:
|
| 59 |
+
if system_load_error:
|
| 60 |
+
raise Exception(f"System failed to load: {system_load_error}")
|
| 61 |
+
|
| 62 |
+
system_loading = True
|
| 63 |
+
try:
|
| 64 |
+
print("Initializing question generation system...")
|
| 65 |
+
question_system = ExamQuestionSystem()
|
| 66 |
+
print("Question generation system loaded successfully!")
|
| 67 |
+
except Exception as e:
|
| 68 |
+
system_load_error = str(e)
|
| 69 |
+
system_loading = False
|
| 70 |
+
raise e
|
| 71 |
+
finally:
|
| 72 |
+
system_loading = False
|
| 73 |
+
|
| 74 |
+
if system_loading:
|
| 75 |
+
raise Exception("System is still loading, please wait...")
|
| 76 |
+
|
| 77 |
+
return question_system
|
| 78 |
+
|
| 79 |
+
# Utility: parse human-readable duration text into minutes when possible
|
| 80 |
+
def parse_duration_to_minutes(duration_text):
|
| 81 |
+
try:
|
| 82 |
+
if not duration_text:
|
| 83 |
+
return None
|
| 84 |
+
text = duration_text.strip().lower()
|
| 85 |
+
# Normalize
|
| 86 |
+
text = text.replace('hrs', 'h').replace('hr', 'h').replace('hours', 'h').replace('hour', 'h')
|
| 87 |
+
text = text.replace('minutes', 'm').replace('minute', 'm').replace('mins', 'm').replace('min', 'm')
|
| 88 |
+
# Patterns like '2h 30m'
|
| 89 |
+
import re
|
| 90 |
+
hours = 0
|
| 91 |
+
minutes = 0
|
| 92 |
+
# Match hours
|
| 93 |
+
h_match = re.search(r"(\d+)\s*h", text)
|
| 94 |
+
if h_match:
|
| 95 |
+
hours = int(h_match.group(1))
|
| 96 |
+
# Match minutes
|
| 97 |
+
m_match = re.search(r"(\d+)\s*m", text)
|
| 98 |
+
if m_match:
|
| 99 |
+
minutes = int(m_match.group(1))
|
| 100 |
+
if h_match or m_match:
|
| 101 |
+
return hours * 60 + minutes
|
| 102 |
+
# If only a number, treat as minutes
|
| 103 |
+
just_num = re.fullmatch(r"\s*(\d+)\s*", duration_text)
|
| 104 |
+
if just_num:
|
| 105 |
+
return int(just_num.group(1))
|
| 106 |
+
# If something like '3 hour' without m, captured above; if unparseable, return None
|
| 107 |
+
return None
|
| 108 |
+
except Exception:
|
| 109 |
+
return None
|
| 110 |
+
|
| 111 |
+
@app.route('/')
|
| 112 |
+
def index():
|
| 113 |
+
"""Welcome page before the step flow."""
|
| 114 |
+
# Clear any existing session data
|
| 115 |
+
session.clear()
|
| 116 |
+
return render_template('welcome.html', project_name='AutoExamGen')
|
| 117 |
+
|
| 118 |
+
@app.route('/step1', methods=['GET'])
|
| 119 |
+
def step1_input():
|
| 120 |
+
"""Step 1: Syllabus input page."""
|
| 121 |
+
return render_template('step1_input.html')
|
| 122 |
+
|
| 123 |
+
@app.route('/step2', methods=['GET', 'POST'])
|
| 124 |
+
def step2_configuration():
|
| 125 |
+
"""Step 2: Question configuration page."""
|
| 126 |
+
if request.method == 'GET':
|
| 127 |
+
# If user tries to access /step2 directly, redirect to step1
|
| 128 |
+
return redirect(url_for('step1_input'))
|
| 129 |
+
|
| 130 |
+
# Handle POST request (form submission from step1)
|
| 131 |
+
content = None
|
| 132 |
+
|
| 133 |
+
try:
|
| 134 |
+
# Get text input (from textarea)
|
| 135 |
+
text_input = request.form.get('text_input', '').strip()
|
| 136 |
+
|
| 137 |
+
# Check if file was uploaded
|
| 138 |
+
if 'file' in request.files:
|
| 139 |
+
file = request.files['file']
|
| 140 |
+
if file and file.filename != '':
|
| 141 |
+
try:
|
| 142 |
+
# Save the uploaded file
|
| 143 |
+
filename = secure_filename(file.filename)
|
| 144 |
+
if not os.path.exists(app.config['UPLOAD_FOLDER']):
|
| 145 |
+
os.makedirs(app.config['UPLOAD_FOLDER'])
|
| 146 |
+
filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
|
| 147 |
+
file.save(filepath)
|
| 148 |
+
|
| 149 |
+
# Store file path in session
|
| 150 |
+
session['content_file'] = filepath
|
| 151 |
+
session.pop('content_text', None) # Clear any text content if it exists
|
| 152 |
+
|
| 153 |
+
# Read the file content for processing
|
| 154 |
+
content = read_file_content(filepath)
|
| 155 |
+
print(f"File uploaded successfully: {filename}, Content length: {len(content)}")
|
| 156 |
+
|
| 157 |
+
except Exception as e:
|
| 158 |
+
error_msg = f'Error processing file: {str(e)}'
|
| 159 |
+
print(error_msg)
|
| 160 |
+
flash(error_msg, 'error')
|
| 161 |
+
return redirect(url_for('step1_input'))
|
| 162 |
+
|
| 163 |
+
# If no file but text content is provided
|
| 164 |
+
if not content and text_input:
|
| 165 |
+
# For small text content, store directly in session
|
| 166 |
+
if len(text_input) < 2000:
|
| 167 |
+
session['content_text'] = text_input
|
| 168 |
+
content = text_input
|
| 169 |
+
else:
|
| 170 |
+
# For large content, save to a temporary file
|
| 171 |
+
temp_file = os.path.join(app.config['UPLOAD_FOLDER'], f'temp_{int(time.time())}.txt')
|
| 172 |
+
with open(temp_file, 'w', encoding='utf-8') as f:
|
| 173 |
+
f.write(text_input)
|
| 174 |
+
session['content_file'] = temp_file
|
| 175 |
+
content = text_input
|
| 176 |
+
print(f"Text input processed, Content length: {len(content)}")
|
| 177 |
+
|
| 178 |
+
# Validate that we have content
|
| 179 |
+
if not content or not content.strip():
|
| 180 |
+
flash('Please provide either a file or paste content', 'error')
|
| 181 |
+
return redirect(url_for('step1_input'))
|
| 182 |
+
|
| 183 |
+
# Process the content for question generation
|
| 184 |
+
try:
|
| 185 |
+
# Initialize question system if not already done
|
| 186 |
+
print("Initializing question system...")
|
| 187 |
+
question_system = get_question_system()
|
| 188 |
+
print("Question system initialized successfully")
|
| 189 |
+
|
| 190 |
+
# Store word count for progress display
|
| 191 |
+
word_count = len(content.split())
|
| 192 |
+
session['word_count'] = word_count
|
| 193 |
+
print(f"Content processed: {word_count} words")
|
| 194 |
+
|
| 195 |
+
# Render the configuration page
|
| 196 |
+
return render_template('step2_config.html',
|
| 197 |
+
word_count=word_count,
|
| 198 |
+
has_syllabus=bool(session.get('syllabus_text', '')))
|
| 199 |
+
|
| 200 |
+
except Exception as e:
|
| 201 |
+
error_msg = f'Error initializing question system: {str(e)}'
|
| 202 |
+
print(error_msg)
|
| 203 |
+
import traceback
|
| 204 |
+
traceback.print_exc()
|
| 205 |
+
flash(error_msg, 'error')
|
| 206 |
+
return redirect(url_for('step1_input'))
|
| 207 |
+
|
| 208 |
+
except Exception as e:
|
| 209 |
+
error_msg = f'An error occurred: {str(e)}'
|
| 210 |
+
print(error_msg)
|
| 211 |
+
import traceback
|
| 212 |
+
traceback.print_exc()
|
| 213 |
+
flash(error_msg, 'error')
|
| 214 |
+
return redirect(url_for('step1_input'))
|
| 215 |
+
|
| 216 |
+
@app.route('/generate', methods=['POST'])
|
| 217 |
+
def step3_generate():
|
| 218 |
+
"""Step 3: Generate and display question paper."""
|
| 219 |
+
try:
|
| 220 |
+
# Get form data
|
| 221 |
+
num_questions = int(request.form.get('num_questions', 5))
|
| 222 |
+
question_types = request.form.getlist('question_types')
|
| 223 |
+
|
| 224 |
+
# Get content from session or uploaded file
|
| 225 |
+
content = None
|
| 226 |
+
|
| 227 |
+
# Check for uploaded file first
|
| 228 |
+
if 'content_file' in session and os.path.exists(session['content_file']):
|
| 229 |
+
try:
|
| 230 |
+
content = read_file_content(session['content_file'])
|
| 231 |
+
except Exception as e:
|
| 232 |
+
flash(f'Error reading uploaded file: {str(e)}', 'error')
|
| 233 |
+
return redirect(url_for('step1_input'))
|
| 234 |
+
# Check for direct text content
|
| 235 |
+
elif 'content_text' in session and session['content_text']:
|
| 236 |
+
content = session['content_text']
|
| 237 |
+
|
| 238 |
+
# If no content found, redirect to step 1
|
| 239 |
+
if not content:
|
| 240 |
+
flash('No content found. Please provide content first.', 'error')
|
| 241 |
+
return redirect(url_for('step1_input'))
|
| 242 |
+
|
| 243 |
+
# Initialize question system
|
| 244 |
+
try:
|
| 245 |
+
question_system = get_question_system()
|
| 246 |
+
|
| 247 |
+
# Helper function to safely get integer values from form
|
| 248 |
+
def get_int(form, key, default=0):
|
| 249 |
+
try:
|
| 250 |
+
return int(form.get(key, default))
|
| 251 |
+
except (ValueError, TypeError):
|
| 252 |
+
return default
|
| 253 |
+
|
| 254 |
+
# Store configuration in session with all required fields and safe defaults
|
| 255 |
+
config = {
|
| 256 |
+
'exam_name': request.form.get('exam_name', 'Sample Exam'),
|
| 257 |
+
'subject_name': request.form.get('subject_name', 'Subject'),
|
| 258 |
+
'duration': get_int(request.form, 'duration', 60),
|
| 259 |
+
'short_questions': get_int(request.form, 'short_questions', 2),
|
| 260 |
+
'short_marks': get_int(request.form, 'short_marks', 2),
|
| 261 |
+
'long_questions': get_int(request.form, 'long_questions', 1),
|
| 262 |
+
'long_marks': get_int(request.form, 'long_marks', 5),
|
| 263 |
+
'long_attempt': get_int(request.form, 'long_attempt', 1),
|
| 264 |
+
'mcq_questions': get_int(request.form, 'mcq_questions', 2),
|
| 265 |
+
'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
| 266 |
+
}
|
| 267 |
+
session['exam_config'] = config
|
| 268 |
+
|
| 269 |
+
# Get the syllabus/content text
|
| 270 |
+
content = ""
|
| 271 |
+
if 'content_text' in session:
|
| 272 |
+
content = session['content_text']
|
| 273 |
+
elif 'content_file' in session and os.path.exists(session['content_file']):
|
| 274 |
+
content = read_file_content(session['content_file'])
|
| 275 |
+
|
| 276 |
+
if not content.strip():
|
| 277 |
+
raise ValueError("No content available for question generation.")
|
| 278 |
+
|
| 279 |
+
# Generate questions based on content and configuration
|
| 280 |
+
print("Generating questions from content...")
|
| 281 |
+
|
| 282 |
+
# Get number of questions for each type
|
| 283 |
+
num_mcq = config['mcq_questions']
|
| 284 |
+
num_short = config['short_questions']
|
| 285 |
+
num_long = config['long_questions']
|
| 286 |
+
|
| 287 |
+
# Generate questions using the question system
|
| 288 |
+
# We request enough questions for all sections, but without auto-MCQ generation
|
| 289 |
+
# so we can handle categorization manually
|
| 290 |
+
total_questions_needed = num_mcq + num_short + num_long
|
| 291 |
+
|
| 292 |
+
results = question_system.generate_exam_questions(
|
| 293 |
+
input_text=content,
|
| 294 |
+
max_questions=total_questions_needed,
|
| 295 |
+
include_mcq=False, # We'll generate options manually for specific questions
|
| 296 |
+
syllabus_text=content
|
| 297 |
+
)
|
| 298 |
+
|
| 299 |
+
all_questions = results.get('questions', [])
|
| 300 |
+
|
| 301 |
+
# Initialize categories
|
| 302 |
+
generated_questions = {
|
| 303 |
+
'mcq_questions': [],
|
| 304 |
+
'short_questions': [],
|
| 305 |
+
'long_questions': []
|
| 306 |
+
}
|
| 307 |
+
|
| 308 |
+
# Distribute questions
|
| 309 |
+
# We prioritize Short and Long questions first to ensure they get content,
|
| 310 |
+
# as MCQs are easier to fallback/generate
|
| 311 |
+
|
| 312 |
+
# Filter out questions that are too simple for Long answers
|
| 313 |
+
long_candidates = [q for q in all_questions if len(q.get('context', '').split()) > 10]
|
| 314 |
+
short_candidates = [q for q in all_questions if q not in long_candidates]
|
| 315 |
+
|
| 316 |
+
# If we don't have enough long candidates, take from short
|
| 317 |
+
if len(long_candidates) < num_long:
|
| 318 |
+
needed = num_long - len(long_candidates)
|
| 319 |
+
long_candidates.extend(short_candidates[:needed])
|
| 320 |
+
short_candidates = short_candidates[needed:]
|
| 321 |
+
|
| 322 |
+
# 3. Process Long Questions (Prioritize these)
|
| 323 |
+
for _ in range(num_long):
|
| 324 |
+
if long_candidates:
|
| 325 |
+
q = long_candidates.pop(0)
|
| 326 |
+
q['type'] = 'long_answer'
|
| 327 |
+
generated_questions['long_questions'].append(q)
|
| 328 |
+
# Remove from all_questions so we don't reuse
|
| 329 |
+
if q in all_questions:
|
| 330 |
+
all_questions.remove(q)
|
| 331 |
+
|
| 332 |
+
# 2. Process Short Questions
|
| 333 |
+
for _ in range(num_short):
|
| 334 |
+
if short_candidates:
|
| 335 |
+
q = short_candidates.pop(0)
|
| 336 |
+
q['type'] = 'short_answer'
|
| 337 |
+
generated_questions['short_questions'].append(q)
|
| 338 |
+
if q in all_questions:
|
| 339 |
+
all_questions.remove(q)
|
| 340 |
+
elif all_questions: # Fallback to any remaining
|
| 341 |
+
q = all_questions.pop(0)
|
| 342 |
+
q['type'] = 'short_answer'
|
| 343 |
+
generated_questions['short_questions'].append(q)
|
| 344 |
+
|
| 345 |
+
# 1. Process MCQs (Use remaining questions)
|
| 346 |
+
# Extract global keywords for distractors
|
| 347 |
+
global_keywords = [k[1] for k in results.get('keywords', [])]
|
| 348 |
+
|
| 349 |
+
for _ in range(num_mcq):
|
| 350 |
+
if all_questions:
|
| 351 |
+
q = all_questions.pop(0)
|
| 352 |
+
# Generate options for this question
|
| 353 |
+
try:
|
| 354 |
+
mcq_data = question_system.option_generator.create_mcq_options(
|
| 355 |
+
q['question'],
|
| 356 |
+
q['context'],
|
| 357 |
+
correct_answer=q.get('correct_answer'),
|
| 358 |
+
global_keywords=global_keywords
|
| 359 |
+
)
|
| 360 |
+
if mcq_data and 'options' in mcq_data:
|
| 361 |
+
q.update(mcq_data)
|
| 362 |
+
q['type'] = 'mcq'
|
| 363 |
+
generated_questions['mcq_questions'].append(q)
|
| 364 |
+
else:
|
| 365 |
+
# Fallback if option generation fails
|
| 366 |
+
q['type'] = 'short_answer'
|
| 367 |
+
generated_questions['short_questions'].append(q)
|
| 368 |
+
except Exception as e:
|
| 369 |
+
print(f"Error generating options: {e}")
|
| 370 |
+
q['type'] = 'short_answer'
|
| 371 |
+
generated_questions['short_questions'].append(q)
|
| 372 |
+
|
| 373 |
+
# Store the generated questions
|
| 374 |
+
session['generated_questions'] = generated_questions
|
| 375 |
+
|
| 376 |
+
# Calculate and store total marks
|
| 377 |
+
total_marks = (
|
| 378 |
+
(len(session['generated_questions']['short_questions']) * config['short_marks']) +
|
| 379 |
+
(len(session['generated_questions']['long_questions']) * config['long_marks']) +
|
| 380 |
+
len(session['generated_questions']['mcq_questions']) # 1 mark per MCQ
|
| 381 |
+
)
|
| 382 |
+
session['total_marks'] = total_marks
|
| 383 |
+
|
| 384 |
+
# Redirect to results page
|
| 385 |
+
return redirect(url_for('show_results'))
|
| 386 |
+
|
| 387 |
+
except Exception as e:
|
| 388 |
+
error_msg = f'Error generating questions: {str(e)}'
|
| 389 |
+
print(error_msg)
|
| 390 |
+
flash(error_msg, 'error')
|
| 391 |
+
return redirect(url_for('step1_input'))
|
| 392 |
+
|
| 393 |
+
except Exception as e:
|
| 394 |
+
error_msg = f'An error occurred: {str(e)}'
|
| 395 |
+
print(error_msg)
|
| 396 |
+
flash(error_msg, 'error')
|
| 397 |
+
return redirect(url_for('step1_input'))
|
| 398 |
+
|
| 399 |
+
@app.route('/download')
|
| 400 |
+
def download_paper():
|
| 401 |
+
"""Download the generated question paper as HTML."""
|
| 402 |
+
try:
|
| 403 |
+
if 'question_paper' not in session:
|
| 404 |
+
return redirect(url_for('index'))
|
| 405 |
+
|
| 406 |
+
# Generate a unique filename
|
| 407 |
+
filename = f"question_paper_{uuid.uuid4().hex[:8]}.html"
|
| 408 |
+
|
| 409 |
+
# Render the printable version
|
| 410 |
+
# Determine display duration similarly to step3
|
| 411 |
+
cfg = session['config']
|
| 412 |
+
display_duration = cfg.get('exam_duration') if cfg.get('exam_duration') else int(round(session['total_marks'] * 1.5))
|
| 413 |
+
|
| 414 |
+
html_content = render_template('printable_paper.html',
|
| 415 |
+
question_paper=session['question_paper'],
|
| 416 |
+
config=cfg,
|
| 417 |
+
total_marks=session['total_marks'],
|
| 418 |
+
exam_date=session['exam_date'],
|
| 419 |
+
display_duration=display_duration)
|
| 420 |
+
|
| 421 |
+
# Create a temporary file
|
| 422 |
+
temp_file = os.path.join(tempfile.gettempdir(), filename)
|
| 423 |
+
with open(temp_file, 'w', encoding='utf-8') as f:
|
| 424 |
+
f.write(html_content)
|
| 425 |
+
|
| 426 |
+
return send_file(temp_file, as_attachment=True, download_name=filename)
|
| 427 |
+
|
| 428 |
+
except Exception as e:
|
| 429 |
+
return jsonify({'error': f'Error downloading paper: {str(e)}'}), 500
|
| 430 |
+
|
| 431 |
+
@app.route('/results')
|
| 432 |
+
def show_results():
|
| 433 |
+
"""Display the generated questions."""
|
| 434 |
+
if 'generated_questions' not in session or 'exam_config' not in session:
|
| 435 |
+
flash('No questions generated yet. Please start from the beginning.', 'error')
|
| 436 |
+
return redirect(url_for('step1_input'))
|
| 437 |
+
|
| 438 |
+
# Get config with defaults
|
| 439 |
+
config = session.get('exam_config', {})
|
| 440 |
+
questions = session.get('generated_questions', {})
|
| 441 |
+
|
| 442 |
+
# Ensure all required question types exist in the questions dictionary
|
| 443 |
+
for qtype in ['mcq_questions', 'short_questions', 'long_questions']:
|
| 444 |
+
if qtype not in questions:
|
| 445 |
+
questions[qtype] = []
|
| 446 |
+
|
| 447 |
+
# Calculate total marks
|
| 448 |
+
total_marks = 0
|
| 449 |
+
if 'mcq_questions' in questions:
|
| 450 |
+
total_marks += len(questions['mcq_questions'])
|
| 451 |
+
if 'short_questions' in questions:
|
| 452 |
+
total_marks += len(questions['short_questions']) * 2 # 2 marks per short question
|
| 453 |
+
if 'long_questions' in questions:
|
| 454 |
+
total_marks += len(questions['long_questions']) * 5 # 5 marks per long question
|
| 455 |
+
|
| 456 |
+
# Get exam date from config or use current date
|
| 457 |
+
exam_date = config.get('timestamp', datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
|
| 458 |
+
|
| 459 |
+
return render_template('step3_result.html',
|
| 460 |
+
question_paper=questions, # Changed from questions to question_paper
|
| 461 |
+
config=config,
|
| 462 |
+
total_marks=total_marks,
|
| 463 |
+
exam_date=datetime.strptime(exam_date, '%Y-%m-%d %H:%M:%S').strftime('%B %d, %Y'),
|
| 464 |
+
display_duration=config.get('duration', 60)) # Use configured duration or default to 60 minutes
|
| 465 |
+
|
| 466 |
+
@app.route('/api/health')
|
| 467 |
+
def health_check():
|
| 468 |
+
"""Health check endpoint."""
|
| 469 |
+
return jsonify({'status': 'healthy', 'service': 'Exam Question Generator'})
|
| 470 |
+
|
| 471 |
+
@app.route('/api/warmup')
|
| 472 |
+
def warmup():
|
| 473 |
+
"""Warmup endpoint to initialize the system."""
|
| 474 |
+
try:
|
| 475 |
+
system = get_question_system()
|
| 476 |
+
return jsonify({
|
| 477 |
+
'status': 'ready',
|
| 478 |
+
'message': 'Question generation system is ready',
|
| 479 |
+
'uses_transformers': system.question_generator.use_transformers
|
| 480 |
+
})
|
| 481 |
+
except Exception as e:
|
| 482 |
+
return jsonify({
|
| 483 |
+
'status': 'loading' if 'still loading' in str(e) else 'error',
|
| 484 |
+
'message': str(e)
|
| 485 |
+
}), 202 if 'still loading' in str(e) else 500
|
| 486 |
+
|
| 487 |
+
if __name__ == '__main__':
|
| 488 |
+
print("🌐 Starting Flask Web Application...")
|
| 489 |
+
print("📝 Exam Question Generator Web Interface")
|
| 490 |
+
print("🔗 Access the application at: http://localhost:5000")
|
| 491 |
+
print("💡 Using rule-based question generation for faster startup")
|
| 492 |
+
print("⚡ System will initialize on first use")
|
| 493 |
+
|
| 494 |
+
# Configure Flask for better timeout handling
|
| 495 |
+
app.config['PERMANENT_SESSION_LIFETIME'] = 1800 # 30 minutes
|
| 496 |
+
|
| 497 |
+
app.run(debug=True, host='0.0.0.0', port=5000, threaded=True)
|
download_nltk_data.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import nltk
|
| 2 |
+
|
| 3 |
+
def download_nltk_data():
|
| 4 |
+
"""Download all required NLTK data packages."""
|
| 5 |
+
print("Downloading NLTK data packages...")
|
| 6 |
+
|
| 7 |
+
# List of NLTK data packages to download
|
| 8 |
+
packages = [
|
| 9 |
+
'punkt',
|
| 10 |
+
'averaged_perceptron_tagger',
|
| 11 |
+
'wordnet',
|
| 12 |
+
'stopwords',
|
| 13 |
+
'universal_tagset',
|
| 14 |
+
'tagsets',
|
| 15 |
+
'omw-1.4', # Open Multilingual WordNet
|
| 16 |
+
]
|
| 17 |
+
|
| 18 |
+
for package in packages:
|
| 19 |
+
try:
|
| 20 |
+
print(f"Downloading {package}...")
|
| 21 |
+
nltk.download(package, quiet=False)
|
| 22 |
+
print(f"Successfully downloaded {package}")
|
| 23 |
+
except Exception as e:
|
| 24 |
+
print(f"Error downloading {package}: {str(e)}")
|
| 25 |
+
|
| 26 |
+
print("\nNLTK data download complete!")
|
| 27 |
+
|
| 28 |
+
if __name__ == "__main__":
|
| 29 |
+
download_nltk_data()
|
exam_question_system.py
ADDED
|
@@ -0,0 +1,376 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
from text_processor import TextProcessor
|
| 5 |
+
from keyword_extractor import KeywordExtractor
|
| 6 |
+
from question_generator import QuestionGenerator
|
| 7 |
+
from option_generator import OptionGenerator
|
| 8 |
+
from syllabus_processor import SyllabusProcessor
|
| 9 |
+
|
| 10 |
+
class ExamQuestionSystem:
|
| 11 |
+
def __init__(self, use_transformers=True):
|
| 12 |
+
"""Initialize the complete exam question generation system.
|
| 13 |
+
|
| 14 |
+
Args:
|
| 15 |
+
use_transformers: Whether to use transformer models for question generation
|
| 16 |
+
"""
|
| 17 |
+
print("Initializing Exam Question Generation System...")
|
| 18 |
+
self.text_processor = TextProcessor()
|
| 19 |
+
self.keyword_extractor = KeywordExtractor()
|
| 20 |
+
# Use rule-based generation by default for faster web deployment
|
| 21 |
+
self.question_generator = QuestionGenerator(use_transformers=use_transformers)
|
| 22 |
+
self.option_generator = OptionGenerator()
|
| 23 |
+
self.syllabus_processor = SyllabusProcessor()
|
| 24 |
+
print("System initialized successfully!")
|
| 25 |
+
|
| 26 |
+
def process_text_file(self, file_path):
|
| 27 |
+
"""
|
| 28 |
+
Process a text file and return its content.
|
| 29 |
+
|
| 30 |
+
Args:
|
| 31 |
+
file_path (str): Path to the text file
|
| 32 |
+
|
| 33 |
+
Returns:
|
| 34 |
+
str: File content
|
| 35 |
+
"""
|
| 36 |
+
try:
|
| 37 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
| 38 |
+
return file.read()
|
| 39 |
+
except Exception as e:
|
| 40 |
+
raise Exception(f"Error reading file {file_path}: {e}")
|
| 41 |
+
|
| 42 |
+
def generate_exam_questions(self, input_text, max_questions=5, include_mcq=True, syllabus_text=None):
|
| 43 |
+
"""
|
| 44 |
+
Complete pipeline to generate exam questions from input text.
|
| 45 |
+
|
| 46 |
+
Args:
|
| 47 |
+
input_text (str): Input text to generate questions from
|
| 48 |
+
max_questions (int): Maximum number of questions to generate
|
| 49 |
+
include_mcq (bool): Whether to include multiple choice options
|
| 50 |
+
syllabus_text (str, optional): Syllabus text for topic-based question generation
|
| 51 |
+
|
| 52 |
+
Returns:
|
| 53 |
+
dict: Generated questions and metadata
|
| 54 |
+
"""
|
| 55 |
+
print("Starting question generation pipeline...")
|
| 56 |
+
|
| 57 |
+
try:
|
| 58 |
+
if not input_text or not input_text.strip():
|
| 59 |
+
raise ValueError("Input text cannot be empty")
|
| 60 |
+
|
| 61 |
+
# If syllabus text is provided, try syllabus-based generation
|
| 62 |
+
if syllabus_text and syllabus_text.strip():
|
| 63 |
+
syllabus_results = self._generate_syllabus_based_questions(input_text, syllabus_text, max_questions, include_mcq)
|
| 64 |
+
if syllabus_results and syllabus_results.get('questions'):
|
| 65 |
+
return syllabus_results
|
| 66 |
+
print("Warning: Syllabus-based generation produced no questions. Falling back to standard generation.")
|
| 67 |
+
|
| 68 |
+
# Otherwise use the standard generation approach
|
| 69 |
+
if not input_text or not input_text.strip():
|
| 70 |
+
raise ValueError("Input text is empty or contains only whitespace")
|
| 71 |
+
|
| 72 |
+
print(f"Input text length: {len(input_text)} characters")
|
| 73 |
+
|
| 74 |
+
# Step 1: Text preprocessing
|
| 75 |
+
print("1. Processing and cleaning text...")
|
| 76 |
+
processed_data = self.text_processor.preprocess_text(input_text)
|
| 77 |
+
|
| 78 |
+
if not processed_data or 'sentences' not in processed_data or not processed_data['sentences']:
|
| 79 |
+
raise ValueError("Failed to process input text into sentences")
|
| 80 |
+
|
| 81 |
+
print(f"Extracted {len(processed_data['sentences'])} sentences from input")
|
| 82 |
+
|
| 83 |
+
# Step 2: Extract keywords and important sentences
|
| 84 |
+
print("2. Extracting keywords and important sentences...")
|
| 85 |
+
key_concepts = self.keyword_extractor.extract_key_concepts(
|
| 86 |
+
processed_data['cleaned_text'],
|
| 87 |
+
processed_data['sentences'],
|
| 88 |
+
top_n_sentences=max(10, max_questions)
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
if not key_concepts or 'important_sentences' not in key_concepts or not key_concepts['important_sentences']:
|
| 92 |
+
# If no important sentences found, use the first few sentences
|
| 93 |
+
print("Warning: No important sentences found, using first few sentences")
|
| 94 |
+
key_concepts['important_sentences'] = processed_data['sentences'][:max_questions]
|
| 95 |
+
|
| 96 |
+
print(f"Found {len(key_concepts.get('important_sentences', []))} important sentences")
|
| 97 |
+
|
| 98 |
+
# Prepare sentences and keywords for generation
|
| 99 |
+
generation_inputs = []
|
| 100 |
+
if key_concepts and 'important_sentences' in key_concepts:
|
| 101 |
+
for item in key_concepts['important_sentences']:
|
| 102 |
+
if isinstance(item, tuple) and len(item) >= 2:
|
| 103 |
+
# item is (score, sentence, keyword) or (score, sentence)
|
| 104 |
+
sentence = item[1]
|
| 105 |
+
keyword = item[2] if len(item) > 2 else None
|
| 106 |
+
generation_inputs.append({'context': sentence, 'answer': keyword})
|
| 107 |
+
elif isinstance(item, str):
|
| 108 |
+
generation_inputs.append({'context': item, 'answer': None})
|
| 109 |
+
|
| 110 |
+
# Step 3: Generate questions
|
| 111 |
+
print("3. Generating questions...")
|
| 112 |
+
questions = []
|
| 113 |
+
|
| 114 |
+
# Generate more questions than requested to ensure we have enough valid ones
|
| 115 |
+
# and to cover all sections (MCQ, Short, Long)
|
| 116 |
+
generation_target = max(max_questions * 2, 10)
|
| 117 |
+
|
| 118 |
+
try:
|
| 119 |
+
questions = self.question_generator.generate_multiple_questions(
|
| 120 |
+
generation_inputs,
|
| 121 |
+
generation_target
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
# Ensure we have a list of questions
|
| 125 |
+
if not questions:
|
| 126 |
+
raise ValueError("No questions were generated")
|
| 127 |
+
|
| 128 |
+
# Convert string questions to proper format
|
| 129 |
+
formatted_questions = []
|
| 130 |
+
for i, q in enumerate(questions):
|
| 131 |
+
if isinstance(q, str):
|
| 132 |
+
formatted_q = {
|
| 133 |
+
'question': q,
|
| 134 |
+
'context': 'Generated from input text',
|
| 135 |
+
'score': 1.0,
|
| 136 |
+
'type': 'short_answer',
|
| 137 |
+
'id': f'q_{i+1}'
|
| 138 |
+
}
|
| 139 |
+
formatted_questions.append(formatted_q)
|
| 140 |
+
elif isinstance(q, dict):
|
| 141 |
+
# Ensure required fields exist
|
| 142 |
+
q['question'] = q.get('question', f'Question {i+1}')
|
| 143 |
+
q['context'] = q.get('context', 'No context provided')
|
| 144 |
+
q['score'] = q.get('score', 1.0)
|
| 145 |
+
q['type'] = q.get('type', 'short_answer')
|
| 146 |
+
q['id'] = q.get('id', f'q_{i+1}')
|
| 147 |
+
formatted_questions.append(q)
|
| 148 |
+
|
| 149 |
+
questions = formatted_questions
|
| 150 |
+
|
| 151 |
+
# Step 4: Generate MCQ options if requested and we have enough questions
|
| 152 |
+
if include_mcq and questions:
|
| 153 |
+
print("4. Generating multiple choice options...")
|
| 154 |
+
# Extract global keywords for distractors
|
| 155 |
+
global_keywords = [k[1] for k in key_concepts.get('keywords', [])]
|
| 156 |
+
|
| 157 |
+
for question_data in questions[:max_questions]: # Limit to max_questions
|
| 158 |
+
try:
|
| 159 |
+
mcq_data = self.option_generator.create_mcq_options(
|
| 160 |
+
question_data['question'],
|
| 161 |
+
question_data['context'],
|
| 162 |
+
correct_answer=question_data.get('correct_answer'),
|
| 163 |
+
global_keywords=global_keywords
|
| 164 |
+
)
|
| 165 |
+
if mcq_data and 'options' in mcq_data and len(mcq_data['options']) >= 2:
|
| 166 |
+
question_data.update(mcq_data)
|
| 167 |
+
question_data['type'] = 'mcq'
|
| 168 |
+
print(f"Generated {len(mcq_data['options'])} options for question")
|
| 169 |
+
else:
|
| 170 |
+
print("Not enough options generated, keeping as short answer")
|
| 171 |
+
except Exception as e:
|
| 172 |
+
print(f"Error generating MCQ options: {str(e)}"
|
| 173 |
+
" (continuing with short answer)")
|
| 174 |
+
|
| 175 |
+
except Exception as e:
|
| 176 |
+
import traceback
|
| 177 |
+
print(f"Error in question generation: {str(e)}\n{traceback.format_exc()}")
|
| 178 |
+
# Create fallback questions
|
| 179 |
+
questions = [{
|
| 180 |
+
'question': f"Sample question {i+1} (error: {str(e)[:50]}...)",
|
| 181 |
+
'context': 'Error occurred during question generation',
|
| 182 |
+
'score': 0.0,
|
| 183 |
+
'type': 'error',
|
| 184 |
+
'id': f'error_{i}'
|
| 185 |
+
} for i in range(min(3, max_questions))]
|
| 186 |
+
|
| 187 |
+
# Compile results
|
| 188 |
+
results = {
|
| 189 |
+
'metadata': {
|
| 190 |
+
'input_word_count': processed_data.get('word_count', 0),
|
| 191 |
+
'input_sentence_count': len(processed_data.get('sentences', [])),
|
| 192 |
+
'questions_generated': len(questions),
|
| 193 |
+
'keywords_extracted': len(key_concepts.get('keywords', [])),
|
| 194 |
+
'named_entities': len(key_concepts.get('named_entities', []))
|
| 195 |
+
},
|
| 196 |
+
'keywords': key_concepts.get('keywords', [])[:10],
|
| 197 |
+
'named_entities': key_concepts.get('named_entities', [])[:10],
|
| 198 |
+
'questions': questions[:max_questions] # Ensure we don't return more than requested
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
print(f"Successfully generated {len(results['questions'])} questions")
|
| 202 |
+
return results
|
| 203 |
+
|
| 204 |
+
except Exception as e:
|
| 205 |
+
import traceback
|
| 206 |
+
error_msg = f"Error in generate_exam_questions: {str(e)}\n{traceback.format_exc()}"
|
| 207 |
+
print(error_msg)
|
| 208 |
+
|
| 209 |
+
# Return a minimal response with error information
|
| 210 |
+
return {
|
| 211 |
+
'metadata': {
|
| 212 |
+
'error': str(e),
|
| 213 |
+
'input_length': len(input_text) if input_text else 0,
|
| 214 |
+
'questions_generated': 0
|
| 215 |
+
},
|
| 216 |
+
'keywords': [],
|
| 217 |
+
'named_entities': [],
|
| 218 |
+
'questions': [{
|
| 219 |
+
'question': f"Error generating questions: {str(e)[:100]}",
|
| 220 |
+
'context': 'An error occurred during question generation',
|
| 221 |
+
'score': 0.0,
|
| 222 |
+
'type': 'error',
|
| 223 |
+
'id': 'error_0'
|
| 224 |
+
}]
|
| 225 |
+
}
|
| 226 |
+
|
| 227 |
+
print(f"✅ Generated {len(questions)} questions successfully!")
|
| 228 |
+
return results
|
| 229 |
+
|
| 230 |
+
def save_questions_to_json(self, questions_data, output_file):
|
| 231 |
+
"""
|
| 232 |
+
Save generated questions to a JSON file.
|
| 233 |
+
|
| 234 |
+
Args:
|
| 235 |
+
questions_data (dict): Generated questions data
|
| 236 |
+
output_file (str): Output file path
|
| 237 |
+
"""
|
| 238 |
+
try:
|
| 239 |
+
with open(output_file, 'w', encoding='utf-8') as file:
|
| 240 |
+
json.dump(questions_data, file, indent=2, ensure_ascii=False)
|
| 241 |
+
print(f"✅ Questions saved to {output_file}")
|
| 242 |
+
except Exception as e:
|
| 243 |
+
print(f"❌ Error saving to file: {e}")
|
| 244 |
+
|
| 245 |
+
def display_questions_console(self, questions_data):
|
| 246 |
+
"""
|
| 247 |
+
Display generated questions in a formatted console output.
|
| 248 |
+
|
| 249 |
+
Args:
|
| 250 |
+
questions_data (dict): Generated questions data
|
| 251 |
+
"""
|
| 252 |
+
print("\n" + "="*80)
|
| 253 |
+
print("GENERATED EXAM QUESTIONS")
|
| 254 |
+
print("="*80)
|
| 255 |
+
|
| 256 |
+
# Display metadata
|
| 257 |
+
metadata = questions_data['metadata']
|
| 258 |
+
print(f"\n📊 STATISTICS:")
|
| 259 |
+
print(f" • Input text: {metadata['input_word_count']} words, {metadata['input_sentence_count']} sentences")
|
| 260 |
+
print(f" • Keywords extracted: {metadata['keywords_extracted']}")
|
| 261 |
+
print(f" • Named entities found: {metadata['named_entities']}")
|
| 262 |
+
print(f" • Questions generated: {metadata['questions_generated']}")
|
| 263 |
+
|
| 264 |
+
# Display top keywords
|
| 265 |
+
print(f"\n🔑 TOP KEYWORDS:")
|
| 266 |
+
for score, keyword in questions_data['keywords'][:5]:
|
| 267 |
+
print(f" • {keyword} (score: {score:.2f})")
|
| 268 |
+
|
| 269 |
+
# Display questions
|
| 270 |
+
print(f"\n❓ QUESTIONS:")
|
| 271 |
+
for i, q in enumerate(questions_data['questions'], 1):
|
| 272 |
+
print(f"\n{i}. {q['question']}")
|
| 273 |
+
|
| 274 |
+
if 'options' in q:
|
| 275 |
+
print(" Options:")
|
| 276 |
+
for j, option in enumerate(q['options'], 1):
|
| 277 |
+
marker = "✓" if j-1 == q['correct_index'] else " "
|
| 278 |
+
print(f" {marker} {chr(64+j)}. {option}")
|
| 279 |
+
|
| 280 |
+
print(f" Context: {q['context'][:100]}...")
|
| 281 |
+
print(f" Confidence: {q['score']:.2f}")
|
| 282 |
+
|
| 283 |
+
print("\n" + "="*80)
|
| 284 |
+
|
| 285 |
+
def _generate_syllabus_based_questions(self, content_text, syllabus_text, max_questions=10, include_mcq=True):
|
| 286 |
+
"""
|
| 287 |
+
Generate questions based on syllabus topics.
|
| 288 |
+
|
| 289 |
+
Args:
|
| 290 |
+
content_text (str): The content text to generate questions from
|
| 291 |
+
syllabus_text (str): The syllabus text with units and topics
|
| 292 |
+
max_questions (int): Maximum number of questions to generate
|
| 293 |
+
include_mcq (bool): Whether to include multiple choice options
|
| 294 |
+
|
| 295 |
+
Returns:
|
| 296 |
+
dict: Generated questions and metadata
|
| 297 |
+
"""
|
| 298 |
+
print("Generating syllabus-based questions...")
|
| 299 |
+
|
| 300 |
+
try:
|
| 301 |
+
# Generate questions by topic
|
| 302 |
+
questions_by_topic = self.syllabus_processor.generate_topic_based_questions(
|
| 303 |
+
syllabus_text=syllabus_text,
|
| 304 |
+
content_text=content_text,
|
| 305 |
+
questions_per_topic=3 # Will be adjusted based on max_questions
|
| 306 |
+
)
|
| 307 |
+
|
| 308 |
+
# Flatten questions from all topics
|
| 309 |
+
all_questions = []
|
| 310 |
+
for topic, questions in questions_by_topic.items():
|
| 311 |
+
for q in questions:
|
| 312 |
+
q['topic'] = topic
|
| 313 |
+
all_questions.append(q)
|
| 314 |
+
|
| 315 |
+
# Limit to max_questions
|
| 316 |
+
all_questions = all_questions[:max_questions]
|
| 317 |
+
|
| 318 |
+
# Generate options for MCQs if needed
|
| 319 |
+
if include_mcq:
|
| 320 |
+
for question in all_questions:
|
| 321 |
+
if 'options' not in question and 'context' in question:
|
| 322 |
+
try:
|
| 323 |
+
mcq_data = self.option_generator.create_mcq_options(
|
| 324 |
+
question['question'],
|
| 325 |
+
question['context'],
|
| 326 |
+
num_options=4
|
| 327 |
+
)
|
| 328 |
+
if mcq_data and 'options' in mcq_data and len(mcq_data['options']) >= 2:
|
| 329 |
+
question.update(mcq_data)
|
| 330 |
+
except Exception as e:
|
| 331 |
+
print(f"Error generating options: {e}")
|
| 332 |
+
|
| 333 |
+
# Prepare results
|
| 334 |
+
results = {
|
| 335 |
+
'metadata': {
|
| 336 |
+
'total_questions': len(all_questions),
|
| 337 |
+
'topics_covered': list(questions_by_topic.keys()),
|
| 338 |
+
'generated_at': str(datetime.now())
|
| 339 |
+
},
|
| 340 |
+
'questions': all_questions
|
| 341 |
+
}
|
| 342 |
+
|
| 343 |
+
return results
|
| 344 |
+
|
| 345 |
+
except Exception as e:
|
| 346 |
+
print(f"Error in syllabus-based question generation: {e}")
|
| 347 |
+
raise
|
| 348 |
+
|
| 349 |
+
# Example usage and testing
|
| 350 |
+
if __name__ == "__main__":
|
| 351 |
+
# Sample text for testing
|
| 352 |
+
sample_text = """
|
| 353 |
+
Artificial Intelligence (AI) is a branch of computer science that aims to create intelligent machines
|
| 354 |
+
that work and react like humans. Machine learning is a subset of AI that provides systems the ability
|
| 355 |
+
to automatically learn and improve from experience without being explicitly programmed. Deep learning
|
| 356 |
+
is a subset of machine learning that uses neural networks with three or more layers. These neural
|
| 357 |
+
networks attempt to simulate the behavior of the human brain to learn from large amounts of data.
|
| 358 |
+
Python is one of the most popular programming languages for AI development due to its simplicity
|
| 359 |
+
and extensive libraries like TensorFlow and PyTorch.
|
| 360 |
+
"""
|
| 361 |
+
|
| 362 |
+
try:
|
| 363 |
+
# Initialize system
|
| 364 |
+
system = ExamQuestionSystem()
|
| 365 |
+
|
| 366 |
+
# Generate questions
|
| 367 |
+
results = system.generate_exam_questions(sample_text, max_questions=3)
|
| 368 |
+
|
| 369 |
+
# Display results
|
| 370 |
+
system.display_questions_console(results)
|
| 371 |
+
|
| 372 |
+
# Save to JSON
|
| 373 |
+
system.save_questions_to_json(results, "sample_questions.json")
|
| 374 |
+
|
| 375 |
+
except Exception as e:
|
| 376 |
+
print(f"❌ Error: {e}")
|
generated_questions.json
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"metadata": {
|
| 3 |
+
"input_word_count": 1,
|
| 4 |
+
"input_sentence_count": 0,
|
| 5 |
+
"questions_generated": 0,
|
| 6 |
+
"keywords_extracted": 1,
|
| 7 |
+
"named_entities": 1
|
| 8 |
+
},
|
| 9 |
+
"keywords": [
|
| 10 |
+
[
|
| 11 |
+
1.0,
|
| 12 |
+
"text"
|
| 13 |
+
]
|
| 14 |
+
],
|
| 15 |
+
"named_entities": [
|
| 16 |
+
[
|
| 17 |
+
"Your",
|
| 18 |
+
1
|
| 19 |
+
]
|
| 20 |
+
],
|
| 21 |
+
"questions": []
|
| 22 |
+
}
|
keyword_extractor.py
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import nltk
|
| 2 |
+
from rake_nltk import Rake
|
| 3 |
+
from collections import Counter
|
| 4 |
+
import re
|
| 5 |
+
|
| 6 |
+
class KeywordExtractor:
|
| 7 |
+
def __init__(self):
|
| 8 |
+
"""Initialize the keyword extractor."""
|
| 9 |
+
self.rake = Rake()
|
| 10 |
+
|
| 11 |
+
def extract_keywords_rake(self, text, max_keywords=10):
|
| 12 |
+
"""
|
| 13 |
+
Extract keywords using RAKE algorithm.
|
| 14 |
+
|
| 15 |
+
Args:
|
| 16 |
+
text (str): Input text
|
| 17 |
+
max_keywords (int): Maximum number of keywords to extract
|
| 18 |
+
|
| 19 |
+
Returns:
|
| 20 |
+
list: List of keywords with scores
|
| 21 |
+
"""
|
| 22 |
+
self.rake.extract_keywords_from_text(text)
|
| 23 |
+
keywords_with_scores = self.rake.get_ranked_phrases_with_scores()
|
| 24 |
+
|
| 25 |
+
# Filter keywords
|
| 26 |
+
clean_keywords = []
|
| 27 |
+
seen_keywords = set()
|
| 28 |
+
|
| 29 |
+
for score, keyword in keywords_with_scores:
|
| 30 |
+
# Remove keywords with digits or special chars
|
| 31 |
+
if re.search(r'\d', keyword) or len(keyword) < 4:
|
| 32 |
+
continue
|
| 33 |
+
|
| 34 |
+
# Remove very short single words that are lowercase (likely noise)
|
| 35 |
+
if ' ' not in keyword and keyword[0].islower() and len(keyword) < 5:
|
| 36 |
+
continue
|
| 37 |
+
|
| 38 |
+
# Remove duplicates
|
| 39 |
+
if keyword.lower() in seen_keywords:
|
| 40 |
+
continue
|
| 41 |
+
|
| 42 |
+
clean_keywords.append((score, keyword))
|
| 43 |
+
seen_keywords.add(keyword.lower())
|
| 44 |
+
|
| 45 |
+
if len(clean_keywords) >= max_keywords:
|
| 46 |
+
break
|
| 47 |
+
|
| 48 |
+
return clean_keywords
|
| 49 |
+
|
| 50 |
+
def extract_named_entities(self, text):
|
| 51 |
+
"""
|
| 52 |
+
Extract named entities (simple approach using capitalization patterns).
|
| 53 |
+
|
| 54 |
+
Args:
|
| 55 |
+
text (str): Input text
|
| 56 |
+
|
| 57 |
+
Returns:
|
| 58 |
+
list: List of potential named entities
|
| 59 |
+
"""
|
| 60 |
+
# Simple named entity extraction based on capitalization
|
| 61 |
+
words = text.split()
|
| 62 |
+
entities = []
|
| 63 |
+
|
| 64 |
+
for word in words:
|
| 65 |
+
# Look for capitalized words that aren't at sentence start
|
| 66 |
+
if word[0].isupper() and len(word) > 2:
|
| 67 |
+
# Remove punctuation
|
| 68 |
+
clean_word = re.sub(r'[^\w]', '', word)
|
| 69 |
+
if clean_word and clean_word not in ['The', 'This', 'That', 'These', 'Those']:
|
| 70 |
+
entities.append(clean_word)
|
| 71 |
+
|
| 72 |
+
# Count occurrences and return most frequent
|
| 73 |
+
entity_counts = Counter(entities)
|
| 74 |
+
return entity_counts.most_common(10)
|
| 75 |
+
|
| 76 |
+
def identify_important_sentences(self, sentences, keywords, top_n=5):
|
| 77 |
+
"""
|
| 78 |
+
Identify important sentences based on keyword density.
|
| 79 |
+
|
| 80 |
+
Args:
|
| 81 |
+
sentences (list): List of sentences
|
| 82 |
+
keywords (list): List of important keywords
|
| 83 |
+
top_n (int): Number of top sentences to return
|
| 84 |
+
|
| 85 |
+
Returns:
|
| 86 |
+
list: List of important sentences with scores
|
| 87 |
+
"""
|
| 88 |
+
keyword_phrases = [kw[1] for kw in keywords] # Extract phrases from (score, phrase) tuples
|
| 89 |
+
sentence_scores = []
|
| 90 |
+
|
| 91 |
+
for sentence in sentences:
|
| 92 |
+
score = 0
|
| 93 |
+
sentence_lower = sentence.lower()
|
| 94 |
+
best_keyword = ""
|
| 95 |
+
|
| 96 |
+
# Score based on keyword presence
|
| 97 |
+
for keyword in keyword_phrases:
|
| 98 |
+
if keyword.lower() in sentence_lower:
|
| 99 |
+
score += 1
|
| 100 |
+
if not best_keyword or len(keyword) > len(best_keyword):
|
| 101 |
+
best_keyword = keyword
|
| 102 |
+
|
| 103 |
+
# Bonus for sentence length (not too short, not too long)
|
| 104 |
+
word_count = len(sentence.split())
|
| 105 |
+
if 8 <= word_count <= 25:
|
| 106 |
+
score += 0.5
|
| 107 |
+
|
| 108 |
+
# Bonus for sentences with numbers or specific terms
|
| 109 |
+
if re.search(r'\d+', sentence):
|
| 110 |
+
score += 0.3
|
| 111 |
+
|
| 112 |
+
if score > 0:
|
| 113 |
+
sentence_scores.append((score, sentence, best_keyword))
|
| 114 |
+
|
| 115 |
+
# Sort by score and return top sentences
|
| 116 |
+
sentence_scores.sort(key=lambda x: x[0], reverse=True)
|
| 117 |
+
return sentence_scores[:top_n]
|
| 118 |
+
|
| 119 |
+
def extract_key_concepts(self, text, sentences, top_n_sentences=5):
|
| 120 |
+
"""
|
| 121 |
+
Complete keyword and concept extraction pipeline.
|
| 122 |
+
|
| 123 |
+
Args:
|
| 124 |
+
text (str): Input text
|
| 125 |
+
sentences (list): List of sentences
|
| 126 |
+
|
| 127 |
+
Returns:
|
| 128 |
+
dict: Extracted keywords, entities, and important sentences
|
| 129 |
+
"""
|
| 130 |
+
keywords = self.extract_keywords_rake(text)
|
| 131 |
+
entities = self.extract_named_entities(text)
|
| 132 |
+
important_sentences = self.identify_important_sentences(sentences, keywords, top_n=top_n_sentences)
|
| 133 |
+
|
| 134 |
+
return {
|
| 135 |
+
'keywords': keywords,
|
| 136 |
+
'named_entities': entities,
|
| 137 |
+
'important_sentences': important_sentences
|
| 138 |
+
}
|
local_question_generator.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
|
| 2 |
+
import torch
|
| 3 |
+
|
| 4 |
+
class LocalQuestionGenerator:
|
| 5 |
+
def __init__(self):
|
| 6 |
+
print("Loading local question generation model...")
|
| 7 |
+
self.device = 0 if torch.cuda.is_available() else -1 # Use GPU if available
|
| 8 |
+
|
| 9 |
+
# Load the model and tokenizer
|
| 10 |
+
model_name = "valhalla/t5-base-qa-qg-hl"
|
| 11 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 12 |
+
self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
|
| 13 |
+
|
| 14 |
+
# Initialize the pipeline
|
| 15 |
+
self.generator = pipeline(
|
| 16 |
+
"text2text-generation",
|
| 17 |
+
model=self.model,
|
| 18 |
+
tokenizer=self.tokenizer,
|
| 19 |
+
device=self.device
|
| 20 |
+
)
|
| 21 |
+
print("Model loaded successfully!")
|
| 22 |
+
|
| 23 |
+
def generate_questions(self, text, num_questions=5, max_length=64):
|
| 24 |
+
"""Generate questions from the given text."""
|
| 25 |
+
if not text.strip():
|
| 26 |
+
return []
|
| 27 |
+
|
| 28 |
+
try:
|
| 29 |
+
# Prepare the input text
|
| 30 |
+
input_text = f"generate questions: {text}"
|
| 31 |
+
|
| 32 |
+
# Generate questions
|
| 33 |
+
results = self.generator(
|
| 34 |
+
input_text,
|
| 35 |
+
max_length=max_length,
|
| 36 |
+
num_return_sequences=num_questions,
|
| 37 |
+
num_beams=5,
|
| 38 |
+
early_stopping=True
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
# Extract and clean the generated questions
|
| 42 |
+
questions = [result['generated_text'].strip() for result in results]
|
| 43 |
+
return questions
|
| 44 |
+
|
| 45 |
+
except Exception as e:
|
| 46 |
+
print(f"Error generating questions: {str(e)}")
|
| 47 |
+
return []
|
| 48 |
+
|
| 49 |
+
# Example usage
|
| 50 |
+
if __name__ == "__main__":
|
| 51 |
+
# Initialize the generator
|
| 52 |
+
qg = LocalQuestionGenerator()
|
| 53 |
+
|
| 54 |
+
# Sample text
|
| 55 |
+
sample_text = """
|
| 56 |
+
Machine learning is a branch of artificial intelligence that focuses on building systems
|
| 57 |
+
that learn from data. These systems can improve their performance over time without being
|
| 58 |
+
explicitly programmed. There are three main types of machine learning: supervised learning,
|
| 59 |
+
unsupervised learning, and reinforcement learning.
|
| 60 |
+
"""
|
| 61 |
+
|
| 62 |
+
# Generate questions
|
| 63 |
+
print("\nGenerating questions...")
|
| 64 |
+
questions = qg.generate_questions(sample_text, num_questions=3)
|
| 65 |
+
|
| 66 |
+
# Print the results
|
| 67 |
+
print("\nGenerated Questions:")
|
| 68 |
+
for i, q in enumerate(questions, 1):
|
| 69 |
+
print(f"{i}. {q}")
|
nltk_install.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import nltk
|
| 3 |
+
import sys
|
| 4 |
+
|
| 5 |
+
def setup_nltk():
|
| 6 |
+
# Set NLTK data directory
|
| 7 |
+
nltk_dir = os.path.join(os.path.expanduser('~'), 'nltk_data')
|
| 8 |
+
os.makedirs(nltk_dir, exist_ok=True)
|
| 9 |
+
nltk.data.path.append(nltk_dir)
|
| 10 |
+
|
| 11 |
+
print(f"NLTK data directory: {nltk_dir}")
|
| 12 |
+
|
| 13 |
+
# List of required NLTK packages
|
| 14 |
+
packages = [
|
| 15 |
+
'punkt',
|
| 16 |
+
'stopwords',
|
| 17 |
+
'averaged_perceptron_tagger',
|
| 18 |
+
'wordnet',
|
| 19 |
+
'omw-1.4'
|
| 20 |
+
]
|
| 21 |
+
|
| 22 |
+
# Download each package
|
| 23 |
+
for package in packages:
|
| 24 |
+
print(f"\nDownloading {package}...")
|
| 25 |
+
try:
|
| 26 |
+
nltk.download(package, download_dir=nltk_dir, quiet=False)
|
| 27 |
+
print(f"✓ {package} downloaded successfully")
|
| 28 |
+
except Exception as e:
|
| 29 |
+
print(f"✗ Error downloading {package}: {str(e)}")
|
| 30 |
+
|
| 31 |
+
# Verify installation
|
| 32 |
+
print("\n=== Verifying NLTK Installation ===")
|
| 33 |
+
try:
|
| 34 |
+
from nltk.tokenize import word_tokenize, sent_tokenize
|
| 35 |
+
from nltk import pos_tag
|
| 36 |
+
from nltk.corpus import stopwords
|
| 37 |
+
|
| 38 |
+
test_text = "NLTK is working correctly if you can read this."
|
| 39 |
+
|
| 40 |
+
# Test tokenization
|
| 41 |
+
print("Testing tokenization...")
|
| 42 |
+
words = word_tokenize(test_text)
|
| 43 |
+
print(f"Word tokens: {words}")
|
| 44 |
+
|
| 45 |
+
# Test sentence tokenization
|
| 46 |
+
sentences = sent_tokenize(test_text)
|
| 47 |
+
print(f"Sentences: {sentences}")
|
| 48 |
+
|
| 49 |
+
# Test POS tagging
|
| 50 |
+
print("\nTesting POS tagging...")
|
| 51 |
+
tags = pos_tag(words)
|
| 52 |
+
print(f"POS tags: {tags}")
|
| 53 |
+
|
| 54 |
+
# Test stopwords
|
| 55 |
+
print("\nTesting stopwords...")
|
| 56 |
+
stop_words = stopwords.words('english')
|
| 57 |
+
print(f"Sample stopwords: {stop_words[:5]}...")
|
| 58 |
+
|
| 59 |
+
print("\n✅ NLTK is working correctly!")
|
| 60 |
+
|
| 61 |
+
except Exception as e:
|
| 62 |
+
print(f"\n❌ Error verifying NLTK: {str(e)}")
|
| 63 |
+
print("\nPlease try running these commands manually:")
|
| 64 |
+
print("import nltk")
|
| 65 |
+
for package in packages:
|
| 66 |
+
print(f"nltk.download('{package}')")
|
| 67 |
+
|
| 68 |
+
if __name__ == "__main__":
|
| 69 |
+
setup_nltk()
|
nltk_setup.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import nltk
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
def setup_nltk():
|
| 5 |
+
try:
|
| 6 |
+
# Set NLTK data path to a local directory
|
| 7 |
+
nltk_data = os.path.join(os.getcwd(), 'nltk_data')
|
| 8 |
+
os.makedirs(nltk_data, exist_ok=True)
|
| 9 |
+
nltk.data.path.append(nltk_data)
|
| 10 |
+
|
| 11 |
+
# List of NLTK packages to download
|
| 12 |
+
packages = [
|
| 13 |
+
('punkt', 'tokenizers/punkt'),
|
| 14 |
+
('stopwords', 'corpora/stopwords'),
|
| 15 |
+
('averaged_perceptron_tagger', 'taggers/averaged_perceptron_tagger'),
|
| 16 |
+
('averaged_perceptron_tagger_eng', 'taggers/averaged_perceptron_tagger_eng'),
|
| 17 |
+
('wordnet', 'corpora/wordnet'),
|
| 18 |
+
('omw-1.4', 'corpora/omw-1.4')
|
| 19 |
+
]
|
| 20 |
+
|
| 21 |
+
print("\n=== Setting up NLTK data ===")
|
| 22 |
+
|
| 23 |
+
# Download each package
|
| 24 |
+
for package, path in packages:
|
| 25 |
+
try:
|
| 26 |
+
nltk.data.find(path)
|
| 27 |
+
print(f"✓ {package} is already available")
|
| 28 |
+
except LookupError:
|
| 29 |
+
print(f"Downloading {package}...")
|
| 30 |
+
nltk.download(package, download_dir=nltk_data)
|
| 31 |
+
print(f"✓ Downloaded {package}")
|
| 32 |
+
|
| 33 |
+
# Test NLTK components
|
| 34 |
+
print("\n=== Testing NLTK Components ===")
|
| 35 |
+
from nltk.tokenize import sent_tokenize, word_tokenize
|
| 36 |
+
from nltk.tag import pos_tag
|
| 37 |
+
from nltk.corpus import stopwords
|
| 38 |
+
|
| 39 |
+
sent_tokenize("This is a test.")
|
| 40 |
+
word_tokenize("This is a test.")
|
| 41 |
+
pos_tag(["test", "this", "is", "a", "sentence"])
|
| 42 |
+
stopwords.words('english')
|
| 43 |
+
|
| 44 |
+
print("\n=== NLTK Setup Completed Successfully ===\n")
|
| 45 |
+
return True
|
| 46 |
+
|
| 47 |
+
except Exception as e:
|
| 48 |
+
print(f"\n⚠ Error during NLTK setup: {str(e)}")
|
| 49 |
+
print("\nPlease try running these commands manually in a Python shell:")
|
| 50 |
+
print("import nltk")
|
| 51 |
+
for package, _ in packages:
|
| 52 |
+
print(f"nltk.download('{package}')")
|
| 53 |
+
return False
|
| 54 |
+
|
| 55 |
+
if __name__ == "__main__":
|
| 56 |
+
setup_nltk()
|
option_generator.py
ADDED
|
@@ -0,0 +1,272 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import random
|
| 2 |
+
import nltk
|
| 3 |
+
from nltk.corpus import wordnet
|
| 4 |
+
from nltk.tokenize import word_tokenize
|
| 5 |
+
from nltk.tag import pos_tag
|
| 6 |
+
|
| 7 |
+
class OptionGenerator:
|
| 8 |
+
def __init__(self):
|
| 9 |
+
"""Initialize the option generator with NLTK resources."""
|
| 10 |
+
try:
|
| 11 |
+
# Download required NLTK data with explicit resource names
|
| 12 |
+
nltk.download('punkt', quiet=True)
|
| 13 |
+
nltk.download('averaged_perceptron_tagger', quiet=True)
|
| 14 |
+
nltk.download('wordnet', quiet=True)
|
| 15 |
+
nltk.download('stopwords', quiet=True)
|
| 16 |
+
nltk.download('universal_tagset', quiet=True)
|
| 17 |
+
nltk.download('tagsets', quiet=True)
|
| 18 |
+
|
| 19 |
+
self.stop_words = set(nltk.corpus.stopwords.words('english'))
|
| 20 |
+
self.word_net_lemmatizer = nltk.WordNetLemmatizer()
|
| 21 |
+
|
| 22 |
+
# POS tag mapping for WordNet
|
| 23 |
+
self.pos_mapping = {
|
| 24 |
+
'NN': 'n', 'NNS': 'n', 'NNP': 'n', 'NNPS': 'n',
|
| 25 |
+
'VB': 'v', 'VBD': 'v', 'VBG': 'v', 'VBN': 'v', 'VBP': 'v', 'VBZ': 'v',
|
| 26 |
+
'JJ': 'a', 'JJR': 'a', 'JJS': 'a',
|
| 27 |
+
'RB': 'r', 'RBR': 'r', 'RBS': 'r'
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
except Exception as e:
|
| 31 |
+
print(f"Error initializing OptionGenerator: {str(e)}")
|
| 32 |
+
raise
|
| 33 |
+
|
| 34 |
+
def _get_synonyms(self, word, pos=None):
|
| 35 |
+
"""Get synonyms for a word using WordNet."""
|
| 36 |
+
synonyms = set()
|
| 37 |
+
|
| 38 |
+
# Skip if word is too short or a stop word
|
| 39 |
+
if len(word) < 3 or word.lower() in self.stop_words:
|
| 40 |
+
return []
|
| 41 |
+
|
| 42 |
+
try:
|
| 43 |
+
wordnet_pos = self.pos_mapping.get(pos, None) if pos else None
|
| 44 |
+
|
| 45 |
+
# Try with the provided POS tag first
|
| 46 |
+
if wordnet_pos:
|
| 47 |
+
for syn in wordnet.synsets(word, pos=wordnet_pos):
|
| 48 |
+
for lemma in syn.lemmas():
|
| 49 |
+
synonym = lemma.name().replace('_', ' ').lower()
|
| 50 |
+
if synonym != word and len(synonym.split()) == 1:
|
| 51 |
+
synonyms.add(synonym)
|
| 52 |
+
|
| 53 |
+
# If no synonyms found, try without POS tag
|
| 54 |
+
if not synonyms:
|
| 55 |
+
for syn in wordnet.synsets(word):
|
| 56 |
+
for lemma in syn.lemmas():
|
| 57 |
+
synonym = lemma.name().replace('_', ' ').lower()
|
| 58 |
+
if synonym != word and len(synonym.split()) == 1:
|
| 59 |
+
synonyms.add(synonym)
|
| 60 |
+
|
| 61 |
+
# If still no synonyms, try with lemmatization
|
| 62 |
+
if not synonyms and pos and pos.startswith('VB'):
|
| 63 |
+
lemma = self.word_net_lemmatizer.lemmatize(word, pos='v')
|
| 64 |
+
if lemma != word:
|
| 65 |
+
for syn in wordnet.synsets(lemma, pos='v'):
|
| 66 |
+
for l in syn.lemmas():
|
| 67 |
+
synonym = l.name().replace('_', ' ').lower()
|
| 68 |
+
if synonym != word and len(synonym.split()) == 1:
|
| 69 |
+
synonyms.add(synonym)
|
| 70 |
+
|
| 71 |
+
except Exception as e:
|
| 72 |
+
print(f"Error getting synonyms for '{word}': {str(e)}")
|
| 73 |
+
|
| 74 |
+
return list(synonyms)[:10] # Return at most 10 synonyms
|
| 75 |
+
|
| 76 |
+
def _get_distractors(self, word, pos=None, num=3):
|
| 77 |
+
"""Generate distractors for a given word."""
|
| 78 |
+
distractors = set()
|
| 79 |
+
|
| 80 |
+
try:
|
| 81 |
+
# Get synonyms first
|
| 82 |
+
synonyms = self._get_synonyms(word, pos)
|
| 83 |
+
distractors.update(synonyms[:num])
|
| 84 |
+
|
| 85 |
+
# If not enough synonyms, add similar words
|
| 86 |
+
if len(distractors) < num:
|
| 87 |
+
wordnet_pos = self.pos_mapping.get(pos, None) if pos else None
|
| 88 |
+
similar_words = []
|
| 89 |
+
|
| 90 |
+
for syn in wordnet.synsets(word, pos=wordnet_pos):
|
| 91 |
+
for lemma in syn.lemmas():
|
| 92 |
+
if lemma.name() != word:
|
| 93 |
+
similar_words.append(lemma.name().replace('_', ' ').lower())
|
| 94 |
+
|
| 95 |
+
# Add similar words that aren't already in distractors
|
| 96 |
+
for w in similar_words:
|
| 97 |
+
if w not in distractors and w != word:
|
| 98 |
+
distractors.add(w)
|
| 99 |
+
if len(distractors) >= num:
|
| 100 |
+
break
|
| 101 |
+
except Exception as e:
|
| 102 |
+
print(f"Error generating distractors for '{word}': {str(e)}")
|
| 103 |
+
|
| 104 |
+
return list(distractors)[:num]
|
| 105 |
+
|
| 106 |
+
def extract_answer_from_context(self, question, context):
|
| 107 |
+
"""
|
| 108 |
+
Extract the most likely answer from the context based on the question.
|
| 109 |
+
This version uses simple string matching instead of POS tagging.
|
| 110 |
+
|
| 111 |
+
Args:
|
| 112 |
+
question (str): Generated question
|
| 113 |
+
context (str): Source sentence/context
|
| 114 |
+
|
| 115 |
+
Returns:
|
| 116 |
+
str: Extracted answer
|
| 117 |
+
"""
|
| 118 |
+
try:
|
| 119 |
+
q_lower = question.lower()
|
| 120 |
+
context_lower = context.lower()
|
| 121 |
+
|
| 122 |
+
# Common patterns for answers
|
| 123 |
+
patterns = [
|
| 124 |
+
('what is', 'is'),
|
| 125 |
+
('what are', 'are'),
|
| 126 |
+
('what was', 'was'),
|
| 127 |
+
('what were', 'were'),
|
| 128 |
+
('who is', 'is'),
|
| 129 |
+
('who are', 'are'),
|
| 130 |
+
('who was', 'was'),
|
| 131 |
+
('who were', 'were'),
|
| 132 |
+
('where is', 'is'),
|
| 133 |
+
('where are', 'are'),
|
| 134 |
+
('when is', 'is'),
|
| 135 |
+
('when was', 'was')
|
| 136 |
+
]
|
| 137 |
+
|
| 138 |
+
# Try to find a direct answer using common patterns
|
| 139 |
+
for q_pattern, verb in patterns:
|
| 140 |
+
if q_lower.startswith(q_pattern):
|
| 141 |
+
# Look for the pattern "[verb] [answer]" in the context
|
| 142 |
+
verb_pos = context_lower.find(verb)
|
| 143 |
+
if verb_pos != -1:
|
| 144 |
+
# Get the text after the verb
|
| 145 |
+
answer_part = context[verb_pos + len(verb):].strip(' ,.?!')
|
| 146 |
+
# Return the first word or phrase
|
| 147 |
+
return answer_part.split(',')[0].split('.')[0].strip()
|
| 148 |
+
|
| 149 |
+
# Fallback: return the first proper noun or capitalized word not in the question
|
| 150 |
+
words = context.split()
|
| 151 |
+
for word in words:
|
| 152 |
+
# Skip short words and words that are in the question
|
| 153 |
+
if (len(word) > 2 and word[0].isupper() and
|
| 154 |
+
word.lower() not in q_lower and
|
| 155 |
+
word.lower() not in self.stop_words):
|
| 156 |
+
return word.strip(',.!?;:')
|
| 157 |
+
|
| 158 |
+
# Last resort: return the first noun-like word
|
| 159 |
+
for word in words:
|
| 160 |
+
if len(word) > 3 and word.lower() not in q_lower and word.lower() not in self.stop_words:
|
| 161 |
+
return word.strip(',.!?;:')
|
| 162 |
+
|
| 163 |
+
# If all else fails, return the first word that's not a stop word
|
| 164 |
+
for word in words:
|
| 165 |
+
if word.lower() not in self.stop_words and len(word) > 2:
|
| 166 |
+
return word.strip(',.!?;:')
|
| 167 |
+
|
| 168 |
+
# Final fallback
|
| 169 |
+
return context.split()[0] if context else "Unknown"
|
| 170 |
+
|
| 171 |
+
except Exception as e:
|
| 172 |
+
print(f"Error extracting answer: {str(e)}")
|
| 173 |
+
# Return the first word as fallback
|
| 174 |
+
return context.split()[0] if context else "Unknown"
|
| 175 |
+
|
| 176 |
+
def create_mcq_options(self, question, context, num_options=4, correct_answer=None, global_keywords=None):
|
| 177 |
+
"""
|
| 178 |
+
Create multiple choice options for a given question and context.
|
| 179 |
+
|
| 180 |
+
Args:
|
| 181 |
+
question (str): The question text
|
| 182 |
+
context (str): The context from which the question was generated
|
| 183 |
+
num_options (int): Number of options to generate (including correct answer)
|
| 184 |
+
correct_answer (str, optional): The correct answer if known
|
| 185 |
+
global_keywords (list, optional): List of keywords from the entire document to use as distractors
|
| 186 |
+
|
| 187 |
+
Returns:
|
| 188 |
+
dict: Dictionary containing options and correct index
|
| 189 |
+
"""
|
| 190 |
+
try:
|
| 191 |
+
# Extract the correct answer from context if not provided
|
| 192 |
+
if not correct_answer:
|
| 193 |
+
correct_answer = self.extract_answer_from_context(question, context)
|
| 194 |
+
|
| 195 |
+
# If we couldn't extract a good answer, use a fallback
|
| 196 |
+
if not correct_answer or correct_answer == "Unknown":
|
| 197 |
+
return {
|
| 198 |
+
'options': ['Option A', 'Option B', 'Option C', 'Option D'],
|
| 199 |
+
'correct_index': 0,
|
| 200 |
+
'correct_answer': 'Option A'
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
# Generate distractors
|
| 204 |
+
distractors = self._get_distractors(
|
| 205 |
+
correct_answer,
|
| 206 |
+
num=min(10, num_options * 2) # Generate more than needed to filter
|
| 207 |
+
)
|
| 208 |
+
|
| 209 |
+
# Ensure we have unique distractors
|
| 210 |
+
distractors = list(set(d for d in distractors if d.lower() != correct_answer.lower()))
|
| 211 |
+
|
| 212 |
+
# If we don't have enough distractors, try using global keywords
|
| 213 |
+
if len(distractors) < num_options - 1 and global_keywords:
|
| 214 |
+
# Filter keywords to ensure they are not the correct answer
|
| 215 |
+
potential_distractors = [k for k in global_keywords if k.lower() != correct_answer.lower()]
|
| 216 |
+
# Shuffle to get random ones
|
| 217 |
+
random.shuffle(potential_distractors)
|
| 218 |
+
|
| 219 |
+
for kw in potential_distractors:
|
| 220 |
+
if kw not in distractors:
|
| 221 |
+
distractors.append(kw)
|
| 222 |
+
if len(distractors) >= num_options + 2: # Get a few extra
|
| 223 |
+
break
|
| 224 |
+
|
| 225 |
+
# If we still don't have enough distractors, add some generic ones
|
| 226 |
+
generic_distractors = [
|
| 227 |
+
'True', 'False', 'Yes', 'No', 'Maybe', 'Always', 'Never',
|
| 228 |
+
'Sometimes', 'Often', 'Rarely', 'All of the above', 'None of the above'
|
| 229 |
+
]
|
| 230 |
+
|
| 231 |
+
while len(distractors) < num_options - 1 and generic_distractors:
|
| 232 |
+
distractor = generic_distractors.pop(0)
|
| 233 |
+
if distractor.lower() != correct_answer.lower() and distractor not in distractors:
|
| 234 |
+
distractors.append(distractor)
|
| 235 |
+
|
| 236 |
+
# Select the final set of options
|
| 237 |
+
options = [correct_answer] + distractors[:(num_options-1)]
|
| 238 |
+
random.shuffle(options)
|
| 239 |
+
|
| 240 |
+
# Find the index of the correct answer
|
| 241 |
+
correct_index = options.index(correct_answer) if correct_answer in options else 0
|
| 242 |
+
|
| 243 |
+
return {
|
| 244 |
+
'options': options,
|
| 245 |
+
'correct_index': correct_index,
|
| 246 |
+
'correct_answer': correct_answer
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
except Exception as e:
|
| 250 |
+
print(f"Error generating options: {str(e)}")
|
| 251 |
+
# Fallback options
|
| 252 |
+
return {
|
| 253 |
+
'options': ['Option A', 'Option B', 'Option C', 'Option D'],
|
| 254 |
+
'correct_index': 0,
|
| 255 |
+
'correct_answer': 'Option A'
|
| 256 |
+
}
|
| 257 |
+
|
| 258 |
+
# Example usage
|
| 259 |
+
if __name__ == "__main__":
|
| 260 |
+
og = OptionGenerator()
|
| 261 |
+
|
| 262 |
+
test_question = "What is the capital of France?"
|
| 263 |
+
test_context = "Paris is the capital of France, known for its art, fashion, and culture."
|
| 264 |
+
|
| 265 |
+
print(f"Question: {test_question}")
|
| 266 |
+
print(f"Context: {test_context}")
|
| 267 |
+
|
| 268 |
+
mcq = og.create_mcq_options(test_question, test_context)
|
| 269 |
+
print("\nOptions:")
|
| 270 |
+
for i, option in enumerate(mcq['options']):
|
| 271 |
+
marker = "✓" if i == mcq['correct_index'] else " "
|
| 272 |
+
print(f"{marker} {chr(65+i)}. {option}")
|
question_generator.py
ADDED
|
@@ -0,0 +1,392 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import nltk
|
| 3 |
+
import logging
|
| 4 |
+
from nltk.tokenize import sent_tokenize, word_tokenize
|
| 5 |
+
from nltk.corpus import stopwords
|
| 6 |
+
from nltk.tag import pos_tag
|
| 7 |
+
import torch
|
| 8 |
+
from transformers import pipeline
|
| 9 |
+
|
| 10 |
+
# Configure logging
|
| 11 |
+
logging.basicConfig(level=logging.INFO)
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
# NLTK data setup
|
| 15 |
+
def setup_nltk():
|
| 16 |
+
"""Download required NLTK data."""
|
| 17 |
+
try:
|
| 18 |
+
nltk.data.find('tokenizers/punkt')
|
| 19 |
+
nltk.data.find('corpora/stopwords')
|
| 20 |
+
nltk.data.find('taggers/averaged_perceptron_tagger')
|
| 21 |
+
nltk.data.find('corpora/wordnet')
|
| 22 |
+
nltk.data.find('corpora/omw-1.4')
|
| 23 |
+
logger.info("NLTK data is already set up.")
|
| 24 |
+
return True
|
| 25 |
+
except LookupError:
|
| 26 |
+
logger.info("Downloading required NLTK data...")
|
| 27 |
+
try:
|
| 28 |
+
nltk.download('punkt', quiet=True)
|
| 29 |
+
nltk.download('stopwords', quiet=True)
|
| 30 |
+
nltk.download('averaged_perceptron_tagger', quiet=True)
|
| 31 |
+
nltk.download('wordnet', quiet=True)
|
| 32 |
+
nltk.download('omw-1.4', quiet=True)
|
| 33 |
+
logger.info("NLTK data downloaded successfully.")
|
| 34 |
+
return True
|
| 35 |
+
except Exception as e:
|
| 36 |
+
logger.error(f"Error downloading NLTK data: {str(e)}")
|
| 37 |
+
return False
|
| 38 |
+
|
| 39 |
+
# Initialize NLTK
|
| 40 |
+
if not setup_nltk():
|
| 41 |
+
logger.warning("NLTK data not available. Some features may not work properly.")
|
| 42 |
+
|
| 43 |
+
class QuestionGenerator:
|
| 44 |
+
def __init__(self, model_name="valhalla/t5-small-qa-qg-hl", use_transformers=False):
|
| 45 |
+
"""Initialize the question generator with enhanced capabilities."""
|
| 46 |
+
self.use_transformers = use_transformers
|
| 47 |
+
self.stop_words = set(stopwords.words('english'))
|
| 48 |
+
self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 49 |
+
|
| 50 |
+
# Initialize rule-based system
|
| 51 |
+
self._init_rule_based_system()
|
| 52 |
+
|
| 53 |
+
# Initialize transformer model if requested
|
| 54 |
+
if use_transformers:
|
| 55 |
+
try:
|
| 56 |
+
logger.info("Loading transformer model...")
|
| 57 |
+
self.qg_model = pipeline(
|
| 58 |
+
"text2text-generation",
|
| 59 |
+
model=model_name,
|
| 60 |
+
device=0 if self.device == 'cuda' else -1
|
| 61 |
+
)
|
| 62 |
+
logger.info("Transformer model loaded successfully.")
|
| 63 |
+
except Exception as e:
|
| 64 |
+
logger.error(f"Error loading transformer model: {str(e)}")
|
| 65 |
+
self.use_transformers = False
|
| 66 |
+
logger.info("Falling back to rule-based generation.")
|
| 67 |
+
|
| 68 |
+
def _init_rule_based_system(self):
|
| 69 |
+
"""Initialize the rule-based question generation system."""
|
| 70 |
+
self.wh_words = ['what', 'when', 'where', 'who', 'whom', 'whose', 'which', 'why', 'how']
|
| 71 |
+
self.aux_verbs = ['is', 'are', 'was', 'were', 'do', 'does', 'did', 'have', 'has', 'had',
|
| 72 |
+
'can', 'could', 'will', 'would', 'shall', 'should', 'may', 'might', 'must']
|
| 73 |
+
self.common_nouns = {'time', 'year', 'people', 'way', 'day', 'man', 'thing', 'woman', 'life', 'child',
|
| 74 |
+
'world', 'school', 'state', 'family', 'student', 'group', 'country', 'problem'}
|
| 75 |
+
|
| 76 |
+
def _is_good_sentence(self, sentence):
|
| 77 |
+
"""Check if a sentence is suitable for question generation."""
|
| 78 |
+
try:
|
| 79 |
+
if not sentence or not isinstance(sentence, str):
|
| 80 |
+
return False
|
| 81 |
+
|
| 82 |
+
# Basic length checks
|
| 83 |
+
words = word_tokenize(sentence)
|
| 84 |
+
if len(words) < 4: # At least 4 words
|
| 85 |
+
return False
|
| 86 |
+
|
| 87 |
+
# Check for question mark
|
| 88 |
+
if '?' in sentence:
|
| 89 |
+
return False
|
| 90 |
+
|
| 91 |
+
# Check for proper sentence ending
|
| 92 |
+
if not sentence.strip().endswith(('.', '!', ';', ':')):
|
| 93 |
+
return False
|
| 94 |
+
|
| 95 |
+
# Check for at least one noun and one verb
|
| 96 |
+
pos_tags = pos_tag(words)
|
| 97 |
+
has_noun = any(tag.startswith('NN') for word, tag in pos_tags)
|
| 98 |
+
has_verb = any(tag.startswith('VB') for word, tag in pos_tags)
|
| 99 |
+
|
| 100 |
+
return has_noun and has_verb
|
| 101 |
+
|
| 102 |
+
except Exception as e:
|
| 103 |
+
logger.error(f"Error in _is_good_sentence: {str(e)}")
|
| 104 |
+
return False
|
| 105 |
+
|
| 106 |
+
def _generate_question_what_is(self, words, pos_tags):
|
| 107 |
+
"""Generate 'What is...?' questions."""
|
| 108 |
+
for i, (word, tag) in enumerate(pos_tags):
|
| 109 |
+
if tag.startswith('NN'):
|
| 110 |
+
return f"What is {word}?"
|
| 111 |
+
return ""
|
| 112 |
+
|
| 113 |
+
def _generate_question_verb_subject(self, words, pos_tags):
|
| 114 |
+
"""Generate questions by inverting subject and verb."""
|
| 115 |
+
for i, (word, tag) in enumerate(pos_tags):
|
| 116 |
+
if tag.startswith('VB') and i > 0:
|
| 117 |
+
subject = ' '.join(words[:i])
|
| 118 |
+
verb = word
|
| 119 |
+
rest = ' '.join(words[i+1:])
|
| 120 |
+
return f"{verb.capitalize()} {subject} {rest}?"
|
| 121 |
+
return ""
|
| 122 |
+
|
| 123 |
+
def _generate_question_wh_word(self, words, pos_tags):
|
| 124 |
+
"""Generate questions using WH-words."""
|
| 125 |
+
for i, (word, tag) in enumerate(pos_tags):
|
| 126 |
+
if tag.startswith('VB') and i > 0:
|
| 127 |
+
wh_word = "What"
|
| 128 |
+
if i > 0 and pos_tags[i-1][1].startswith('NNP'):
|
| 129 |
+
wh_word = "Who"
|
| 130 |
+
return f"{wh_word} {word} {' '.join(words[:i])}?"
|
| 131 |
+
return ""
|
| 132 |
+
|
| 133 |
+
def _generate_question_from_statement(self, sentence):
|
| 134 |
+
"""Generate a question from a statement using multiple strategies."""
|
| 135 |
+
try:
|
| 136 |
+
if not sentence or not isinstance(sentence, str):
|
| 137 |
+
return ""
|
| 138 |
+
|
| 139 |
+
# Clean the sentence
|
| 140 |
+
sentence = sentence.strip()
|
| 141 |
+
if sentence.endswith('.'):
|
| 142 |
+
sentence = sentence[:-1].strip()
|
| 143 |
+
|
| 144 |
+
words = word_tokenize(sentence)
|
| 145 |
+
if len(words) < 4: # Too short for a good question
|
| 146 |
+
return ""
|
| 147 |
+
|
| 148 |
+
pos_tags = pos_tag(words)
|
| 149 |
+
|
| 150 |
+
# Try different question generation strategies
|
| 151 |
+
strategies = [
|
| 152 |
+
self._generate_question_what_is,
|
| 153 |
+
self._generate_question_verb_subject,
|
| 154 |
+
self._generate_question_wh_word
|
| 155 |
+
]
|
| 156 |
+
|
| 157 |
+
for strategy in strategies:
|
| 158 |
+
question = strategy(words, pos_tags)
|
| 159 |
+
if question:
|
| 160 |
+
return question
|
| 161 |
+
|
| 162 |
+
# Fallback: ask about the whole sentence
|
| 163 |
+
return f"Can you explain: {sentence[:100]}...?"
|
| 164 |
+
|
| 165 |
+
except Exception as e:
|
| 166 |
+
logger.error(f"Error in _generate_question_from_statement: {str(e)}")
|
| 167 |
+
return ""
|
| 168 |
+
|
| 169 |
+
def generate_question_from_sentence(self, sentence):
|
| 170 |
+
"""Generate a question from a given sentence."""
|
| 171 |
+
if not self._is_good_sentence(sentence):
|
| 172 |
+
return ""
|
| 173 |
+
|
| 174 |
+
try:
|
| 175 |
+
# Use transformer model if available
|
| 176 |
+
if self.use_transformers and hasattr(self, 'qg_model'):
|
| 177 |
+
try:
|
| 178 |
+
# Prepare input for e2e model
|
| 179 |
+
input_text = f"generate questions: {sentence}"
|
| 180 |
+
outputs = self.qg_model(input_text)
|
| 181 |
+
if outputs and len(outputs) > 0:
|
| 182 |
+
generated_text = outputs[0]['generated_text']
|
| 183 |
+
# The model might generate multiple questions separated by <sep>
|
| 184 |
+
questions = generated_text.split('<sep>')
|
| 185 |
+
if questions:
|
| 186 |
+
return questions[0].strip()
|
| 187 |
+
except Exception as e:
|
| 188 |
+
logger.error(f"Transformer generation failed: {e}")
|
| 189 |
+
# Fallback to rule-based
|
| 190 |
+
|
| 191 |
+
# First try rule-based generation
|
| 192 |
+
question = self._generate_question_from_statement(sentence)
|
| 193 |
+
if question:
|
| 194 |
+
return question
|
| 195 |
+
|
| 196 |
+
# Fallback to simple question generation
|
| 197 |
+
words = word_tokenize(sentence)
|
| 198 |
+
if len(words) < 4:
|
| 199 |
+
return ""
|
| 200 |
+
|
| 201 |
+
# Try to make a simple question
|
| 202 |
+
return f"What is the main point about: {sentence[:100]}...?"
|
| 203 |
+
|
| 204 |
+
except Exception as e:
|
| 205 |
+
logger.error(f"Error generating question: {str(e)}")
|
| 206 |
+
return ""
|
| 207 |
+
|
| 208 |
+
def _score_sentence(self, sentence):
|
| 209 |
+
"""Score a sentence based on its quality for question generation."""
|
| 210 |
+
try:
|
| 211 |
+
if not self._is_good_sentence(sentence):
|
| 212 |
+
return 0
|
| 213 |
+
|
| 214 |
+
words = word_tokenize(sentence)
|
| 215 |
+
pos_tags = pos_tag(words)
|
| 216 |
+
|
| 217 |
+
# Start with base score
|
| 218 |
+
score = 1.0
|
| 219 |
+
|
| 220 |
+
# Check for content words
|
| 221 |
+
has_noun = any(tag.startswith('NN') for _, tag in pos_tags)
|
| 222 |
+
has_verb = any(tag.startswith('VB') for _, tag in pos_tags)
|
| 223 |
+
has_adj = any(tag.startswith('JJ') for _, tag in pos_tags)
|
| 224 |
+
|
| 225 |
+
# Increase score based on content
|
| 226 |
+
if has_noun and has_verb:
|
| 227 |
+
score += 2.0
|
| 228 |
+
elif has_noun or has_verb:
|
| 229 |
+
score += 1.0
|
| 230 |
+
|
| 231 |
+
if has_adj:
|
| 232 |
+
score += 0.5
|
| 233 |
+
|
| 234 |
+
# Adjust for sentence length
|
| 235 |
+
word_count = len(words)
|
| 236 |
+
if 8 <= word_count <= 25: # Ideal length
|
| 237 |
+
score += 1.0
|
| 238 |
+
|
| 239 |
+
# Bonus for proper nouns or numbers
|
| 240 |
+
if any(tag in {'NNP', 'NNPS', 'CD'} for _, tag in pos_tags):
|
| 241 |
+
score += 1.0
|
| 242 |
+
|
| 243 |
+
return max(0.5, score) # Ensure minimum score
|
| 244 |
+
|
| 245 |
+
except Exception as e:
|
| 246 |
+
logger.error(f"Error in _score_sentence: {str(e)}")
|
| 247 |
+
return 0.5
|
| 248 |
+
|
| 249 |
+
def generate_questions(self, text, num_questions=5):
|
| 250 |
+
"""Generate questions from the given text."""
|
| 251 |
+
if not text or not text.strip():
|
| 252 |
+
logger.warning("Empty text provided for question generation")
|
| 253 |
+
return []
|
| 254 |
+
|
| 255 |
+
try:
|
| 256 |
+
# Split text into sentences
|
| 257 |
+
sentences = sent_tokenize(text)
|
| 258 |
+
return self.generate_multiple_questions(sentences, num_questions)
|
| 259 |
+
|
| 260 |
+
except Exception as e:
|
| 261 |
+
logger.error(f"Error in generate_questions: {str(e)}")
|
| 262 |
+
return []
|
| 263 |
+
|
| 264 |
+
def generate_multiple_questions(self, inputs, max_questions=5):
|
| 265 |
+
"""
|
| 266 |
+
Generate multiple questions from a list of inputs (context/answer pairs).
|
| 267 |
+
|
| 268 |
+
Args:
|
| 269 |
+
inputs: List of dicts {'context': str, 'answer': str} or list of strings
|
| 270 |
+
max_questions: Maximum number of questions to generate
|
| 271 |
+
|
| 272 |
+
Returns:
|
| 273 |
+
List of generated questions with metadata
|
| 274 |
+
"""
|
| 275 |
+
if not inputs or max_questions <= 0:
|
| 276 |
+
logger.warning("No inputs provided or invalid max_questions")
|
| 277 |
+
return []
|
| 278 |
+
|
| 279 |
+
questions = []
|
| 280 |
+
used_contexts = set()
|
| 281 |
+
|
| 282 |
+
logger.info(f"Generating up to {max_questions} questions from {len(inputs)} inputs")
|
| 283 |
+
|
| 284 |
+
for item in inputs:
|
| 285 |
+
try:
|
| 286 |
+
if len(questions) >= max_questions:
|
| 287 |
+
break
|
| 288 |
+
|
| 289 |
+
# Handle different input types
|
| 290 |
+
if isinstance(item, dict):
|
| 291 |
+
context = item.get('context', '')
|
| 292 |
+
answer = item.get('answer')
|
| 293 |
+
else:
|
| 294 |
+
context = str(item)
|
| 295 |
+
answer = None
|
| 296 |
+
|
| 297 |
+
if not context or not context.strip():
|
| 298 |
+
continue
|
| 299 |
+
|
| 300 |
+
context = context.strip()
|
| 301 |
+
|
| 302 |
+
# Skip if we've already used this context
|
| 303 |
+
if context in used_contexts:
|
| 304 |
+
continue
|
| 305 |
+
|
| 306 |
+
question_text = ""
|
| 307 |
+
|
| 308 |
+
# Use transformer model if available
|
| 309 |
+
if self.use_transformers and hasattr(self, 'qg_model'):
|
| 310 |
+
try:
|
| 311 |
+
if answer:
|
| 312 |
+
input_text = f"answer: {answer} context: {context}"
|
| 313 |
+
else:
|
| 314 |
+
input_text = f"generate questions: {context}"
|
| 315 |
+
|
| 316 |
+
outputs = self.qg_model(input_text)
|
| 317 |
+
if outputs and len(outputs) > 0:
|
| 318 |
+
question_text = outputs[0]['generated_text']
|
| 319 |
+
except Exception as e:
|
| 320 |
+
logger.error(f"Transformer generation failed: {e}")
|
| 321 |
+
|
| 322 |
+
# Fallback to rule-based if transformer failed or not available
|
| 323 |
+
if not question_text:
|
| 324 |
+
question_text = self.generate_question_from_sentence(context)
|
| 325 |
+
|
| 326 |
+
if question_text and question_text not in [q.get('question', '') for q in questions]:
|
| 327 |
+
q_data = {
|
| 328 |
+
'question': question_text,
|
| 329 |
+
'context': context,
|
| 330 |
+
'score': 1.0,
|
| 331 |
+
'type': 'short_answer'
|
| 332 |
+
}
|
| 333 |
+
# If we have a known answer, use it for options later
|
| 334 |
+
if answer:
|
| 335 |
+
q_data['correct_answer'] = answer
|
| 336 |
+
|
| 337 |
+
questions.append(q_data)
|
| 338 |
+
used_contexts.add(context)
|
| 339 |
+
|
| 340 |
+
except Exception as e:
|
| 341 |
+
logger.error(f"Error processing input: {str(e)}")
|
| 342 |
+
continue
|
| 343 |
+
|
| 344 |
+
# If we still don't have enough questions, create simple ones
|
| 345 |
+
if len(questions) < max_questions:
|
| 346 |
+
logger.info(f"Creating simple questions to reach {max_questions} total")
|
| 347 |
+
for i in range(len(questions), max_questions):
|
| 348 |
+
# Try to find an unused context or reuse one
|
| 349 |
+
fallback_context = "General knowledge about the topic"
|
| 350 |
+
if inputs:
|
| 351 |
+
# Pick a random input to generate a question from
|
| 352 |
+
import random
|
| 353 |
+
item = random.choice(inputs)
|
| 354 |
+
if isinstance(item, dict):
|
| 355 |
+
fallback_context = item.get('context', fallback_context)
|
| 356 |
+
else:
|
| 357 |
+
fallback_context = str(item)
|
| 358 |
+
|
| 359 |
+
# Create a more specific fallback question
|
| 360 |
+
words = fallback_context.split()
|
| 361 |
+
topic_snippet = " ".join(words[:5]) + "..." if len(words) > 5 else fallback_context
|
| 362 |
+
|
| 363 |
+
questions.append({
|
| 364 |
+
'question': f"Explain the significance of: {topic_snippet}",
|
| 365 |
+
'context': fallback_context,
|
| 366 |
+
'score': 0.5,
|
| 367 |
+
'type': 'short_answer'
|
| 368 |
+
})
|
| 369 |
+
|
| 370 |
+
logger.info(f"Successfully generated {len(questions)} questions")
|
| 371 |
+
return questions[:max_questions]
|
| 372 |
+
|
| 373 |
+
# Example usage
|
| 374 |
+
if __name__ == "__main__":
|
| 375 |
+
# Test the question generator
|
| 376 |
+
qg = QuestionGenerator(use_transformers=False)
|
| 377 |
+
|
| 378 |
+
test_text = """
|
| 379 |
+
Machine learning is a branch of artificial intelligence that focuses on building systems
|
| 380 |
+
that learn from data. These systems can improve their performance over time without being
|
| 381 |
+
explicitly programmed. There are three main types of machine learning: supervised learning,
|
| 382 |
+
unsupervised learning, and reinforcement learning.
|
| 383 |
+
"""
|
| 384 |
+
|
| 385 |
+
print("\nGenerating questions...")
|
| 386 |
+
questions = qg.generate_questions(test_text, 3)
|
| 387 |
+
|
| 388 |
+
print("\nGenerated Questions:")
|
| 389 |
+
for i, q in enumerate(questions, 1):
|
| 390 |
+
print(f"{i}. {q.get('question', 'No question generated')}")
|
| 391 |
+
print(f" Context: {q.get('context', 'No context')[:100]}...")
|
| 392 |
+
print()
|
question_generator_old.py
ADDED
|
@@ -0,0 +1,600 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import random
|
| 3 |
+
import os
|
| 4 |
+
import sys
|
| 5 |
+
import nltk
|
| 6 |
+
from nltk.tokenize import sent_tokenize, word_tokenize
|
| 7 |
+
from nltk.corpus import stopwords
|
| 8 |
+
from nltk.probability import FreqDist
|
| 9 |
+
from nltk.tag import pos_tag
|
| 10 |
+
from collections import defaultdict
|
| 11 |
+
import torch
|
| 12 |
+
from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer
|
| 13 |
+
import numpy as np
|
| 14 |
+
|
| 15 |
+
# Simple NLTK data setup
|
| 16 |
+
def setup_nltk():
|
| 17 |
+
try:
|
| 18 |
+
# Try to import required NLTK components
|
| 19 |
+
sent_tokenize("Test")
|
| 20 |
+
word_tokenize("Test")
|
| 21 |
+
pos_tag(["test"])
|
| 22 |
+
stopwords.words('english')
|
| 23 |
+
return True
|
| 24 |
+
except LookupError:
|
| 25 |
+
try:
|
| 26 |
+
import nltk
|
| 27 |
+
nltk.download('punkt', quiet=True)
|
| 28 |
+
nltk.download('stopwords', quiet=True)
|
| 29 |
+
nltk.download('averaged_perceptron_tagger', quiet=True)
|
| 30 |
+
nltk.download('wordnet', quiet=True)
|
| 31 |
+
nltk.download('omw-1.4', quiet=True)
|
| 32 |
+
return True
|
| 33 |
+
except:
|
| 34 |
+
return False
|
| 35 |
+
|
| 36 |
+
# Initialize NLTK
|
| 37 |
+
if not setup_nltk():
|
| 38 |
+
print("Warning: Could not initialize NLTK. Some features may not work properly.")
|
| 39 |
+
|
| 40 |
+
# Set up NLTK data path
|
| 41 |
+
def setup_nltk():
|
| 42 |
+
try:
|
| 43 |
+
# Set NLTK data path to a local directory
|
| 44 |
+
nltk_data = os.path.join(os.getcwd(), 'nltk_data')
|
| 45 |
+
os.makedirs(nltk_data, exist_ok=True)
|
| 46 |
+
nltk.data.path.append(nltk_data)
|
| 47 |
+
|
| 48 |
+
# Download required NLTK data
|
| 49 |
+
print("\n=== Downloading NLTK Data ===")
|
| 50 |
+
|
| 51 |
+
# Download punkt tokenizer
|
| 52 |
+
try:
|
| 53 |
+
nltk.data.find('tokenizers/punkt')
|
| 54 |
+
print("✓ punkt tokenizer is already available")
|
| 55 |
+
except LookupError:
|
| 56 |
+
print("Downloading punkt tokenizer...")
|
| 57 |
+
nltk.download('punkt', download_dir=nltk_data)
|
| 58 |
+
print("✓ Downloaded punkt tokenizer")
|
| 59 |
+
|
| 60 |
+
# Download stopwords
|
| 61 |
+
try:
|
| 62 |
+
nltk.data.find('corpora/stopwords')
|
| 63 |
+
print("✓ Stopwords are already available")
|
| 64 |
+
except LookupError:
|
| 65 |
+
print("Downloading stopwords...")
|
| 66 |
+
nltk.download('stopwords', download_dir=nltk_data)
|
| 67 |
+
print("✓ Downloaded stopwords")
|
| 68 |
+
|
| 69 |
+
# Download averaged_perceptron_tagger
|
| 70 |
+
try:
|
| 71 |
+
nltk.data.find('taggers/averaged_perceptron_tagger')
|
| 72 |
+
print("✓ POS tagger is already available")
|
| 73 |
+
except LookupError:
|
| 74 |
+
print("Downloading POS tagger...")
|
| 75 |
+
nltk.download('averaged_perceptron_tagger', download_dir=nltk_data)
|
| 76 |
+
print("✓ Downloaded POS tagger")
|
| 77 |
+
|
| 78 |
+
# Download wordnet
|
| 79 |
+
try:
|
| 80 |
+
nltk.data.find('corpora/wordnet')
|
| 81 |
+
print("✓ WordNet is already available")
|
| 82 |
+
except LookupError:
|
| 83 |
+
print("Downloading WordNet...")
|
| 84 |
+
nltk.download('wordnet', download_dir=nltk_data)
|
| 85 |
+
print("✓ Downloaded WordNet")
|
| 86 |
+
|
| 87 |
+
# Download omw-1.4
|
| 88 |
+
try:
|
| 89 |
+
nltk.data.find('corpora/omw-1.4')
|
| 90 |
+
print("✓ OMW-1.4 is already available")
|
| 91 |
+
except LookupError:
|
| 92 |
+
print("Downloading OMW-1.4...")
|
| 93 |
+
nltk.download('omw-1.4', download_dir=nltk_data)
|
| 94 |
+
print("✓ Downloaded OMW-1.4")
|
| 95 |
+
|
| 96 |
+
# Test NLTK components
|
| 97 |
+
print("\n=== Testing NLTK Components ===")
|
| 98 |
+
sent_tokenize("This is a test.")
|
| 99 |
+
word_tokenize("This is a test.")
|
| 100 |
+
pos_tag(["test", "this", "is", "a", "sentence"])
|
| 101 |
+
stopwords.words('english')
|
| 102 |
+
|
| 103 |
+
print("\n=== NLTK Setup Completed Successfully ===\n")
|
| 104 |
+
return True
|
| 105 |
+
|
| 106 |
+
except Exception as e:
|
| 107 |
+
print(f"\n⚠ Error during NLTK setup: {str(e)}")
|
| 108 |
+
print("\nPlease try running these commands manually in a Python shell:")
|
| 109 |
+
print("import nltk")
|
| 110 |
+
print("nltk.download('punkt')")
|
| 111 |
+
print("nltk.download('stopwords')")
|
| 112 |
+
print("nltk.download('averaged_perceptron_tagger')")
|
| 113 |
+
print("nltk.download('wordnet')")
|
| 114 |
+
print("nltk.download('omw-1.4')\n")
|
| 115 |
+
return False
|
| 116 |
+
|
| 117 |
+
# Initialize NLTK
|
| 118 |
+
if not setup_nltk():
|
| 119 |
+
print("Failed to initialize NLTK. Some features may not work properly.")
|
| 120 |
+
print("Trying to continue with limited functionality...\n")
|
| 121 |
+
try:
|
| 122 |
+
print(f"✓ {package} is already downloaded")
|
| 123 |
+
except LookupError:
|
| 124 |
+
print(f"Downloading {package}...")
|
| 125 |
+
try:
|
| 126 |
+
nltk.download(package, download_dir=nltk_data, quiet=False)
|
| 127 |
+
# Verify download
|
| 128 |
+
try:
|
| 129 |
+
nltk.data.find(path)
|
| 130 |
+
print(f"✓ Successfully downloaded {package}")
|
| 131 |
+
except LookupError:
|
| 132 |
+
print(f"⚠ Warning: {package} download verification failed")
|
| 133 |
+
except Exception as e:
|
| 134 |
+
print(f"⚠ Error downloading {package}: {str(e)}")
|
| 135 |
+
if package == 'averaged_perceptron_tagger':
|
| 136 |
+
print("⚠ This is a critical package. The application may not work properly.")
|
| 137 |
+
|
| 138 |
+
print("\n=== NLTK Data Setup Complete ===\n")
|
| 139 |
+
|
| 140 |
+
# Initialize NLTK data
|
| 141 |
+
download_nltk_data()
|
| 142 |
+
|
| 143 |
+
# Initialize NLTK components
|
| 144 |
+
try:
|
| 145 |
+
# Initialize tokenizers
|
| 146 |
+
sent_tokenize("Initializing...")
|
| 147 |
+
word_tokenize("Initializing...")
|
| 148 |
+
|
| 149 |
+
# Initialize POS tagger
|
| 150 |
+
from nltk.tag import pos_tag
|
| 151 |
+
pos_tag(["test"])
|
| 152 |
+
|
| 153 |
+
# Initialize stopwords
|
| 154 |
+
stopwords.words('english')
|
| 155 |
+
|
| 156 |
+
print("✓ NLTK components initialized successfully")
|
| 157 |
+
except Exception as e:
|
| 158 |
+
print(f"⚠ Error initializing NLTK components: {str(e)}")
|
| 159 |
+
|
| 160 |
+
class QuestionGenerator:
|
| 161 |
+
def __init__(self, model_name="deepset/roberta-base-squad2", use_transformers=True):
|
| 162 |
+
"""
|
| 163 |
+
Initialize the question generator with improved context understanding.
|
| 164 |
+
|
| 165 |
+
Args:
|
| 166 |
+
model_name (str): Name of the pre-trained model to use
|
| 167 |
+
use_transformers (bool): Whether to use transformer models for better quality
|
| 168 |
+
"""
|
| 169 |
+
print("Initializing question generator with enhanced context understanding...")
|
| 170 |
+
self.use_transformers = use_transformers
|
| 171 |
+
self.stop_words = set(stopwords.words('english'))
|
| 172 |
+
self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 173 |
+
|
| 174 |
+
if use_transformers:
|
| 175 |
+
try:
|
| 176 |
+
print("Loading question generation model...")
|
| 177 |
+
self.qg_model = pipeline("text2text-generation",
|
| 178 |
+
model="valhalla/t5-base-qa-qg-hl",
|
| 179 |
+
device=0 if self.device == 'cuda' else -1)
|
| 180 |
+
print("Question generation model loaded successfully!")
|
| 181 |
+
except Exception as e:
|
| 182 |
+
print(f"Error loading transformer model: {str(e)}")
|
| 183 |
+
print("Falling back to rule-based generation.")
|
| 184 |
+
self.use_transformers = False
|
| 185 |
+
|
| 186 |
+
if not self.use_transformers:
|
| 187 |
+
print("Using rule-based question generation.")
|
| 188 |
+
self._init_rule_based_system()
|
| 189 |
+
|
| 190 |
+
print("Question generator initialized successfully!")
|
| 191 |
+
|
| 192 |
+
def _init_rule_based_system(self):
|
| 193 |
+
"""Initialize the rule-based question generation system."""
|
| 194 |
+
self.wh_words = ['what', 'when', 'where', 'who', 'whom', 'whose', 'which', 'why', 'how']
|
| 195 |
+
self.aux_verbs = ['is', 'are', 'was', 'were', 'do', 'does', 'did', 'have', 'has', 'had', 'can', 'could', 'will', 'would', 'shall', 'should', 'may', 'might', 'must']
|
| 196 |
+
self.important_pos_tags = {'NN', 'NNS', 'NNP', 'NNPS', 'VBG', 'VBN', 'JJ', 'JJR', 'JJS'}
|
| 197 |
+
|
| 198 |
+
def _extract_key_phrases(self, text):
|
| 199 |
+
"""Extract key phrases from text based on POS tagging."""
|
| 200 |
+
words = word_tokenize(text)
|
| 201 |
+
pos_tags = nltk.pos_tag(words)
|
| 202 |
+
|
| 203 |
+
key_phrases = []
|
| 204 |
+
current_phrase = []
|
| 205 |
+
|
| 206 |
+
for word, tag in pos_tags:
|
| 207 |
+
if tag in self.important_pos_tags:
|
| 208 |
+
current_phrase.append(word.lower())
|
| 209 |
+
elif current_phrase:
|
| 210 |
+
if len(current_phrase) > 1: # Only consider phrases with at least 2 words
|
| 211 |
+
key_phrases.append(' '.join(current_phrase))
|
| 212 |
+
current_phrase = []
|
| 213 |
+
|
| 214 |
+
return list(set(key_phrases)) # Remove duplicates
|
| 215 |
+
|
| 216 |
+
def generate_question_from_sentence(self, sentence):
|
| 217 |
+
"""Generate a question from a given sentence using rule-based approach."""
|
| 218 |
+
words = word_tokenize(sentence)
|
| 219 |
+
pos_tags = nltk.pos_tag(words)
|
| 220 |
+
|
| 221 |
+
# Find the main verb and subject
|
| 222 |
+
for i, (word, tag) in enumerate(pos_tags):
|
| 223 |
+
if tag.startswith('VB'): # Verb
|
| 224 |
+
# Find the subject before the verb
|
| 225 |
+
for j in range(i-1, -1, -1):
|
| 226 |
+
if pos_tags[j][1].startswith('NN'): # Noun
|
| 227 |
+
subject = ' '.join([w for w, _ in pos_tags[j:i]])
|
| 228 |
+
# Create a wh-question
|
| 229 |
+
question = f"What {pos_tags[i][0]} {subject}?"
|
| 230 |
+
return question
|
| 231 |
+
|
| 232 |
+
# Fallback: create a what question about the main noun phrase
|
| 233 |
+
for i, (word, tag) in enumerate(pos_tags):
|
| 234 |
+
if tag.startswith('NN'): # Noun
|
| 235 |
+
return f"What is {word}?"
|
| 236 |
+
|
| 237 |
+
# Final fallback
|
| 238 |
+
return f"What is the main idea of: {sentence[:50]}...?"
|
| 239 |
+
|
| 240 |
+
def _analyze_text_structure(self, text):
|
| 241 |
+
"""Analyze text structure to identify important concepts and relationships."""
|
| 242 |
+
sentences = sent_tokenize(text)
|
| 243 |
+
key_phrases = self._extract_key_phrases(text)
|
| 244 |
+
|
| 245 |
+
# Find most important terms using frequency distribution
|
| 246 |
+
words = [word.lower() for word in word_tokenize(text)
|
| 247 |
+
if word.isalnum() and word.lower() not in self.stop_words]
|
| 248 |
+
freq_dist = FreqDist(words)
|
| 249 |
+
|
| 250 |
+
return {
|
| 251 |
+
'sentences': sentences,
|
| 252 |
+
'key_phrases': key_phrases,
|
| 253 |
+
'top_terms': [word for word, _ in freq_dist.most_common(10)],
|
| 254 |
+
'concept_map': self._build_concept_map(sentences, key_phrases)
|
| 255 |
+
}
|
| 256 |
+
|
| 257 |
+
def _build_concept_map(self, sentences, key_phrases):
|
| 258 |
+
"""Build a simple concept map showing relationships between key phrases."""
|
| 259 |
+
concept_map = defaultdict(list)
|
| 260 |
+
|
| 261 |
+
for phrase in key_phrases:
|
| 262 |
+
for sentence in sentences:
|
| 263 |
+
if phrase in sentence.lower():
|
| 264 |
+
# Find other key phrases in the same sentence
|
| 265 |
+
related = [p for p in key_phrases if p != phrase and p in sentence.lower()]
|
| 266 |
+
concept_map[phrase].extend(related)
|
| 267 |
+
|
| 268 |
+
# Remove duplicates
|
| 269 |
+
return {k: list(set(v)) for k, v in concept_map.items()}
|
| 270 |
+
try:
|
| 271 |
+
print("Attempting to load T5 model... This may take a few minutes on first run.")
|
| 272 |
+
|
| 273 |
+
from transformers import T5ForConditionalGeneration, T5Tokenizer
|
| 274 |
+
|
| 275 |
+
# Use smaller, faster model for web deployment
|
| 276 |
+
model_name = "t5-base"
|
| 277 |
+
|
| 278 |
+
print(f"Loading {model_name} model...")
|
| 279 |
+
self.tokenizer = T5Tokenizer.from_pretrained(model_name)
|
| 280 |
+
self.model = T5ForConditionalGeneration.from_pretrained(model_name)
|
| 281 |
+
|
| 282 |
+
# Use CPU for more reliable deployment (avoid CUDA issues)
|
| 283 |
+
self.device = torch.device("cpu")
|
| 284 |
+
self.model.to(self.device)
|
| 285 |
+
self.model.eval() # Set to evaluation mode
|
| 286 |
+
|
| 287 |
+
self.use_transformers = True
|
| 288 |
+
print(f"T5 model loaded successfully on {self.device}")
|
| 289 |
+
|
| 290 |
+
except ImportError as e:
|
| 291 |
+
print(f"Transformers library not installed: {e}")
|
| 292 |
+
print("Install with: pip install transformers torch")
|
| 293 |
+
self.use_transformers = False
|
| 294 |
+
except Exception as e:
|
| 295 |
+
print(f"Failed to load T5 model: {e}")
|
| 296 |
+
print("Falling back to rule-based generation.")
|
| 297 |
+
self.use_transformers = False
|
| 298 |
+
|
| 299 |
+
def generate_questions(self, text, num_questions=5, context_window=3):
|
| 300 |
+
"""
|
| 301 |
+
Generate meaningful questions from the given text with better context understanding.
|
| 302 |
+
|
| 303 |
+
Args:
|
| 304 |
+
text (str): Input text to generate questions from
|
| 305 |
+
num_questions (int): Number of questions to generate
|
| 306 |
+
context_window (int): Number of sentences to consider as context
|
| 307 |
+
|
| 308 |
+
Returns:
|
| 309 |
+
list: List of generated questions with their context
|
| 310 |
+
"""
|
| 311 |
+
if not text.strip():
|
| 312 |
+
return []
|
| 313 |
+
|
| 314 |
+
# Analyze the text structure first
|
| 315 |
+
analysis = self._analyze_text_structure(text)
|
| 316 |
+
sentences = analysis['sentences']
|
| 317 |
+
|
| 318 |
+
if not sentences:
|
| 319 |
+
return []
|
| 320 |
+
|
| 321 |
+
questions = []
|
| 322 |
+
|
| 323 |
+
# Generate questions using different strategies
|
| 324 |
+
if self.use_transformers and hasattr(self, 'qg_model'):
|
| 325 |
+
# Use transformer-based generation for better quality
|
| 326 |
+
for i in range(0, len(sentences), context_window):
|
| 327 |
+
context = ' '.join(sentences[i:i+context_window])
|
| 328 |
+
try:
|
| 329 |
+
# Generate questions for this context window
|
| 330 |
+
generated = self.qg_model(context, max_length=128, num_return_sequences=1)
|
| 331 |
+
if generated and len(generated) > 0:
|
| 332 |
+
question = generated[0]['generated_text'].strip()
|
| 333 |
+
if question and question[-1] != '?':
|
| 334 |
+
question += '?'
|
| 335 |
+
questions.append({
|
| 336 |
+
'question': question,
|
| 337 |
+
'context': context,
|
| 338 |
+
'type': 'comprehension'
|
| 339 |
+
})
|
| 340 |
+
if len(questions) >= num_questions:
|
| 341 |
+
break
|
| 342 |
+
except Exception as e:
|
| 343 |
+
print(f"Error in transformer-based generation: {str(e)}")
|
| 344 |
+
continue
|
| 345 |
+
|
| 346 |
+
# Fallback to rule-based generation if needed
|
| 347 |
+
if len(questions) < num_questions:
|
| 348 |
+
for i, sentence in enumerate(sentences):
|
| 349 |
+
if len(sentence.split()) < 5: # Skip very short sentences
|
| 350 |
+
continue
|
| 351 |
+
|
| 352 |
+
# Generate question using rule-based approach
|
| 353 |
+
question = self._generate_question_from_sentence(sentence)
|
| 354 |
+
|
| 355 |
+
# Get context (previous and next sentences)
|
| 356 |
+
start = max(0, i-1)
|
| 357 |
+
end = min(len(sentences), i+2)
|
| 358 |
+
context = ' '.join(sentences[start:end])
|
| 359 |
+
|
| 360 |
+
questions.append({
|
| 361 |
+
'question': question,
|
| 362 |
+
'context': context,
|
| 363 |
+
'type': 'factual'
|
| 364 |
+
})
|
| 365 |
+
|
| 366 |
+
if len(questions) >= num_questions:
|
| 367 |
+
break
|
| 368 |
+
|
| 369 |
+
# Ensure we have enough questions
|
| 370 |
+
if len(questions) < num_questions:
|
| 371 |
+
# Generate some conceptual questions based on key phrases
|
| 372 |
+
for phrase in analysis['key_phrases'][:num_questions - len(questions)]:
|
| 373 |
+
questions.append({
|
| 374 |
+
'question': f"Explain the concept of {phrase} in detail.",
|
| 375 |
+
'context': f"The concept of {phrase} is important in this context.",
|
| 376 |
+
'type': 'conceptual'
|
| 377 |
+
})
|
| 378 |
+
|
| 379 |
+
return questions[:num_questions]
|
| 380 |
+
|
| 381 |
+
def _generate_with_transformers(self, sentence, max_length):
|
| 382 |
+
"""Generate question using T5 model."""
|
| 383 |
+
if not self.use_transformers or self.model is None:
|
| 384 |
+
return self._generate_with_rules(sentence)
|
| 385 |
+
|
| 386 |
+
try:
|
| 387 |
+
# Prepare input for T5 model
|
| 388 |
+
input_text = f"generate question: {sentence[:300]}" # Limit input length
|
| 389 |
+
|
| 390 |
+
# Tokenize input with error handling
|
| 391 |
+
inputs = self.tokenizer.encode(
|
| 392 |
+
input_text,
|
| 393 |
+
return_tensors="pt",
|
| 394 |
+
max_length=256, # Reduced for faster processing
|
| 395 |
+
truncation=True,
|
| 396 |
+
padding=True
|
| 397 |
+
)
|
| 398 |
+
|
| 399 |
+
if self.device:
|
| 400 |
+
inputs = inputs.to(self.device)
|
| 401 |
+
|
| 402 |
+
# Generate question with optimized parameters
|
| 403 |
+
with torch.no_grad():
|
| 404 |
+
outputs = self.model.generate(
|
| 405 |
+
inputs,
|
| 406 |
+
max_length=min(max_length, 64), # Increased output length
|
| 407 |
+
num_beams=4, # Increased beams for better quality
|
| 408 |
+
early_stopping=True,
|
| 409 |
+
do_sample=False, # Deterministic for consistency
|
| 410 |
+
pad_token_id=self.tokenizer.eos_token_id
|
| 411 |
+
)
|
| 412 |
+
|
| 413 |
+
# Decode and clean question
|
| 414 |
+
question = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 415 |
+
cleaned_question = self.clean_question(question)
|
| 416 |
+
|
| 417 |
+
# Validate the generated question
|
| 418 |
+
if len(cleaned_question) < 10 or not cleaned_question.endswith('?'):
|
| 419 |
+
print("Generated question quality low, using rule-based fallback")
|
| 420 |
+
return self._generate_with_rules(sentence)
|
| 421 |
+
|
| 422 |
+
return cleaned_question
|
| 423 |
+
|
| 424 |
+
except Exception as e:
|
| 425 |
+
print(f"Transformer generation failed: {e}")
|
| 426 |
+
print("Falling back to rule-based generation")
|
| 427 |
+
return self._generate_with_rules(sentence)
|
| 428 |
+
|
| 429 |
+
def _generate_with_rules(self, sentence):
|
| 430 |
+
"""Generate question using rule-based approach."""
|
| 431 |
+
sentence = sentence.strip()
|
| 432 |
+
words = sentence.split()
|
| 433 |
+
|
| 434 |
+
# Enhanced question templates based on sentence patterns
|
| 435 |
+
question_templates = [
|
| 436 |
+
# What questions - most common
|
| 437 |
+
(lambda s: any(word in s.lower() for word in ['is', 'are', 'means', 'refers', 'definition', 'concept']),
|
| 438 |
+
lambda s: f"What {self._extract_predicate(s)}?"),
|
| 439 |
+
|
| 440 |
+
# Define/Explain questions
|
| 441 |
+
(lambda s: any(word in s.lower() for word in ['definition', 'meaning', 'concept', 'term']),
|
| 442 |
+
lambda s: f"Define {self._extract_main_subject(s)}."),
|
| 443 |
+
|
| 444 |
+
# How questions
|
| 445 |
+
(lambda s: any(word in s.lower() for word in ['process', 'method', 'way', 'procedure', 'algorithm']),
|
| 446 |
+
lambda s: f"How {self._extract_predicate(s)}?"),
|
| 447 |
+
|
| 448 |
+
# Why questions
|
| 449 |
+
(lambda s: any(word in s.lower() for word in ['because', 'reason', 'cause', 'purpose', 'important']),
|
| 450 |
+
lambda s: f"Why {self._extract_predicate(s)}?"),
|
| 451 |
+
|
| 452 |
+
# When questions
|
| 453 |
+
(lambda s: any(word in s.lower() for word in ['year', 'century', 'time', 'date', 'period', 'era']),
|
| 454 |
+
lambda s: f"When {self._extract_predicate(s)}?"),
|
| 455 |
+
|
| 456 |
+
# Where questions
|
| 457 |
+
(lambda s: any(word in s.lower() for word in ['place', 'location', 'country', 'city', 'region']),
|
| 458 |
+
lambda s: f"Where {self._extract_predicate(s)}?"),
|
| 459 |
+
|
| 460 |
+
# Who questions
|
| 461 |
+
(lambda s: any(word in s.lower() for word in ['person', 'people', 'scientist', 'author', 'researcher']),
|
| 462 |
+
lambda s: f"Who {self._extract_predicate(s)}?"),
|
| 463 |
+
|
| 464 |
+
# How questions
|
| 465 |
+
(lambda s: any(word in s.lower() for word in ['method', 'process', 'way', 'how']),
|
| 466 |
+
lambda s: f"How {self._extract_predicate(s)}?"),
|
| 467 |
+
|
| 468 |
+
# Why questions
|
| 469 |
+
(lambda s: any(word in s.lower() for word in ['reason', 'because', 'cause', 'why']),
|
| 470 |
+
lambda s: f"Why {self._extract_predicate(s)}?"),
|
| 471 |
+
|
| 472 |
+
# Default question
|
| 473 |
+
(lambda s: True,
|
| 474 |
+
lambda s: f"What can you tell me about {self._extract_main_subject(s)}?")
|
| 475 |
+
]
|
| 476 |
+
|
| 477 |
+
# Apply first matching template
|
| 478 |
+
for condition, template in question_templates:
|
| 479 |
+
if condition(sentence):
|
| 480 |
+
try:
|
| 481 |
+
question = template(sentence)
|
| 482 |
+
return self.clean_question(question)
|
| 483 |
+
except:
|
| 484 |
+
continue
|
| 485 |
+
|
| 486 |
+
# Fallback
|
| 487 |
+
return f"What is the main point about {words[0] if words else 'this topic'}?"
|
| 488 |
+
|
| 489 |
+
def _extract_main_subject(self, sentence):
|
| 490 |
+
"""Extract the main subject from a sentence."""
|
| 491 |
+
words = sentence.split()
|
| 492 |
+
# Look for capitalized words (likely proper nouns)
|
| 493 |
+
subjects = [word.strip('.,!?;:') for word in words if word[0].isupper() and len(word) > 2]
|
| 494 |
+
if subjects:
|
| 495 |
+
return subjects[0]
|
| 496 |
+
# Fallback to first few words
|
| 497 |
+
return ' '.join(words[:3]) if len(words) >= 3 else sentence
|
| 498 |
+
|
| 499 |
+
def _extract_predicate(self, sentence):
|
| 500 |
+
"""Extract predicate for question formation."""
|
| 501 |
+
sentence = sentence.lower()
|
| 502 |
+
# Remove common sentence starters
|
| 503 |
+
sentence = re.sub(r'^(the|this|that|these|those|a|an)\s+', '', sentence)
|
| 504 |
+
|
| 505 |
+
# Find verb patterns
|
| 506 |
+
if ' is ' in sentence:
|
| 507 |
+
parts = sentence.split(' is ', 1)
|
| 508 |
+
if len(parts) > 1:
|
| 509 |
+
return f"is {parts[1].strip('.,!?;:')}"
|
| 510 |
+
|
| 511 |
+
if ' are ' in sentence:
|
| 512 |
+
parts = sentence.split(' are ', 1)
|
| 513 |
+
if len(parts) > 1:
|
| 514 |
+
return f"are {parts[1].strip('.,!?;:')}"
|
| 515 |
+
|
| 516 |
+
# Default fallback
|
| 517 |
+
words = sentence.split()
|
| 518 |
+
if len(words) > 3:
|
| 519 |
+
return ' '.join(words[1:]).strip('.,!?;:')
|
| 520 |
+
return sentence.strip('.,!?;:')
|
| 521 |
+
|
| 522 |
+
def clean_question(self, question):
|
| 523 |
+
"""
|
| 524 |
+
Clean and format the generated question.
|
| 525 |
+
|
| 526 |
+
Args:
|
| 527 |
+
question (str): Raw generated question
|
| 528 |
+
|
| 529 |
+
Returns:
|
| 530 |
+
str: Cleaned question
|
| 531 |
+
"""
|
| 532 |
+
# Remove extra spaces
|
| 533 |
+
question = re.sub(r'\s+', ' ', question.strip())
|
| 534 |
+
|
| 535 |
+
# Ensure question ends with question mark
|
| 536 |
+
if not question.endswith('?'):
|
| 537 |
+
question += '?'
|
| 538 |
+
|
| 539 |
+
# Capitalize first letter
|
| 540 |
+
if question:
|
| 541 |
+
question = question[0].upper() + question[1:]
|
| 542 |
+
|
| 543 |
+
return question
|
| 544 |
+
|
| 545 |
+
def generate_multiple_questions(self, sentences, max_questions=5):
|
| 546 |
+
"""
|
| 547 |
+
Generate multiple questions from a list of sentences.
|
| 548 |
+
|
| 549 |
+
Args:
|
| 550 |
+
sentences (list): List of sentences to generate questions from
|
| 551 |
+
max_questions (int): Maximum number of questions to generate
|
| 552 |
+
|
| 553 |
+
Returns:
|
| 554 |
+
list: List of generated questions with their source sentences
|
| 555 |
+
"""
|
| 556 |
+
questions = []
|
| 557 |
+
|
| 558 |
+
for i, (score, sentence) in enumerate(sentences[:max_questions]):
|
| 559 |
+
try:
|
| 560 |
+
question = self.generate_question_from_sentence(sentence)
|
| 561 |
+
|
| 562 |
+
# Filter out low-quality questions
|
| 563 |
+
if self.is_valid_question(question):
|
| 564 |
+
questions.append({
|
| 565 |
+
'question': question,
|
| 566 |
+
'context': sentence,
|
| 567 |
+
'score': score,
|
| 568 |
+
'question_id': i + 1
|
| 569 |
+
})
|
| 570 |
+
except Exception as e:
|
| 571 |
+
print(f"Error generating question from sentence: {sentence[:50]}... Error: {e}")
|
| 572 |
+
continue
|
| 573 |
+
|
| 574 |
+
return questions
|
| 575 |
+
|
| 576 |
+
def is_valid_question(self, question):
|
| 577 |
+
"""
|
| 578 |
+
Check if a generated question is valid.
|
| 579 |
+
|
| 580 |
+
Args:
|
| 581 |
+
question (str): Generated question
|
| 582 |
+
|
| 583 |
+
Returns:
|
| 584 |
+
bool: True if question is valid
|
| 585 |
+
"""
|
| 586 |
+
# Basic validation criteria
|
| 587 |
+
if len(question) < 10: # Too short
|
| 588 |
+
return False
|
| 589 |
+
|
| 590 |
+
if len(question) > 200: # Too long
|
| 591 |
+
return False
|
| 592 |
+
|
| 593 |
+
# Must contain question words or end with question mark
|
| 594 |
+
question_words = ['what', 'who', 'when', 'where', 'why', 'how', 'which', 'is', 'are', 'do', 'does', 'did', 'can', 'could', 'would', 'should']
|
| 595 |
+
question_lower = question.lower()
|
| 596 |
+
|
| 597 |
+
has_question_word = any(word in question_lower for word in question_words)
|
| 598 |
+
ends_with_question_mark = question.endswith('?')
|
| 599 |
+
|
| 600 |
+
return has_question_word or ends_with_question_mark
|
requirements.txt
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
nltk>=3.8
|
| 2 |
+
transformers>=4.30.0
|
| 3 |
+
torch>=2.0.0
|
| 4 |
+
flask>=3.0.0
|
| 5 |
+
rake-nltk>=1.0.6
|
| 6 |
+
scikit-learn>=1.3.0
|
| 7 |
+
numpy>=1.24.0
|
| 8 |
+
pandas>=1.5.0
|
| 9 |
+
sentence-transformers>=2.2.0
|
| 10 |
+
sentencepiece>=0.1.99
|
| 11 |
+
python-docx>=0.8.11
|
| 12 |
+
openai>=1.0.0
|
| 13 |
+
PyPDF2>=3.0.0
|
sample_text.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Machine Learning and Artificial Intelligence
|
| 2 |
+
|
| 3 |
+
Machine learning is a method of data analysis that automates analytical model building. It is a branch of artificial intelligence based on the idea that systems can learn from data, identify patterns and make decisions with minimal human intervention. The term machine learning was coined in 1959 by Arthur Samuel, an American IBMer and pioneer in the field of computer gaming and artificial intelligence.
|
| 4 |
+
|
| 5 |
+
There are three main types of machine learning algorithms: supervised learning, unsupervised learning, and reinforcement learning. Supervised learning uses labeled training data to learn a mapping function from input variables to output variables. Common examples include classification and regression problems. Unsupervised learning finds hidden patterns in data without labeled examples, such as clustering and association rule learning. Reinforcement learning involves an agent learning to make decisions by taking actions in an environment to maximize cumulative reward.
|
| 6 |
+
|
| 7 |
+
Deep learning is a subset of machine learning that uses neural networks with multiple layers to model and understand complex patterns in data. These deep neural networks can automatically learn hierarchical representations of data, making them particularly effective for tasks like image recognition, natural language processing, and speech recognition. Popular deep learning frameworks include TensorFlow, PyTorch, and Keras.
|
| 8 |
+
|
| 9 |
+
Python has become the most popular programming language for machine learning due to its simplicity, readability, and extensive ecosystem of libraries. Key Python libraries for machine learning include NumPy for numerical computing, Pandas for data manipulation, Scikit-learn for traditional machine learning algorithms, and Matplotlib for data visualization. The combination of these tools makes Python an ideal choice for both beginners and experts in the field of artificial intelligence.
|
setup_nltk.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import nltk
|
| 3 |
+
|
| 4 |
+
def download_nltk_data():
|
| 5 |
+
# Set NLTK data path to a local directory
|
| 6 |
+
nltk_data = os.path.join(os.getcwd(), 'nltk_data')
|
| 7 |
+
os.makedirs(nltk_data, exist_ok=True)
|
| 8 |
+
nltk.data.path.append(nltk_data)
|
| 9 |
+
|
| 10 |
+
print(f"NLTK data will be downloaded to: {nltk_data}")
|
| 11 |
+
|
| 12 |
+
# List of NLTK packages to download
|
| 13 |
+
packages = [
|
| 14 |
+
'punkt',
|
| 15 |
+
'stopwords',
|
| 16 |
+
'averaged_perceptron_tagger',
|
| 17 |
+
'averaged_perceptron_tagger_eng',
|
| 18 |
+
'wordnet',
|
| 19 |
+
'omw-1.4',
|
| 20 |
+
'maxent_ne_chunker',
|
| 21 |
+
'words',
|
| 22 |
+
'punkt'
|
| 23 |
+
]
|
| 24 |
+
|
| 25 |
+
print("\n=== Downloading NLTK Data ===")
|
| 26 |
+
|
| 27 |
+
for package in packages:
|
| 28 |
+
try:
|
| 29 |
+
print(f"Downloading {package}...")
|
| 30 |
+
nltk.download(package, download_dir=nltk_data)
|
| 31 |
+
print(f"✓ {package} downloaded successfully")
|
| 32 |
+
except Exception as e:
|
| 33 |
+
print(f"⚠ Error downloading {package}: {str(e)}")
|
| 34 |
+
|
| 35 |
+
print("\n=== NLTK Setup Complete ===")
|
| 36 |
+
print(f"NLTK data location: {nltk_data}")
|
| 37 |
+
print("You can now run your application.")
|
| 38 |
+
|
| 39 |
+
if __name__ == "__main__":
|
| 40 |
+
download_nltk_data()
|
simple_nltk_test.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import nltk
|
| 2 |
+
|
| 3 |
+
print("Testing NLTK installation...")
|
| 4 |
+
|
| 5 |
+
# Test tokenization
|
| 6 |
+
try:
|
| 7 |
+
from nltk.tokenize import word_tokenize, sent_tokenize
|
| 8 |
+
text = "This is a test sentence. NLTK should be able to tokenize this."
|
| 9 |
+
print("\nTokenization test:")
|
| 10 |
+
print(f"Word tokens: {word_tokenize(text)}")
|
| 11 |
+
print(f"Sentences: {sent_tokenize(text)}")
|
| 12 |
+
except Exception as e:
|
| 13 |
+
print(f"Tokenization error: {e}")
|
| 14 |
+
|
| 15 |
+
# Test POS tagging
|
| 16 |
+
try:
|
| 17 |
+
from nltk import pos_tag
|
| 18 |
+
tokens = word_tokenize("This is a test")
|
| 19 |
+
print("\nPOS tagging test:")
|
| 20 |
+
print(f"POS tags: {pos_tag(tokens)}")
|
| 21 |
+
except Exception as e:
|
| 22 |
+
print(f"POS tagging error: {e}")
|
| 23 |
+
|
| 24 |
+
# Test stopwords
|
| 25 |
+
try:
|
| 26 |
+
from nltk.corpus import stopwords
|
| 27 |
+
print("\nStopwords test:")
|
| 28 |
+
print(f"English stopwords (first 5): {stopwords.words('english')[:5]}")
|
| 29 |
+
except Exception as e:
|
| 30 |
+
print(f"Stopwords error: {e}")
|
| 31 |
+
|
| 32 |
+
print("\nNLTK test complete.")
|
simple_question_generator.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import random
|
| 3 |
+
|
| 4 |
+
class SimpleQuestionGenerator:
|
| 5 |
+
def __init__(self):
|
| 6 |
+
self.question_words = [
|
| 7 |
+
'What', 'Why', 'How', 'When', 'Where', 'Who', 'Which', 'Describe', 'Explain'
|
| 8 |
+
]
|
| 9 |
+
self.auxiliary_verbs = ['is', 'are', 'was', 'were', 'do', 'does', 'did', 'have', 'has', 'had']
|
| 10 |
+
|
| 11 |
+
def generate_question(self, sentence):
|
| 12 |
+
"""Generate a simple question from a given sentence."""
|
| 13 |
+
if not sentence.strip():
|
| 14 |
+
return ""
|
| 15 |
+
|
| 16 |
+
# Clean the sentence
|
| 17 |
+
sentence = sentence.strip().strip('.').strip()
|
| 18 |
+
|
| 19 |
+
# Simple question patterns
|
| 20 |
+
patterns = [
|
| 21 |
+
(r'^(.*?) is (.*?)[.,;]?$', 'What is {}?'),
|
| 22 |
+
(r'^(.*?) are (.*?)[.,;]?$', 'What are {}?'),
|
| 23 |
+
(r'^(.*?) was (.*?)[.,;]?$', 'What was {}?'),
|
| 24 |
+
(r'^(.*?) were (.*?)[.,;]?$', 'What were {}?'),
|
| 25 |
+
(r'^(.*?) can be (.*?)[.,;]?$', 'How can {} be {}?'),
|
| 26 |
+
(r'^(.*?) has (.*?)[.,;]?$', 'What has {}?'),
|
| 27 |
+
(r'^(.*?) have (.*?)[.,;]?$', 'What have {}?'),
|
| 28 |
+
]
|
| 29 |
+
|
| 30 |
+
# Try to match patterns
|
| 31 |
+
for pattern, template in patterns:
|
| 32 |
+
match = re.match(pattern, sentence, re.IGNORECASE)
|
| 33 |
+
if match:
|
| 34 |
+
return template.format(match.group(1)).capitalize()
|
| 35 |
+
|
| 36 |
+
# Default question if no pattern matches
|
| 37 |
+
return f"What is the main point about: {sentence[:50]}...?"
|
| 38 |
+
|
| 39 |
+
def generate_questions(self, text, num_questions=5):
|
| 40 |
+
"""Generate questions from the given text."""
|
| 41 |
+
# Simple sentence splitting
|
| 42 |
+
sentences = re.split(r'(?<=[.!?])\s+', text)
|
| 43 |
+
questions = []
|
| 44 |
+
|
| 45 |
+
for sentence in sentences:
|
| 46 |
+
if len(questions) >= num_questions:
|
| 47 |
+
break
|
| 48 |
+
|
| 49 |
+
question = self.generate_question(sentence)
|
| 50 |
+
if question and question not in questions:
|
| 51 |
+
questions.append(question)
|
| 52 |
+
|
| 53 |
+
return questions
|
| 54 |
+
|
| 55 |
+
# Example usage
|
| 56 |
+
if __name__ == "__main__":
|
| 57 |
+
# Create a question generator
|
| 58 |
+
qg = SimpleQuestionGenerator()
|
| 59 |
+
|
| 60 |
+
# Sample text
|
| 61 |
+
sample_text = """
|
| 62 |
+
Machine learning is a branch of artificial intelligence.
|
| 63 |
+
It focuses on building systems that learn from data.
|
| 64 |
+
These systems can improve their performance over time.
|
| 65 |
+
There are three main types of machine learning: supervised, unsupervised, and reinforcement learning.
|
| 66 |
+
Supervised learning uses labeled data to train models.
|
| 67 |
+
Unsupervised learning finds patterns in unlabeled data.
|
| 68 |
+
Reinforcement learning involves training an agent to make decisions through rewards.
|
| 69 |
+
"""
|
| 70 |
+
|
| 71 |
+
# Generate and print questions
|
| 72 |
+
print("Generating questions...\n")
|
| 73 |
+
questions = qg.generate_questions(sample_text, 5)
|
| 74 |
+
|
| 75 |
+
for i, question in enumerate(questions, 1):
|
| 76 |
+
print(f"{i}. {question}")
|
syllabus_processor.py
ADDED
|
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import nltk
|
| 3 |
+
import os
|
| 4 |
+
from nltk.tokenize import word_tokenize, sent_tokenize
|
| 5 |
+
from nltk.corpus import stopwords
|
| 6 |
+
from nltk.tag import pos_tag
|
| 7 |
+
from nltk.stem import WordNetLemmatizer
|
| 8 |
+
from typing import List, Dict, Tuple, Optional
|
| 9 |
+
import logging
|
| 10 |
+
from datetime import datetime
|
| 11 |
+
from question_generator import QuestionGenerator
|
| 12 |
+
|
| 13 |
+
# Configure logging
|
| 14 |
+
logging.basicConfig(level=logging.INFO)
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
class SyllabusProcessor:
|
| 18 |
+
def __init__(self):
|
| 19 |
+
"""Initialize the SyllabusProcessor with necessary NLTK components."""
|
| 20 |
+
try:
|
| 21 |
+
# Download required NLTK data
|
| 22 |
+
nltk.download('punkt', quiet=True)
|
| 23 |
+
nltk.download('stopwords', quiet=True)
|
| 24 |
+
nltk.download('averaged_perceptron_tagger', quiet=True)
|
| 25 |
+
nltk.download('averaged_perceptron_tagger_eng', quiet=True)
|
| 26 |
+
nltk.download('wordnet', quiet=True)
|
| 27 |
+
nltk.download('omw-1.4', quiet=True)
|
| 28 |
+
|
| 29 |
+
# Initialize NLTK components
|
| 30 |
+
self.stop_words = set(stopwords.words('english'))
|
| 31 |
+
self.lemmatizer = WordNetLemmatizer()
|
| 32 |
+
|
| 33 |
+
# Import and initialize the PerceptronTagger
|
| 34 |
+
from nltk.tag import PerceptronTagger
|
| 35 |
+
self.tagger = PerceptronTagger()
|
| 36 |
+
|
| 37 |
+
# Initialize question generator
|
| 38 |
+
self.question_generator = QuestionGenerator()
|
| 39 |
+
|
| 40 |
+
logger.info("SyllabusProcessor initialized successfully")
|
| 41 |
+
|
| 42 |
+
except Exception as e:
|
| 43 |
+
logger.error(f"Error initializing SyllabusProcessor: {str(e)}")
|
| 44 |
+
raise
|
| 45 |
+
|
| 46 |
+
def parse_syllabus(self, syllabus_text: str) -> Dict[str, List[str]]:
|
| 47 |
+
"""
|
| 48 |
+
Parse a syllabus text into topics and subtopics.
|
| 49 |
+
|
| 50 |
+
Args:
|
| 51 |
+
syllabus_text: Raw syllabus text with units and topics
|
| 52 |
+
|
| 53 |
+
Returns:
|
| 54 |
+
Dictionary mapping unit names to lists of topics
|
| 55 |
+
"""
|
| 56 |
+
units = {}
|
| 57 |
+
current_unit = "General Topics"
|
| 58 |
+
units[current_unit] = []
|
| 59 |
+
|
| 60 |
+
# Split into lines and process each line
|
| 61 |
+
for line in syllabus_text.split('\n'):
|
| 62 |
+
line = line.strip()
|
| 63 |
+
if not line:
|
| 64 |
+
continue
|
| 65 |
+
|
| 66 |
+
# Check for unit headers (e.g., "Unit 1.0 Introduction" or "Unit 1: Introduction")
|
| 67 |
+
unit_match = re.match(r'(?:Unit[\s\-]*\d+(?:\.\d+)?\s*[:\-]?\s*)(.+)', line, re.IGNORECASE)
|
| 68 |
+
if unit_match:
|
| 69 |
+
current_unit = unit_match.group(0).strip()
|
| 70 |
+
units[current_unit] = []
|
| 71 |
+
|
| 72 |
+
# Check if there are topics on the same line (e.g., "Unit 1 ... 1.1 Topic")
|
| 73 |
+
remaining_text = unit_match.group(1)
|
| 74 |
+
# Find all topic patterns like "1.1 Topic Name"
|
| 75 |
+
inline_topics = re.findall(r'(\d+(?:\.\d+)+[\s\-]+[^0-9]+)', remaining_text)
|
| 76 |
+
for t in inline_topics:
|
| 77 |
+
# Clean up the topic text
|
| 78 |
+
t = re.sub(r'\s*\d+\.\d+.*$', '', t).strip() # Remove next topic number if caught
|
| 79 |
+
if t:
|
| 80 |
+
units[current_unit].append(t.strip())
|
| 81 |
+
continue
|
| 82 |
+
|
| 83 |
+
# Check for topic lines (e.g., "1.1 Topic Name" or "- Topic Name")
|
| 84 |
+
# Handle multiple topics on one line
|
| 85 |
+
topics_on_line = re.findall(r'(\d+(?:\.\d+)+[\s\-]+[^0-9]+)', line)
|
| 86 |
+
if topics_on_line:
|
| 87 |
+
for t in topics_on_line:
|
| 88 |
+
t = t.strip()
|
| 89 |
+
# Clean up trailing dots or next topic numbers
|
| 90 |
+
t = re.sub(r'\s*\d+\.\d+.*$', '', t).strip()
|
| 91 |
+
if t and len(t) > 3: # Avoid just numbers
|
| 92 |
+
units[current_unit].append(t)
|
| 93 |
+
else:
|
| 94 |
+
# Check for bullet points
|
| 95 |
+
topic_match = re.match(r'(?:[-•*]\s*)(.+)', line)
|
| 96 |
+
if topic_match:
|
| 97 |
+
topic = topic_match.group(1).strip()
|
| 98 |
+
if topic and topic.lower() not in ['introduction', 'overview']:
|
| 99 |
+
units[current_unit].append(topic)
|
| 100 |
+
|
| 101 |
+
return units
|
| 102 |
+
|
| 103 |
+
def extract_key_terms(self, topic: str) -> List[str]:
|
| 104 |
+
"""
|
| 105 |
+
Extract key terms from a topic for question generation.
|
| 106 |
+
|
| 107 |
+
Args:
|
| 108 |
+
topic: The topic text
|
| 109 |
+
|
| 110 |
+
Returns:
|
| 111 |
+
List of important terms from the topic
|
| 112 |
+
"""
|
| 113 |
+
try:
|
| 114 |
+
# Use the instance tagger
|
| 115 |
+
words = word_tokenize(topic.lower())
|
| 116 |
+
pos_tags = self.tagger.tag(words)
|
| 117 |
+
|
| 118 |
+
# Extract nouns and proper nouns
|
| 119 |
+
key_terms = [
|
| 120 |
+
word for word, tag in pos_tags
|
| 121 |
+
if tag.startswith('NN') and word not in self.stop_words
|
| 122 |
+
]
|
| 123 |
+
|
| 124 |
+
return list(set(key_terms)) # Remove duplicates
|
| 125 |
+
|
| 126 |
+
except Exception as e:
|
| 127 |
+
logger.error(f"Error extracting key terms: {str(e)}")
|
| 128 |
+
return []
|
| 129 |
+
|
| 130 |
+
def generate_topic_based_questions(self, syllabus_text: str, content_text: str,
|
| 131 |
+
questions_per_topic: int = 3) -> Dict[str, List[Dict]]:
|
| 132 |
+
"""
|
| 133 |
+
Generate questions based on syllabus topics.
|
| 134 |
+
|
| 135 |
+
Args:
|
| 136 |
+
syllabus_text: The syllabus text with units and topics
|
| 137 |
+
content_text: The content text to generate questions from
|
| 138 |
+
questions_per_topic: Number of questions to generate per topic
|
| 139 |
+
|
| 140 |
+
Returns:
|
| 141 |
+
Dictionary mapping topics to lists of questions
|
| 142 |
+
"""
|
| 143 |
+
# Parse the syllabus
|
| 144 |
+
units = self.parse_syllabus(syllabus_text)
|
| 145 |
+
|
| 146 |
+
# Process content into sentences
|
| 147 |
+
sentences = sent_tokenize(content_text)
|
| 148 |
+
|
| 149 |
+
topic_questions = {}
|
| 150 |
+
|
| 151 |
+
for unit, topics in units.items():
|
| 152 |
+
for topic in topics:
|
| 153 |
+
# Extract key terms from the topic
|
| 154 |
+
key_terms = self.extract_key_terms(topic)
|
| 155 |
+
|
| 156 |
+
# Find relevant sentences containing these terms
|
| 157 |
+
relevant_sentences = []
|
| 158 |
+
for sentence in sentences:
|
| 159 |
+
if any(term in sentence.lower() for term in key_terms):
|
| 160 |
+
relevant_sentences.append(sentence)
|
| 161 |
+
|
| 162 |
+
# If no relevant sentences found, use general content
|
| 163 |
+
if not relevant_sentences:
|
| 164 |
+
relevant_sentences = sentences
|
| 165 |
+
|
| 166 |
+
# Generate questions from relevant sentences
|
| 167 |
+
questions = self.question_generator.generate_multiple_questions(
|
| 168 |
+
relevant_sentences,
|
| 169 |
+
max_questions=min(questions_per_topic, len(relevant_sentences))
|
| 170 |
+
)
|
| 171 |
+
|
| 172 |
+
if questions:
|
| 173 |
+
topic_questions[f"{unit} - {topic}"] = questions
|
| 174 |
+
|
| 175 |
+
return topic_questions
|
test_imports.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
"""Quick test to verify all modules can be imported and basic functionality works."""
|
| 4 |
+
|
| 5 |
+
import sys
|
| 6 |
+
|
| 7 |
+
def test_imports():
|
| 8 |
+
"""Test if all modules can be imported."""
|
| 9 |
+
print("Testing module imports...")
|
| 10 |
+
|
| 11 |
+
try:
|
| 12 |
+
print(" [OK] Importing text_processor...")
|
| 13 |
+
from text_processor import TextProcessor
|
| 14 |
+
tp = TextProcessor()
|
| 15 |
+
print(" [OK] TextProcessor initialized")
|
| 16 |
+
|
| 17 |
+
print(" [OK] Importing keyword_extractor...")
|
| 18 |
+
from keyword_extractor import KeywordExtractor
|
| 19 |
+
ke = KeywordExtractor()
|
| 20 |
+
print(" [OK] KeywordExtractor initialized")
|
| 21 |
+
|
| 22 |
+
print(" [OK] Importing question_generator...")
|
| 23 |
+
from question_generator import QuestionGenerator
|
| 24 |
+
qg = QuestionGenerator(use_transformers=False)
|
| 25 |
+
print(" [OK] QuestionGenerator initialized")
|
| 26 |
+
|
| 27 |
+
print(" [OK] Importing option_generator...")
|
| 28 |
+
from option_generator import OptionGenerator
|
| 29 |
+
og = OptionGenerator()
|
| 30 |
+
print(" [OK] OptionGenerator initialized")
|
| 31 |
+
|
| 32 |
+
print(" [OK] Importing syllabus_processor...")
|
| 33 |
+
from syllabus_processor import SyllabusProcessor
|
| 34 |
+
sp = SyllabusProcessor()
|
| 35 |
+
print(" [OK] SyllabusProcessor initialized")
|
| 36 |
+
|
| 37 |
+
print(" [OK] Importing exam_question_system...")
|
| 38 |
+
from exam_question_system import ExamQuestionSystem
|
| 39 |
+
print(" [OK] ExamQuestionSystem imported")
|
| 40 |
+
|
| 41 |
+
print(" [OK] Importing app...")
|
| 42 |
+
from app import app
|
| 43 |
+
print(" [OK] Flask app imported")
|
| 44 |
+
|
| 45 |
+
print("\n[SUCCESS] All imports successful!")
|
| 46 |
+
return True
|
| 47 |
+
|
| 48 |
+
except ImportError as e:
|
| 49 |
+
print(f"\n[ERROR] Import error: {e}")
|
| 50 |
+
return False
|
| 51 |
+
except Exception as e:
|
| 52 |
+
print(f"\n[WARNING] Error during initialization: {e}")
|
| 53 |
+
print(" (This might be due to missing NLTK data)")
|
| 54 |
+
return False
|
| 55 |
+
|
| 56 |
+
def test_basic_functionality():
|
| 57 |
+
"""Test basic functionality."""
|
| 58 |
+
print("\nTesting basic functionality...")
|
| 59 |
+
|
| 60 |
+
try:
|
| 61 |
+
from text_processor import TextProcessor
|
| 62 |
+
from keyword_extractor import KeywordExtractor
|
| 63 |
+
|
| 64 |
+
tp = TextProcessor()
|
| 65 |
+
ke = KeywordExtractor()
|
| 66 |
+
|
| 67 |
+
test_text = "Python is a programming language. It is widely used for web development and data science."
|
| 68 |
+
|
| 69 |
+
print(" [OK] Testing text preprocessing...")
|
| 70 |
+
processed = tp.preprocess_text(test_text)
|
| 71 |
+
assert 'sentences' in processed
|
| 72 |
+
assert len(processed['sentences']) > 0
|
| 73 |
+
print(f" [OK] Processed {len(processed['sentences'])} sentences")
|
| 74 |
+
|
| 75 |
+
print(" [OK] Testing keyword extraction...")
|
| 76 |
+
keywords = ke.extract_keywords_rake(test_text, max_keywords=5)
|
| 77 |
+
assert len(keywords) > 0
|
| 78 |
+
print(f" [OK] Extracted {len(keywords)} keywords")
|
| 79 |
+
|
| 80 |
+
print("\n[SUCCESS] Basic functionality test passed!")
|
| 81 |
+
return True
|
| 82 |
+
|
| 83 |
+
except Exception as e:
|
| 84 |
+
print(f"\n[WARNING] Functionality test error: {e}")
|
| 85 |
+
import traceback
|
| 86 |
+
traceback.print_exc()
|
| 87 |
+
return False
|
| 88 |
+
|
| 89 |
+
if __name__ == "__main__":
|
| 90 |
+
print("=" * 60)
|
| 91 |
+
print("Project Import and Basic Functionality Test")
|
| 92 |
+
print("=" * 60)
|
| 93 |
+
|
| 94 |
+
imports_ok = test_imports()
|
| 95 |
+
|
| 96 |
+
if imports_ok:
|
| 97 |
+
functionality_ok = test_basic_functionality()
|
| 98 |
+
|
| 99 |
+
if functionality_ok:
|
| 100 |
+
print("\n" + "=" * 60)
|
| 101 |
+
print("[SUCCESS] PROJECT IS WORKING!")
|
| 102 |
+
print("=" * 60)
|
| 103 |
+
sys.exit(0)
|
| 104 |
+
else:
|
| 105 |
+
print("\n" + "=" * 60)
|
| 106 |
+
print("[WARNING] Imports work but functionality may have issues")
|
| 107 |
+
print("=" * 60)
|
| 108 |
+
sys.exit(1)
|
| 109 |
+
else:
|
| 110 |
+
print("\n" + "=" * 60)
|
| 111 |
+
print("[ERROR] PROJECT HAS IMPORT ERRORS")
|
| 112 |
+
print("=" * 60)
|
| 113 |
+
sys.exit(1)
|
test_local_generator.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from local_question_generator import LocalQuestionGenerator
|
| 2 |
+
|
| 3 |
+
def test_local_generator():
|
| 4 |
+
print("Testing Local Question Generator...")
|
| 5 |
+
|
| 6 |
+
# Sample text
|
| 7 |
+
sample_text = """
|
| 8 |
+
The water cycle describes how water evaporates from the Earth's surface,
|
| 9 |
+
rises into the atmosphere, cools and condenses into rain or snow in clouds,
|
| 10 |
+
and falls again to the surface as precipitation.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
# Initialize the generator
|
| 14 |
+
print("Initializing model (this may take a minute)...")
|
| 15 |
+
qg = LocalQuestionGenerator()
|
| 16 |
+
|
| 17 |
+
# Generate questions
|
| 18 |
+
print("\nGenerating questions...")
|
| 19 |
+
questions = qg.generate_questions(sample_text, num_questions=3)
|
| 20 |
+
|
| 21 |
+
# Print results
|
| 22 |
+
print("\nGenerated Questions:")
|
| 23 |
+
for i, q in enumerate(questions, 1):
|
| 24 |
+
print(f"{i}. {q}")
|
| 25 |
+
|
| 26 |
+
if __name__ == "__main__":
|
| 27 |
+
test_local_generator()
|
test_nltk.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import nltk
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
# Set NLTK data path
|
| 5 |
+
def setup_nltk():
|
| 6 |
+
nltk_data = os.path.join(os.getcwd(), 'nltk_data')
|
| 7 |
+
os.makedirs(nltk_data, exist_ok=True)
|
| 8 |
+
nltk.data.path.append(nltk_data)
|
| 9 |
+
return nltk_data
|
| 10 |
+
|
| 11 |
+
print("Testing NLTK installation...")
|
| 12 |
+
try:
|
| 13 |
+
# Test basic NLTK functionality
|
| 14 |
+
print("Testing tokenizers...")
|
| 15 |
+
from nltk.tokenize import word_tokenize, sent_tokenize
|
| 16 |
+
|
| 17 |
+
text = "This is a test sentence. NLTK is working!"
|
| 18 |
+
print(f"Word tokens: {word_tokenize(text)}")
|
| 19 |
+
print(f"Sentences: {sent_tokenize(text)}")
|
| 20 |
+
|
| 21 |
+
print("\nTesting POS tagging...")
|
| 22 |
+
from nltk import pos_tag
|
| 23 |
+
print(f"POS tags: {pos_tag(word_tokenize(text))}")
|
| 24 |
+
|
| 25 |
+
print("\nTesting stopwords...")
|
| 26 |
+
from nltk.corpus import stopwords
|
| 27 |
+
print(f"English stopwords: {list(stopwords.words('english'))[:5]}...")
|
| 28 |
+
|
| 29 |
+
print("\n✅ NLTK is working correctly!")
|
| 30 |
+
|
| 31 |
+
except LookupError as e:
|
| 32 |
+
print(f"\n❌ NLTK data not found: {e}")
|
| 33 |
+
nltk_data = setup_nltk()
|
| 34 |
+
print(f"\nPlease run these commands in a Python shell to download NLTK data:")
|
| 35 |
+
print(f"import nltk")
|
| 36 |
+
print(f"nltk.download('punkt', download_dir=r'{nltk_data}')")
|
| 37 |
+
print(f"nltk.download('stopwords', download_dir=r'{nltk_data}')")
|
| 38 |
+
print(f"nltk.download('averaged_perceptron_tagger', download_dir=r'{nltk_data}')")
|
| 39 |
+
print(f"nltk.download('wordnet', download_dir=r'{nltk_data}')")
|
| 40 |
+
print(f"nltk.download('omw-1.4', download_dir=r'{nltk_data}')")
|
| 41 |
+
|
| 42 |
+
except Exception as e:
|
| 43 |
+
print(f"\n❌ Error testing NLTK: {e}")
|
test_question_generator.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from question_generator import QuestionGenerator
|
| 2 |
+
|
| 3 |
+
def test_question_generator():
|
| 4 |
+
print("Testing Question Generator...")
|
| 5 |
+
|
| 6 |
+
# Initialize the question generator
|
| 7 |
+
qg = QuestionGenerator(use_transformers=False) # Using rule-based for faster testing
|
| 8 |
+
|
| 9 |
+
# Sample text about machine learning
|
| 10 |
+
sample_text = """
|
| 11 |
+
Machine learning is a branch of artificial intelligence that focuses on building systems
|
| 12 |
+
that learn from data. These systems can improve their performance over time without being
|
| 13 |
+
explicitly programmed. There are three main types of machine learning: supervised learning,
|
| 14 |
+
unsupervised learning, and reinforcement learning. Supervised learning uses labeled data
|
| 15 |
+
to train models, while unsupervised learning finds patterns in unlabeled data.
|
| 16 |
+
Reinforcement learning involves training an agent to make decisions by rewarding desired
|
| 17 |
+
behaviors and/or punishing undesired ones.
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
print("\nGenerating questions...")
|
| 21 |
+
questions = qg.generate_questions(sample_text, num_questions=3)
|
| 22 |
+
|
| 23 |
+
print("\nGenerated Questions:")
|
| 24 |
+
for i, q in enumerate(questions, 1):
|
| 25 |
+
print(f"{i}. {q}")
|
| 26 |
+
|
| 27 |
+
if __name__ == "__main__":
|
| 28 |
+
test_question_generator()
|
test_syllabus.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import sys
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
# Ensure the current directory is in the python path
|
| 6 |
+
sys.path.append(os.getcwd())
|
| 7 |
+
|
| 8 |
+
from syllabus_processor import SyllabusProcessor
|
| 9 |
+
|
| 10 |
+
# Configure logging
|
| 11 |
+
logging.basicConfig(level=logging.INFO)
|
| 12 |
+
|
| 13 |
+
def test_syllabus_parsing():
|
| 14 |
+
syllabus_text = """Unit 1.0 Introduction (9 Lectures) Self-Learning (SL) 1.1 Data for Graphics. 1.2 Design principles 1.3 Value for visualization 1.4 Categorical 1.5 Time series 1.6 statistical data graphics I 1.7 statistical data graphics II 1.8 Introduction to Visualization Tools I 1.9 Introduction to Visualization Tools II
|
| 15 |
+
Unit 2.0 Graphics Pipeline and Aesthetics and Perception (10 Lectures) 2.1 Primitives: vertices edges and triangles 2.2 Model transforms 2.3 Translations 2.4 Rotations 2.5 scaling 2.6 View transform, Perspective transform, window transform 2.7 Graphical Perception Theory 2.8 Experimentation, and the Application 2.9 Graphical Integrity, Layering and Separation 2.10 Color and Information, Using Space"""
|
| 16 |
+
|
| 17 |
+
print("Initializing SyllabusProcessor...")
|
| 18 |
+
try:
|
| 19 |
+
processor = SyllabusProcessor()
|
| 20 |
+
|
| 21 |
+
print("\nParsing syllabus...")
|
| 22 |
+
units = processor.parse_syllabus(syllabus_text)
|
| 23 |
+
|
| 24 |
+
print("\nParsing Results:")
|
| 25 |
+
for unit, topics in units.items():
|
| 26 |
+
print(f"\nUnit: {unit}")
|
| 27 |
+
print(f"Topics found: {len(topics)}")
|
| 28 |
+
for topic in topics:
|
| 29 |
+
print(f" - {topic}")
|
| 30 |
+
|
| 31 |
+
except Exception as e:
|
| 32 |
+
print(f"Error: {e}")
|
| 33 |
+
import traceback
|
| 34 |
+
traceback.print_exc()
|
| 35 |
+
|
| 36 |
+
if __name__ == "__main__":
|
| 37 |
+
test_syllabus_parsing()
|
text_processor.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import nltk
|
| 2 |
+
import re
|
| 3 |
+
import string
|
| 4 |
+
from nltk.corpus import stopwords
|
| 5 |
+
from nltk.tokenize import sent_tokenize, word_tokenize
|
| 6 |
+
from nltk.stem import WordNetLemmatizer
|
| 7 |
+
|
| 8 |
+
class TextProcessor:
|
| 9 |
+
def __init__(self):
|
| 10 |
+
"""Initialize the text processor with required NLTK data."""
|
| 11 |
+
self.download_nltk_data()
|
| 12 |
+
self.stop_words = set(stopwords.words('english'))
|
| 13 |
+
self.lemmatizer = WordNetLemmatizer()
|
| 14 |
+
|
| 15 |
+
def download_nltk_data(self):
|
| 16 |
+
"""Download required NLTK data if not already present."""
|
| 17 |
+
required_data = [
|
| 18 |
+
('tokenizers/punkt_tab', 'punkt_tab'),
|
| 19 |
+
('tokenizers/punkt', 'punkt'),
|
| 20 |
+
('corpora/stopwords', 'stopwords'),
|
| 21 |
+
('corpora/wordnet', 'wordnet'),
|
| 22 |
+
('corpora/omw-1.4', 'omw-1.4')
|
| 23 |
+
]
|
| 24 |
+
|
| 25 |
+
for path, name in required_data:
|
| 26 |
+
try:
|
| 27 |
+
nltk.data.find(path)
|
| 28 |
+
except LookupError:
|
| 29 |
+
print(f"Downloading NLTK {name}...")
|
| 30 |
+
nltk.download(name)
|
| 31 |
+
|
| 32 |
+
def clean_text(self, text):
|
| 33 |
+
"""
|
| 34 |
+
Clean and preprocess the input text.
|
| 35 |
+
|
| 36 |
+
Args:
|
| 37 |
+
text (str): Raw input text
|
| 38 |
+
|
| 39 |
+
Returns:
|
| 40 |
+
str: Cleaned text
|
| 41 |
+
"""
|
| 42 |
+
# Remove extra whitespace and normalize
|
| 43 |
+
text = re.sub(r'\s+', ' ', text.strip())
|
| 44 |
+
|
| 45 |
+
# Remove common header/footer patterns (e.g., "Page 1 of 10", "Unit 1")
|
| 46 |
+
text = re.sub(r'Page\s+\d+\s+of\s+\d+', '', text, flags=re.IGNORECASE)
|
| 47 |
+
text = re.sub(r'Unit\s+\d+(\.\d+)?', '', text, flags=re.IGNORECASE)
|
| 48 |
+
|
| 49 |
+
# Remove standalone numbers (often page numbers or list markers)
|
| 50 |
+
text = re.sub(r'\b\d+\b', '', text)
|
| 51 |
+
|
| 52 |
+
# Remove special characters but keep sentence structure
|
| 53 |
+
# Keep periods, question marks, exclamation points, commas, and hyphens
|
| 54 |
+
text = re.sub(r'[^\w\s\.\?\!,\-]', '', text)
|
| 55 |
+
|
| 56 |
+
# Remove multiple periods/spaces
|
| 57 |
+
text = re.sub(r'\.+', '.', text)
|
| 58 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
| 59 |
+
|
| 60 |
+
return text
|
| 61 |
+
|
| 62 |
+
def tokenize_sentences(self, text):
|
| 63 |
+
"""
|
| 64 |
+
Tokenize text into sentences.
|
| 65 |
+
|
| 66 |
+
Args:
|
| 67 |
+
text (str): Input text
|
| 68 |
+
|
| 69 |
+
Returns:
|
| 70 |
+
list: List of sentences
|
| 71 |
+
"""
|
| 72 |
+
sentences = sent_tokenize(text)
|
| 73 |
+
# Filter out very short sentences (less than 5 words)
|
| 74 |
+
filtered_sentences = [s for s in sentences if len(word_tokenize(s)) >= 5]
|
| 75 |
+
return filtered_sentences
|
| 76 |
+
|
| 77 |
+
def tokenize_words(self, text):
|
| 78 |
+
"""
|
| 79 |
+
Tokenize text into words and remove stopwords.
|
| 80 |
+
|
| 81 |
+
Args:
|
| 82 |
+
text (str): Input text
|
| 83 |
+
|
| 84 |
+
Returns:
|
| 85 |
+
list: List of processed words
|
| 86 |
+
"""
|
| 87 |
+
words = word_tokenize(text.lower())
|
| 88 |
+
|
| 89 |
+
# Remove punctuation and stopwords
|
| 90 |
+
words = [word for word in words if word not in string.punctuation]
|
| 91 |
+
words = [word for word in words if word not in self.stop_words]
|
| 92 |
+
|
| 93 |
+
# Lemmatize words
|
| 94 |
+
words = [self.lemmatizer.lemmatize(word) for word in words]
|
| 95 |
+
|
| 96 |
+
return words
|
| 97 |
+
|
| 98 |
+
def preprocess_text(self, text):
|
| 99 |
+
"""
|
| 100 |
+
Complete preprocessing pipeline.
|
| 101 |
+
|
| 102 |
+
Args:
|
| 103 |
+
text (str): Raw input text
|
| 104 |
+
|
| 105 |
+
Returns:
|
| 106 |
+
dict: Processed text components
|
| 107 |
+
"""
|
| 108 |
+
cleaned_text = self.clean_text(text)
|
| 109 |
+
sentences = self.tokenize_sentences(cleaned_text)
|
| 110 |
+
words = self.tokenize_words(cleaned_text)
|
| 111 |
+
|
| 112 |
+
return {
|
| 113 |
+
'cleaned_text': cleaned_text,
|
| 114 |
+
'sentences': sentences,
|
| 115 |
+
'words': words,
|
| 116 |
+
'word_count': len(words),
|
| 117 |
+
'sentence_count': len(sentences)
|
| 118 |
+
}
|
verify_generation.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from exam_question_system import ExamQuestionSystem
|
| 3 |
+
from option_generator import OptionGenerator
|
| 4 |
+
|
| 5 |
+
# Configure logging
|
| 6 |
+
logging.basicConfig(level=logging.INFO)
|
| 7 |
+
|
| 8 |
+
def verify_generation():
|
| 9 |
+
# Mock content
|
| 10 |
+
content = """
|
| 11 |
+
Data visualization is the graphical representation of information and data.
|
| 12 |
+
By using visual elements like charts, graphs, and maps, data visualization
|
| 13 |
+
tools provide an accessible way to see and understand trends, outliers, and
|
| 14 |
+
patterns in data. In the world of Big Data, data visualization tools and
|
| 15 |
+
technologies are essential to analyze massive amounts of information and
|
| 16 |
+
make data-driven decisions.
|
| 17 |
+
|
| 18 |
+
Design principles in data visualization include understanding the audience,
|
| 19 |
+
choosing the right chart type, and using color effectively. Good design
|
| 20 |
+
makes complex data more accessible, understandable, and usable.
|
| 21 |
+
|
| 22 |
+
Exploratory Data Analysis (EDA) is an approach to analyzing data sets to
|
| 23 |
+
summarize their main characteristics, often with visual methods. A statistical
|
| 24 |
+
model can be used or not, but primarily EDA is for seeing what the data can
|
| 25 |
+
tell us beyond the formal modeling or hypothesis testing task.
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
print("Initializing ExamQuestionSystem...")
|
| 29 |
+
system = ExamQuestionSystem(use_transformers=False) # Use rule-based for speed in test
|
| 30 |
+
|
| 31 |
+
print("\nGenerating questions (Target: 2 MCQ, 1 Short, 1 Long)...")
|
| 32 |
+
# We simulate the logic in app.py
|
| 33 |
+
num_mcq = 2
|
| 34 |
+
num_short = 1
|
| 35 |
+
num_long = 1
|
| 36 |
+
total_needed = num_mcq + num_short + num_long
|
| 37 |
+
|
| 38 |
+
results = system.generate_exam_questions(
|
| 39 |
+
input_text=content,
|
| 40 |
+
max_questions=total_needed,
|
| 41 |
+
include_mcq=False,
|
| 42 |
+
syllabus_text=content
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
all_questions = results.get('questions', [])
|
| 46 |
+
print(f"\nTotal questions generated: {len(all_questions)}")
|
| 47 |
+
|
| 48 |
+
# Simulate app.py distribution logic
|
| 49 |
+
generated_questions = {
|
| 50 |
+
'mcq_questions': [],
|
| 51 |
+
'short_questions': [],
|
| 52 |
+
'long_questions': []
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
# Filter out questions that are too simple for Long answers
|
| 56 |
+
long_candidates = [q for q in all_questions if len(q.get('context', '').split()) > 10]
|
| 57 |
+
short_candidates = [q for q in all_questions if q not in long_candidates]
|
| 58 |
+
|
| 59 |
+
# If we don't have enough long candidates, take from short
|
| 60 |
+
if len(long_candidates) < num_long:
|
| 61 |
+
needed = num_long - len(long_candidates)
|
| 62 |
+
long_candidates.extend(short_candidates[:needed])
|
| 63 |
+
short_candidates = short_candidates[needed:]
|
| 64 |
+
|
| 65 |
+
# 3. Process Long Questions
|
| 66 |
+
for _ in range(num_long):
|
| 67 |
+
if long_candidates:
|
| 68 |
+
q = long_candidates.pop(0)
|
| 69 |
+
q['type'] = 'long_answer'
|
| 70 |
+
generated_questions['long_questions'].append(q)
|
| 71 |
+
if q in all_questions:
|
| 72 |
+
all_questions.remove(q)
|
| 73 |
+
|
| 74 |
+
# 2. Process Short Questions
|
| 75 |
+
for _ in range(num_short):
|
| 76 |
+
if short_candidates:
|
| 77 |
+
q = short_candidates.pop(0)
|
| 78 |
+
q['type'] = 'short_answer'
|
| 79 |
+
generated_questions['short_questions'].append(q)
|
| 80 |
+
if q in all_questions:
|
| 81 |
+
all_questions.remove(q)
|
| 82 |
+
elif all_questions:
|
| 83 |
+
q = all_questions.pop(0)
|
| 84 |
+
q['type'] = 'short_answer'
|
| 85 |
+
generated_questions['short_questions'].append(q)
|
| 86 |
+
|
| 87 |
+
# 1. Process MCQs
|
| 88 |
+
global_keywords = [k[1] for k in results.get('keywords', [])]
|
| 89 |
+
print(f"\nGlobal Keywords (Cleaned): {global_keywords}")
|
| 90 |
+
|
| 91 |
+
for _ in range(num_mcq):
|
| 92 |
+
if all_questions:
|
| 93 |
+
q = all_questions.pop(0)
|
| 94 |
+
try:
|
| 95 |
+
mcq_data = system.option_generator.create_mcq_options(
|
| 96 |
+
q['question'],
|
| 97 |
+
q['context'],
|
| 98 |
+
correct_answer=q.get('correct_answer'),
|
| 99 |
+
global_keywords=global_keywords
|
| 100 |
+
)
|
| 101 |
+
if mcq_data and 'options' in mcq_data:
|
| 102 |
+
q.update(mcq_data)
|
| 103 |
+
q['type'] = 'mcq'
|
| 104 |
+
generated_questions['mcq_questions'].append(q)
|
| 105 |
+
except Exception as e:
|
| 106 |
+
print(f"Error generating options: {e}")
|
| 107 |
+
|
| 108 |
+
# Print Results
|
| 109 |
+
print("\n--- Generation Results ---")
|
| 110 |
+
print(f"MCQs: {len(generated_questions['mcq_questions'])}")
|
| 111 |
+
for q in generated_questions['mcq_questions']:
|
| 112 |
+
print(f" Q: {q['question']}")
|
| 113 |
+
print(f" Options: {q.get('options')}")
|
| 114 |
+
|
| 115 |
+
print(f"\nShort Questions: {len(generated_questions['short_questions'])}")
|
| 116 |
+
for q in generated_questions['short_questions']:
|
| 117 |
+
print(f" Q: {q['question']}")
|
| 118 |
+
|
| 119 |
+
print(f"\nLong Questions: {len(generated_questions['long_questions'])}")
|
| 120 |
+
for q in generated_questions['long_questions']:
|
| 121 |
+
print(f" Q: {q['question']}")
|
| 122 |
+
|
| 123 |
+
if __name__ == "__main__":
|
| 124 |
+
verify_generation()
|