Spaces:
Sleeping
Sleeping
File size: 7,126 Bytes
300f197 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 | import re
import nltk
import os
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
from typing import List, Dict, Tuple, Optional
import logging
from datetime import datetime
from question_generator import QuestionGenerator
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class SyllabusProcessor:
def __init__(self):
"""Initialize the SyllabusProcessor with necessary NLTK components."""
try:
# Download required NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('averaged_perceptron_tagger_eng', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
# Initialize NLTK components
self.stop_words = set(stopwords.words('english'))
self.lemmatizer = WordNetLemmatizer()
# Import and initialize the PerceptronTagger
from nltk.tag import PerceptronTagger
self.tagger = PerceptronTagger()
# Initialize question generator
self.question_generator = QuestionGenerator()
logger.info("SyllabusProcessor initialized successfully")
except Exception as e:
logger.error(f"Error initializing SyllabusProcessor: {str(e)}")
raise
def parse_syllabus(self, syllabus_text: str) -> Dict[str, List[str]]:
"""
Parse a syllabus text into topics and subtopics.
Args:
syllabus_text: Raw syllabus text with units and topics
Returns:
Dictionary mapping unit names to lists of topics
"""
units = {}
current_unit = "General Topics"
units[current_unit] = []
# Split into lines and process each line
for line in syllabus_text.split('\n'):
line = line.strip()
if not line:
continue
# Check for unit headers (e.g., "Unit 1.0 Introduction" or "Unit 1: Introduction")
unit_match = re.match(r'(?:Unit[\s\-]*\d+(?:\.\d+)?\s*[:\-]?\s*)(.+)', line, re.IGNORECASE)
if unit_match:
current_unit = unit_match.group(0).strip()
units[current_unit] = []
# Check if there are topics on the same line (e.g., "Unit 1 ... 1.1 Topic")
remaining_text = unit_match.group(1)
# Find all topic patterns like "1.1 Topic Name"
inline_topics = re.findall(r'(\d+(?:\.\d+)+[\s\-]+[^0-9]+)', remaining_text)
for t in inline_topics:
# Clean up the topic text
t = re.sub(r'\s*\d+\.\d+.*$', '', t).strip() # Remove next topic number if caught
if t:
units[current_unit].append(t.strip())
continue
# Check for topic lines (e.g., "1.1 Topic Name" or "- Topic Name")
# Handle multiple topics on one line
topics_on_line = re.findall(r'(\d+(?:\.\d+)+[\s\-]+[^0-9]+)', line)
if topics_on_line:
for t in topics_on_line:
t = t.strip()
# Clean up trailing dots or next topic numbers
t = re.sub(r'\s*\d+\.\d+.*$', '', t).strip()
if t and len(t) > 3: # Avoid just numbers
units[current_unit].append(t)
else:
# Check for bullet points
topic_match = re.match(r'(?:[-•*]\s*)(.+)', line)
if topic_match:
topic = topic_match.group(1).strip()
if topic and topic.lower() not in ['introduction', 'overview']:
units[current_unit].append(topic)
return units
def extract_key_terms(self, topic: str) -> List[str]:
"""
Extract key terms from a topic for question generation.
Args:
topic: The topic text
Returns:
List of important terms from the topic
"""
try:
# Use the instance tagger
words = word_tokenize(topic.lower())
pos_tags = self.tagger.tag(words)
# Extract nouns and proper nouns
key_terms = [
word for word, tag in pos_tags
if tag.startswith('NN') and word not in self.stop_words
]
return list(set(key_terms)) # Remove duplicates
except Exception as e:
logger.error(f"Error extracting key terms: {str(e)}")
return []
def generate_topic_based_questions(self, syllabus_text: str, content_text: str,
questions_per_topic: int = 3) -> Dict[str, List[Dict]]:
"""
Generate questions based on syllabus topics.
Args:
syllabus_text: The syllabus text with units and topics
content_text: The content text to generate questions from
questions_per_topic: Number of questions to generate per topic
Returns:
Dictionary mapping topics to lists of questions
"""
# Parse the syllabus
units = self.parse_syllabus(syllabus_text)
# Process content into sentences
sentences = sent_tokenize(content_text)
topic_questions = {}
for unit, topics in units.items():
for topic in topics:
# Extract key terms from the topic
key_terms = self.extract_key_terms(topic)
# Find relevant sentences containing these terms
relevant_sentences = []
for sentence in sentences:
if any(term in sentence.lower() for term in key_terms):
relevant_sentences.append(sentence)
# If no relevant sentences found, use general content
if not relevant_sentences:
relevant_sentences = sentences
# Generate questions from relevant sentences
questions = self.question_generator.generate_multiple_questions(
relevant_sentences,
max_questions=min(questions_per_topic, len(relevant_sentences))
)
if questions:
topic_questions[f"{unit} - {topic}"] = questions
return topic_questions |