import re import nltk import os from nltk.tokenize import word_tokenize, sent_tokenize from nltk.corpus import stopwords from nltk.tag import pos_tag from nltk.stem import WordNetLemmatizer from typing import List, Dict, Tuple, Optional import logging from datetime import datetime from question_generator import QuestionGenerator # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class SyllabusProcessor: def __init__(self): """Initialize the SyllabusProcessor with necessary NLTK components.""" try: # Download required NLTK data nltk.download('punkt', quiet=True) nltk.download('stopwords', quiet=True) nltk.download('averaged_perceptron_tagger', quiet=True) nltk.download('averaged_perceptron_tagger_eng', quiet=True) nltk.download('wordnet', quiet=True) nltk.download('omw-1.4', quiet=True) # Initialize NLTK components self.stop_words = set(stopwords.words('english')) self.lemmatizer = WordNetLemmatizer() # Import and initialize the PerceptronTagger from nltk.tag import PerceptronTagger self.tagger = PerceptronTagger() # Initialize question generator self.question_generator = QuestionGenerator() logger.info("SyllabusProcessor initialized successfully") except Exception as e: logger.error(f"Error initializing SyllabusProcessor: {str(e)}") raise def parse_syllabus(self, syllabus_text: str) -> Dict[str, List[str]]: """ Parse a syllabus text into topics and subtopics. Args: syllabus_text: Raw syllabus text with units and topics Returns: Dictionary mapping unit names to lists of topics """ units = {} current_unit = "General Topics" units[current_unit] = [] # Split into lines and process each line for line in syllabus_text.split('\n'): line = line.strip() if not line: continue # Check for unit headers (e.g., "Unit 1.0 Introduction" or "Unit 1: Introduction") unit_match = re.match(r'(?:Unit[\s\-]*\d+(?:\.\d+)?\s*[:\-]?\s*)(.+)', line, re.IGNORECASE) if unit_match: current_unit = unit_match.group(0).strip() units[current_unit] = [] # Check if there are topics on the same line (e.g., "Unit 1 ... 1.1 Topic") remaining_text = unit_match.group(1) # Find all topic patterns like "1.1 Topic Name" inline_topics = re.findall(r'(\d+(?:\.\d+)+[\s\-]+[^0-9]+)', remaining_text) for t in inline_topics: # Clean up the topic text t = re.sub(r'\s*\d+\.\d+.*$', '', t).strip() # Remove next topic number if caught if t: units[current_unit].append(t.strip()) continue # Check for topic lines (e.g., "1.1 Topic Name" or "- Topic Name") # Handle multiple topics on one line topics_on_line = re.findall(r'(\d+(?:\.\d+)+[\s\-]+[^0-9]+)', line) if topics_on_line: for t in topics_on_line: t = t.strip() # Clean up trailing dots or next topic numbers t = re.sub(r'\s*\d+\.\d+.*$', '', t).strip() if t and len(t) > 3: # Avoid just numbers units[current_unit].append(t) else: # Check for bullet points topic_match = re.match(r'(?:[-•*]\s*)(.+)', line) if topic_match: topic = topic_match.group(1).strip() if topic and topic.lower() not in ['introduction', 'overview']: units[current_unit].append(topic) return units def extract_key_terms(self, topic: str) -> List[str]: """ Extract key terms from a topic for question generation. Args: topic: The topic text Returns: List of important terms from the topic """ try: # Use the instance tagger words = word_tokenize(topic.lower()) pos_tags = self.tagger.tag(words) # Extract nouns and proper nouns key_terms = [ word for word, tag in pos_tags if tag.startswith('NN') and word not in self.stop_words ] return list(set(key_terms)) # Remove duplicates except Exception as e: logger.error(f"Error extracting key terms: {str(e)}") return [] def generate_topic_based_questions(self, syllabus_text: str, content_text: str, questions_per_topic: int = 3) -> Dict[str, List[Dict]]: """ Generate questions based on syllabus topics. Args: syllabus_text: The syllabus text with units and topics content_text: The content text to generate questions from questions_per_topic: Number of questions to generate per topic Returns: Dictionary mapping topics to lists of questions """ # Parse the syllabus units = self.parse_syllabus(syllabus_text) # Process content into sentences sentences = sent_tokenize(content_text) topic_questions = {} for unit, topics in units.items(): for topic in topics: # Extract key terms from the topic key_terms = self.extract_key_terms(topic) # Find relevant sentences containing these terms relevant_sentences = [] for sentence in sentences: if any(term in sentence.lower() for term in key_terms): relevant_sentences.append(sentence) # If no relevant sentences found, use general content if not relevant_sentences: relevant_sentences = sentences # Generate questions from relevant sentences questions = self.question_generator.generate_multiple_questions( relevant_sentences, max_questions=min(questions_per_topic, len(relevant_sentences)) ) if questions: topic_questions[f"{unit} - {topic}"] = questions return topic_questions