Spaces:

Omnamdev02
/

AutoExamGen

Sleeping

File size: 7,126 Bytes

300f197

import re
import nltk
import os
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
from typing import List, Dict, Tuple, Optional
import logging
from datetime import datetime
from question_generator import QuestionGenerator

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class SyllabusProcessor:
    def __init__(self):
        """Initialize the SyllabusProcessor with necessary NLTK components."""
        try:
            # Download required NLTK data
            nltk.download('punkt', quiet=True)
            nltk.download('stopwords', quiet=True)
            nltk.download('averaged_perceptron_tagger', quiet=True)
            nltk.download('averaged_perceptron_tagger_eng', quiet=True)
            nltk.download('wordnet', quiet=True)
            nltk.download('omw-1.4', quiet=True)
            
            # Initialize NLTK components
            self.stop_words = set(stopwords.words('english'))
            self.lemmatizer = WordNetLemmatizer()
            
            # Import and initialize the PerceptronTagger
            from nltk.tag import PerceptronTagger
            self.tagger = PerceptronTagger()
            
            # Initialize question generator
            self.question_generator = QuestionGenerator()
            
            logger.info("SyllabusProcessor initialized successfully")
            
        except Exception as e:
            logger.error(f"Error initializing SyllabusProcessor: {str(e)}")
            raise

    def parse_syllabus(self, syllabus_text: str) -> Dict[str, List[str]]:
        """

        Parse a syllabus text into topics and subtopics.

        

        Args:

            syllabus_text: Raw syllabus text with units and topics

            

        Returns:

            Dictionary mapping unit names to lists of topics

        """
        units = {}
        current_unit = "General Topics"
        units[current_unit] = []
        
        # Split into lines and process each line
        for line in syllabus_text.split('\n'):
            line = line.strip()
            if not line:
                continue
                
            # Check for unit headers (e.g., "Unit 1.0 Introduction" or "Unit 1: Introduction")
            unit_match = re.match(r'(?:Unit[\s\-]*\d+(?:\.\d+)?\s*[:\-]?\s*)(.+)', line, re.IGNORECASE)
            if unit_match:
                current_unit = unit_match.group(0).strip()
                units[current_unit] = []
                
                # Check if there are topics on the same line (e.g., "Unit 1 ... 1.1 Topic")
                remaining_text = unit_match.group(1)
                # Find all topic patterns like "1.1 Topic Name"
                inline_topics = re.findall(r'(\d+(?:\.\d+)+[\s\-]+[^0-9]+)', remaining_text)
                for t in inline_topics:
                    # Clean up the topic text
                    t = re.sub(r'\s*\d+\.\d+.*$', '', t).strip() # Remove next topic number if caught
                    if t:
                        units[current_unit].append(t.strip())
                continue
                
            # Check for topic lines (e.g., "1.1 Topic Name" or "- Topic Name")
            # Handle multiple topics on one line
            topics_on_line = re.findall(r'(\d+(?:\.\d+)+[\s\-]+[^0-9]+)', line)
            if topics_on_line:
                for t in topics_on_line:
                    t = t.strip()
                    # Clean up trailing dots or next topic numbers
                    t = re.sub(r'\s*\d+\.\d+.*$', '', t).strip()
                    if t and len(t) > 3: # Avoid just numbers
                        units[current_unit].append(t)
            else:
                # Check for bullet points
                topic_match = re.match(r'(?:[-•*]\s*)(.+)', line)
                if topic_match:
                    topic = topic_match.group(1).strip()
                    if topic and topic.lower() not in ['introduction', 'overview']:
                        units[current_unit].append(topic)
        
        return units
    
    def extract_key_terms(self, topic: str) -> List[str]:
        """

        Extract key terms from a topic for question generation.

        

        Args:

            topic: The topic text

            

        Returns:

            List of important terms from the topic

        """
        try:
            # Use the instance tagger
            words = word_tokenize(topic.lower())
            pos_tags = self.tagger.tag(words)
            
            # Extract nouns and proper nouns
            key_terms = [
                word for word, tag in pos_tags 
                if tag.startswith('NN') and word not in self.stop_words
            ]
            
            return list(set(key_terms))  # Remove duplicates
            
        except Exception as e:
            logger.error(f"Error extracting key terms: {str(e)}")
            return []

    def generate_topic_based_questions(self, syllabus_text: str, content_text: str, 

                                     questions_per_topic: int = 3) -> Dict[str, List[Dict]]:
        """

        Generate questions based on syllabus topics.

        

        Args:

            syllabus_text: The syllabus text with units and topics

            content_text: The content text to generate questions from

            questions_per_topic: Number of questions to generate per topic

            

        Returns:

            Dictionary mapping topics to lists of questions

        """
        # Parse the syllabus
        units = self.parse_syllabus(syllabus_text)
        
        # Process content into sentences
        sentences = sent_tokenize(content_text)
        
        topic_questions = {}
        
        for unit, topics in units.items():
            for topic in topics:
                # Extract key terms from the topic
                key_terms = self.extract_key_terms(topic)
                
                # Find relevant sentences containing these terms
                relevant_sentences = []
                for sentence in sentences:
                    if any(term in sentence.lower() for term in key_terms):
                        relevant_sentences.append(sentence)
                
                # If no relevant sentences found, use general content
                if not relevant_sentences:
                    relevant_sentences = sentences
                
                # Generate questions from relevant sentences
                questions = self.question_generator.generate_multiple_questions(
                    relevant_sentences, 
                    max_questions=min(questions_per_topic, len(relevant_sentences))
                )
                
                if questions:
                    topic_questions[f"{unit} - {topic}"] = questions
        
        return topic_questions