Spaces:

Omnamdev02
/

AutoExamGen

Sleeping

App Files Files Community

AutoExamGen / syllabus_processor.py

Omnamdev02

Add files via upload

300f197 unverified 3 months ago

raw

history blame contribute delete

7.13 kB

	import re
	import nltk
	import os
	from nltk.tokenize import word_tokenize, sent_tokenize
	from nltk.corpus import stopwords
	from nltk.tag import pos_tag
	from nltk.stem import WordNetLemmatizer
	from typing import List, Dict, Tuple, Optional
	import logging
	from datetime import datetime
	from question_generator import QuestionGenerator

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	class SyllabusProcessor:
	def __init__(self):
	"""Initialize the SyllabusProcessor with necessary NLTK components."""
	try:
	# Download required NLTK data
	nltk.download('punkt', quiet=True)
	nltk.download('stopwords', quiet=True)
	nltk.download('averaged_perceptron_tagger', quiet=True)
	nltk.download('averaged_perceptron_tagger_eng', quiet=True)
	nltk.download('wordnet', quiet=True)
	nltk.download('omw-1.4', quiet=True)

	# Initialize NLTK components
	self.stop_words = set(stopwords.words('english'))
	self.lemmatizer = WordNetLemmatizer()

	# Import and initialize the PerceptronTagger
	from nltk.tag import PerceptronTagger
	self.tagger = PerceptronTagger()

	# Initialize question generator
	self.question_generator = QuestionGenerator()

	logger.info("SyllabusProcessor initialized successfully")

	except Exception as e:
	logger.error(f"Error initializing SyllabusProcessor: {str(e)}")
	raise

	def parse_syllabus(self, syllabus_text: str) -> Dict[str, List[str]]:
	"""
	Parse a syllabus text into topics and subtopics.

	Args:
	syllabus_text: Raw syllabus text with units and topics

	Returns:
	Dictionary mapping unit names to lists of topics
	"""
	units = {}
	current_unit = "General Topics"
	units[current_unit] = []

	# Split into lines and process each line
	for line in syllabus_text.split('\n'):
	line = line.strip()
	if not line:
	continue

	# Check for unit headers (e.g., "Unit 1.0 Introduction" or "Unit 1: Introduction")
	unit_match = re.match(r'(?:Unit[\s\-]\d+(?:\.\d+)?\s[:\-]?\s*)(.+)', line, re.IGNORECASE)
	if unit_match:
	current_unit = unit_match.group(0).strip()
	units[current_unit] = []

	# Check if there are topics on the same line (e.g., "Unit 1 ... 1.1 Topic")
	remaining_text = unit_match.group(1)
	# Find all topic patterns like "1.1 Topic Name"
	inline_topics = re.findall(r'(\d+(?:\.\d+)+[\s\-]+[^0-9]+)', remaining_text)
	for t in inline_topics:
	# Clean up the topic text
	t = re.sub(r'\s\d+\.\d+.$', '', t).strip() # Remove next topic number if caught
	if t:
	units[current_unit].append(t.strip())
	continue

	# Check for topic lines (e.g., "1.1 Topic Name" or "- Topic Name")
	# Handle multiple topics on one line
	topics_on_line = re.findall(r'(\d+(?:\.\d+)+[\s\-]+[^0-9]+)', line)
	if topics_on_line:
	for t in topics_on_line:
	t = t.strip()
	# Clean up trailing dots or next topic numbers
	t = re.sub(r'\s\d+\.\d+.$', '', t).strip()
	if t and len(t) > 3: # Avoid just numbers
	units[current_unit].append(t)
	else:
	# Check for bullet points
	topic_match = re.match(r'(?:[-•]\s)(.+)', line)
	if topic_match:
	topic = topic_match.group(1).strip()
	if topic and topic.lower() not in ['introduction', 'overview']:
	units[current_unit].append(topic)

	return units

	def extract_key_terms(self, topic: str) -> List[str]:
	"""
	Extract key terms from a topic for question generation.

	Args:
	topic: The topic text

	Returns:
	List of important terms from the topic
	"""
	try:
	# Use the instance tagger
	words = word_tokenize(topic.lower())
	pos_tags = self.tagger.tag(words)

	# Extract nouns and proper nouns
	key_terms = [
	word for word, tag in pos_tags
	if tag.startswith('NN') and word not in self.stop_words
	]

	return list(set(key_terms)) # Remove duplicates

	except Exception as e:
	logger.error(f"Error extracting key terms: {str(e)}")
	return []

	def generate_topic_based_questions(self, syllabus_text: str, content_text: str,
	questions_per_topic: int = 3) -> Dict[str, List[Dict]]:
	"""
	Generate questions based on syllabus topics.

	Args:
	syllabus_text: The syllabus text with units and topics
	content_text: The content text to generate questions from
	questions_per_topic: Number of questions to generate per topic

	Returns:
	Dictionary mapping topics to lists of questions
	"""
	# Parse the syllabus
	units = self.parse_syllabus(syllabus_text)

	# Process content into sentences
	sentences = sent_tokenize(content_text)

	topic_questions = {}

	for unit, topics in units.items():
	for topic in topics:
	# Extract key terms from the topic
	key_terms = self.extract_key_terms(topic)

	# Find relevant sentences containing these terms
	relevant_sentences = []
	for sentence in sentences:
	if any(term in sentence.lower() for term in key_terms):
	relevant_sentences.append(sentence)

	# If no relevant sentences found, use general content
	if not relevant_sentences:
	relevant_sentences = sentences

	# Generate questions from relevant sentences
	questions = self.question_generator.generate_multiple_questions(
	relevant_sentences,
	max_questions=min(questions_per_topic, len(relevant_sentences))
	)

	if questions:
	topic_questions[f"{unit} - {topic}"] = questions

	return topic_questions