8674-Project / src /curriculum_optimizer.py
ckharche's picture
added option to choose tracks
5360228 verified
"""
Curriculum Optimizer - PRODUCTION VERSION
All redundant code removed, all critical issues fixed
"""
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from sentence_transformers import SentenceTransformer, util
import networkx as nx
import numpy as np
from typing import Dict, List, Set, Optional
from dataclasses import dataclass
import re
from datetime import datetime
@dataclass
class StudentProfile:
completed_courses: List[str]
time_commitment: int
preferred_difficulty: str
career_goals: str
interests: List[str]
current_gpa: float = 3.5
learning_style: str = "Visual"
class HybridOptimizer:
EQUIVALENCY_GROUPS = [
{"MATH1341", "MATH1241", "MATH1231"},
{"MATH1342", "MATH1242"},
{"PHYS1151", "PHYS1161", "PHYS1145"},
{"PHYS1155", "PHYS1165", "PHYS1147"},
]
COURSE_TRACKS = {
"physics": {
"engineering": ["PHYS1151", "PHYS1155"],
"science": ["PHYS1161", "PHYS1165"],
"life_sciences": ["PHYS1145", "PHYS1147"]
},
"calculus": {
"standard": ["MATH1341", "MATH1342"],
"computational": ["MATH156", "MATH256"]
}
}
CONCENTRATION_REQUIREMENTS = {
"ai_ml": {
"foundations": {
"required": ["CS1800", "CS2500", "CS2510", "CS2800"],
"sequence": True
},
"core": {
"required": ["CS3000", "CS3500"],
"pick_1_from": ["CS3200", "CS3650", "CS5700"]
},
"concentration_specific": {
"required": ["CS4100", "DS4400"],
"pick_2_from": ["CS4120", "CS4180", "DS4420", "DS4440"],
"pick_1_systems": ["CS4730", "CS4700"]
},
"math": {
"required": ["MATH1341", "MATH1342"],
"pick_1_from": ["MATH2331", "MATH3081"]
}
},
"systems": {
"foundations": {"required": ["CS1800", "CS2500", "CS2510", "CS2800"]},
"core": {"required": ["CS3000", "CS3500", "CS3650"], "pick_1_from": ["CS5700", "CS3200"]},
"concentration_specific": {"required": ["CS4700"], "pick_2_from": ["CS4730"], "pick_1_from": ["CS4400", "CS4500", "CS4520"]},
"math": {"required": ["MATH1341", "MATH1342"]}
},
"security": {
"foundations": {"required": ["CS1800", "CS2500", "CS2510", "CS2800"]},
"core": {"required": ["CS3000", "CS3650", "CY2550"], "pick_1_from": ["CS5700", "CS3500"]},
"concentration_specific": {"required": ["CY3740"], "pick_2_from": ["CY4740", "CY4760", "CY4770"], "pick_1_from": ["CS4700", "CS4730"]},
"math": {"required": ["MATH1342"], "pick_1_from": ["MATH3527", "MATH3081"]}
}
}
def __init__(self):
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.model_name = "meta-llama/Llama-3.1-8B-Instruct"
self.embedding_model_name = 'BAAI/bge-large-en-v1.5'
self.llm = None
self.tokenizer = None
self.embedding_model = None
self.curriculum_graph = None
self.courses = {}
self.current_student = None
def load_models(self):
print("Loading embedding model...")
self.embedding_model = SentenceTransformer(self.embedding_model_name, device=self.device)
def load_llm(self):
if self.device.type == 'cuda' and self.llm is None:
print("Loading LLM for intelligent planning...")
quant_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16
)
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
self.tokenizer.pad_token = self.tokenizer.eos_token
self.llm = AutoModelForCausalLM.from_pretrained(
self.model_name,
quantization_config=quant_config,
device_map="auto"
)
def load_data(self, graph: nx.DiGraph):
self.curriculum_graph = graph
self.courses = dict(graph.nodes(data=True))
UNDERGRAD_ACCESSIBLE_GRAD = {"CS5700", "CY5700", "DS5110", "CS5010"}
self.valid_courses = []
course_texts = []
concentration_courses = set()
for track_reqs in self.CONCENTRATION_REQUIREMENTS.values():
for category, reqs in track_reqs.items():
if isinstance(reqs, dict):
for key, courses in reqs.items():
if isinstance(courses, list):
concentration_courses.update(courses)
for cid, data in self.courses.items():
name = data.get('name', '')
if not name or name.strip() == '' or any(skip in name.lower() for skip in ['lab', 'recitation', 'seminar', 'practicum']):
continue
course_level = self._get_level(cid)
if course_level >= 5000 and cid not in UNDERGRAD_ACCESSIBLE_GRAD:
continue
self.valid_courses.append(cid)
course_texts.append(f"{name} {data.get('description', '')}")
missing_required = concentration_courses - set(self.valid_courses)
if missing_required:
print(f"\n⚠️ WARNING: {len(missing_required)} required courses missing from graph: {sorted(missing_required)}\n")
print(f"Computing embeddings for {len(self.valid_courses)} courses...")
self.course_embeddings = self.embedding_model.encode(course_texts, convert_to_tensor=True, show_progress_bar=True)
print(f"\nTotal valid courses: {len(self.valid_courses)}")
def _get_level(self, course_id: str) -> int:
match = re.search(r'\d+', course_id)
return int(match.group()) if match else 9999
def _get_completed_with_equivalents(self, completed: Set[str]) -> Set[str]:
expanded_completed = completed.copy()
for course in completed:
for group in self.EQUIVALENCY_GROUPS:
if course in group:
expanded_completed.update(group)
return expanded_completed
def _can_take_course(self, course_id: str, completed: Set[str]) -> bool:
effective_completed = self._get_completed_with_equivalents(completed)
if course_id not in self.curriculum_graph:
return True
prereqs = set(self.curriculum_graph.predecessors(course_id))
return prereqs.issubset(effective_completed)
def _validate_sequence(self, selected: List[str], candidate: str) -> bool:
for track_type, tracks in self.COURSE_TRACKS.items():
for track_name, sequence in tracks.items():
if candidate in sequence:
for other_track, other_seq in tracks.items():
if other_track != track_name and any(c in selected for c in other_seq):
return False
return True
def _score_course(self, course_id: str, semantic_scores: Dict[str, float], required_set: Set[str], picklist_set: Set[str], year: int, track: str) -> float:
"""
PRODUCTION SCORING - NOW TRACK AWARE
Applies different boosts based on the selected track.
"""
if course_id not in self.courses or not self.courses[course_id].get('name', '').strip():
return -10000.0
course_data = self.courses[course_id]
subject = course_data.get('subject', '')
level = self._get_level(course_id)
name = course_data.get('name', '').lower()
score = 0.0
# --- SEMANTICS APPLIED FIRST ---
semantic_weight = 15.0 if year == 4 else 5.0
score += semantic_scores.get(course_id, 0.0) * semantic_weight
# --- PENALTY APPLIED AFTER SEMANTICS ---
non_technical_keywords = ['society', 'ethics', 'law', 'policy', 'mobile', 'game', 'visualiz', 'web']
if any(keyword in name for keyword in non_technical_keywords):
# Exception: allow 'game' and 'mobile' if game_dev track is selected
if track == "game_dev" and any(k in name for k in ['game', 'mobile']):
pass # Do not penalize
else:
score -= 10000.0
# Subject-aware scoring
if subject in ["CS", "DS"]:
score += 300.0
elif subject == "CY":
if level < 3000:
score -= 500.0
else:
score += 300.0 # Allow CY electives if not intro
elif subject == "MATH":
score += 100.0
else:
score -= 1000.0
# --- TRACK-AWARE CRITICAL PATH BOOSTS ---
if track == "ai_ml":
if course_id in ["DS2500", "DS3000", "DS3500"]:
score += 7000.0
elif track == "security":
if course_id in ["CY2550", "CY3740"]:
score += 7000.0
elif track == "systems":
if course_id == "CS3650":
score += 7000.0
elif track == "game_dev":
if course_id == "CS3540": # Game Programming
score += 8000.0 # Main course for this track
# "general" track gets no special boosts
# Hard requirements
if course_id in required_set:
score += 10000.0
# Pick-list courses
if course_id in picklist_set:
score += 5000.0
# Unlocking factor
if course_id in self.curriculum_graph:
unlocks = self.curriculum_graph.out_degree(course_id)
score += min(unlocks, 5) * 2.0
# Level preference
score -= (level / 100.0)
# Year-specific penalties
if year == 4 and level < 4000:
score -= 3000.0
elif year == 3 and level < 3000:
score -= 2000.0
return score
def generate_simple_plan(self, student: StudentProfile, track_override: Optional[str] = None) -> Dict:
print("--- Generating Enhanced Rule-Based Plan ---")
self.current_student = student
return self.generate_enhanced_rule_plan(student, track_override)
def generate_enhanced_rule_plan(self, student: StudentProfile, track_override: Optional[str] = None) -> Dict:
self.current_student = student
# --- FIX: Logic corrected to respect "general" override ---
if track_override:
track = track_override
print(f"--- Using user-selected track: {track} ---")
else:
track = self._identify_track(student)
print(f"--- Auto-identified track: {track} ---")
if not track:
track = "general"
plan = self._build_structured_plan(student, track, None)
validation = self.validate_plan(plan, student)
if validation["errors"]:
plan = self._fix_plan_errors(plan, validation, student)
validation = self.validate_plan(plan, student)
difficulty_level = self._map_difficulty(student.preferred_difficulty)
courses_per_semester = self._calculate_course_load(student.time_commitment)
track_name = track.replace("_", " ").title()
explanation = f"Personalized {track_name} track ({difficulty_level} difficulty, {courses_per_semester} courses/semester)"
return self._finalize_plan(plan, explanation, validation)
def generate_llm_plan(self, student: StudentProfile, track_override: Optional[str] = None) -> Dict:
print("--- Generating AI-Optimized Plan ---")
self.current_student = student
self.load_llm()
if not self.llm:
return self.generate_enhanced_rule_plan(student, track_override) # Pass override
# --- FIX: Use override if provided, otherwise identify ---
if track_override and track_override != "general":
track = track_override
print(f"--- Using user-selected track: {track} ---")
else:
track = self._identify_track(student)
print(f"--- Auto-identified track: {track} ---")
if not track:
track = "general"
llm_suggestions = self._get_llm_course_suggestions(student, track)
plan = self._build_structured_plan(student, track, llm_suggestions)
validation = self.validate_plan(plan, student)
if validation["errors"]:
plan = self._fix_plan_errors(plan, validation, student)
validation = self.validate_plan(plan, student)
track_name = track.replace("_", " ").title()
explanation = self._generate_explanation(student, plan, track, f"AI-optimized {track_name}")
return self._finalize_plan(plan, explanation, validation)
def _build_structured_plan(self, student: StudentProfile, track: str, llm_suggestions: Optional[List[str]] = None) -> Dict:
"""
PRODUCTION PLANNER - NOW FULLY TRACK-AWARE
Uses different priority lists based on the selected track.
"""
completed = set(student.completed_courses)
plan = {}
# --- FIX: TRACK-AWARE REQUIREMENTS ---
if track == "general":
print("--- Using General CS requirements ---")
requirements = {
"foundations": {"required": ["CS1800", "CS2500", "CS2510", "CS2800"]},
"core": {"required": ["CS3000", "CS3500", "CS3650"]},
"math": {"required": ["MATH1341", "MATH1342"], "pick_1_from": ["MATH2331", "MATH3081"]}
}
elif track == "game_dev":
print("--- Using Game Dev (AI/ML base) requirements ---")
# Use ai_ml as a base, scoring/priorities will handle the rest
requirements = self.CONCENTRATION_REQUIREMENTS["ai_ml"]
else:
requirements = self.CONCENTRATION_REQUIREMENTS.get(track, self.CONCENTRATION_REQUIREMENTS["ai_ml"])
courses_per_semester = self._calculate_course_load(student.time_commitment)
# Build required and pick sets
required_set = set()
picklist_set = set()
for category, reqs in requirements.items():
if "required" in reqs:
required_set.update(reqs["required"])
for key, courses in reqs.items():
if key.startswith("pick_"):
picklist_set.update(courses)
semantic_scores = self._compute_semantic_scores(student)
# --- FIX: TRACK-AWARE PRIORITIES ---
TRACK_YEAR_PRIORITIES = {
"general": {
2: ["CS3000", "CS3500", "CS3650", "MATH2331", "MATH3081", "CS3200"],
3: ["CS4700", "CS4400", "CS4500", "CS4100"],
4: ["CS5700", "CS4730", "CS4530", "CS4550", "CS4410"]
},
"ai_ml": {
2: ["CS3000", "CS3500", "DS2500", "DS3000", "DS3500", "MATH2331", "MATH3081", "CS3650"],
3: ["CS4100", "DS4400", "CS4120", "DS4420", "DS4440", "CS4180"],
4: ["CS4730", "CS4700", "CS5700", "DS4300", "CS4400", "CS4500"]
},
"security": {
2: ["CS3000", "CS3650", "CY2550", "MATH2331", "MATH3081", "CS3500"],
3: ["CY3740", "CS4700", "CS5700", "CS4730"],
4: ["CY4740", "CY4760", "CS4400"] # CY4770 is missing from graph
},
"systems": {
2: ["CS3000", "CS3500", "CS3650", "MATH2331", "CS3200"],
3: ["CS4700", "CS5700", "CS4730", "CS4500", "CS4400"],
4: ["CS4520", "CS4410"]
},
"game_dev": {
2: ["CS3000", "CS3500", "CS3540", "MATH2331", "MATH3081", "CS3650"],
3: ["CS4520", "CS4300", "CS4100", "CS4700"],
4: ["CS4550", "CS4410", "CS4180"]
}
}
for sem_num in range(1, 9):
year = ((sem_num - 1) // 2) + 1
available_courses = self._get_available_courses(completed, year, sem_num, track)
schedulable = [
c for c in available_courses
if c not in completed and self._can_take_course(c, completed)
]
# Use track-specific priorities, default to "general" if track is unknown
current_year_priorities = TRACK_YEAR_PRIORITIES.get(track, TRACK_YEAR_PRIORITIES["general"]).get(year)
if current_year_priorities:
priority_courses = [c for c in current_year_priorities if c in schedulable]
other_courses = [c for c in schedulable if c not in current_year_priorities]
scored_priority = sorted(
priority_courses,
# --- FIX: Pass 'track' to score_course ---
key=lambda c: self._score_course(c, semantic_scores, required_set, picklist_set, year, track),
reverse=True
)
scored_others = sorted(
other_courses,
key=lambda c: self._score_course(c, semantic_scores, required_set, picklist_set, year, track),
reverse=True
)
scored_courses = scored_priority + scored_others
else:
# Year 1: normal scoring
scored_courses = sorted(
schedulable,
key=lambda c: self._score_course(c, semantic_scores, required_set, picklist_set, year, track),
reverse=True
)
# Select top N courses
selected = []
for course in scored_courses:
if len(selected) >= courses_per_semester:
break
if self._validate_sequence(selected, course):
selected.append(course)
if selected:
year_key = f"year_{year}"
if year_key not in plan:
plan[year_key] = {}
sem_type = 'fall' if (sem_num % 2) == 1 else 'spring'
plan[year_key][sem_type] = selected
completed.update(selected)
return plan
def _get_available_courses(self, completed: Set[str], year: int, sem_num: int = None, track: str = "ai_ml") -> List[str]:
"""
PRODUCTION COURSE FILTER - Strict level enforcement
"""
# Year 1: Hardcoded foundation
if year == 1:
if not completed or len(completed) < 2:
return [c for c in ["CS1800", "CS2500", "MATH1341", "ENGW1111"] if c in self.valid_courses]
else:
next_courses = []
prereq_map = [
("CS2800", "CS1800"),
("CS2510", "CS2500"),
("MATH1342", "MATH1341"),
("DS2000", None),
("DS2500", "DS2000")
]
for course, prereq in prereq_map:
if course in self.valid_courses and course not in completed:
if prereq is None or prereq in completed:
next_courses.append(course)
return next_courses
# Years 2-4: Strict filtering by subject and level
available = []
ALLOWED_SUBJECTS = {"CS", "DS", "CY", "MATH"}
for cid in self.valid_courses:
if cid in completed:
continue
course_data = self.courses.get(cid, {})
subject = course_data.get('subject')
if subject not in ALLOWED_SUBJECTS:
continue
course_level = self._get_level(cid)
# FIX: Strict year-based level filtering
if year == 2:
if course_level < 2000 or course_level > 3999:
continue # Year 2: only 2000-3999
elif year == 3:
if course_level < 3000:
continue # Year 3: 3000+ only
elif year == 4:
if course_level < 4000:
continue # Year 4: 4000+ only (including CS5700)
available.append(cid)
return available
def _fix_plan_errors(self, plan: Dict, validation: Dict, student: StudentProfile) -> Dict:
if any("Mixed" in error for error in validation["errors"]):
return self._build_structured_plan(student, self._identify_track(student), None)
return plan
def _get_llm_course_suggestions(self, student: StudentProfile, track: str) -> List[str]:
requirements = self.CONCENTRATION_REQUIREMENTS.get(track, {})
all_options = set()
for reqs in requirements.values():
for key, courses in reqs.items():
if key.startswith("pick_"):
all_options.update(courses)
course_options_text = [
f"{cid}: {self.courses[cid].get('name', cid)} - {self.courses[cid].get('description', '')[:100].strip()}"
for cid in list(all_options)[:15] if cid in self.courses
]
prompt = f"""Expert curriculum advisor ranking courses for student.
Student Profile:
- Career Goal: {student.career_goals}
- Interests: {', '.join(student.interests)}
- Difficulty: {student.preferred_difficulty}
Available Courses:
{chr(10).join(course_options_text)}
Return ONLY top 5 course IDs, one per line."""
try:
inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096).to(self.device)
with torch.no_grad():
outputs = self.llm.generate(
**inputs,
max_new_tokens=100,
temperature=0.2,
do_sample=True,
pad_token_id=self.tokenizer.eos_token_id
)
response = self.tokenizer.decode(outputs[0][len(inputs['input_ids'][0]):], skip_special_tokens=True)
suggested_courses = re.findall(r'([A-Z]{2,4}\d{4})', response)
return suggested_courses[:5]
except Exception as e:
print(f"LLM suggestion failed: {e}")
return list(all_options)[:5]
def _map_difficulty(self, preferred_difficulty: str) -> str:
return {"easy": "easy", "moderate": "medium", "challenging": "hard"}.get(preferred_difficulty.lower(), "medium")
def _calculate_course_load(self, time_commitment: int) -> int:
if time_commitment <= 20:
return 3
if time_commitment <= 40:
return 4
return 5
def _identify_track(self, student: StudentProfile) -> str:
if not hasattr(self, 'embedding_model') or self.embedding_model is None:
combined = f"{student.career_goals.lower()} {' '.join(student.interests).lower()}"
if any(word in combined for word in ['ai', 'ml', 'machine learning', 'data']):
return "ai_ml"
if any(word in combined for word in ['systems', 'distributed', 'backend']):
return "systems"
if any(word in combined for word in ['security', 'cyber']):
return "security"
return "ai_ml"
profile_text = f"{student.career_goals} {' '.join(student.interests)}"
profile_emb = self.embedding_model.encode(profile_text, convert_to_tensor=True)
track_descriptions = {
"ai_ml": "artificial intelligence machine learning deep learning neural networks data science",
"systems": "operating systems distributed systems networks compilers databases performance backend",
"security": "cybersecurity cryptography network security ethical hacking vulnerabilities"
}
best_track, best_score = "ai_ml", -1.0
for track, description in track_descriptions.items():
track_emb = self.embedding_model.encode(description, convert_to_tensor=True)
score = float(util.cos_sim(profile_emb, track_emb))
if score > best_score:
best_score, best_track = score, track
return best_track
def _compute_semantic_scores(self, student: StudentProfile) -> Dict[str, float]:
query_text = f"{student.career_goals} {' '.join(student.interests)}"
query_emb = self.embedding_model.encode(query_text, convert_to_tensor=True)
similarities = util.cos_sim(query_emb, self.course_embeddings)[0]
return {cid: float(similarities[idx]) for idx, cid in enumerate(self.valid_courses)}
def _generate_explanation(self, student: StudentProfile, plan: Dict, track: str, plan_type: str) -> str:
return f"{plan_type.title()} plan for the {track} track, tailored to your goal of becoming a {student.career_goals}."
def validate_plan(self, plan: Dict, student: StudentProfile = None) -> Dict[str, List[str]]:
issues = {"errors": [], "warnings": [], "info": []}
all_courses = [course for year in plan.values() for sem in year.values() for course in sem if isinstance(sem, list)]
# Check for mixed tracks
for track_type, tracks in self.COURSE_TRACKS.items():
tracks_used = {name for name, courses in tracks.items() if any(c in all_courses for c in courses)}
if len(tracks_used) > 1:
issues["errors"].append(f"Mixed {track_type} tracks: {', '.join(tracks_used)}. Choose one sequence.")
# Validate prerequisites
completed_for_validation = set(student.completed_courses) if student else set()
for year in range(1, 5):
for sem in ["fall", "spring"]:
year_key = f"year_{year}"
sem_courses = plan.get(year_key, {}).get(sem, [])
for course in sem_courses:
if course in self.curriculum_graph:
prereqs = set(self.curriculum_graph.predecessors(course))
if not prereqs.issubset(self._get_completed_with_equivalents(completed_for_validation)):
missing = prereqs - completed_for_validation
issues["errors"].append(f"{course} in Year {year} {sem} is missing prereqs: {', '.join(missing)}")
completed_for_validation.update(sem_courses)
return issues
def _finalize_plan(self, plan: Dict, explanation: str, validation: Dict = None) -> Dict:
structured_plan = {
"reasoning": explanation,
"validation": validation or {"errors": [], "warnings": [], "info": []}
}
complexities = []
for year in range(1, 5):
year_key = f"year_{year}"
structured_plan[year_key] = {
"fall": plan.get(year_key, {}).get("fall", []),
"spring": plan.get(year_key, {}).get("spring", []),
"summer": "co-op" if year in [2, 3] else []
}
for sem in ["fall", "spring"]:
courses = structured_plan[year_key][sem]
if courses:
sem_complexity = sum(self.courses.get(c, {}).get('complexity', 50) for c in courses)
complexities.append(sem_complexity)
structured_plan["complexity_analysis"] = {
"average_semester_complexity": float(np.mean(complexities)) if complexities else 0,
"peak_semester_complexity": float(np.max(complexities)) if complexities else 0,
"total_complexity": float(np.sum(complexities)) if complexities else 0,
"balance_score (std_dev)": float(np.std(complexities)) if complexities else 0
}
structured_plan["metadata"] = {
"generated": datetime.now().isoformat(),
"valid": len(validation.get("errors", [])) == 0 if validation else True,
}
return {"pathway": structured_plan}
class CurriculumOptimizer(HybridOptimizer):
"""Compatibility wrapper"""
def __init__(self):
super().__init__()
def generate_plan(self, student: StudentProfile, track_override: Optional[str] = None) -> Dict:
return self.generate_enhanced_rule_plan(student, track_override)