8674-Project / src /curriculum_analyzer.py
ckharche's picture
added option to choose tracks
5360228 verified
#!/usr/bin/env python3
"""
FIXED Curriculum Analyzer - Production Version
Synchronized with optimizer logic:
1. Filters subjects to ONLY: CS, DS, CY, MATH, PHYS, ENGW.
2. Removes IS, EECE, STAT, and other irrelevant subjects.
3. ADDS exception for undergrad-accessible 5000-level courses (CS5700).
4. FIXES bad prerequisite data (e.g., CS2500 -> CS2800).
"""
import pickle
import argparse
import networkx as nx
import re
from typing import Set, Dict
from collections import defaultdict
def get_course_level(cid):
"""Extracts the numerical part of a course ID for level checking."""
match = re.search(r'\d+', cid)
return int(match.group(0)) if match else 9999
class CurriculumAnalyzer:
# --- FIX 1: DEFINE LISTS THAT MATCH THE OPTIMIZER ---
# Subjects the optimizer is programmed to understand.
# ENGW/PHYS are needed only for hardcoded Year 1.
KEEP_SUBJECTS = {"CS", "DS", "CY", "MATH", "PHYS", "ENGW"}
# 5000-level courses the optimizer explicitly allows.
UNDERGRAD_ACCESSIBLE_GRAD = {"CS5700", "CY5700", "DS5110", "CS5010"}
def __init__(self, graph_path, courses_path):
print("📚 Loading raw curriculum data...")
try:
with open(graph_path, 'rb') as f:
self.graph = pickle.load(f)
with open(courses_path, 'rb') as f:
self.courses = pickle.load(f)
except Exception as e:
print(f"❌ ERROR: Could not load files. {e}")
exit(1)
# Merge course data into graph nodes
for course_id, course_data in self.courses.items():
if self.graph.has_node(course_id):
self.graph.nodes[course_id].update(course_data)
print(f"✅ Loaded {self.graph.number_of_nodes()} courses, {self.graph.number_of_edges()} edges")
def pre_filter_graph(self):
"""
--- FIX 2: IMPLEMENTS STRICT FILTERING ---
Keeps only relevant subjects and removes labs/high-level courses.
"""
print("\n🧹 Pre-filtering graph...")
nodes_to_remove = set()
for node, data in self.graph.nodes(data=True):
subject = data.get('subject', '')
name = data.get('name', '').lower()
level = get_course_level(node)
# Check for removal
is_irrelevant_subject = subject not in self.KEEP_SUBJECTS
is_lab_or_seminar = any(skip in name for skip in ['lab', 'recitation', 'seminar', 'practicum', 'co-op'])
# Grad-level check
is_grad_level = level >= 5000
is_allowed_grad = node in self.UNDERGRAD_ACCESSIBLE_GRAD
if (is_irrelevant_subject or
is_lab_or_seminar or
(is_grad_level and not is_allowed_grad)): # <-- Bug fix
nodes_to_remove.add(node)
original_count = self.graph.number_of_nodes()
self.graph.remove_nodes_from(nodes_to_remove)
print(f"✅ Removed {len(nodes_to_remove)} irrelevant courses (IS, EECE, etc.)")
print(f" Original nodes: {original_count}")
print(f" Remaining nodes: {self.graph.number_of_nodes()}")
def fix_chains(self):
"""Adds critical prerequisite chains that might be missing."""
print("\n🔗 Validating and fixing critical prerequisite chains...")
critical_chains = {
# Foundations
("CS1800", "CS2800", "Discrete → Logic"),
("CS2500", "CS2510", "Fundies 1 → Fundies 2"),
# Core CS
("CS2510", "CS3500", "Fundies 2 → OOD"),
("CS2510", "CS3000", "Fundies 2 → Algorithms"),
("CS2800", "CS3000", "Logic → Algorithms"),
# --- THIS IS THE FIX ---
("CS3000", "CS3650", "Algorithms -> Systems"),
# ---------------------
# Core AI/ML
("CS3000", "CS4100", "Algorithms → AI"),
("CS3500", "CS4100", "OOD → AI"),
# Core DS Path
("DS2000", "DS2500", "Prog w/ Data → Intermediate"),
("DS2500", "DS3500", "Intermediate → Advanced"),
("DS3500", "DS4400", "Advanced → ML1"),
("CS3500", "DS4400", "OOD → ML1"),
# Math
("MATH1341", "MATH1342", "Calc 1 → Calc 2"),
}
added = 0
for prereq, course, desc in critical_chains:
if self.graph.has_node(prereq) and self.graph.has_node(course):
if not self.graph.has_edge(prereq, course):
self.graph.add_edge(prereq, course)
print(f" 🔧 FIXED: Added {prereq}{course} ({desc})")
added += 1
if added == 0:
print(" ✅ All critical chains present")
def remove_spurious_chains(self):
"""
--- FIX 3: REMOVE BAD DATA ---
Removes known incorrect prerequisite edges from scraper.
"""
print("\n🗑️ Removing spurious prerequisite chains...")
# Based on your inspect_graph output and catalog knowledge
spurious_chains = {
("CS2500", "CS2800"), # Fundies 1 is NOT a prereq for Logic
("MATH1365", "CS2800"), # Not a real prereq
("EECE2160", "CS3000"), # Irrelevant prereq
("EECE2560", "CS3500"), # Irrelevant prereq
}
removed = 0
for prereq, course in spurious_chains:
if self.graph.has_edge(prereq, course):
self.graph.remove_edge(prereq, course)
print(f" ✅ REMOVED: {prereq}{course}")
removed += 1
if removed == 0:
print(" ✅ No spurious chains found")
def calculate_and_add_complexity(self):
"""Calculates and adds complexity score to each course."""
print("\n🧮 Calculating complexity scores...")
for node in self.graph.nodes():
# Use predecessors/successors on the *cleaned* graph
in_degree = self.graph.in_degree(node)
out_degree = self.graph.out_degree(node)
# Complexity heuristic: weighted by prerequisites and courses unlocked
score = (in_degree * 10) + (out_degree * 5)
nx.set_node_attributes(self.graph, {node: {'complexity': score}})
print("✅ Complexity scores calculated")
def validate_critical_courses(self) -> Dict[str, Set[str]]:
"""Check if all critical courses exist in the graph."""
print("\n🎯 Validating critical course coverage...")
# This list MUST match the optimizer's requirements
required_courses = {
"foundations": {"CS1800", "CS2500", "CS2510", "CS2800"},
"core": {"CS3000", "CS3500", "CS3650", "CS3200", "CS5700"}, # Added CS5700
"ai_ml": {"CS4100", "DS4400", "CS4120", "DS4420", "CS4180", "DS4440"},
"systems": {"CS4730", "CS4700", "CS4400", "CS4500"},
"security": {"CY2550", "CY3740", "CY4740", "CY4760"},
"math": {"MATH1341", "MATH1342", "MATH2331", "MATH3081"},
}
missing = {}
for category, courses in required_courses.items():
missing_in_cat = courses - set(self.graph.nodes())
if missing_in_cat:
missing[category] = missing_in_cat
print(f" ⚠️ {category}: Missing {missing_in_cat}")
else:
print(f" ✅ {category}: All courses present")
return missing
def save_enriched_graph(self, output_path):
"""Saves the final, clean, and enriched graph."""
print(f"\n💾 Saving cleaned graph to {output_path}...")
with open(output_path, 'wb') as f:
pickle.dump(self.graph, f)
print("✅ Graph saved")
# Save a summary report
report_path = output_path.replace('.pkl', '_report.txt')
with open(report_path, 'w') as f:
f.write("Curriculum Graph Analysis Report\n")
f.write("="*70 + "\n\n")
f.write(f"Total courses: {self.graph.number_of_nodes()}\n")
f.write(f"Total prerequisites: {self.graph.number_of_edges()}\n\n")
subject_counts = defaultdict(int)
for node in self.graph.nodes():
subject = self.graph.nodes[node].get('subject', 'UNKNOWN')
subject_counts[subject] += 1
f.write("Subject breakdown (Filtered):\n")
for subject in sorted(subject_counts.keys()):
f.write(f" {subject}: {subject_counts[subject]}\n")
print(f"✅ Report saved to {report_path}")
def main(args):
"""Main execution flow."""
analyzer = CurriculumAnalyzer(args.graph, args.courses)
analyzer.pre_filter_graph()
analyzer.fix_chains()
analyzer.remove_spurious_chains()
analyzer.calculate_and_add_complexity()
missing = analyzer.validate_critical_courses()
if missing:
print("\n⚠️ WARNING: Some critical courses are missing!")
print(" Consider re-scraping with additional terms or subjects.")
print(" Missing courses will be excluded from planning.")
analyzer.save_enriched_graph(args.output_graph)
print("\n✨ Analysis complete!")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="NEU Curriculum Analyzer - Cleans and validates data")
parser.add_argument('--graph', required=True, help="Path to RAW curriculum graph (e.g., neu_merged_graph_...pkl)")
parser.add_argument('--courses', required=True, help="Path to RAW courses data (e.g., neu_merged_courses_...pkl)")
parser.add_argument('--output-graph', default='neu_graph_clean.pkl', help="Output path for the new, clean graph")
args = parser.parse_args()
main(args)