Spaces:

ckharche
/

8674-Project

Sleeping

App Files Files Community

8674-Project / src /curriculum_analyzer.py

ckharche

added option to choose tracks

5360228 verified 4 months ago

raw

history blame contribute delete

10.2 kB

	#!/usr/bin/env python3
	"""
	FIXED Curriculum Analyzer - Production Version

	Synchronized with optimizer logic:
	1. Filters subjects to ONLY: CS, DS, CY, MATH, PHYS, ENGW.
	2. Removes IS, EECE, STAT, and other irrelevant subjects.
	3. ADDS exception for undergrad-accessible 5000-level courses (CS5700).
	4. FIXES bad prerequisite data (e.g., CS2500 -> CS2800).
	"""
	import pickle
	import argparse
	import networkx as nx
	import re
	from typing import Set, Dict
	from collections import defaultdict

	def get_course_level(cid):
	"""Extracts the numerical part of a course ID for level checking."""
	match = re.search(r'\d+', cid)
	return int(match.group(0)) if match else 9999

	class CurriculumAnalyzer:

	# --- FIX 1: DEFINE LISTS THAT MATCH THE OPTIMIZER ---

	# Subjects the optimizer is programmed to understand.
	# ENGW/PHYS are needed only for hardcoded Year 1.
	KEEP_SUBJECTS = {"CS", "DS", "CY", "MATH", "PHYS", "ENGW"}

	# 5000-level courses the optimizer explicitly allows.
	UNDERGRAD_ACCESSIBLE_GRAD = {"CS5700", "CY5700", "DS5110", "CS5010"}

	def __init__(self, graph_path, courses_path):
	print("📚 Loading raw curriculum data...")
	try:
	with open(graph_path, 'rb') as f:
	self.graph = pickle.load(f)
	with open(courses_path, 'rb') as f:
	self.courses = pickle.load(f)
	except Exception as e:
	print(f"❌ ERROR: Could not load files. {e}")
	exit(1)

	# Merge course data into graph nodes
	for course_id, course_data in self.courses.items():
	if self.graph.has_node(course_id):
	self.graph.nodes[course_id].update(course_data)

	print(f"✅ Loaded {self.graph.number_of_nodes()} courses, {self.graph.number_of_edges()} edges")

	def pre_filter_graph(self):
	"""
	--- FIX 2: IMPLEMENTS STRICT FILTERING ---
	Keeps only relevant subjects and removes labs/high-level courses.
	"""
	print("\n🧹 Pre-filtering graph...")

	nodes_to_remove = set()
	for node, data in self.graph.nodes(data=True):
	subject = data.get('subject', '')
	name = data.get('name', '').lower()
	level = get_course_level(node)

	# Check for removal
	is_irrelevant_subject = subject not in self.KEEP_SUBJECTS
	is_lab_or_seminar = any(skip in name for skip in ['lab', 'recitation', 'seminar', 'practicum', 'co-op'])

	# Grad-level check
	is_grad_level = level >= 5000
	is_allowed_grad = node in self.UNDERGRAD_ACCESSIBLE_GRAD

	if (is_irrelevant_subject or
	is_lab_or_seminar or
	(is_grad_level and not is_allowed_grad)): # <-- Bug fix
	nodes_to_remove.add(node)

	original_count = self.graph.number_of_nodes()
	self.graph.remove_nodes_from(nodes_to_remove)

	print(f"✅ Removed {len(nodes_to_remove)} irrelevant courses (IS, EECE, etc.)")
	print(f" Original nodes: {original_count}")
	print(f" Remaining nodes: {self.graph.number_of_nodes()}")

	def fix_chains(self):
	"""Adds critical prerequisite chains that might be missing."""
	print("\n🔗 Validating and fixing critical prerequisite chains...")

	critical_chains = {
	# Foundations
	("CS1800", "CS2800", "Discrete → Logic"),
	("CS2500", "CS2510", "Fundies 1 → Fundies 2"),
	# Core CS
	("CS2510", "CS3500", "Fundies 2 → OOD"),
	("CS2510", "CS3000", "Fundies 2 → Algorithms"),
	("CS2800", "CS3000", "Logic → Algorithms"),

	# --- THIS IS THE FIX ---
	("CS3000", "CS3650", "Algorithms -> Systems"),
	# ---------------------

	# Core AI/ML
	("CS3000", "CS4100", "Algorithms → AI"),
	("CS3500", "CS4100", "OOD → AI"),
	# Core DS Path
	("DS2000", "DS2500", "Prog w/ Data → Intermediate"),
	("DS2500", "DS3500", "Intermediate → Advanced"),
	("DS3500", "DS4400", "Advanced → ML1"),
	("CS3500", "DS4400", "OOD → ML1"),
	# Math
	("MATH1341", "MATH1342", "Calc 1 → Calc 2"),
	}

	added = 0
	for prereq, course, desc in critical_chains:
	if self.graph.has_node(prereq) and self.graph.has_node(course):
	if not self.graph.has_edge(prereq, course):
	self.graph.add_edge(prereq, course)
	print(f" 🔧 FIXED: Added {prereq} → {course} ({desc})")
	added += 1

	if added == 0:
	print(" ✅ All critical chains present")

	def remove_spurious_chains(self):
	"""
	--- FIX 3: REMOVE BAD DATA ---
	Removes known incorrect prerequisite edges from scraper.
	"""
	print("\n🗑️ Removing spurious prerequisite chains...")

	# Based on your inspect_graph output and catalog knowledge
	spurious_chains = {
	("CS2500", "CS2800"), # Fundies 1 is NOT a prereq for Logic
	("MATH1365", "CS2800"), # Not a real prereq
	("EECE2160", "CS3000"), # Irrelevant prereq
	("EECE2560", "CS3500"), # Irrelevant prereq
	}

	removed = 0
	for prereq, course in spurious_chains:
	if self.graph.has_edge(prereq, course):
	self.graph.remove_edge(prereq, course)
	print(f" ✅ REMOVED: {prereq} → {course}")
	removed += 1

	if removed == 0:
	print(" ✅ No spurious chains found")

	def calculate_and_add_complexity(self):
	"""Calculates and adds complexity score to each course."""
	print("\n🧮 Calculating complexity scores...")

	for node in self.graph.nodes():
	# Use predecessors/successors on the cleaned graph
	in_degree = self.graph.in_degree(node)
	out_degree = self.graph.out_degree(node)

	# Complexity heuristic: weighted by prerequisites and courses unlocked
	score = (in_degree * 10) + (out_degree * 5)
	nx.set_node_attributes(self.graph, {node: {'complexity': score}})

	print("✅ Complexity scores calculated")

	def validate_critical_courses(self) -> Dict[str, Set[str]]:
	"""Check if all critical courses exist in the graph."""
	print("\n🎯 Validating critical course coverage...")

	# This list MUST match the optimizer's requirements
	required_courses = {
	"foundations": {"CS1800", "CS2500", "CS2510", "CS2800"},
	"core": {"CS3000", "CS3500", "CS3650", "CS3200", "CS5700"}, # Added CS5700
	"ai_ml": {"CS4100", "DS4400", "CS4120", "DS4420", "CS4180", "DS4440"},
	"systems": {"CS4730", "CS4700", "CS4400", "CS4500"},
	"security": {"CY2550", "CY3740", "CY4740", "CY4760"},
	"math": {"MATH1341", "MATH1342", "MATH2331", "MATH3081"},
	}

	missing = {}
	for category, courses in required_courses.items():
	missing_in_cat = courses - set(self.graph.nodes())
	if missing_in_cat:
	missing[category] = missing_in_cat
	print(f" ⚠️ {category}: Missing {missing_in_cat}")
	else:
	print(f" ✅ {category}: All courses present")

	return missing

	def save_enriched_graph(self, output_path):
	"""Saves the final, clean, and enriched graph."""
	print(f"\n💾 Saving cleaned graph to {output_path}...")
	with open(output_path, 'wb') as f:
	pickle.dump(self.graph, f)
	print("✅ Graph saved")

	# Save a summary report
	report_path = output_path.replace('.pkl', '_report.txt')
	with open(report_path, 'w') as f:
	f.write("Curriculum Graph Analysis Report\n")
	f.write("="*70 + "\n\n")
	f.write(f"Total courses: {self.graph.number_of_nodes()}\n")
	f.write(f"Total prerequisites: {self.graph.number_of_edges()}\n\n")

	subject_counts = defaultdict(int)
	for node in self.graph.nodes():
	subject = self.graph.nodes[node].get('subject', 'UNKNOWN')
	subject_counts[subject] += 1

	f.write("Subject breakdown (Filtered):\n")
	for subject in sorted(subject_counts.keys()):
	f.write(f" {subject}: {subject_counts[subject]}\n")

	print(f"✅ Report saved to {report_path}")

	def main(args):
	"""Main execution flow."""
	analyzer = CurriculumAnalyzer(args.graph, args.courses)
	analyzer.pre_filter_graph()
	analyzer.fix_chains()
	analyzer.remove_spurious_chains()
	analyzer.calculate_and_add_complexity()

	missing = analyzer.validate_critical_courses()

	if missing:
	print("\n⚠️ WARNING: Some critical courses are missing!")
	print(" Consider re-scraping with additional terms or subjects.")
	print(" Missing courses will be excluded from planning.")

	analyzer.save_enriched_graph(args.output_graph)
	print("\n✨ Analysis complete!")

	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="NEU Curriculum Analyzer - Cleans and validates data")
	parser.add_argument('--graph', required=True, help="Path to RAW curriculum graph (e.g., neu_merged_graph_...pkl)")
	parser.add_argument('--courses', required=True, help="Path to RAW courses data (e.g., neu_merged_courses_...pkl)")
	parser.add_argument('--output-graph', default='neu_graph_clean.pkl', help="Output path for the new, clean graph")
	args = parser.parse_args()
	main(args)