#!/usr/bin/env python3 """ FIXED Curriculum Analyzer - Production Version Synchronized with optimizer logic: 1. Filters subjects to ONLY: CS, DS, CY, MATH, PHYS, ENGW. 2. Removes IS, EECE, STAT, and other irrelevant subjects. 3. ADDS exception for undergrad-accessible 5000-level courses (CS5700). 4. FIXES bad prerequisite data (e.g., CS2500 -> CS2800). """ import pickle import argparse import networkx as nx import re from typing import Set, Dict from collections import defaultdict def get_course_level(cid): """Extracts the numerical part of a course ID for level checking.""" match = re.search(r'\d+', cid) return int(match.group(0)) if match else 9999 class CurriculumAnalyzer: # --- FIX 1: DEFINE LISTS THAT MATCH THE OPTIMIZER --- # Subjects the optimizer is programmed to understand. # ENGW/PHYS are needed only for hardcoded Year 1. KEEP_SUBJECTS = {"CS", "DS", "CY", "MATH", "PHYS", "ENGW"} # 5000-level courses the optimizer explicitly allows. UNDERGRAD_ACCESSIBLE_GRAD = {"CS5700", "CY5700", "DS5110", "CS5010"} def __init__(self, graph_path, courses_path): print("๐Ÿ“š Loading raw curriculum data...") try: with open(graph_path, 'rb') as f: self.graph = pickle.load(f) with open(courses_path, 'rb') as f: self.courses = pickle.load(f) except Exception as e: print(f"โŒ ERROR: Could not load files. {e}") exit(1) # Merge course data into graph nodes for course_id, course_data in self.courses.items(): if self.graph.has_node(course_id): self.graph.nodes[course_id].update(course_data) print(f"โœ… Loaded {self.graph.number_of_nodes()} courses, {self.graph.number_of_edges()} edges") def pre_filter_graph(self): """ --- FIX 2: IMPLEMENTS STRICT FILTERING --- Keeps only relevant subjects and removes labs/high-level courses. """ print("\n๐Ÿงน Pre-filtering graph...") nodes_to_remove = set() for node, data in self.graph.nodes(data=True): subject = data.get('subject', '') name = data.get('name', '').lower() level = get_course_level(node) # Check for removal is_irrelevant_subject = subject not in self.KEEP_SUBJECTS is_lab_or_seminar = any(skip in name for skip in ['lab', 'recitation', 'seminar', 'practicum', 'co-op']) # Grad-level check is_grad_level = level >= 5000 is_allowed_grad = node in self.UNDERGRAD_ACCESSIBLE_GRAD if (is_irrelevant_subject or is_lab_or_seminar or (is_grad_level and not is_allowed_grad)): # <-- Bug fix nodes_to_remove.add(node) original_count = self.graph.number_of_nodes() self.graph.remove_nodes_from(nodes_to_remove) print(f"โœ… Removed {len(nodes_to_remove)} irrelevant courses (IS, EECE, etc.)") print(f" Original nodes: {original_count}") print(f" Remaining nodes: {self.graph.number_of_nodes()}") def fix_chains(self): """Adds critical prerequisite chains that might be missing.""" print("\n๐Ÿ”— Validating and fixing critical prerequisite chains...") critical_chains = { # Foundations ("CS1800", "CS2800", "Discrete โ†’ Logic"), ("CS2500", "CS2510", "Fundies 1 โ†’ Fundies 2"), # Core CS ("CS2510", "CS3500", "Fundies 2 โ†’ OOD"), ("CS2510", "CS3000", "Fundies 2 โ†’ Algorithms"), ("CS2800", "CS3000", "Logic โ†’ Algorithms"), # --- THIS IS THE FIX --- ("CS3000", "CS3650", "Algorithms -> Systems"), # --------------------- # Core AI/ML ("CS3000", "CS4100", "Algorithms โ†’ AI"), ("CS3500", "CS4100", "OOD โ†’ AI"), # Core DS Path ("DS2000", "DS2500", "Prog w/ Data โ†’ Intermediate"), ("DS2500", "DS3500", "Intermediate โ†’ Advanced"), ("DS3500", "DS4400", "Advanced โ†’ ML1"), ("CS3500", "DS4400", "OOD โ†’ ML1"), # Math ("MATH1341", "MATH1342", "Calc 1 โ†’ Calc 2"), } added = 0 for prereq, course, desc in critical_chains: if self.graph.has_node(prereq) and self.graph.has_node(course): if not self.graph.has_edge(prereq, course): self.graph.add_edge(prereq, course) print(f" ๐Ÿ”ง FIXED: Added {prereq} โ†’ {course} ({desc})") added += 1 if added == 0: print(" โœ… All critical chains present") def remove_spurious_chains(self): """ --- FIX 3: REMOVE BAD DATA --- Removes known incorrect prerequisite edges from scraper. """ print("\n๐Ÿ—‘๏ธ Removing spurious prerequisite chains...") # Based on your inspect_graph output and catalog knowledge spurious_chains = { ("CS2500", "CS2800"), # Fundies 1 is NOT a prereq for Logic ("MATH1365", "CS2800"), # Not a real prereq ("EECE2160", "CS3000"), # Irrelevant prereq ("EECE2560", "CS3500"), # Irrelevant prereq } removed = 0 for prereq, course in spurious_chains: if self.graph.has_edge(prereq, course): self.graph.remove_edge(prereq, course) print(f" โœ… REMOVED: {prereq} โ†’ {course}") removed += 1 if removed == 0: print(" โœ… No spurious chains found") def calculate_and_add_complexity(self): """Calculates and adds complexity score to each course.""" print("\n๐Ÿงฎ Calculating complexity scores...") for node in self.graph.nodes(): # Use predecessors/successors on the *cleaned* graph in_degree = self.graph.in_degree(node) out_degree = self.graph.out_degree(node) # Complexity heuristic: weighted by prerequisites and courses unlocked score = (in_degree * 10) + (out_degree * 5) nx.set_node_attributes(self.graph, {node: {'complexity': score}}) print("โœ… Complexity scores calculated") def validate_critical_courses(self) -> Dict[str, Set[str]]: """Check if all critical courses exist in the graph.""" print("\n๐ŸŽฏ Validating critical course coverage...") # This list MUST match the optimizer's requirements required_courses = { "foundations": {"CS1800", "CS2500", "CS2510", "CS2800"}, "core": {"CS3000", "CS3500", "CS3650", "CS3200", "CS5700"}, # Added CS5700 "ai_ml": {"CS4100", "DS4400", "CS4120", "DS4420", "CS4180", "DS4440"}, "systems": {"CS4730", "CS4700", "CS4400", "CS4500"}, "security": {"CY2550", "CY3740", "CY4740", "CY4760"}, "math": {"MATH1341", "MATH1342", "MATH2331", "MATH3081"}, } missing = {} for category, courses in required_courses.items(): missing_in_cat = courses - set(self.graph.nodes()) if missing_in_cat: missing[category] = missing_in_cat print(f" โš ๏ธ {category}: Missing {missing_in_cat}") else: print(f" โœ… {category}: All courses present") return missing def save_enriched_graph(self, output_path): """Saves the final, clean, and enriched graph.""" print(f"\n๐Ÿ’พ Saving cleaned graph to {output_path}...") with open(output_path, 'wb') as f: pickle.dump(self.graph, f) print("โœ… Graph saved") # Save a summary report report_path = output_path.replace('.pkl', '_report.txt') with open(report_path, 'w') as f: f.write("Curriculum Graph Analysis Report\n") f.write("="*70 + "\n\n") f.write(f"Total courses: {self.graph.number_of_nodes()}\n") f.write(f"Total prerequisites: {self.graph.number_of_edges()}\n\n") subject_counts = defaultdict(int) for node in self.graph.nodes(): subject = self.graph.nodes[node].get('subject', 'UNKNOWN') subject_counts[subject] += 1 f.write("Subject breakdown (Filtered):\n") for subject in sorted(subject_counts.keys()): f.write(f" {subject}: {subject_counts[subject]}\n") print(f"โœ… Report saved to {report_path}") def main(args): """Main execution flow.""" analyzer = CurriculumAnalyzer(args.graph, args.courses) analyzer.pre_filter_graph() analyzer.fix_chains() analyzer.remove_spurious_chains() analyzer.calculate_and_add_complexity() missing = analyzer.validate_critical_courses() if missing: print("\nโš ๏ธ WARNING: Some critical courses are missing!") print(" Consider re-scraping with additional terms or subjects.") print(" Missing courses will be excluded from planning.") analyzer.save_enriched_graph(args.output_graph) print("\nโœจ Analysis complete!") if __name__ == "__main__": parser = argparse.ArgumentParser(description="NEU Curriculum Analyzer - Cleans and validates data") parser.add_argument('--graph', required=True, help="Path to RAW curriculum graph (e.g., neu_merged_graph_...pkl)") parser.add_argument('--courses', required=True, help="Path to RAW courses data (e.g., neu_merged_courses_...pkl)") parser.add_argument('--output-graph', default='neu_graph_clean.pkl', help="Output path for the new, clean graph") args = parser.parse_args() main(args)