Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| FIXED Curriculum Analyzer - Production Version | |
| Synchronized with optimizer logic: | |
| 1. Filters subjects to ONLY: CS, DS, CY, MATH, PHYS, ENGW. | |
| 2. Removes IS, EECE, STAT, and other irrelevant subjects. | |
| 3. ADDS exception for undergrad-accessible 5000-level courses (CS5700). | |
| 4. FIXES bad prerequisite data (e.g., CS2500 -> CS2800). | |
| """ | |
| import pickle | |
| import argparse | |
| import networkx as nx | |
| import re | |
| from typing import Set, Dict | |
| from collections import defaultdict | |
| def get_course_level(cid): | |
| """Extracts the numerical part of a course ID for level checking.""" | |
| match = re.search(r'\d+', cid) | |
| return int(match.group(0)) if match else 9999 | |
| class CurriculumAnalyzer: | |
| # --- FIX 1: DEFINE LISTS THAT MATCH THE OPTIMIZER --- | |
| # Subjects the optimizer is programmed to understand. | |
| # ENGW/PHYS are needed only for hardcoded Year 1. | |
| KEEP_SUBJECTS = {"CS", "DS", "CY", "MATH", "PHYS", "ENGW"} | |
| # 5000-level courses the optimizer explicitly allows. | |
| UNDERGRAD_ACCESSIBLE_GRAD = {"CS5700", "CY5700", "DS5110", "CS5010"} | |
| def __init__(self, graph_path, courses_path): | |
| print("📚 Loading raw curriculum data...") | |
| try: | |
| with open(graph_path, 'rb') as f: | |
| self.graph = pickle.load(f) | |
| with open(courses_path, 'rb') as f: | |
| self.courses = pickle.load(f) | |
| except Exception as e: | |
| print(f"❌ ERROR: Could not load files. {e}") | |
| exit(1) | |
| # Merge course data into graph nodes | |
| for course_id, course_data in self.courses.items(): | |
| if self.graph.has_node(course_id): | |
| self.graph.nodes[course_id].update(course_data) | |
| print(f"✅ Loaded {self.graph.number_of_nodes()} courses, {self.graph.number_of_edges()} edges") | |
| def pre_filter_graph(self): | |
| """ | |
| --- FIX 2: IMPLEMENTS STRICT FILTERING --- | |
| Keeps only relevant subjects and removes labs/high-level courses. | |
| """ | |
| print("\n🧹 Pre-filtering graph...") | |
| nodes_to_remove = set() | |
| for node, data in self.graph.nodes(data=True): | |
| subject = data.get('subject', '') | |
| name = data.get('name', '').lower() | |
| level = get_course_level(node) | |
| # Check for removal | |
| is_irrelevant_subject = subject not in self.KEEP_SUBJECTS | |
| is_lab_or_seminar = any(skip in name for skip in ['lab', 'recitation', 'seminar', 'practicum', 'co-op']) | |
| # Grad-level check | |
| is_grad_level = level >= 5000 | |
| is_allowed_grad = node in self.UNDERGRAD_ACCESSIBLE_GRAD | |
| if (is_irrelevant_subject or | |
| is_lab_or_seminar or | |
| (is_grad_level and not is_allowed_grad)): # <-- Bug fix | |
| nodes_to_remove.add(node) | |
| original_count = self.graph.number_of_nodes() | |
| self.graph.remove_nodes_from(nodes_to_remove) | |
| print(f"✅ Removed {len(nodes_to_remove)} irrelevant courses (IS, EECE, etc.)") | |
| print(f" Original nodes: {original_count}") | |
| print(f" Remaining nodes: {self.graph.number_of_nodes()}") | |
| def fix_chains(self): | |
| """Adds critical prerequisite chains that might be missing.""" | |
| print("\n🔗 Validating and fixing critical prerequisite chains...") | |
| critical_chains = { | |
| # Foundations | |
| ("CS1800", "CS2800", "Discrete → Logic"), | |
| ("CS2500", "CS2510", "Fundies 1 → Fundies 2"), | |
| # Core CS | |
| ("CS2510", "CS3500", "Fundies 2 → OOD"), | |
| ("CS2510", "CS3000", "Fundies 2 → Algorithms"), | |
| ("CS2800", "CS3000", "Logic → Algorithms"), | |
| # --- THIS IS THE FIX --- | |
| ("CS3000", "CS3650", "Algorithms -> Systems"), | |
| # --------------------- | |
| # Core AI/ML | |
| ("CS3000", "CS4100", "Algorithms → AI"), | |
| ("CS3500", "CS4100", "OOD → AI"), | |
| # Core DS Path | |
| ("DS2000", "DS2500", "Prog w/ Data → Intermediate"), | |
| ("DS2500", "DS3500", "Intermediate → Advanced"), | |
| ("DS3500", "DS4400", "Advanced → ML1"), | |
| ("CS3500", "DS4400", "OOD → ML1"), | |
| # Math | |
| ("MATH1341", "MATH1342", "Calc 1 → Calc 2"), | |
| } | |
| added = 0 | |
| for prereq, course, desc in critical_chains: | |
| if self.graph.has_node(prereq) and self.graph.has_node(course): | |
| if not self.graph.has_edge(prereq, course): | |
| self.graph.add_edge(prereq, course) | |
| print(f" 🔧 FIXED: Added {prereq} → {course} ({desc})") | |
| added += 1 | |
| if added == 0: | |
| print(" ✅ All critical chains present") | |
| def remove_spurious_chains(self): | |
| """ | |
| --- FIX 3: REMOVE BAD DATA --- | |
| Removes known incorrect prerequisite edges from scraper. | |
| """ | |
| print("\n🗑️ Removing spurious prerequisite chains...") | |
| # Based on your inspect_graph output and catalog knowledge | |
| spurious_chains = { | |
| ("CS2500", "CS2800"), # Fundies 1 is NOT a prereq for Logic | |
| ("MATH1365", "CS2800"), # Not a real prereq | |
| ("EECE2160", "CS3000"), # Irrelevant prereq | |
| ("EECE2560", "CS3500"), # Irrelevant prereq | |
| } | |
| removed = 0 | |
| for prereq, course in spurious_chains: | |
| if self.graph.has_edge(prereq, course): | |
| self.graph.remove_edge(prereq, course) | |
| print(f" ✅ REMOVED: {prereq} → {course}") | |
| removed += 1 | |
| if removed == 0: | |
| print(" ✅ No spurious chains found") | |
| def calculate_and_add_complexity(self): | |
| """Calculates and adds complexity score to each course.""" | |
| print("\n🧮 Calculating complexity scores...") | |
| for node in self.graph.nodes(): | |
| # Use predecessors/successors on the *cleaned* graph | |
| in_degree = self.graph.in_degree(node) | |
| out_degree = self.graph.out_degree(node) | |
| # Complexity heuristic: weighted by prerequisites and courses unlocked | |
| score = (in_degree * 10) + (out_degree * 5) | |
| nx.set_node_attributes(self.graph, {node: {'complexity': score}}) | |
| print("✅ Complexity scores calculated") | |
| def validate_critical_courses(self) -> Dict[str, Set[str]]: | |
| """Check if all critical courses exist in the graph.""" | |
| print("\n🎯 Validating critical course coverage...") | |
| # This list MUST match the optimizer's requirements | |
| required_courses = { | |
| "foundations": {"CS1800", "CS2500", "CS2510", "CS2800"}, | |
| "core": {"CS3000", "CS3500", "CS3650", "CS3200", "CS5700"}, # Added CS5700 | |
| "ai_ml": {"CS4100", "DS4400", "CS4120", "DS4420", "CS4180", "DS4440"}, | |
| "systems": {"CS4730", "CS4700", "CS4400", "CS4500"}, | |
| "security": {"CY2550", "CY3740", "CY4740", "CY4760"}, | |
| "math": {"MATH1341", "MATH1342", "MATH2331", "MATH3081"}, | |
| } | |
| missing = {} | |
| for category, courses in required_courses.items(): | |
| missing_in_cat = courses - set(self.graph.nodes()) | |
| if missing_in_cat: | |
| missing[category] = missing_in_cat | |
| print(f" ⚠️ {category}: Missing {missing_in_cat}") | |
| else: | |
| print(f" ✅ {category}: All courses present") | |
| return missing | |
| def save_enriched_graph(self, output_path): | |
| """Saves the final, clean, and enriched graph.""" | |
| print(f"\n💾 Saving cleaned graph to {output_path}...") | |
| with open(output_path, 'wb') as f: | |
| pickle.dump(self.graph, f) | |
| print("✅ Graph saved") | |
| # Save a summary report | |
| report_path = output_path.replace('.pkl', '_report.txt') | |
| with open(report_path, 'w') as f: | |
| f.write("Curriculum Graph Analysis Report\n") | |
| f.write("="*70 + "\n\n") | |
| f.write(f"Total courses: {self.graph.number_of_nodes()}\n") | |
| f.write(f"Total prerequisites: {self.graph.number_of_edges()}\n\n") | |
| subject_counts = defaultdict(int) | |
| for node in self.graph.nodes(): | |
| subject = self.graph.nodes[node].get('subject', 'UNKNOWN') | |
| subject_counts[subject] += 1 | |
| f.write("Subject breakdown (Filtered):\n") | |
| for subject in sorted(subject_counts.keys()): | |
| f.write(f" {subject}: {subject_counts[subject]}\n") | |
| print(f"✅ Report saved to {report_path}") | |
| def main(args): | |
| """Main execution flow.""" | |
| analyzer = CurriculumAnalyzer(args.graph, args.courses) | |
| analyzer.pre_filter_graph() | |
| analyzer.fix_chains() | |
| analyzer.remove_spurious_chains() | |
| analyzer.calculate_and_add_complexity() | |
| missing = analyzer.validate_critical_courses() | |
| if missing: | |
| print("\n⚠️ WARNING: Some critical courses are missing!") | |
| print(" Consider re-scraping with additional terms or subjects.") | |
| print(" Missing courses will be excluded from planning.") | |
| analyzer.save_enriched_graph(args.output_graph) | |
| print("\n✨ Analysis complete!") | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser(description="NEU Curriculum Analyzer - Cleans and validates data") | |
| parser.add_argument('--graph', required=True, help="Path to RAW curriculum graph (e.g., neu_merged_graph_...pkl)") | |
| parser.add_argument('--courses', required=True, help="Path to RAW courses data (e.g., neu_merged_courses_...pkl)") | |
| parser.add_argument('--output-graph', default='neu_graph_clean.pkl', help="Output path for the new, clean graph") | |
| args = parser.parse_args() | |
| main(args) |