Spaces:
Sleeping
Sleeping
File size: 10,238 Bytes
5360228 a522797 5360228 a522797 9f3c8c1 5360228 a522797 5360228 a522797 5360228 9f3c8c1 a522797 5360228 9f3c8c1 a522797 9f3c8c1 a522797 5360228 9f3c8c1 5360228 a522797 5360228 a522797 9f3c8c1 5360228 9f3c8c1 5360228 9f3c8c1 5360228 9f3c8c1 5360228 9f3c8c1 a522797 9f3c8c1 5360228 9f3c8c1 a522797 5360228 9f3c8c1 5360228 9f3c8c1 a522797 9f3c8c1 5360228 9f3c8c1 a522797 9f3c8c1 a522797 9f3c8c1 5360228 9f3c8c1 5360228 9f3c8c1 5360228 9f3c8c1 5360228 9f3c8c1 a522797 9f3c8c1 a522797 9f3c8c1 5360228 9f3c8c1 a522797 9f3c8c1 a522797 9f3c8c1 a522797 9f3c8c1 a522797 9f3c8c1 5360228 a522797 9f3c8c1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 |
#!/usr/bin/env python3
"""
FIXED Curriculum Analyzer - Production Version
Synchronized with optimizer logic:
1. Filters subjects to ONLY: CS, DS, CY, MATH, PHYS, ENGW.
2. Removes IS, EECE, STAT, and other irrelevant subjects.
3. ADDS exception for undergrad-accessible 5000-level courses (CS5700).
4. FIXES bad prerequisite data (e.g., CS2500 -> CS2800).
"""
import pickle
import argparse
import networkx as nx
import re
from typing import Set, Dict
from collections import defaultdict
def get_course_level(cid):
"""Extracts the numerical part of a course ID for level checking."""
match = re.search(r'\d+', cid)
return int(match.group(0)) if match else 9999
class CurriculumAnalyzer:
# --- FIX 1: DEFINE LISTS THAT MATCH THE OPTIMIZER ---
# Subjects the optimizer is programmed to understand.
# ENGW/PHYS are needed only for hardcoded Year 1.
KEEP_SUBJECTS = {"CS", "DS", "CY", "MATH", "PHYS", "ENGW"}
# 5000-level courses the optimizer explicitly allows.
UNDERGRAD_ACCESSIBLE_GRAD = {"CS5700", "CY5700", "DS5110", "CS5010"}
def __init__(self, graph_path, courses_path):
print("๐ Loading raw curriculum data...")
try:
with open(graph_path, 'rb') as f:
self.graph = pickle.load(f)
with open(courses_path, 'rb') as f:
self.courses = pickle.load(f)
except Exception as e:
print(f"โ ERROR: Could not load files. {e}")
exit(1)
# Merge course data into graph nodes
for course_id, course_data in self.courses.items():
if self.graph.has_node(course_id):
self.graph.nodes[course_id].update(course_data)
print(f"โ
Loaded {self.graph.number_of_nodes()} courses, {self.graph.number_of_edges()} edges")
def pre_filter_graph(self):
"""
--- FIX 2: IMPLEMENTS STRICT FILTERING ---
Keeps only relevant subjects and removes labs/high-level courses.
"""
print("\n๐งน Pre-filtering graph...")
nodes_to_remove = set()
for node, data in self.graph.nodes(data=True):
subject = data.get('subject', '')
name = data.get('name', '').lower()
level = get_course_level(node)
# Check for removal
is_irrelevant_subject = subject not in self.KEEP_SUBJECTS
is_lab_or_seminar = any(skip in name for skip in ['lab', 'recitation', 'seminar', 'practicum', 'co-op'])
# Grad-level check
is_grad_level = level >= 5000
is_allowed_grad = node in self.UNDERGRAD_ACCESSIBLE_GRAD
if (is_irrelevant_subject or
is_lab_or_seminar or
(is_grad_level and not is_allowed_grad)): # <-- Bug fix
nodes_to_remove.add(node)
original_count = self.graph.number_of_nodes()
self.graph.remove_nodes_from(nodes_to_remove)
print(f"โ
Removed {len(nodes_to_remove)} irrelevant courses (IS, EECE, etc.)")
print(f" Original nodes: {original_count}")
print(f" Remaining nodes: {self.graph.number_of_nodes()}")
def fix_chains(self):
"""Adds critical prerequisite chains that might be missing."""
print("\n๐ Validating and fixing critical prerequisite chains...")
critical_chains = {
# Foundations
("CS1800", "CS2800", "Discrete โ Logic"),
("CS2500", "CS2510", "Fundies 1 โ Fundies 2"),
# Core CS
("CS2510", "CS3500", "Fundies 2 โ OOD"),
("CS2510", "CS3000", "Fundies 2 โ Algorithms"),
("CS2800", "CS3000", "Logic โ Algorithms"),
# --- THIS IS THE FIX ---
("CS3000", "CS3650", "Algorithms -> Systems"),
# ---------------------
# Core AI/ML
("CS3000", "CS4100", "Algorithms โ AI"),
("CS3500", "CS4100", "OOD โ AI"),
# Core DS Path
("DS2000", "DS2500", "Prog w/ Data โ Intermediate"),
("DS2500", "DS3500", "Intermediate โ Advanced"),
("DS3500", "DS4400", "Advanced โ ML1"),
("CS3500", "DS4400", "OOD โ ML1"),
# Math
("MATH1341", "MATH1342", "Calc 1 โ Calc 2"),
}
added = 0
for prereq, course, desc in critical_chains:
if self.graph.has_node(prereq) and self.graph.has_node(course):
if not self.graph.has_edge(prereq, course):
self.graph.add_edge(prereq, course)
print(f" ๐ง FIXED: Added {prereq} โ {course} ({desc})")
added += 1
if added == 0:
print(" โ
All critical chains present")
def remove_spurious_chains(self):
"""
--- FIX 3: REMOVE BAD DATA ---
Removes known incorrect prerequisite edges from scraper.
"""
print("\n๐๏ธ Removing spurious prerequisite chains...")
# Based on your inspect_graph output and catalog knowledge
spurious_chains = {
("CS2500", "CS2800"), # Fundies 1 is NOT a prereq for Logic
("MATH1365", "CS2800"), # Not a real prereq
("EECE2160", "CS3000"), # Irrelevant prereq
("EECE2560", "CS3500"), # Irrelevant prereq
}
removed = 0
for prereq, course in spurious_chains:
if self.graph.has_edge(prereq, course):
self.graph.remove_edge(prereq, course)
print(f" โ
REMOVED: {prereq} โ {course}")
removed += 1
if removed == 0:
print(" โ
No spurious chains found")
def calculate_and_add_complexity(self):
"""Calculates and adds complexity score to each course."""
print("\n๐งฎ Calculating complexity scores...")
for node in self.graph.nodes():
# Use predecessors/successors on the *cleaned* graph
in_degree = self.graph.in_degree(node)
out_degree = self.graph.out_degree(node)
# Complexity heuristic: weighted by prerequisites and courses unlocked
score = (in_degree * 10) + (out_degree * 5)
nx.set_node_attributes(self.graph, {node: {'complexity': score}})
print("โ
Complexity scores calculated")
def validate_critical_courses(self) -> Dict[str, Set[str]]:
"""Check if all critical courses exist in the graph."""
print("\n๐ฏ Validating critical course coverage...")
# This list MUST match the optimizer's requirements
required_courses = {
"foundations": {"CS1800", "CS2500", "CS2510", "CS2800"},
"core": {"CS3000", "CS3500", "CS3650", "CS3200", "CS5700"}, # Added CS5700
"ai_ml": {"CS4100", "DS4400", "CS4120", "DS4420", "CS4180", "DS4440"},
"systems": {"CS4730", "CS4700", "CS4400", "CS4500"},
"security": {"CY2550", "CY3740", "CY4740", "CY4760"},
"math": {"MATH1341", "MATH1342", "MATH2331", "MATH3081"},
}
missing = {}
for category, courses in required_courses.items():
missing_in_cat = courses - set(self.graph.nodes())
if missing_in_cat:
missing[category] = missing_in_cat
print(f" โ ๏ธ {category}: Missing {missing_in_cat}")
else:
print(f" โ
{category}: All courses present")
return missing
def save_enriched_graph(self, output_path):
"""Saves the final, clean, and enriched graph."""
print(f"\n๐พ Saving cleaned graph to {output_path}...")
with open(output_path, 'wb') as f:
pickle.dump(self.graph, f)
print("โ
Graph saved")
# Save a summary report
report_path = output_path.replace('.pkl', '_report.txt')
with open(report_path, 'w') as f:
f.write("Curriculum Graph Analysis Report\n")
f.write("="*70 + "\n\n")
f.write(f"Total courses: {self.graph.number_of_nodes()}\n")
f.write(f"Total prerequisites: {self.graph.number_of_edges()}\n\n")
subject_counts = defaultdict(int)
for node in self.graph.nodes():
subject = self.graph.nodes[node].get('subject', 'UNKNOWN')
subject_counts[subject] += 1
f.write("Subject breakdown (Filtered):\n")
for subject in sorted(subject_counts.keys()):
f.write(f" {subject}: {subject_counts[subject]}\n")
print(f"โ
Report saved to {report_path}")
def main(args):
"""Main execution flow."""
analyzer = CurriculumAnalyzer(args.graph, args.courses)
analyzer.pre_filter_graph()
analyzer.fix_chains()
analyzer.remove_spurious_chains()
analyzer.calculate_and_add_complexity()
missing = analyzer.validate_critical_courses()
if missing:
print("\nโ ๏ธ WARNING: Some critical courses are missing!")
print(" Consider re-scraping with additional terms or subjects.")
print(" Missing courses will be excluded from planning.")
analyzer.save_enriched_graph(args.output_graph)
print("\nโจ Analysis complete!")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="NEU Curriculum Analyzer - Cleans and validates data")
parser.add_argument('--graph', required=True, help="Path to RAW curriculum graph (e.g., neu_merged_graph_...pkl)")
parser.add_argument('--courses', required=True, help="Path to RAW courses data (e.g., neu_merged_courses_...pkl)")
parser.add_argument('--output-graph', default='neu_graph_clean.pkl', help="Output path for the new, clean graph")
args = parser.parse_args()
main(args) |