File size: 10,238 Bytes
5360228
a522797
5360228
 
 
 
 
 
 
a522797
 
 
 
 
9f3c8c1
5360228
a522797
 
 
 
 
 
 
5360228
 
 
 
 
 
 
 
 
 
a522797
 
5360228
 
 
 
 
 
 
 
 
9f3c8c1
 
 
 
 
 
a522797
 
5360228
 
 
 
9f3c8c1
 
 
a522797
9f3c8c1
 
a522797
 
5360228
 
 
 
 
 
 
 
 
 
 
9f3c8c1
 
5360228
a522797
5360228
 
 
 
a522797
9f3c8c1
 
 
 
 
5360228
9f3c8c1
 
5360228
9f3c8c1
 
5360228
 
 
 
 
 
 
 
 
 
9f3c8c1
 
5360228
 
 
 
9f3c8c1
 
 
 
 
 
 
 
 
 
 
 
a522797
9f3c8c1
5360228
 
 
 
9f3c8c1
a522797
5360228
9f3c8c1
5360228
 
 
 
9f3c8c1
 
 
 
 
 
 
 
 
 
 
a522797
9f3c8c1
 
 
 
 
5360228
9f3c8c1
 
a522797
9f3c8c1
 
 
 
 
a522797
9f3c8c1
 
 
 
5360228
9f3c8c1
 
5360228
9f3c8c1
5360228
9f3c8c1
5360228
9f3c8c1
 
 
 
 
 
 
 
 
 
 
 
a522797
 
 
9f3c8c1
a522797
 
9f3c8c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5360228
9f3c8c1
 
 
 
a522797
 
 
 
 
9f3c8c1
 
a522797
 
9f3c8c1
a522797
9f3c8c1
 
 
 
 
 
 
a522797
 
9f3c8c1
5360228
 
 
a522797
9f3c8c1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
#!/usr/bin/env python3
"""

FIXED Curriculum Analyzer - Production Version



Synchronized with optimizer logic:

1.  Filters subjects to ONLY: CS, DS, CY, MATH, PHYS, ENGW.

2.  Removes IS, EECE, STAT, and other irrelevant subjects.

3.  ADDS exception for undergrad-accessible 5000-level courses (CS5700).

4.  FIXES bad prerequisite data (e.g., CS2500 -> CS2800).

"""
import pickle
import argparse
import networkx as nx
import re
from typing import Set, Dict
from collections import defaultdict

def get_course_level(cid):
    """Extracts the numerical part of a course ID for level checking."""
    match = re.search(r'\d+', cid)
    return int(match.group(0)) if match else 9999

class CurriculumAnalyzer:
    
    # --- FIX 1: DEFINE LISTS THAT MATCH THE OPTIMIZER ---
    
    # Subjects the optimizer is programmed to understand.
    # ENGW/PHYS are needed only for hardcoded Year 1.
    KEEP_SUBJECTS = {"CS", "DS", "CY", "MATH", "PHYS", "ENGW"}
    
    # 5000-level courses the optimizer explicitly allows.
    UNDERGRAD_ACCESSIBLE_GRAD = {"CS5700", "CY5700", "DS5110", "CS5010"}

    def __init__(self, graph_path, courses_path):
        print("๐Ÿ“š Loading raw curriculum data...")
        try:
            with open(graph_path, 'rb') as f:
                self.graph = pickle.load(f)
            with open(courses_path, 'rb') as f:
                self.courses = pickle.load(f)
        except Exception as e:
            print(f"โŒ ERROR: Could not load files. {e}")
            exit(1)
            
        # Merge course data into graph nodes
        for course_id, course_data in self.courses.items():
            if self.graph.has_node(course_id):
                self.graph.nodes[course_id].update(course_data)
        
        print(f"โœ… Loaded {self.graph.number_of_nodes()} courses, {self.graph.number_of_edges()} edges")

    def pre_filter_graph(self):
        """

        --- FIX 2: IMPLEMENTS STRICT FILTERING ---

        Keeps only relevant subjects and removes labs/high-level courses.

        """
        print("\n๐Ÿงน Pre-filtering graph...")
        
        nodes_to_remove = set()
        for node, data in self.graph.nodes(data=True):
            subject = data.get('subject', '')
            name = data.get('name', '').lower()
            level = get_course_level(node)
            
            # Check for removal
            is_irrelevant_subject = subject not in self.KEEP_SUBJECTS
            is_lab_or_seminar = any(skip in name for skip in ['lab', 'recitation', 'seminar', 'practicum', 'co-op'])
            
            # Grad-level check
            is_grad_level = level >= 5000
            is_allowed_grad = node in self.UNDERGRAD_ACCESSIBLE_GRAD
            
            if (is_irrelevant_subject or 
                is_lab_or_seminar or
                (is_grad_level and not is_allowed_grad)): # <-- Bug fix
                nodes_to_remove.add(node)
        
        original_count = self.graph.number_of_nodes()
        self.graph.remove_nodes_from(nodes_to_remove)
        
        print(f"โœ… Removed {len(nodes_to_remove)} irrelevant courses (IS, EECE, etc.)")
        print(f"   Original nodes: {original_count}")
        print(f"   Remaining nodes: {self.graph.number_of_nodes()}")

    def fix_chains(self):
        """Adds critical prerequisite chains that might be missing."""
        print("\n๐Ÿ”— Validating and fixing critical prerequisite chains...")
        
        critical_chains = {
            # Foundations
            ("CS1800", "CS2800", "Discrete โ†’ Logic"),
            ("CS2500", "CS2510", "Fundies 1 โ†’ Fundies 2"),
            # Core CS
            ("CS2510", "CS3500", "Fundies 2 โ†’ OOD"),
            ("CS2510", "CS3000", "Fundies 2 โ†’ Algorithms"),
            ("CS2800", "CS3000", "Logic โ†’ Algorithms"), 
            
            # --- THIS IS THE FIX ---
            ("CS3000", "CS3650", "Algorithms -> Systems"), 
            # ---------------------

            # Core AI/ML
            ("CS3000", "CS4100", "Algorithms โ†’ AI"),
            ("CS3500", "CS4100", "OOD โ†’ AI"),
            # Core DS Path
            ("DS2000", "DS2500", "Prog w/ Data โ†’ Intermediate"),
            ("DS2500", "DS3500", "Intermediate โ†’ Advanced"),
            ("DS3500", "DS4400", "Advanced โ†’ ML1"),
            ("CS3500", "DS4400", "OOD โ†’ ML1"),
            # Math
            ("MATH1341", "MATH1342", "Calc 1 โ†’ Calc 2"),
        }
        
        added = 0
        for prereq, course, desc in critical_chains:
            if self.graph.has_node(prereq) and self.graph.has_node(course):
                if not self.graph.has_edge(prereq, course):
                    self.graph.add_edge(prereq, course)
                    print(f"  ๐Ÿ”ง FIXED: Added {prereq} โ†’ {course} ({desc})")
                    added += 1
        
        if added == 0:
            print("  โœ… All critical chains present")

    def remove_spurious_chains(self):
        """

        --- FIX 3: REMOVE BAD DATA ---

        Removes known incorrect prerequisite edges from scraper.

        """
        print("\n๐Ÿ—‘๏ธ Removing spurious prerequisite chains...")
        
        # Based on your inspect_graph output and catalog knowledge
        spurious_chains = {
            ("CS2500", "CS2800"),  # Fundies 1 is NOT a prereq for Logic
            ("MATH1365", "CS2800"), # Not a real prereq
            ("EECE2160", "CS3000"), # Irrelevant prereq
            ("EECE2560", "CS3500"), # Irrelevant prereq
        }
        
        removed = 0
        for prereq, course in spurious_chains:
            if self.graph.has_edge(prereq, course):
                self.graph.remove_edge(prereq, course)
                print(f"  โœ… REMOVED: {prereq} โ†’ {course}")
                removed += 1
        
        if removed == 0:
            print("  โœ… No spurious chains found")

    def calculate_and_add_complexity(self):
        """Calculates and adds complexity score to each course."""
        print("\n๐Ÿงฎ Calculating complexity scores...")
        
        for node in self.graph.nodes():
            # Use predecessors/successors on the *cleaned* graph
            in_degree = self.graph.in_degree(node)
            out_degree = self.graph.out_degree(node)
            
            # Complexity heuristic: weighted by prerequisites and courses unlocked
            score = (in_degree * 10) + (out_degree * 5)
            nx.set_node_attributes(self.graph, {node: {'complexity': score}})
        
        print("โœ… Complexity scores calculated")

    def validate_critical_courses(self) -> Dict[str, Set[str]]:
        """Check if all critical courses exist in the graph."""
        print("\n๐ŸŽฏ Validating critical course coverage...")
        
        # This list MUST match the optimizer's requirements
        required_courses = {
            "foundations": {"CS1800", "CS2500", "CS2510", "CS2800"},
            "core": {"CS3000", "CS3500", "CS3650", "CS3200", "CS5700"}, # Added CS5700
            "ai_ml": {"CS4100", "DS4400", "CS4120", "DS4420", "CS4180", "DS4440"},
            "systems": {"CS4730", "CS4700", "CS4400", "CS4500"},
            "security": {"CY2550", "CY3740", "CY4740", "CY4760"},
            "math": {"MATH1341", "MATH1342", "MATH2331", "MATH3081"},
        }
        
        missing = {}
        for category, courses in required_courses.items():
            missing_in_cat = courses - set(self.graph.nodes())
            if missing_in_cat:
                missing[category] = missing_in_cat
                print(f"  โš ๏ธ  {category}: Missing {missing_in_cat}")
            else:
                print(f"  โœ… {category}: All courses present")
        
        return missing

    def save_enriched_graph(self, output_path):
        """Saves the final, clean, and enriched graph."""
        print(f"\n๐Ÿ’พ Saving cleaned graph to {output_path}...")
        with open(output_path, 'wb') as f:
            pickle.dump(self.graph, f)
        print("โœ… Graph saved")
        
        # Save a summary report
        report_path = output_path.replace('.pkl', '_report.txt')
        with open(report_path, 'w') as f:
            f.write("Curriculum Graph Analysis Report\n")
            f.write("="*70 + "\n\n")
            f.write(f"Total courses: {self.graph.number_of_nodes()}\n")
            f.write(f"Total prerequisites: {self.graph.number_of_edges()}\n\n")
            
            subject_counts = defaultdict(int)
            for node in self.graph.nodes():
                subject = self.graph.nodes[node].get('subject', 'UNKNOWN')
                subject_counts[subject] += 1
            
            f.write("Subject breakdown (Filtered):\n")
            for subject in sorted(subject_counts.keys()):
                f.write(f"  {subject}: {subject_counts[subject]}\n")
        
        print(f"โœ… Report saved to {report_path}")

def main(args):
    """Main execution flow."""
    analyzer = CurriculumAnalyzer(args.graph, args.courses)
    analyzer.pre_filter_graph()
    analyzer.fix_chains()
    analyzer.remove_spurious_chains()
    analyzer.calculate_and_add_complexity()
    
    missing = analyzer.validate_critical_courses()
    
    if missing:
        print("\nโš ๏ธ  WARNING: Some critical courses are missing!")
        print("   Consider re-scraping with additional terms or subjects.")
        print("   Missing courses will be excluded from planning.")
    
    analyzer.save_enriched_graph(args.output_graph)
    print("\nโœจ Analysis complete!")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="NEU Curriculum Analyzer - Cleans and validates data")
    parser.add_argument('--graph', required=True, help="Path to RAW curriculum graph (e.g., neu_merged_graph_...pkl)")
    parser.add_argument('--courses', required=True, help="Path to RAW courses data (e.g., neu_merged_courses_...pkl)")
    parser.add_argument('--output-graph', default='neu_graph_clean.pkl', help="Output path for the new, clean graph")
    args = parser.parse_args()
    main(args)