ckharche commited on
Commit
9f3c8c1
·
verified ·
1 Parent(s): 722bc5e

Upload 6 files

Browse files
src/agentic_optimizer.py CHANGED
@@ -1,413 +1,442 @@
1
  """
2
- Agentic Curriculum Optimizer
3
- Runs 100% locally, no API costs
4
- """
5
 
 
 
 
 
 
6
  import json
7
- import sqlite3
 
8
  import networkx as nx
9
- import numpy as np
10
- from dataclasses import dataclass, asdict
11
- from typing import Dict, List, Tuple, Optional
12
  from datetime import datetime
13
- import pickle
14
  import torch
15
- from sentence_transformers import SentenceTransformer
16
  from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
17
- import schedule
18
- import time
19
-
20
- @dataclass
21
- class StudentProfile:
22
- student_id: str
23
- completed_courses: List[str]
24
- current_gpa: float
25
- interests: List[str]
26
- career_goals: str
27
- learning_style: str
28
- time_commitment: int
29
- preferred_difficulty: str
30
 
31
  @dataclass
32
- class PlanFeedback:
33
- student_id: str
34
- plan_id: str
35
- timestamp: datetime
36
- actual_gpa: float
37
- difficulty_rating: int # 1-5
38
- satisfaction: int # 1-5
39
- completed_courses: List[str]
40
- dropped_courses: List[str]
41
 
42
- class CurriculumAgent:
43
  """
44
- Autonomous agent that:
45
- 1. Monitors student progress
46
- 2. Adapts recommendations based on feedback
47
- 3. Proactively suggests adjustments
48
- 4. Learns from outcomes
49
  """
50
 
51
- def __init__(self, db_path="curriculum_agent.db"):
52
- self.db_path = db_path
53
- self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
54
-
55
- # Models (local, no API)
56
- self.embedder = SentenceTransformer('all-MiniLM-L6-v2') # Smaller for local
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  self.graph = None
58
  self.courses = {}
 
59
 
60
- # Initialize database for memory
61
- self._init_database()
62
-
63
- # Agent state
64
- self.active_plans = {}
65
- self.feedback_history = []
66
-
67
- def _init_database(self):
68
- """Create tables for agent memory"""
69
- conn = sqlite3.connect(self.db_path)
70
- c = conn.cursor()
71
-
72
- # Student profiles
73
- c.execute('''CREATE TABLE IF NOT EXISTS students
74
- (id TEXT PRIMARY KEY,
75
- profile TEXT,
76
- created_at TIMESTAMP)''')
77
-
78
- # Generated plans
79
- c.execute('''CREATE TABLE IF NOT EXISTS plans
80
- (id TEXT PRIMARY KEY,
81
- student_id TEXT,
82
- plan_data TEXT,
83
- created_at TIMESTAMP,
84
- performance_score REAL)''')
85
-
86
- # Feedback for learning
87
- c.execute('''CREATE TABLE IF NOT EXISTS feedback
88
- (id INTEGER PRIMARY KEY AUTOINCREMENT,
89
- plan_id TEXT,
90
- student_id TEXT,
91
- feedback_data TEXT,
92
- timestamp TIMESTAMP)''')
93
-
94
- # Agent learning patterns
95
- c.execute('''CREATE TABLE IF NOT EXISTS patterns
96
- (id INTEGER PRIMARY KEY AUTOINCREMENT,
97
- pattern_type TEXT,
98
- pattern_data TEXT,
99
- success_rate REAL,
100
- discovered_at TIMESTAMP)''')
101
-
102
- conn.commit()
103
- conn.close()
104
 
105
- def perceive(self) -> Dict:
106
- """
107
- PERCEPTION: Gather information about environment
108
- """
109
- perceptions = {
110
- "active_students": self._get_active_students(),
111
- "recent_feedback": self._get_recent_feedback(),
112
- "course_updates": self._check_course_updates(),
113
- "success_patterns": self._analyze_success_patterns()
114
- }
115
- return perceptions
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
- def decide(self, perceptions: Dict) -> List[Dict]:
118
- """
119
- DECISION: Determine what actions to take
120
- """
121
- decisions = []
122
-
123
- # Decision 1: Which students need plan updates?
124
- for student_id in perceptions["active_students"]:
125
- if self._needs_plan_update(student_id, perceptions):
126
- decisions.append({
127
- "action": "update_plan",
128
- "student_id": student_id,
129
- "reason": "Poor performance feedback"
130
- })
131
-
132
- # Decision 2: Identify at-risk students
133
- at_risk = self._identify_at_risk_students(perceptions["recent_feedback"])
134
- for student_id in at_risk:
135
- decisions.append({
136
- "action": "intervention",
137
- "student_id": student_id,
138
- "reason": "Risk of dropping out"
139
- })
140
-
141
- # Decision 3: Optimize based on patterns
142
- if perceptions["success_patterns"]:
143
- decisions.append({
144
- "action": "update_algorithm",
145
- "patterns": perceptions["success_patterns"]
146
- })
147
-
148
- return decisions
149
 
150
- def act(self, decisions: List[Dict]) -> List[Dict]:
151
- """
152
- ACTION: Execute decisions
153
- """
154
- results = []
155
-
156
- for decision in decisions:
157
- if decision["action"] == "update_plan":
158
- new_plan = self._regenerate_plan(decision["student_id"])
159
- results.append({
160
- "action": "plan_updated",
161
- "student_id": decision["student_id"],
162
- "plan": new_plan
163
- })
164
-
165
- elif decision["action"] == "intervention":
166
- intervention = self._create_intervention(decision["student_id"])
167
- results.append({
168
- "action": "intervention_created",
169
- "student_id": decision["student_id"],
170
- "intervention": intervention
171
- })
172
 
173
- elif decision["action"] == "update_algorithm":
174
- self._update_planning_algorithm(decision["patterns"])
175
- results.append({
176
- "action": "algorithm_updated",
177
- "patterns_applied": len(decision["patterns"])
178
- })
179
-
180
- return results
 
 
 
 
 
 
 
 
 
181
 
182
- def learn(self, results: List[Dict]):
183
- """
184
- LEARNING: Update knowledge based on outcomes
185
- """
186
- conn = sqlite3.connect(self.db_path)
187
- c = conn.cursor()
188
-
189
- for result in results:
190
- if result["action"] == "plan_updated":
191
- # Track plan performance
192
- self._track_plan_performance(result["student_id"], result["plan"])
193
 
194
- elif result["action"] == "intervention_created":
195
- # Monitor intervention effectiveness
196
- self._monitor_intervention(result["student_id"], result["intervention"])
197
-
198
- # Discover new patterns
199
- patterns = self._discover_patterns()
200
- for pattern in patterns:
201
- c.execute("INSERT INTO patterns (pattern_type, pattern_data, success_rate, discovered_at) VALUES (?, ?, ?, ?)",
202
- (pattern["type"], json.dumps(pattern["data"]), pattern["success_rate"], datetime.now()))
203
-
204
- conn.commit()
205
- conn.close()
206
-
207
- def run_autonomous_cycle(self):
208
- """
209
- Main agent loop - runs continuously
210
- """
211
- while True:
212
- print(f"\n[{datetime.now()}] Agent Cycle Starting...")
213
-
214
- # 1. PERCEIVE
215
- perceptions = self.perceive()
216
- print(f"Perceptions: {len(perceptions['active_students'])} active students")
217
-
218
- # 2. DECIDE
219
- decisions = self.decide(perceptions)
220
- print(f"Decisions: {len(decisions)} actions to take")
221
-
222
- # 3. ACT
223
- results = self.act(decisions)
224
- print(f"Results: {len(results)} actions completed")
225
-
226
- # 4. LEARN
227
- self.learn(results)
228
- print("Learning cycle complete")
229
-
230
- # Wait before next cycle (in production, this could be daily)
231
- time.sleep(60) # Run every minute for demo
232
-
233
- # --- Helper Methods ---
234
-
235
- def _get_active_students(self) -> List[str]:
236
- """Get list of active students"""
237
- conn = sqlite3.connect(self.db_path)
238
- c = conn.cursor()
239
- c.execute("SELECT id FROM students")
240
- students = [row[0] for row in c.fetchall()]
241
- conn.close()
242
- return students
243
-
244
- def _get_recent_feedback(self) -> List[Dict]:
245
- """Get recent feedback"""
246
- conn = sqlite3.connect(self.db_path)
247
- c = conn.cursor()
248
- c.execute("SELECT feedback_data FROM feedback ORDER BY timestamp DESC LIMIT 10")
249
- feedback = [json.loads(row[0]) for row in c.fetchall()]
250
- conn.close()
251
- return feedback
252
-
253
- def _check_course_updates(self) -> Dict:
254
- """Check for course changes (mock for demo)"""
255
- return {"updated_courses": [], "new_prerequisites": {}}
256
-
257
- def _analyze_success_patterns(self) -> List[Dict]:
258
- """Identify successful patterns"""
259
- conn = sqlite3.connect(self.db_path)
260
- c = conn.cursor()
261
- c.execute("SELECT pattern_data, success_rate FROM patterns WHERE success_rate > 0.7")
262
- patterns = [{"data": json.loads(row[0]), "success_rate": row[1]} for row in c.fetchall()]
263
- conn.close()
264
- return patterns
265
-
266
- def _needs_plan_update(self, student_id: str, perceptions: Dict) -> bool:
267
- """Determine if student needs plan update"""
268
- # Check if recent feedback shows issues
269
- for feedback in perceptions["recent_feedback"]:
270
- if feedback.get("student_id") == student_id:
271
- if feedback.get("satisfaction", 5) < 3:
272
- return True
273
- return False
274
-
275
- def _identify_at_risk_students(self, feedback: List[Dict]) -> List[str]:
276
- """Identify students at risk"""
277
- at_risk = []
278
- for fb in feedback:
279
- if fb.get("difficulty_rating", 0) > 4 or fb.get("dropped_courses", []):
280
- at_risk.append(fb.get("student_id"))
281
- return at_risk
282
-
283
- def _regenerate_plan(self, student_id: str) -> Dict:
284
- """Generate new plan for student"""
285
- # This would use your existing optimizer
286
- return {"plan": "new_optimized_plan", "adjustments": ["reduced_difficulty"]}
287
 
288
- def _create_intervention(self, student_id: str) -> Dict:
289
- """Create intervention plan"""
290
- return {
291
- "type": "academic_support",
292
- "recommendations": ["tutoring", "reduced_courseload", "advisor_meeting"]
 
 
 
293
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
294
 
295
- def _update_planning_algorithm(self, patterns: List[Dict]):
296
- """Update planning based on learned patterns"""
297
- # This would adjust your optimizer's weights/rules
298
- print(f"Updating algorithm with {len(patterns)} patterns")
299
-
300
- def _track_plan_performance(self, student_id: str, plan: Dict):
301
- """Track how well plans perform"""
302
- conn = sqlite3.connect(self.db_path)
303
- c = conn.cursor()
304
- c.execute("UPDATE plans SET performance_score = ? WHERE student_id = ?",
305
- (0.0, student_id)) # Would calculate actual score
306
- conn.commit()
307
- conn.close()
308
-
309
- def _monitor_intervention(self, student_id: str, intervention: Dict):
310
- """Monitor intervention effectiveness"""
311
- print(f"Monitoring intervention for {student_id}")
312
-
313
- def _discover_patterns(self) -> List[Dict]:
314
- """Discover new patterns from data"""
315
- # Example: Find that students who take CS2500 before CS2510 do better
316
- patterns = []
317
-
318
- # Analyze database for patterns
319
- conn = sqlite3.connect(self.db_path)
320
- c = conn.cursor()
321
-
322
- # Example pattern discovery
323
- c.execute("""
324
- SELECT COUNT(*) FROM feedback
325
- WHERE feedback_data LIKE '%CS2500%CS2510%'
326
- AND json_extract(feedback_data, '$.satisfaction') > 4
327
- """)
328
-
329
- result = c.fetchone()
330
- if result and result[0] > 5: # If pattern appears frequently
331
- patterns.append({
332
- "type": "course_sequence",
333
- "data": {"sequence": ["CS2500", "CS2510"]},
334
- "success_rate": 0.85
335
- })
336
-
337
- conn.close()
338
- return patterns
339
 
 
 
340
 
341
- class LocalAgentRunner:
342
- """
343
- Manages the agent without external dependencies
344
- """
345
-
346
- def __init__(self, curriculum_data_path: str):
347
- self.agent = CurriculumAgent()
 
 
 
 
348
 
349
- # Load curriculum data
350
- with open(curriculum_data_path, 'rb') as f:
351
- graph = pickle.load(f)
352
- self.agent.graph = graph
353
- self.agent.courses = dict(graph.nodes(data=True))
 
 
 
 
 
 
 
 
 
 
 
 
354
 
355
- def add_student(self, profile: StudentProfile) -> str:
356
- """Add a student to track"""
357
- conn = sqlite3.connect(self.agent.db_path)
358
- c = conn.cursor()
359
 
360
- student_id = f"STU_{datetime.now().timestamp()}"
361
- c.execute("INSERT INTO students (id, profile, created_at) VALUES (?, ?, ?)",
362
- (student_id, json.dumps(asdict(profile)), datetime.now()))
 
 
363
 
364
- conn.commit()
365
- conn.close()
366
 
367
- return student_id
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
368
 
369
- def submit_feedback(self, feedback: PlanFeedback):
370
- """Submit feedback for learning"""
371
- conn = sqlite3.connect(self.agent.db_path)
372
- c = conn.cursor()
373
 
374
- c.execute("INSERT INTO feedback (plan_id, student_id, feedback_data, timestamp) VALUES (?, ?, ?, ?)",
375
- (feedback.plan_id, feedback.student_id, json.dumps(asdict(feedback)), feedback.timestamp))
 
 
 
 
 
 
 
376
 
377
- conn.commit()
378
- conn.close()
 
 
 
 
 
 
 
 
 
379
 
380
- def start_agent(self):
381
- """Start the autonomous agent"""
382
- print("Starting Curriculum Agent...")
383
- print("Agent will monitor students and adapt plans automatically")
384
- print("Press Ctrl+C to stop")
385
 
386
- try:
387
- self.agent.run_autonomous_cycle()
388
- except KeyboardInterrupt:
389
- print("\nAgent stopped")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
390
 
391
 
392
- # Example usage
393
- if __name__ == "__main__":
394
- # Initialize agent
395
- runner = LocalAgentRunner("neu_graph_analyzed_clean.pkl")
 
 
 
396
 
397
- # Add a test student
398
- student = StudentProfile(
399
- student_id="test_001",
400
- completed_courses=["CS1800", "CS2500"],
401
- current_gpa=3.5,
402
- interests=["AI", "Machine Learning"],
403
- career_goals="ML Engineer",
404
- learning_style="Visual",
405
- time_commitment=40,
406
- preferred_difficulty="moderate"
407
- )
408
 
409
- student_id = runner.add_student(student)
410
- print(f"Added student: {student_id}")
 
 
411
 
412
- # Start autonomous agent
413
- runner.start_agent()
 
 
 
 
 
 
 
1
  """
2
+ Agentic Curriculum Optimizer - Autonomous Graph Validator & Fixer
3
+ Detects missing courses, suggests replacements, and directly patches the graph.
 
4
 
5
+ Usage:
6
+ python agentic_optimizer.py --graph neu_graph_clean6.pkl --validate
7
+ python agentic_optimizer.py --graph neu_graph_clean6.pkl --fix --output neu_graph_fixed.pkl
8
+ """
9
+ import pickle
10
  import json
11
+ import re
12
+ import argparse
13
  import networkx as nx
14
+ from typing import Dict, Set, List, Tuple, Optional
 
 
15
  from datetime import datetime
16
+ from dataclasses import dataclass, asdict
17
  import torch
 
18
  from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  @dataclass
21
+ class CourseChange:
22
+ """Detected change in course catalog"""
23
+ old_code: str
24
+ new_code: str = None
25
+ status: str = "missing" # missing, renamed, moved, deprecated
26
+ replacement_suggestion: str = None
27
+ confidence: float = 0.0
28
+ evidence: str = ""
 
29
 
30
+ class AgenticOptimizer:
31
  """
32
+ Autonomous agent that validates requirements AND fixes graph automatically
 
 
 
 
33
  """
34
 
35
+ # Requirements synced with curriculum_optimizer.py
36
+ CONCENTRATION_REQUIREMENTS = {
37
+ "ai_ml": {
38
+ "foundations": {
39
+ "required": ["CS1800", "CS2500", "CS2510", "CS2800"],
40
+ },
41
+ "core": {
42
+ "required": ["CS3000", "CS3500"],
43
+ "pick_1_from": ["CS3200", "CS3650", "CS5700"] # FIXED: CS3700 → CS5700
44
+ },
45
+ "concentration_specific": {
46
+ "required": ["CS4100", "DS4400"],
47
+ "pick_2_from": ["CS4120", "CS4180", "DS4420", "DS4440"],
48
+ "pick_1_systems": ["CS4730", "CS4700"] # REMOVED: CS4750 (doesn't exist)
49
+ },
50
+ "math": {
51
+ "required": ["MATH1341", "MATH1342"],
52
+ "pick_1_from": ["MATH2331", "MATH3081"] # REMOVED: STAT3150
53
+ }
54
+ },
55
+ "systems": {
56
+ "foundations": {
57
+ "required": ["CS1800", "CS2500", "CS2510", "CS2800"]
58
+ },
59
+ "core": {
60
+ "required": ["CS3000", "CS3500", "CS3650"],
61
+ "pick_1_from": ["CS5700", "CS3200"] # FIXED: CS3700 → CS5700
62
+ },
63
+ "concentration_specific": {
64
+ "required": ["CS4700"],
65
+ "pick_2_from": ["CS4730"], # REMOVED: CS4750, CS4770
66
+ "pick_1_from": ["CS4400", "CS4500", "CS4520"]
67
+ },
68
+ "math": {
69
+ "required": ["MATH1341", "MATH1342"]
70
+ }
71
+ },
72
+ "security": {
73
+ "foundations": {
74
+ "required": ["CS1800", "CS2500", "CS2510", "CS2800"]
75
+ },
76
+ "core": {
77
+ "required": ["CS3000", "CS3650", "CY2550"],
78
+ "pick_1_from": ["CS5700", "CS3500"] # FIXED: CS3700 → CS5700
79
+ },
80
+ "concentration_specific": {
81
+ "required": ["CY3740"],
82
+ "pick_2_from": ["CY4740", "CY4760", "CY4770"], # CY4770 (moved from CS)
83
+ "pick_1_from": ["CS4700", "CS4730"]
84
+ },
85
+ "math": {
86
+ "required": ["MATH1342"],
87
+ "pick_1_from": ["MATH3527", "MATH3081"]
88
+ }
89
+ }
90
+ }
91
+
92
+ # Known manual additions for courses that don't appear in scraper
93
+ MANUAL_COURSES = {
94
+ "CS5700": {
95
+ "name": "Fundamentals of Networks",
96
+ "subject": "CS",
97
+ "classId": "5700",
98
+ "description": "Networks and distributed systems (grad level, no prereqs)",
99
+ "minCredits": 4,
100
+ "maxCredits": 4,
101
+ "prerequisites": [] # Open to undergrads
102
+ },
103
+ "CY4770": {
104
+ "name": "Foundations of Cryptography",
105
+ "subject": "CY",
106
+ "classId": "4770",
107
+ "description": "Mathematical cryptography (moved from CS dept)",
108
+ "minCredits": 4,
109
+ "maxCredits": 4,
110
+ "prerequisites": ["CS3000"] # Simplified prereq
111
+ }
112
+ }
113
+
114
+ def __init__(self, graph_path: str, use_llm: bool = True):
115
+ self.graph_path = graph_path
116
+ self.use_llm = use_llm
117
  self.graph = None
118
  self.courses = {}
119
+ self.changes = []
120
 
121
+ # Load LLM if needed
122
+ self.llm = None
123
+ self.tokenizer = None
124
+ if use_llm:
125
+ self._load_llm()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
 
127
+ def _load_llm(self):
128
+ """Load local LLM for intelligent validation"""
129
+ print("🤖 Loading LLM for catalog analysis...")
130
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
131
+
132
+ if device.type == 'cuda':
133
+ model_name = "meta-llama/Llama-3.1-8B-Instruct"
134
+ quant_config = BitsAndBytesConfig(
135
+ load_in_4bit=True,
136
+ bnb_4bit_quant_type="nf4",
137
+ bnb_4bit_compute_dtype=torch.bfloat16
138
+ )
139
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
140
+ self.tokenizer.pad_token = self.tokenizer.eos_token
141
+ self.llm = AutoModelForCausalLM.from_pretrained(
142
+ model_name,
143
+ quantization_config=quant_config,
144
+ device_map="auto"
145
+ )
146
+ print("✅ LLM loaded")
147
+ else:
148
+ print("⚠️ No GPU available, LLM disabled")
149
+ self.use_llm = False
150
 
151
+ def load_graph(self):
152
+ """Load curriculum graph"""
153
+ print(f"📚 Loading graph: {self.graph_path}")
154
+ with open(self.graph_path, 'rb') as f:
155
+ self.graph = pickle.load(f)
156
+ self.courses = dict(self.graph.nodes(data=True))
157
+ print(f"✅ Loaded {len(self.courses)} courses")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
 
159
+ def validate_requirements(self) -> Dict[str, List[CourseChange]]:
160
+ """Check which required courses are missing from graph"""
161
+ print("\n🔍 Validating CONCENTRATION_REQUIREMENTS against graph...")
162
+
163
+ track_changes = {}
164
+
165
+ for track, track_reqs in self.CONCENTRATION_REQUIREMENTS.items():
166
+ print(f"\n📋 Checking {track} track:")
167
+ track_changes[track] = []
168
+
169
+ for category, reqs in track_reqs.items():
170
+ if not isinstance(reqs, dict):
171
+ continue
 
 
 
 
 
 
 
 
 
172
 
173
+ for key, courses in reqs.items():
174
+ if not isinstance(courses, list):
175
+ continue
176
+
177
+ for course in courses:
178
+ if course not in self.courses:
179
+ change = CourseChange(
180
+ old_code=course,
181
+ status="missing",
182
+ evidence=f"Not found in scraped graph ({len(self.courses)} courses)"
183
+ )
184
+ track_changes[track].append(change)
185
+ print(f" ❌ {course} - MISSING")
186
+ else:
187
+ print(f" ✅ {course}")
188
+
189
+ return track_changes
190
 
191
+ def find_replacements(self, changes: Dict[str, List[CourseChange]]) -> Dict[str, List[CourseChange]]:
192
+ """Use pattern matching + LLM to suggest replacements"""
193
+ print("\n🤖 Analyzing missing courses...")
194
+
195
+ for track, track_changes in changes.items():
196
+ for change in track_changes:
197
+ if change.status != "missing":
198
+ continue
 
 
 
199
 
200
+ # Try pattern matching first (instant)
201
+ replacement = self._pattern_match_replacement(change.old_code)
202
+ if replacement:
203
+ change.new_code = replacement
204
+ change.status = "renamed"
205
+ change.confidence = 0.7
206
+ change.evidence = "Pattern matching"
207
+ print(f" 🔄 {change.old_code} {replacement} (pattern)")
208
+ continue
209
+
210
+ # Check manual course database
211
+ if change.old_code in self.MANUAL_COURSES:
212
+ change.new_code = change.old_code # Will be added to graph
213
+ change.status = "manual_add"
214
+ change.confidence = 1.0
215
+ change.evidence = "Manual course database"
216
+ print(f" ➕ {change.old_code} - Will be added manually")
217
+ continue
218
+
219
+ # Use LLM for ambiguous cases
220
+ if self.use_llm and self.llm:
221
+ replacement = self._llm_suggest_replacement(change.old_code, track)
222
+ if replacement:
223
+ change.new_code = replacement
224
+ change.status = "renamed"
225
+ change.confidence = 0.9
226
+ change.evidence = "LLM analysis"
227
+ print(f" 🔄 {change.old_code} → {replacement} (LLM)")
228
+ else:
229
+ print(f" ⚠️ {change.old_code} - No replacement found")
230
+
231
+ return changes
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
 
233
+ def _pattern_match_replacement(self, course_code: str) -> Optional[str]:
234
+ """Fast pattern-based replacement detection"""
235
+
236
+ # Known replacements from manual verification
237
+ known_replacements = {
238
+ "CS3700": "CS5700",
239
+ "CS4770": "CY4770",
240
+ "STAT3150": "MATH3081",
241
  }
242
+
243
+ if course_code in known_replacements:
244
+ if known_replacements[course_code] in self.courses:
245
+ return known_replacements[course_code]
246
+
247
+ # Try subject swap (CS ↔ CY)
248
+ if course_code.startswith("CS"):
249
+ alt_code = "CY" + course_code[2:]
250
+ if alt_code in self.courses:
251
+ return alt_code
252
+ elif course_code.startswith("CY"):
253
+ alt_code = "CS" + course_code[2:]
254
+ if alt_code in self.courses:
255
+ return alt_code
256
+
257
+ # Try grad-level version (3XXX/4XXX → 5XXX)
258
+ match = re.match(r'([A-Z]+)(\d)(\d{3})', course_code)
259
+ if match:
260
+ subject, first_digit, rest = match.groups()
261
+ if first_digit in ['3', '4']:
262
+ grad_code = f"{subject}5{rest}"
263
+ if grad_code in self.courses:
264
+ return grad_code
265
+
266
+ return None
267
 
268
+ def _llm_suggest_replacement(self, missing_course: str, track: str) -> Optional[str]:
269
+ """Use LLM to intelligently suggest replacement"""
270
+
271
+ subject = re.match(r'([A-Z]+)', missing_course).group(1)
272
+ similar_courses = [
273
+ (cid, data.get('name', ''))
274
+ for cid, data in self.courses.items()
275
+ if cid.startswith(subject) and cid != missing_course
276
+ ][:10]
277
+
278
+ course_list = "\n".join([f"- {cid}: {name}" for cid, name in similar_courses])
279
+
280
+ prompt = f"""Course catalog expert analyzing NEU curriculum changes.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
 
282
+ **Missing:** {missing_course}
283
+ **Track:** {track}
284
 
285
+ **Available courses:**
286
+ {course_list}
287
+
288
+ Which course replaced {missing_course}? Return ONLY the code or "NONE".
289
+
290
+ Rules:
291
+ - Networks: CS3700 → CS5700
292
+ - Crypto: CS → CY dept
293
+ - STAT → MATH
294
+ - Game courses often don't exist
295
+ """
296
 
297
+ try:
298
+ inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to(self.llm.device)
299
+ with torch.no_grad():
300
+ outputs = self.llm.generate(
301
+ **inputs,
302
+ max_new_tokens=50,
303
+ temperature=0.1,
304
+ do_sample=True,
305
+ pad_token_id=self.tokenizer.eos_token_id
306
+ )
307
+ response = self.tokenizer.decode(outputs[0][len(inputs['input_ids'][0]):], skip_special_tokens=True).strip()
308
+
309
+ match = re.search(r'([A-Z]{2,4}\d{4})', response)
310
+ if match:
311
+ suggested = match.group(1)
312
+ if suggested in self.courses:
313
+ return suggested
314
 
315
+ except Exception as e:
316
+ print(f" ⚠️ LLM error: {e}")
 
 
317
 
318
+ return None
319
+
320
+ def fix_graph(self, changes: Dict[str, List[CourseChange]]) -> int:
321
+ """Directly add missing courses to the graph"""
322
+ print("\n🔧 Fixing graph by adding missing courses...")
323
 
324
+ added_count = 0
 
325
 
326
+ for track, track_changes in changes.items():
327
+ for change in track_changes:
328
+ if change.status == "manual_add" and change.old_code in self.MANUAL_COURSES:
329
+ course_data = self.MANUAL_COURSES[change.old_code]
330
+ cid = change.old_code
331
+
332
+ # Add node
333
+ self.graph.add_node(cid, **course_data)
334
+ self.courses[cid] = course_data
335
+
336
+ # Add prerequisite edges
337
+ for prereq in course_data.get("prerequisites", []):
338
+ if prereq in self.graph:
339
+ self.graph.add_edge(prereq, cid, relationship="prerequisite")
340
+ else:
341
+ print(f" ⚠️ Prereq {prereq} for {cid} not in graph")
342
+
343
+ print(f" ✅ Added {cid}: {course_data['name']}")
344
+ added_count += 1
345
+
346
+ return added_count
347
 
348
+ def save_report(self, changes: Dict[str, List[CourseChange]], output_path: str = None):
349
+ """Save validation report"""
350
+ if not output_path:
351
+ output_path = f"catalog_validation_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
352
 
353
+ report = {
354
+ "timestamp": datetime.now().isoformat(),
355
+ "graph_file": self.graph_path,
356
+ "total_courses_in_graph": len(self.courses),
357
+ "changes": {
358
+ track: [asdict(c) for c in track_changes]
359
+ for track, track_changes in changes.items()
360
+ }
361
+ }
362
 
363
+ with open(output_path, 'w') as f:
364
+ json.dump(report, f, indent=2)
365
+
366
+ print(f"\n💾 Report saved: {output_path}")
367
+
368
+ def save_graph(self, output_path: str):
369
+ """Save the fixed graph"""
370
+ with open(output_path, 'wb') as f:
371
+ pickle.dump(self.graph, f)
372
+ print(f"💾 Fixed graph saved: {output_path}")
373
+ print(f"📊 Final graph: {self.graph.number_of_nodes()} courses, {self.graph.number_of_edges()} edges")
374
 
375
+ def run(self, fix: bool = False, output: str = None):
376
+ """Main agent workflow"""
377
+ print("="*70)
378
+ print("AGENTIC OPTIMIZER - Autonomous Graph Validator & Fixer")
379
+ print("="*70)
380
 
381
+ # Step 1: Load data
382
+ self.load_graph()
383
+
384
+ # Step 2: Validate requirements
385
+ changes = self.validate_requirements()
386
+
387
+ # Count issues
388
+ total_missing = sum(len(c) for c in changes.values())
389
+ if total_missing == 0:
390
+ print("\n✅ All requirements valid! No changes needed.")
391
+ return
392
+
393
+ print(f"\n⚠️ Found {total_missing} missing courses across all tracks")
394
+
395
+ # Step 3: Find replacements
396
+ changes = self.find_replacements(changes)
397
+
398
+ # Step 4: Generate report
399
+ self.save_report(changes)
400
+
401
+ # Step 5: Fix graph if requested
402
+ if fix:
403
+ added = self.fix_graph(changes)
404
+
405
+ if added > 0:
406
+ print(f"\n✅ Added {added} courses to graph")
407
+
408
+ if output:
409
+ self.save_graph(output)
410
+ else:
411
+ # Default output name
412
+ default_output = self.graph_path.replace('.pkl', '_fixed.pkl')
413
+ self.save_graph(default_output)
414
+ else:
415
+ print("\n⚠️ No courses added (all issues are renamings, not missing)")
416
+
417
+ print("\n✨ Optimization complete!")
418
 
419
 
420
+ def main():
421
+ parser = argparse.ArgumentParser(description="Agentic Optimizer - Auto-validate & fix curriculum graph")
422
+ parser.add_argument('--graph', required=True, help="Path to curriculum graph .pkl")
423
+ parser.add_argument('--validate', action='store_true', help="Only validate, don't fix")
424
+ parser.add_argument('--fix', action='store_true', help="Fix graph by adding missing courses")
425
+ parser.add_argument('--output', help="Output path for fixed graph")
426
+ parser.add_argument('--no-llm', action='store_true', help="Disable LLM (use pattern matching only)")
427
 
428
+ args = parser.parse_args()
 
 
 
 
 
 
 
 
 
 
429
 
430
+ agent = AgenticOptimizer(
431
+ graph_path=args.graph,
432
+ use_llm=not args.no_llm
433
+ )
434
 
435
+ agent.run(
436
+ fix=args.fix,
437
+ output=args.output
438
+ )
439
+
440
+
441
+ if __name__ == "__main__":
442
+ main()
src/curriculum_analyzer.py CHANGED
@@ -1,13 +1,11 @@
1
  """
2
- Curriculum Analyzer and Data Enrichment Tool (with Pre-filtering)
3
- Analyzes, CLEANS, and enriches scraped NEU curriculum data.
4
  """
5
  import pickle
6
- import json
7
  import argparse
8
  import networkx as nx
9
  import re
10
- from collections import defaultdict
11
 
12
  def get_course_level(cid):
13
  """Extracts the numerical part of a course ID for level checking."""
@@ -16,112 +14,177 @@ def get_course_level(cid):
16
 
17
  class CurriculumAnalyzer:
18
  def __init__(self, graph_path, courses_path):
19
- self.graph_path = graph_path
20
- self.courses_path = courses_path
21
- self.graph = None
22
- self.courses = None
23
- self.load_data()
24
-
25
- def load_data(self):
26
  print("📚 Loading raw curriculum data...")
27
- try:
28
- with open(self.graph_path, 'rb') as f:
29
- self.graph = pickle.load(f)
30
- with open(self.courses_path, 'rb') as f:
31
- self.courses = pickle.load(f)
32
-
33
- # Merge course metadata into the graph nodes
34
- for course_id, course_data in self.courses.items():
35
- if self.graph.has_node(course_id):
36
- self.graph.nodes[course_id].update(course_data)
37
-
38
- print(f"✅ Loaded raw data with {self.graph.number_of_nodes()} courses.")
39
- except FileNotFoundError as e:
40
- print(f"❌ Error: Data file not found. {e}")
41
- exit(1)
42
 
43
  def pre_filter_graph(self):
44
- """
45
- Permanently removes irrelevant courses from the graph.
46
- This is the most important step for creating logical plans.
47
- """
48
- print("\n🧹 Pre-filtering graph to remove irrelevant courses...")
49
-
50
- # Define what subjects are considered relevant for a tech-focused degree
51
- RELEVANT_SUBJECTS = {
52
- "CS", "DS", "CY",
53
- }
54
 
55
- nodes_to_remove = []
56
  for node, data in self.graph.nodes(data=True):
57
- subject = data.get('subject')
 
58
  level = get_course_level(node)
59
 
60
- # Mark for removal if subject is irrelevant OR it's a grad course (>= 5000)
61
- if subject not in RELEVANT_SUBJECTS or level >= 5000:
62
- nodes_to_remove.append(node)
63
-
 
 
 
 
 
64
  self.graph.remove_nodes_from(nodes_to_remove)
65
- print(f"✅ Graph filtered. Removed {len(nodes_to_remove)} irrelevant courses. Remaining: {self.graph.number_of_nodes()}")
 
66
 
67
- def calculate_and_add_complexity(self):
68
- """Calculates complexity scores for the remaining courses."""
69
- print("\n🧮 Calculating complexity scores for filtered graph...")
70
- if not self.graph.nodes():
71
- return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
- foundation_courses = [n for n, d in self.graph.in_degree() if d == 0]
 
 
74
 
75
- complexity_scores = {}
76
- for node in self.graph.nodes():
77
- # Calculate depth (longest path from a foundation course)
78
- depth = 0
79
- if foundation_courses:
80
- paths = [nx.shortest_path_length(self.graph, source, node)
81
- for source in foundation_courses if nx.has_path(self.graph, source, node)]
82
- if paths:
83
- depth = max(paths) # Use max path for a better sense of progression
 
 
 
 
84
 
85
- in_deg = self.graph.in_degree(node)
86
- out_deg = self.graph.out_degree(node)
 
 
 
 
 
87
 
88
- # Formula: (prereqs * 10) + (unlocks * 5) + (depth * 3)
89
- score = (in_deg * 10) + (out_deg * 5) + (depth * 3)
90
- complexity_scores[node] = {
91
- 'complexity': score,
92
- 'depth': depth,
93
- 'prereq_count': in_deg,
94
- 'unlocks_count': out_deg
95
- }
96
 
97
- nx.set_node_attributes(self.graph, complexity_scores)
98
- print(" Complexity scores calculated and added.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
100
  def save_enriched_graph(self, output_path):
101
  """Saves the final, clean, and enriched graph."""
102
- print(f"\n💾 Saving CLEAN and enriched graph to {output_path}...")
103
  with open(output_path, 'wb') as f:
104
  pickle.dump(self.graph, f)
105
- print("✅ Graph saved.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
  def main(args):
108
  """Main execution flow."""
109
  analyzer = CurriculumAnalyzer(args.graph, args.courses)
110
-
111
- # Run the new cleaning step first!
112
  analyzer.pre_filter_graph()
113
-
 
114
  analyzer.calculate_and_add_complexity()
115
 
116
- analyzer.save_enriched_graph(args.output_graph)
117
 
118
- print("\n✨ Analysis and cleaning complete!")
119
- print(f"➡️ In the Streamlit app, upload the new clean file: '{args.output_graph}'")
 
 
 
 
 
120
 
121
  if __name__ == "__main__":
122
- parser = argparse.ArgumentParser(description="NEU Curriculum Analyzer and Data Enrichment Tool")
123
- parser.add_argument('--graph', required=True, help="Path to the RAW curriculum graph from the scraper.")
124
- parser.add_argument('--courses', required=True, help="Path to the RAW courses data from the scraper.")
125
- parser.add_argument('--output-graph', default='neu_graph_analyzed_clean.pkl', help="Path to save the new CLEANED and enriched graph.")
126
  args = parser.parse_args()
127
- main(args)
 
1
  """
2
+ Fixed Curriculum Analyzer - Better handling of incomplete data
 
3
  """
4
  import pickle
 
5
  import argparse
6
  import networkx as nx
7
  import re
8
+ from typing import Set, Dict
9
 
10
  def get_course_level(cid):
11
  """Extracts the numerical part of a course ID for level checking."""
 
14
 
15
  class CurriculumAnalyzer:
16
  def __init__(self, graph_path, courses_path):
 
 
 
 
 
 
 
17
  print("📚 Loading raw curriculum data...")
18
+ with open(graph_path, 'rb') as f:
19
+ self.graph = pickle.load(f)
20
+ with open(courses_path, 'rb') as f:
21
+ self.courses = pickle.load(f)
22
+
23
+ # Merge course data into graph nodes
24
+ for course_id, course_data in self.courses.items():
25
+ if self.graph.has_node(course_id):
26
+ self.graph.nodes[course_id].update(course_data)
27
+
28
+ print(f"✅ Loaded {self.graph.number_of_nodes()} courses, {self.graph.number_of_edges()} edges")
 
 
 
 
29
 
30
  def pre_filter_graph(self):
31
+ """Keeps only relevant subjects and removes labs/high-level courses."""
32
+ print("\n🧹 Pre-filtering graph...")
33
+
34
+ KEEP_SUBJECTS = {"CS", "DS", "IS", "CY", "MATH", "PHYS", "ENGW", "STAT", "EECE"}
 
 
 
 
 
 
35
 
36
+ nodes_to_remove = set()
37
  for node, data in self.graph.nodes(data=True):
38
+ subject = data.get('subject', '')
39
+ name = data.get('name', '').lower()
40
  level = get_course_level(node)
41
 
42
+ # Remove if:
43
+ # - Not in whitelist
44
+ # - Too advanced (5000+)
45
+ # - Lab/recitation/etc
46
+ if (subject not in KEEP_SUBJECTS or
47
+ level >= 5000 or
48
+ any(skip in name for skip in ['lab', 'recitation', 'seminar', 'practicum', 'co-op'])):
49
+ nodes_to_remove.add(node)
50
+
51
  self.graph.remove_nodes_from(nodes_to_remove)
52
+ print(f"✅ Removed {len(nodes_to_remove)} irrelevant courses")
53
+ print(f" Remaining: {self.graph.number_of_nodes()} courses")
54
 
55
+ def fix_chains(self):
56
+ """Adds critical prerequisite chains that might be missing."""
57
+ print("\n🔗 Validating and fixing critical prerequisite chains...")
58
+
59
+ critical_chains = {
60
+ ("CS1800", "CS2800", "Discrete → Logic"),
61
+ ("CS2500", "CS2510", "Fundies 1 → Fundies 2"),
62
+ ("CS2510", "CS3500", "Fundies 2 → OOD"),
63
+ ("CS2510", "CS3000", "Fundies 2 → Algorithms"),
64
+ ("CS3000", "CS4100", "Algorithms → AI"), # NEW
65
+ ("MATH1341", "MATH1342", "Calc 1 → Calc 2"),
66
+ ("DS2000", "DS2500", "Prog w/ Data → Intermediate"),
67
+ ("DS2500", "DS3500", "Intermediate → Advanced"),
68
+ ("DS3500", "DS4400", "Advanced → ML1"), # NEW
69
+ }
70
+
71
+ added = 0
72
+ for prereq, course, desc in critical_chains:
73
+ if self.graph.has_node(prereq) and self.graph.has_node(course):
74
+ if not self.graph.has_edge(prereq, course):
75
+ self.graph.add_edge(prereq, course)
76
+ print(f" 🔧 FIXED: Added {prereq} → {course} ({desc})")
77
+ added += 1
78
+
79
+ if added == 0:
80
+ print(" ✅ All critical chains present")
81
 
82
+ def remove_spurious_chains(self):
83
+ """Removes known incorrect prerequisite edges."""
84
+ print("\n🗑️ Removing spurious prerequisite chains...")
85
 
86
+ spurious_chains = {
87
+ ("MATH1365", "CS2800"), # Not a real prereq
88
+ }
89
+
90
+ removed = 0
91
+ for prereq, course in spurious_chains:
92
+ if self.graph.has_edge(prereq, course):
93
+ self.graph.remove_edge(prereq, course)
94
+ print(f" ✅ REMOVED: {prereq} {course}")
95
+ removed += 1
96
+
97
+ if removed == 0:
98
+ print(" ✅ No spurious chains found")
99
 
100
+ def calculate_and_add_complexity(self):
101
+ """Calculates and adds complexity score to each course."""
102
+ print("\n🧮 Calculating complexity scores...")
103
+
104
+ for node in self.graph.nodes():
105
+ in_degree = self.graph.in_degree(node)
106
+ out_degree = self.graph.out_degree(node)
107
 
108
+ # Complexity heuristic: weighted by prerequisites and courses unlocked
109
+ score = (in_degree * 10) + (out_degree * 5)
110
+ nx.set_node_attributes(self.graph, {node: {'complexity': score}})
111
+
112
+ print("✅ Complexity scores calculated")
 
 
 
113
 
114
+ def validate_critical_courses(self) -> Dict[str, Set[str]]:
115
+ """Check if all critical courses exist in the graph."""
116
+ print("\n🎯 Validating critical course coverage...")
117
+
118
+ required_courses = {
119
+ "foundations": {"CS1800", "CS2500", "CS2510", "CS2800"},
120
+ "core": {"CS3000", "CS3500", "CS3650", "CS3700", "CS3200"},
121
+ "ai_ml": {"CS4100", "DS4400", "CS4120", "DS4420", "CS4180", "DS4440"},
122
+ "systems": {"CS4730", "CS4400", "CS4500"}, # Removed often-missing courses
123
+ "security": {"CY2550", "CY3740", "CY4740", "CY4760"},
124
+ "math": {"MATH1341", "MATH1342", "MATH2331", "MATH3081"}, # No STAT courses at NEU
125
+ }
126
+
127
+ missing = {}
128
+ for category, courses in required_courses.items():
129
+ missing_in_cat = courses - set(self.graph.nodes())
130
+ if missing_in_cat:
131
+ missing[category] = missing_in_cat
132
+ print(f" ⚠️ {category}: Missing {missing_in_cat}")
133
+ else:
134
+ print(f" ✅ {category}: All courses present")
135
+
136
+ return missing
137
 
138
  def save_enriched_graph(self, output_path):
139
  """Saves the final, clean, and enriched graph."""
140
+ print(f"\n💾 Saving cleaned graph to {output_path}...")
141
  with open(output_path, 'wb') as f:
142
  pickle.dump(self.graph, f)
143
+ print("✅ Graph saved")
144
+
145
+ # Save a summary report
146
+ report_path = output_path.replace('.pkl', '_report.txt')
147
+ with open(report_path, 'w') as f:
148
+ f.write("Curriculum Graph Analysis Report\n")
149
+ f.write("="*70 + "\n\n")
150
+ f.write(f"Total courses: {self.graph.number_of_nodes()}\n")
151
+ f.write(f"Total prerequisites: {self.graph.number_of_edges()}\n\n")
152
+
153
+ # Subject breakdown
154
+ from collections import defaultdict
155
+ subject_counts = defaultdict(int)
156
+ for node in self.graph.nodes():
157
+ subject = self.graph.nodes[node].get('subject', 'UNKNOWN')
158
+ subject_counts[subject] += 1
159
+
160
+ f.write("Subject breakdown:\n")
161
+ for subject in sorted(subject_counts.keys()):
162
+ f.write(f" {subject}: {subject_counts[subject]}\n")
163
+
164
+ print(f"✅ Report saved to {report_path}")
165
 
166
  def main(args):
167
  """Main execution flow."""
168
  analyzer = CurriculumAnalyzer(args.graph, args.courses)
 
 
169
  analyzer.pre_filter_graph()
170
+ analyzer.fix_chains()
171
+ analyzer.remove_spurious_chains()
172
  analyzer.calculate_and_add_complexity()
173
 
174
+ missing = analyzer.validate_critical_courses()
175
 
176
+ if missing:
177
+ print("\n⚠️ WARNING: Some critical courses are missing!")
178
+ print(" Consider re-scraping with additional terms or subjects.")
179
+ print(" Missing courses will be excluded from planning.")
180
+
181
+ analyzer.save_enriched_graph(args.output_graph)
182
+ print("\n✨ Analysis complete!")
183
 
184
  if __name__ == "__main__":
185
+ parser = argparse.ArgumentParser(description="NEU Curriculum Analyzer - Cleans and validates data")
186
+ parser.add_argument('--graph', required=True, help="Path to RAW curriculum graph")
187
+ parser.add_argument('--courses', required=True, help="Path to RAW courses data")
188
+ parser.add_argument('--output-graph', default='neu_graph_clean.pkl', help="Output path")
189
  args = parser.parse_args()
190
+ main(args)
src/curriculum_optimizer.py CHANGED
@@ -1,7 +1,6 @@
1
  """
2
  Fixed Hybrid Curriculum Optimizer
3
- Actually personalizes plans based on student profile
4
- WITH MUTUAL EXCLUSION AND SEQUENCE VALIDATION
5
  """
6
  import torch
7
  from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
@@ -27,10 +26,15 @@ class StudentProfile:
27
 
28
  class HybridOptimizer:
29
  """
30
- Fixed optimizer with proper course sequencing and mutual exclusion
31
  """
32
 
33
- # COURSE TRACKS - Mutually exclusive sequences
 
 
 
 
 
34
  COURSE_TRACKS = {
35
  "physics": {
36
  "engineering": ["PHYS1151", "PHYS1155"],
@@ -43,83 +47,56 @@ class HybridOptimizer:
43
  }
44
  }
45
 
46
- # CONCENTRATION REQUIREMENTS - Structured with pick lists
47
  CONCENTRATION_REQUIREMENTS = {
48
  "ai_ml": {
49
  "foundations": {
50
- "required": ["CS1800", "CS2500", "CS2510", "CS2800"]
 
51
  },
52
  "core": {
53
  "required": ["CS3000", "CS3500"],
54
- "pick_1_from": ["CS3200", "CS3650", "CS3700"]
55
  },
56
  "concentration_specific": {
57
  "required": ["CS4100", "DS4400"],
58
  "pick_2_from": ["CS4120", "CS4180", "DS4420", "DS4440"],
59
- "pick_1_systems": ["CS4730", "CS4700", "CS4750"]
60
  },
61
  "math": {
62
  "required": ["MATH1341", "MATH1342"],
63
- "pick_1_from": ["MATH2331", "MATH3081", "STAT315"]
64
  }
65
  },
66
  "systems": {
67
- "foundations": {
68
- "required": ["CS1800", "CS2500", "CS2510", "CS2800"]
69
- },
70
- "core": {
71
- "required": ["CS3000", "CS3500", "CS3650"],
72
- "pick_1_from": ["CS3700", "CS3200"]
73
- },
74
- "concentration_specific": {
75
- "required": ["CS4700"],
76
- "pick_2_from": ["CS4730", "CS4750", "CS4770"],
77
- "pick_1_from": ["CS4400", "CS4500", "CS4520"]
78
- },
79
- "math": {
80
- "required": ["MATH1341", "MATH1342"]
81
- }
82
  },
83
  "security": {
84
- "foundations": {
85
- "required": ["CS1800", "CS2500", "CS2510", "CS2800"]
86
- },
87
- "core": {
88
- "required": ["CS3000", "CS3650", "CY2550"],
89
- "pick_1_from": ["CS3700", "CS3500"]
90
- },
91
- "concentration_specific": {
92
- "required": ["CY3740"],
93
- "pick_2_from": ["CY4740", "CY4760", "CY4770"],
94
- "pick_1_from": ["CS4700", "CS4730"]
95
- },
96
- "math": {
97
- "required": ["MATH1342"],
98
- "pick_1_from": ["MATH3527", "MATH3081"]
99
- }
100
  }
101
  }
102
 
103
  def __init__(self):
104
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
105
-
106
- # Use smaller model for efficiency
107
  self.model_name = "meta-llama/Llama-3.1-8B-Instruct"
108
  self.embedding_model_name = 'BAAI/bge-large-en-v1.5'
109
-
110
  self.llm = None
111
  self.tokenizer = None
112
  self.embedding_model = None
113
  self.curriculum_graph = None
114
  self.courses = {}
115
-
 
116
  def load_models(self):
117
- """Load embedding model and optionally LLM"""
118
  print("Loading embedding model...")
119
  self.embedding_model = SentenceTransformer(self.embedding_model_name, device=self.device)
120
 
121
  def load_llm(self):
122
- """Load LLM separately for when needed"""
123
  if self.device.type == 'cuda' and self.llm is None:
124
  print("Loading LLM for intelligent planning...")
125
  quant_config = BitsAndBytesConfig(
@@ -134,272 +111,218 @@ class HybridOptimizer:
134
  quantization_config=quant_config,
135
  device_map="auto"
136
  )
137
-
138
  def load_data(self, graph: nx.DiGraph):
139
- """Load and preprocess curriculum data"""
140
  self.curriculum_graph = graph
141
  self.courses = dict(graph.nodes(data=True))
142
-
143
- # Filter valid courses
144
  self.valid_courses = []
145
  course_texts = []
146
 
 
 
 
 
 
 
 
 
147
  for cid, data in self.courses.items():
148
- # Skip labs/recitations
149
  name = data.get('name', '')
150
- if any(skip in name for skip in ['Lab', 'Recitation', 'Seminar', 'Practicum']):
151
  continue
152
-
153
- # Skip grad level
154
- if self._get_level(cid) >= 5000:
155
  continue
156
 
157
  self.valid_courses.append(cid)
158
  course_texts.append(f"{name} {data.get('description', '')}")
159
 
160
- # Precompute embeddings
 
 
 
161
  print(f"Computing embeddings for {len(self.valid_courses)} courses...")
162
- self.course_embeddings = self.embedding_model.encode(
163
- course_texts,
164
- convert_to_tensor=True,
165
- show_progress_bar=True
166
- )
167
-
168
- def _get_track_commitment(self, completed: Set[str], track_type: str) -> Optional[str]:
169
- """Once a student takes one course in a track, commit to that track"""
170
- tracks = self.COURSE_TRACKS.get(track_type, {})
171
- for track_name, courses in tracks.items():
172
- if any(c in completed for c in courses):
173
- return track_name
174
- return None
175
-
 
 
 
 
 
 
 
 
176
  def _validate_sequence(self, selected: List[str], candidate: str) -> bool:
177
- """Ensure course sequences stay consistent - no mixing tracks"""
178
  for track_type, tracks in self.COURSE_TRACKS.items():
179
  for track_name, sequence in tracks.items():
180
  if candidate in sequence:
181
- # Check if any course from different track already selected
182
  for other_track, other_seq in tracks.items():
183
- if other_track != track_name:
184
- if any(c in selected for c in other_seq):
185
- return False # Don't mix sequences
186
  return True
187
-
188
- def validate_plan(self, plan: Dict) -> Dict[str, List[str]]:
189
- """Validate a plan for consistency and requirements"""
190
- issues = {
191
- "errors": [],
192
- "warnings": [],
193
- "info": []
194
- }
195
-
196
- all_courses = []
197
- for year_key, year_data in plan.items():
198
- if isinstance(year_data, dict) and year_key.startswith("year_"):
199
- all_courses.extend(year_data.get("fall", []))
200
- all_courses.extend(year_data.get("spring", []))
201
-
202
- # Check for sequence mixing
203
- for track_type, tracks in self.COURSE_TRACKS.items():
204
- tracks_used = set()
205
- for track_name, courses in tracks.items():
206
- if any(c in all_courses for c in courses):
207
- tracks_used.add(track_name)
208
-
209
- if len(tracks_used) > 1:
210
- issues["errors"].append(
211
- f"Mixed {track_type} tracks: {', '.join(tracks_used)}. Must choose one sequence."
212
- )
213
 
214
- # Check prerequisites are satisfied
215
- completed = set()
216
- for year in range(1, 5):
217
- for sem in ["fall", "spring"]:
218
- year_key = f"year_{year}"
219
- if year_key in plan:
220
- courses = plan[year_key].get(sem, [])
221
- for course in courses:
222
- if course in self.curriculum_graph:
223
- prereqs = set(self.curriculum_graph.predecessors(course))
224
- missing = prereqs - completed
225
- if missing:
226
- issues["errors"].append(
227
- f"{course} in Year {year} {sem} missing prereqs: {', '.join(missing)}"
228
- )
229
- completed.update(courses)
230
 
231
- return issues
232
-
233
- def generate_llm_plan(self, student: StudentProfile) -> Dict:
234
- """Generate AI-powered plan with LLM course selection"""
235
- print("--- Generating AI-Optimized Plan ---")
236
 
237
- # Ensure LLM is loaded
238
- self.load_llm()
239
 
240
- if not self.llm:
241
- print("LLM not available, falling back to enhanced rule-based plan")
242
- return self.generate_enhanced_rule_plan(student)
 
 
 
 
243
 
244
- # Step 1: Identify track
245
- track = self._identify_track(student)
246
- print(f"Identified track: {track}")
247
 
248
- # Step 2: Get LLM-suggested courses
249
- llm_suggestions = self._get_llm_course_suggestions(student, track)
 
250
 
251
- # Step 3: Build plan using LLM suggestions + rules
252
- plan = self._build_structured_plan(student, track, llm_suggestions)
 
 
253
 
254
- # Step 4: Validate plan
255
- validation = self.validate_plan(plan)
256
- if validation["errors"]:
257
- print(f"Plan validation errors: {validation['errors']}")
258
- # Try to fix errors
259
- plan = self._fix_plan_errors(plan, validation, student)
260
 
261
- # Step 5: Generate explanation
262
- explanation = self._generate_explanation(student, plan, track, "AI-optimized")
263
 
264
- return self._finalize_plan(plan, explanation, validation)
265
-
266
  def generate_simple_plan(self, student: StudentProfile) -> Dict:
267
- """Generate rule-based plan that considers student preferences"""
268
  print("--- Generating Enhanced Rule-Based Plan ---")
 
269
  return self.generate_enhanced_rule_plan(student)
270
-
271
- def generate_enhanced_rule_plan(self, student: StudentProfile) -> Dict:
272
- """Enhanced rule-based plan with proper sequencing"""
273
 
274
- # Step 1: Identify track
 
275
  track = self._identify_track(student)
276
-
277
- # Step 2: Build structured plan
278
  plan = self._build_structured_plan(student, track, None)
 
279
 
280
- # Step 3: Validate
281
- validation = self.validate_plan(plan)
282
  if validation["errors"]:
283
  plan = self._fix_plan_errors(plan, validation, student)
284
- validation = self.validate_plan(plan) # Re-validate
285
 
286
- # Step 4: Generate explanation
287
  difficulty_level = self._map_difficulty(student.preferred_difficulty)
288
  courses_per_semester = self._calculate_course_load(student.time_commitment)
289
  explanation = f"Personalized {track} track ({difficulty_level} difficulty, {courses_per_semester} courses/semester)"
290
 
291
  return self._finalize_plan(plan, explanation, validation)
292
-
293
- def _build_structured_plan(
294
- self,
295
- student: StudentProfile,
296
- track: str,
297
- llm_suggestions: Optional[List[str]] = None
298
- ) -> Dict:
299
- """Build plan using structured concentration requirements"""
 
 
 
 
 
 
 
 
 
 
 
 
 
300
 
301
  completed = set(student.completed_courses)
302
  plan = {}
303
  requirements = self.CONCENTRATION_REQUIREMENTS.get(track, self.CONCENTRATION_REQUIREMENTS["ai_ml"])
304
 
305
- # Determine course load
306
  courses_per_semester = self._calculate_course_load(student.time_commitment)
307
 
308
- # Track which requirements have been satisfied
309
- required_queue = []
310
- pick_lists = []
311
-
312
- # Build queue of required courses
313
  for category, reqs in requirements.items():
314
  if "required" in reqs:
315
- required_queue.extend(reqs["required"])
316
-
317
- # Handle pick lists
318
  for key, courses in reqs.items():
319
  if key.startswith("pick_"):
320
- num_to_pick = int(re.search(r'\d+', key).group()) if re.search(r'\d+', key) else 1
321
- pick_lists.append({
322
- "courses": courses,
323
- "num_to_pick": num_to_pick,
324
- "category": category
325
- })
326
-
327
- # Handle course track commitments (physics/calculus)
328
- physics_track = self._get_track_commitment(completed, "physics")
329
- calc_track = self._get_track_commitment(completed, "calculus")
330
-
331
- # Build semesters
332
  for sem_num in range(1, 9):
333
  year = ((sem_num - 1) // 2) + 1
334
- is_fall = (sem_num % 2) == 1
335
 
336
- available = self._get_available_courses(completed, year)
337
- selected = []
338
 
339
- # Apply track commitments
340
- if not physics_track and year <= 2:
341
- # Choose physics track based on difficulty preference
342
- if student.preferred_difficulty == "challenging":
343
- physics_track = "engineering"
344
- else:
345
- physics_track = "science"
346
 
347
- # Priority 1: Required courses
348
- for course in required_queue[:]:
349
- if course in available and len(selected) < courses_per_semester:
350
- if self._validate_sequence(selected, course):
351
- selected.append(course)
352
- required_queue.remove(course)
353
- available.remove(course)
354
-
355
- # Priority 2: Handle pick lists
356
- for pick_list in pick_lists:
357
- if len(selected) >= courses_per_semester:
358
- break
359
-
360
- # Filter available courses from this pick list
361
- available_from_list = [c for c in pick_list["courses"] if c in available]
362
 
363
- # Use LLM suggestions if available
364
- if llm_suggestions:
365
- # Prioritize LLM-suggested courses
366
- for suggested in llm_suggestions:
367
- if suggested in available_from_list and pick_list["num_to_pick"] > 0:
368
- if self._validate_sequence(selected, suggested):
369
- selected.append(suggested)
370
- available.remove(suggested)
371
- pick_list["num_to_pick"] -= 1
372
-
373
- # Fill remaining slots
374
- for course in available_from_list[:pick_list["num_to_pick"]]:
375
- if len(selected) < courses_per_semester and course in available:
376
- if self._validate_sequence(selected, course):
377
- selected.append(course)
378
- available.remove(course)
379
- pick_list["num_to_pick"] -= 1
380
-
381
- # Priority 3: Track-specific courses (physics/calc)
382
- if physics_track and year <= 2:
383
- physics_courses = self.COURSE_TRACKS["physics"].get(physics_track, [])
384
- for course in physics_courses:
385
- if course in available and len(selected) < courses_per_semester:
386
- selected.append(course)
387
- available.remove(course)
388
-
389
- # Priority 4: Fill with electives
390
- if len(selected) < courses_per_semester and available:
391
- semantic_scores = self._compute_semantic_scores(student)
392
- electives = sorted(
393
- available,
394
- key=lambda c: self._score_elective(c, semantic_scores, completed),
395
  reverse=True
396
  )
397
 
398
- for elective in electives:
399
- if len(selected) >= courses_per_semester:
400
- break
401
- if self._validate_sequence(selected, elective):
402
- selected.append(elective)
 
 
 
 
 
 
 
 
 
 
 
403
 
404
  # Add to plan
405
  if selected:
@@ -407,310 +330,184 @@ class HybridOptimizer:
407
  if year_key not in plan:
408
  plan[year_key] = {}
409
 
410
- sem_type = 'fall' if is_fall else 'spring'
411
- plan[year_key][sem_type] = selected[:courses_per_semester]
412
  completed.update(selected)
413
 
414
  return plan
415
-
416
- def _fix_plan_errors(self, plan: Dict, validation: Dict, student: StudentProfile) -> Dict:
417
- """Attempt to fix validation errors in a plan"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
418
 
419
- # For now, if there are sequence mixing errors, rebuild with enforced consistency
420
- if any("Mixed" in error for error in validation["errors"]):
421
- print("Fixing sequence mixing errors...")
 
 
 
422
 
423
- # Find which tracks were mixed and pick the first one
424
- for error in validation["errors"]:
425
- if "Mixed physics" in error:
426
- # Force engineering track (most common)
427
- self.COURSE_TRACKS["physics"] = {"engineering": ["PHYS1151", "PHYS1155"]}
428
- elif "Mixed calculus" in error:
429
- # Force standard calc
430
- self.COURSE_TRACKS["calculus"] = {"standard": ["MATH1341", "MATH1342"]}
431
 
432
- # Rebuild plan with enforced tracks
433
- return self._build_structured_plan(student, self._identify_track(student), None)
 
 
 
 
 
 
 
 
 
 
434
 
 
 
 
 
 
435
  return plan
436
-
437
- def _get_llm_course_suggestions(self, student: StudentProfile, track: str) -> List[str]:
438
- """Use LLM to suggest personalized course priorities"""
439
-
440
- requirements = self.CONCENTRATION_REQUIREMENTS.get(track, self.CONCENTRATION_REQUIREMENTS["ai_ml"])
441
 
442
- # Gather all elective options from pick lists
443
- all_options = []
444
- for category, reqs in requirements.items():
 
445
  for key, courses in reqs.items():
446
- if key.startswith("pick_"):
447
- all_options.extend(courses)
448
-
449
- # Create course options text
450
- course_options = []
451
- for cid in all_options[:10]: # Limit to avoid token limits
452
- if cid in self.courses:
453
- name = self.courses[cid].get('name', cid)
454
- desc = self.courses[cid].get('description', '')[:100]
455
- course_options.append(f"{cid}: {name} - {desc}")
456
-
457
- prompt = f"""You are a curriculum advisor. Given this student profile, rank the TOP 5 most relevant courses from the options below.
458
-
459
- Student Profile:
460
- - Career Goal: {student.career_goals}
461
- - Interests: {', '.join(student.interests)}
462
- - Time Commitment: {student.time_commitment} hours/week
463
- - Preferred Difficulty: {student.preferred_difficulty}
464
- - Current GPA: {student.current_gpa}
465
-
466
- Available Courses:
467
- {chr(10).join(course_options)}
468
-
469
- Return ONLY the top 5 course IDs in order of priority, one per line. Example:
470
- CS4100
471
- DS4400
472
- CS4120
473
- CS4180
474
- DS4440"""
475
-
476
  try:
477
- inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to(self.device)
478
-
479
  with torch.no_grad():
480
- outputs = self.llm.generate(
481
- **inputs,
482
- max_new_tokens=100,
483
- temperature=0.3,
484
- do_sample=True,
485
- pad_token_id=self.tokenizer.eos_token_id
486
- )
487
-
488
  response = self.tokenizer.decode(outputs[0][len(inputs['input_ids'][0]):], skip_special_tokens=True)
489
-
490
- # Extract course IDs
491
- suggested_courses = []
492
- for line in response.strip().split('\n'):
493
- line = line.strip()
494
- match = re.search(r'([A-Z]{2,4}\d{4})', line)
495
- if match:
496
- suggested_courses.append(match.group(1))
497
-
498
  return suggested_courses[:5]
499
-
500
  except Exception as e:
501
  print(f"LLM suggestion failed: {e}")
502
- return all_options[:5] # Fallback
503
-
504
  def _map_difficulty(self, preferred_difficulty: str) -> str:
505
- """Map UI difficulty to internal levels"""
506
- mapping = {
507
- "easy": "easy",
508
- "moderate": "medium",
509
- "challenging": "hard"
510
- }
511
- return mapping.get(preferred_difficulty.lower(), "medium")
512
-
513
  def _calculate_course_load(self, time_commitment: int) -> int:
514
- """Calculate courses per semester based on time commitment"""
515
- if time_commitment < 20:
516
- return 3 # Part-time
517
- elif time_commitment < 30:
518
- return 4 # Standard
519
- elif time_commitment < 40:
520
- return 4 # Standard-heavy
521
- else:
522
- return 4 # Max (prerequisites limit anyway)
523
-
524
- def _identify_track(self, student: StudentProfile) -> str:
525
- """Use embeddings to identify best track"""
526
 
 
 
 
 
 
 
 
527
  profile_text = f"{student.career_goals} {' '.join(student.interests)}"
528
  profile_emb = self.embedding_model.encode(profile_text, convert_to_tensor=True)
529
-
530
  track_descriptions = {
531
- "ai_ml": "artificial intelligence machine learning deep learning neural networks data science NLP computer vision LLM",
532
- "systems": "operating systems distributed systems networks compilers databases performance optimization backend",
533
- "security": "cybersecurity cryptography penetration testing security vulnerabilities network security ethical hacking"
534
  }
535
-
536
- best_track = "ai_ml"
537
- best_score = -1
538
-
539
  for track, description in track_descriptions.items():
540
  track_emb = self.embedding_model.encode(description, convert_to_tensor=True)
541
  score = float(util.cos_sim(profile_emb, track_emb))
542
  if score > best_score:
543
- best_score = score
544
- best_track = track
545
-
546
  return best_track
547
-
548
- def _compute_semantic_scores(self, student: StudentProfile) -> Dict[str, float]:
549
- """Compute semantic alignment for all courses"""
550
 
 
551
  query_text = f"{student.career_goals} {' '.join(student.interests)}"
552
  query_emb = self.embedding_model.encode(query_text, convert_to_tensor=True)
553
-
554
  similarities = util.cos_sim(query_emb, self.course_embeddings)[0]
 
555
 
556
- scores = {}
557
- for idx, cid in enumerate(self.valid_courses):
558
- scores[cid] = float(similarities[idx])
559
-
560
- return scores
561
-
562
- def _get_available_courses(self, completed: Set[str], year: int) -> List[str]:
563
- """Get schedulable courses with year restrictions"""
564
-
565
- available = []
566
- max_level = 2999 if year == 1 else 3999 if year == 2 else 9999
567
-
568
- for cid in self.valid_courses:
569
- if cid in completed:
570
- continue
571
-
572
- if self._get_level(cid) > max_level:
573
- continue
574
-
575
- # Check prerequisites
576
- if cid in self.curriculum_graph:
577
- prereqs = set(self.curriculum_graph.predecessors(cid))
578
- if not prereqs.issubset(completed):
579
- continue
580
-
581
- available.append(cid)
582
-
583
- return available
584
-
585
- def _score_elective(
586
- self,
587
- course_id: str,
588
- semantic_scores: Dict[str, float],
589
- completed: Set[str]
590
- ) -> float:
591
- """Basic elective scoring"""
592
-
593
- score = 0.0
594
-
595
- # Semantic alignment (50%)
596
- score += semantic_scores.get(course_id, 0) * 0.5
597
-
598
- # Unlocks future courses (30%)
599
- if course_id in self.curriculum_graph:
600
- unlocks = len(list(self.curriculum_graph.successors(course_id)))
601
- score += min(unlocks / 5, 1.0) * 0.3
602
-
603
- # Subject relevance (20%)
604
- subject = self.courses.get(course_id, {}).get('subject', '')
605
- subject_scores = {"CS": 1.0, "DS": 0.9, "IS": 0.6, "MATH": 0.7, "CY": 0.8}
606
- score += subject_scores.get(subject, 0.3) * 0.2
607
-
608
- return score
609
-
610
  def _generate_explanation(self, student: StudentProfile, plan: Dict, track: str, plan_type: str) -> str:
611
- """Generate explanation using LLM if available"""
 
 
 
 
612
 
613
- if not self.llm:
614
- return f"{plan_type} {track} track plan for {student.career_goals}"
615
-
616
- # Count courses
617
- total_courses = sum(
618
- len(plan.get(f"year_{y}", {}).get(sem, []))
619
- for y in range(1, 5)
620
- for sem in ["fall", "spring"]
621
- )
622
-
623
- prompt = f"""Explain this curriculum plan in 1-2 sentences:
624
- Plan Type: {plan_type}
625
- Track: {track}
626
- Student Goal: {student.career_goals}
627
- Interests: {', '.join(student.interests[:2])}
628
- Difficulty: {student.preferred_difficulty}
629
- Time Commitment: {student.time_commitment}h/week
630
- Total Courses: {total_courses}
631
 
632
- Be specific about how the plan matches their preferences."""
 
 
 
 
 
 
 
 
 
 
 
 
633
 
634
- try:
635
- inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True).to(self.device)
636
-
637
- with torch.no_grad():
638
- outputs = self.llm.generate(
639
- **inputs,
640
- max_new_tokens=150,
641
- temperature=0.7,
642
- do_sample=True,
643
- pad_token_id=self.tokenizer.eos_token_id
644
- )
645
-
646
- explanation = self.tokenizer.decode(outputs[0][len(inputs['input_ids'][0]):], skip_special_tokens=True)
647
- return explanation.strip()
648
-
649
- except Exception as e:
650
- print(f"Explanation generation failed: {e}")
651
- return f"{plan_type} {track} track plan optimized for {student.career_goals}"
652
-
653
- def _get_level(self, course_id: str) -> int:
654
- """Extract course level"""
655
- match = re.search(r'\d+', course_id)
656
- return int(match.group()) if match else 9999
657
-
658
  def _finalize_plan(self, plan: Dict, explanation: str, validation: Dict = None) -> Dict:
659
- """Add structure, metrics, and validation to plan"""
660
-
661
- structured = {
662
- "reasoning": explanation,
663
- "validation": validation if validation else {"errors": [], "warnings": [], "info": []}
664
- }
665
-
666
- # Ensure all years present
667
  for year in range(1, 5):
668
  year_key = f"year_{year}"
669
- if year_key not in plan:
670
- plan[year_key] = {}
671
-
672
- structured[year_key] = {
673
- "fall": plan[year_key].get("fall", []),
674
- "spring": plan[year_key].get("spring", []),
675
  "summer": "co-op" if year in [2, 3] else []
676
  }
 
 
 
 
 
677
 
678
- # Calculate complexity metrics
679
- complexities = []
680
- for year_key in structured:
681
- if year_key.startswith("year_"):
682
- for sem in ["fall", "spring"]:
683
- courses = structured[year_key].get(sem, [])
684
- if courses:
685
- sem_complexity = sum(
686
- self.courses.get(c, {}).get('complexity', 50)
687
- for c in courses
688
- )
689
- complexities.append(sem_complexity)
690
-
691
- structured["complexity_analysis"] = {
692
  "average_semester_complexity": float(np.mean(complexities)) if complexities else 0,
693
  "peak_semester_complexity": float(np.max(complexities)) if complexities else 0,
694
  "total_complexity": float(np.sum(complexities)) if complexities else 0,
695
  "balance_score (std_dev)": float(np.std(complexities)) if complexities else 0
696
  }
697
-
698
- # Add metadata
699
- structured["metadata"] = {
700
  "generated": datetime.now().isoformat(),
701
  "valid": len(validation.get("errors", [])) == 0 if validation else True,
702
- "has_warnings": len(validation.get("warnings", [])) > 0 if validation else False
703
  }
704
-
705
- return {"pathway": structured}
706
 
707
- # Backward compatibility wrapper
708
  class CurriculumOptimizer(HybridOptimizer):
709
- """Compatibility wrapper"""
710
-
711
  def __init__(self):
712
  super().__init__()
713
 
714
  def generate_plan(self, student: StudentProfile) -> Dict:
715
- """Default plan generation - uses enhanced rules"""
716
  return self.generate_enhanced_rule_plan(student)
 
1
  """
2
  Fixed Hybrid Curriculum Optimizer
3
+ WITH PROPER COURSE DISCOVERY, SUBJECT-AWARE SCORING, AND CONCENTRATION FOCUS
 
4
  """
5
  import torch
6
  from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 
26
 
27
  class HybridOptimizer:
28
  """
29
+ Fixed optimizer with subject-aware scoring and concentration focus
30
  """
31
 
32
+ EQUIVALENCY_GROUPS = [
33
+ {"MATH1341", "MATH1241", "MATH1231"}, # Calculus 1
34
+ {"MATH1342", "MATH1242"}, # Calculus 2
35
+ {"PHYS1151", "PHYS1161", "PHYS1145"}, # Physics 1
36
+ {"PHYS1155", "PHYS1165", "PHYS1147"}, # Physics 2
37
+ ]
38
  COURSE_TRACKS = {
39
  "physics": {
40
  "engineering": ["PHYS1151", "PHYS1155"],
 
47
  }
48
  }
49
 
 
50
  CONCENTRATION_REQUIREMENTS = {
51
  "ai_ml": {
52
  "foundations": {
53
+ "required": ["CS1800", "CS2500", "CS2510", "CS2800"],
54
+ "sequence": True
55
  },
56
  "core": {
57
  "required": ["CS3000", "CS3500"],
58
+ "pick_1_from": ["CS3200", "CS3650", "CS5700"]
59
  },
60
  "concentration_specific": {
61
  "required": ["CS4100", "DS4400"],
62
  "pick_2_from": ["CS4120", "CS4180", "DS4420", "DS4440"],
63
+ "pick_1_systems": ["CS4730", "CS4700"]
64
  },
65
  "math": {
66
  "required": ["MATH1341", "MATH1342"],
67
+ "pick_1_from": ["MATH2331", "MATH3081"]
68
  }
69
  },
70
  "systems": {
71
+ "foundations": { "required": ["CS1800", "CS2500", "CS2510", "CS2800"] },
72
+ "core": { "required": ["CS3000", "CS3500", "CS3650"], "pick_1_from": ["CS5700", "CS3200"] },
73
+ "concentration_specific": { "required": ["CS4700"], "pick_2_from": ["CS4730"], "pick_1_from": ["CS4400", "CS4500", "CS4520"] },
74
+ "math": { "required": ["MATH1341", "MATH1342"] }
 
 
 
 
 
 
 
 
 
 
 
75
  },
76
  "security": {
77
+ "foundations": { "required": ["CS1800", "CS2500", "CS2510", "CS2800"] },
78
+ "core": { "required": ["CS3000", "CS3650", "CY2550"], "pick_1_from": ["CS5700", "CS3500"] },
79
+ "concentration_specific": { "required": ["CY3740"], "pick_2_from": ["CY4740", "CY4760", "CY4770"], "pick_1_from": ["CS4700", "CS4730"] },
80
+ "math": { "required": ["MATH1342"], "pick_1_from": ["MATH3527", "MATH3081"] }
 
 
 
 
 
 
 
 
 
 
 
 
81
  }
82
  }
83
 
84
  def __init__(self):
85
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 
86
  self.model_name = "meta-llama/Llama-3.1-8B-Instruct"
87
  self.embedding_model_name = 'BAAI/bge-large-en-v1.5'
 
88
  self.llm = None
89
  self.tokenizer = None
90
  self.embedding_model = None
91
  self.curriculum_graph = None
92
  self.courses = {}
93
+ self.current_student = None
94
+
95
  def load_models(self):
 
96
  print("Loading embedding model...")
97
  self.embedding_model = SentenceTransformer(self.embedding_model_name, device=self.device)
98
 
99
  def load_llm(self):
 
100
  if self.device.type == 'cuda' and self.llm is None:
101
  print("Loading LLM for intelligent planning...")
102
  quant_config = BitsAndBytesConfig(
 
111
  quantization_config=quant_config,
112
  device_map="auto"
113
  )
114
+
115
  def load_data(self, graph: nx.DiGraph):
 
116
  self.curriculum_graph = graph
117
  self.courses = dict(graph.nodes(data=True))
118
+ UNDERGRAD_ACCESSIBLE_GRAD = {"CS5700", "CY5700", "DS5110", "CS5010"}
 
119
  self.valid_courses = []
120
  course_texts = []
121
 
122
+ concentration_courses = set()
123
+ for track_reqs in self.CONCENTRATION_REQUIREMENTS.values():
124
+ for category, reqs in track_reqs.items():
125
+ if isinstance(reqs, dict):
126
+ for key, courses in reqs.items():
127
+ if isinstance(courses, list):
128
+ concentration_courses.update(courses)
129
+
130
  for cid, data in self.courses.items():
 
131
  name = data.get('name', '')
132
+ if not name or name.strip() == '' or any(skip in name.lower() for skip in ['lab', 'recitation', 'seminar', 'practicum']):
133
  continue
134
+
135
+ course_level = self._get_level(cid)
136
+ if course_level >= 5000 and cid not in UNDERGRAD_ACCESSIBLE_GRAD:
137
  continue
138
 
139
  self.valid_courses.append(cid)
140
  course_texts.append(f"{name} {data.get('description', '')}")
141
 
142
+ missing_required = concentration_courses - set(self.valid_courses)
143
+ if missing_required:
144
+ print(f"\n⚠️ WARNING: {len(missing_required)} required courses missing from graph: {sorted(missing_required)}\n")
145
+
146
  print(f"Computing embeddings for {len(self.valid_courses)} courses...")
147
+ self.course_embeddings = self.embedding_model.encode(course_texts, convert_to_tensor=True, show_progress_bar=True)
148
+ print(f"\nTotal valid courses: {len(self.valid_courses)}")
149
+
150
+ def _get_level(self, course_id: str) -> int:
151
+ match = re.search(r'\d+', course_id)
152
+ return int(match.group()) if match else 9999
153
+
154
+ def _get_completed_with_equivalents(self, completed: Set[str]) -> Set[str]:
155
+ expanded_completed = completed.copy()
156
+ for course in completed:
157
+ for group in self.EQUIVALENCY_GROUPS:
158
+ if course in group:
159
+ expanded_completed.update(group)
160
+ return expanded_completed
161
+
162
+ def _can_take_course(self, course_id: str, completed: Set[str]) -> bool:
163
+ effective_completed = self._get_completed_with_equivalents(completed)
164
+ if course_id not in self.curriculum_graph:
165
+ return True
166
+ prereqs = set(self.curriculum_graph.predecessors(course_id))
167
+ return prereqs.issubset(effective_completed)
168
+
169
  def _validate_sequence(self, selected: List[str], candidate: str) -> bool:
 
170
  for track_type, tracks in self.COURSE_TRACKS.items():
171
  for track_name, sequence in tracks.items():
172
  if candidate in sequence:
 
173
  for other_track, other_seq in tracks.items():
174
+ if other_track != track_name and any(c in selected for c in other_seq):
175
+ return False
 
176
  return True
177
+
178
+ def _score_course(self, course_id: str, semantic_scores: Dict[str, float], required_set: Set[str], picklist_set: Set[str]) -> float:
179
+ """FIXED: Proper scoring with IS heavy penalty"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
 
181
+ if course_id not in self.courses or not self.courses[course_id].get('name', '').strip():
182
+ return -10000.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
 
184
+ course_data = self.courses[course_id]
185
+ subject = course_data.get('subject', '')
 
 
 
186
 
187
+ score = 0.0
 
188
 
189
+ # Subject bonuses/penalties
190
+ if subject in ["CS", "DS", "CY"]:
191
+ score += 300.0
192
+ elif subject == "MATH":
193
+ score += 100.0
194
+ else:
195
+ score -= 1000.0 # Heavy penalty for everything else (including IS)
196
 
197
+ # Required courses: massive boost
198
+ if course_id in required_set:
199
+ score += 10000.0 # INCREASED from 1000
200
 
201
+ # Pick-list courses: high boost
202
+ if course_id in picklist_set:
203
+ score += 5000.0 # INCREASED from 500
204
 
205
+ # Unlocking factor (reduced weight)
206
+ if course_id in self.curriculum_graph:
207
+ unlocks = self.curriculum_graph.out_degree(course_id)
208
+ score += min(unlocks, 5) * 2.0 # REDUCED
209
 
210
+ # Level preference
211
+ level = self._get_level(course_id)
212
+ score -= (level / 100.0)
 
 
 
213
 
214
+ # Semantic alignment (reduced weight)
215
+ score += semantic_scores.get(course_id, 0.0) * 5.0 # REDUCED from 15
216
 
217
+ return score
218
+
219
  def generate_simple_plan(self, student: StudentProfile) -> Dict:
 
220
  print("--- Generating Enhanced Rule-Based Plan ---")
221
+ self.current_student = student
222
  return self.generate_enhanced_rule_plan(student)
 
 
 
223
 
224
+ def generate_enhanced_rule_plan(self, student: StudentProfile) -> Dict:
225
+ self.current_student = student
226
  track = self._identify_track(student)
 
 
227
  plan = self._build_structured_plan(student, track, None)
228
+ validation = self.validate_plan(plan, student)
229
 
 
 
230
  if validation["errors"]:
231
  plan = self._fix_plan_errors(plan, validation, student)
232
+ validation = self.validate_plan(plan, student)
233
 
 
234
  difficulty_level = self._map_difficulty(student.preferred_difficulty)
235
  courses_per_semester = self._calculate_course_load(student.time_commitment)
236
  explanation = f"Personalized {track} track ({difficulty_level} difficulty, {courses_per_semester} courses/semester)"
237
 
238
  return self._finalize_plan(plan, explanation, validation)
239
+
240
+ def generate_llm_plan(self, student: StudentProfile) -> Dict:
241
+ print("--- Generating AI-Optimized Plan ---")
242
+ self.current_student = student
243
+ self.load_llm()
244
+ if not self.llm:
245
+ return self.generate_enhanced_rule_plan(student)
246
+
247
+ track = self._identify_track(student)
248
+ llm_suggestions = self._get_llm_course_suggestions(student, track)
249
+ plan = self._build_structured_plan(student, track, llm_suggestions)
250
+ validation = self.validate_plan(plan, student)
251
+ if validation["errors"]:
252
+ plan = self._fix_plan_errors(plan, validation, student)
253
+ validation = self.validate_plan(plan, student)
254
+
255
+ explanation = self._generate_explanation(student, plan, track, "AI-optimized")
256
+ return self._finalize_plan(plan, explanation, validation)
257
+
258
+ def _build_structured_plan(self, student: StudentProfile, track: str, llm_suggestions: Optional[List[str]] = None) -> Dict:
259
+ """FIXED with hardcoded Year 2 priorities"""
260
 
261
  completed = set(student.completed_courses)
262
  plan = {}
263
  requirements = self.CONCENTRATION_REQUIREMENTS.get(track, self.CONCENTRATION_REQUIREMENTS["ai_ml"])
264
 
 
265
  courses_per_semester = self._calculate_course_load(student.time_commitment)
266
 
267
+ # Build required and pick sets
268
+ required_set = set()
269
+ picklist_set = set()
 
 
270
  for category, reqs in requirements.items():
271
  if "required" in reqs:
272
+ required_set.update(reqs["required"])
 
 
273
  for key, courses in reqs.items():
274
  if key.startswith("pick_"):
275
+ picklist_set.update(courses)
276
+
277
+ semantic_scores = self._compute_semantic_scores(student)
278
+
279
+ # HARDCODED FIX: Force Year 2 to prioritize core courses
280
+ YEAR2_MUST_TAKE = ["CS3000", "CS3500", "DS2500", "MATH2331", "MATH3081"]
281
+
 
 
 
 
 
282
  for sem_num in range(1, 9):
283
  year = ((sem_num - 1) // 2) + 1
 
284
 
285
+ available_courses = self._get_available_courses(completed, year, sem_num, track)
 
286
 
287
+ # Filter: must be takeable
288
+ schedulable = [
289
+ c for c in available_courses
290
+ if c not in completed and self._can_take_course(c, completed)
291
+ ]
 
 
292
 
293
+ # HARDCODED: In Year 2, force core courses to the top
294
+ if year == 2:
295
+ priority_courses = [c for c in YEAR2_MUST_TAKE if c in schedulable]
296
+ other_courses = [c for c in schedulable if c not in YEAR2_MUST_TAKE]
 
 
 
 
 
 
 
 
 
 
 
297
 
298
+ # Score priority courses separately
299
+ scored_priority = sorted(
300
+ priority_courses,
301
+ key=lambda c: self._score_course(c, semantic_scores, required_set, picklist_set),
302
+ reverse=True
303
+ )
304
+ scored_others = sorted(
305
+ other_courses,
306
+ key=lambda c: self._score_course(c, semantic_scores, required_set, picklist_set),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
307
  reverse=True
308
  )
309
 
310
+ scored_courses = scored_priority + scored_others
311
+ else:
312
+ # Normal scoring for other years
313
+ scored_courses = sorted(
314
+ schedulable,
315
+ key=lambda c: self._score_course(c, semantic_scores, required_set, picklist_set),
316
+ reverse=True
317
+ )
318
+
319
+ # Select top N courses
320
+ selected = []
321
+ for course in scored_courses:
322
+ if len(selected) >= courses_per_semester:
323
+ break
324
+ if self._validate_sequence(selected, course):
325
+ selected.append(course)
326
 
327
  # Add to plan
328
  if selected:
 
330
  if year_key not in plan:
331
  plan[year_key] = {}
332
 
333
+ sem_type = 'fall' if (sem_num % 2) == 1 else 'spring'
334
+ plan[year_key][sem_type] = selected
335
  completed.update(selected)
336
 
337
  return plan
338
+
339
+ def _get_available_courses(self, completed: Set[str], year: int, sem_num: int = None, track: str = "ai_ml") -> List[str]:
340
+ """FIXED: Return ALL courses that COULD be taken in this year"""
341
+
342
+ # Year 1: Hardcoded foundation
343
+ if year == 1:
344
+ if not completed or len(completed) < 2:
345
+ return [c for c in ["CS1800", "CS2500", "MATH1341", "ENGW1111"] if c in self.valid_courses]
346
+ else:
347
+ next_courses = []
348
+ for course, prereq in [("CS2800", "CS1800"), ("CS2510", "CS2500"), ("MATH1342", "MATH1341"), ("DS2000", None)]:
349
+ if course in self.valid_courses and course not in completed:
350
+ if prereq is None or prereq in completed:
351
+ next_courses.append(course)
352
+ return next_courses
353
+
354
+ # Years 2-4: Filter by subject and level
355
+ available = []
356
 
357
+ # ONLY CS/DS/CY/MATH allowed
358
+ ALLOWED_SUBJECTS = {"CS", "DS", "CY", "MATH"}
359
+
360
+ for cid in self.valid_courses:
361
+ if cid in completed:
362
+ continue
363
 
364
+ course_data = self.courses.get(cid, {})
365
+ subject = course_data.get('subject')
 
 
 
 
 
 
366
 
367
+ if subject not in ALLOWED_SUBJECTS:
368
+ continue
369
+
370
+ course_level = self._get_level(cid)
371
+
372
+ # Year-based level filtering
373
+ if year == 2 and course_level > 3999:
374
+ continue # No 4000+ in Year 2
375
+ if year >= 3 and course_level < 2000:
376
+ continue # No intro courses in Years 3-4
377
+
378
+ available.append(cid)
379
 
380
+ return available
381
+
382
+ def _fix_plan_errors(self, plan: Dict, validation: Dict, student: StudentProfile) -> Dict:
383
+ if any("Mixed" in error for error in validation["errors"]):
384
+ return self._build_structured_plan(student, self._identify_track(student), None)
385
  return plan
 
 
 
 
 
386
 
387
+ def _get_llm_course_suggestions(self, student: StudentProfile, track: str) -> List[str]:
388
+ requirements = self.CONCENTRATION_REQUIREMENTS.get(track, {})
389
+ all_options = set()
390
+ for reqs in requirements.values():
391
  for key, courses in reqs.items():
392
+ if key.startswith("pick_"): all_options.update(courses)
393
+
394
+ course_options_text = [f"{cid}: {self.courses[cid].get('name', cid)} - {self.courses[cid].get('description', '')[:100].strip()}"
395
+ for cid in list(all_options)[:15] if cid in self.courses]
396
+
397
+ prompt = f"""You are an expert curriculum advisor. Based on the student profile, rank the top 5 most relevant courses from the list below.
398
+ ### Student Profile:
399
+ - **Career Goal:** {student.career_goals}
400
+ - **Interests:** {', '.join(student.interests)}
401
+ - **Preferred Difficulty:** {student.preferred_difficulty}
402
+ ### Available Elective Courses:
403
+ {chr(10).join(course_options_text)}
404
+ Return ONLY the top 5 course IDs, each on a new line.
405
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
406
  try:
407
+ inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096).to(self.device)
 
408
  with torch.no_grad():
409
+ outputs = self.llm.generate(**inputs, max_new_tokens=100, temperature=0.2, do_sample=True, pad_token_id=self.tokenizer.eos_token_id)
 
 
 
 
 
 
 
410
  response = self.tokenizer.decode(outputs[0][len(inputs['input_ids'][0]):], skip_special_tokens=True)
411
+ suggested_courses = re.findall(r'([A-Z]{2,4}\d{4})', response)
 
 
 
 
 
 
 
 
412
  return suggested_courses[:5]
 
413
  except Exception as e:
414
  print(f"LLM suggestion failed: {e}")
415
+ return list(all_options)[:5]
416
+
417
  def _map_difficulty(self, preferred_difficulty: str) -> str:
418
+ return {"easy": "easy", "moderate": "medium", "challenging": "hard"}.get(preferred_difficulty.lower(), "medium")
419
+
 
 
 
 
 
 
420
  def _calculate_course_load(self, time_commitment: int) -> int:
421
+ if time_commitment <= 20: return 3
422
+ if time_commitment <= 40: return 4 # Setting hours to 40 will now correctly return 4.
423
+ return 5
 
 
 
 
 
 
 
 
 
424
 
425
+ def _identify_track(self, student: StudentProfile) -> str:
426
+ if not hasattr(self, 'embedding_model') or self.embedding_model is None:
427
+ combined = f"{student.career_goals.lower()} {' '.join(student.interests).lower()}"
428
+ if any(word in combined for word in ['ai', 'ml', 'machine learning', 'data']): return "ai_ml"
429
+ if any(word in combined for word in ['systems', 'distributed', 'backend']): return "systems"
430
+ if any(word in combined for word in ['security', 'cyber']): return "security"
431
+ return "ai_ml"
432
  profile_text = f"{student.career_goals} {' '.join(student.interests)}"
433
  profile_emb = self.embedding_model.encode(profile_text, convert_to_tensor=True)
 
434
  track_descriptions = {
435
+ "ai_ml": "artificial intelligence machine learning deep learning neural networks data science",
436
+ "systems": "operating systems distributed systems networks compilers databases performance backend",
437
+ "security": "cybersecurity cryptography network security ethical hacking vulnerabilities"
438
  }
439
+ best_track, best_score = "ai_ml", -1.0
 
 
 
440
  for track, description in track_descriptions.items():
441
  track_emb = self.embedding_model.encode(description, convert_to_tensor=True)
442
  score = float(util.cos_sim(profile_emb, track_emb))
443
  if score > best_score:
444
+ best_score, best_track = score, track
 
 
445
  return best_track
 
 
 
446
 
447
+ def _compute_semantic_scores(self, student: StudentProfile) -> Dict[str, float]:
448
  query_text = f"{student.career_goals} {' '.join(student.interests)}"
449
  query_emb = self.embedding_model.encode(query_text, convert_to_tensor=True)
 
450
  similarities = util.cos_sim(query_emb, self.course_embeddings)[0]
451
+ return {cid: float(similarities[idx]) for idx, cid in enumerate(self.valid_courses)}
452
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
453
  def _generate_explanation(self, student: StudentProfile, plan: Dict, track: str, plan_type: str) -> str:
454
+ return f"{plan_type.title()} plan for the {track} track, tailored to your goal of becoming a {student.career_goals}."
455
+
456
+ def validate_plan(self, plan: Dict, student: StudentProfile = None) -> Dict[str, List[str]]:
457
+ issues = {"errors": [], "warnings": [], "info": []}
458
+ all_courses = [course for year in plan.values() for sem in year.values() for course in sem if isinstance(sem, list)]
459
 
460
+ for track_type, tracks in self.COURSE_TRACKS.items():
461
+ tracks_used = {name for name, courses in tracks.items() if any(c in all_courses for c in courses)}
462
+ if len(tracks_used) > 1:
463
+ issues["errors"].append(f"Mixed {track_type} tracks: {', '.join(tracks_used)}. Choose one sequence.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
464
 
465
+ completed_for_validation = set(student.completed_courses) if student else set()
466
+ for year in range(1, 5):
467
+ for sem in ["fall", "spring"]:
468
+ year_key = f"year_{year}"
469
+ sem_courses = plan.get(year_key, {}).get(sem, [])
470
+ for course in sem_courses:
471
+ if course in self.curriculum_graph:
472
+ prereqs = set(self.curriculum_graph.predecessors(course))
473
+ if not prereqs.issubset(self._get_completed_with_equivalents(completed_for_validation)):
474
+ missing = prereqs - completed_for_validation
475
+ issues["errors"].append(f"{course} in Year {year} {sem} is missing prereqs: {', '.join(missing)}")
476
+ completed_for_validation.update(sem_courses)
477
+ return issues
478
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
479
  def _finalize_plan(self, plan: Dict, explanation: str, validation: Dict = None) -> Dict:
480
+ structured_plan = {"reasoning": explanation, "validation": validation or {"errors": [], "warnings": [], "info": []}}
481
+ complexities = []
 
 
 
 
 
 
482
  for year in range(1, 5):
483
  year_key = f"year_{year}"
484
+ structured_plan[year_key] = {
485
+ "fall": plan.get(year_key, {}).get("fall", []),
486
+ "spring": plan.get(year_key, {}).get("spring", []),
 
 
 
487
  "summer": "co-op" if year in [2, 3] else []
488
  }
489
+ for sem in ["fall", "spring"]:
490
+ courses = structured_plan[year_key][sem]
491
+ if courses:
492
+ sem_complexity = sum(self.courses.get(c, {}).get('complexity', 50) for c in courses)
493
+ complexities.append(sem_complexity)
494
 
495
+ structured_plan["complexity_analysis"] = {
 
 
 
 
 
 
 
 
 
 
 
 
 
496
  "average_semester_complexity": float(np.mean(complexities)) if complexities else 0,
497
  "peak_semester_complexity": float(np.max(complexities)) if complexities else 0,
498
  "total_complexity": float(np.sum(complexities)) if complexities else 0,
499
  "balance_score (std_dev)": float(np.std(complexities)) if complexities else 0
500
  }
501
+ structured_plan["metadata"] = {
 
 
502
  "generated": datetime.now().isoformat(),
503
  "valid": len(validation.get("errors", [])) == 0 if validation else True,
 
504
  }
505
+ return {"pathway": structured_plan}
 
506
 
 
507
  class CurriculumOptimizer(HybridOptimizer):
508
+ """Wrapper to maintain compatibility with older script calls."""
 
509
  def __init__(self):
510
  super().__init__()
511
 
512
  def generate_plan(self, student: StudentProfile) -> Dict:
 
513
  return self.generate_enhanced_rule_plan(student)
src/inspect_graph.py CHANGED
@@ -1,88 +1,265 @@
 
 
 
 
 
1
  import pickle
2
  import networkx as nx
3
- import argparse
 
4
 
5
- def inspect_graph(graph_path: str):
6
- """
7
- Loads a curriculum graph and runs diagnostic checks to verify its integrity.
8
- """
 
 
 
 
9
  try:
10
- with open(graph_path, 'rb') as f:
11
  graph = pickle.load(f)
12
- print(f"✅ Successfully loaded graph '{graph_path}'")
13
- print(f" - Total Courses (Nodes): {graph.number_of_nodes()}")
14
- print(f" - Prerequisite Links (Edges): {graph.number_of_edges()}")
15
- except FileNotFoundError:
16
- print(f"❌ ERROR: File not found at '{graph_path}'. Please check the path.")
17
- return
18
  except Exception as e:
19
- print(f"❌ ERROR: Could not load or parse the pickle file. Reason: {e}")
20
  return
21
-
22
- print("\n--- 🧐 DIAGNOSTIC CHECKS ---")
23
-
24
- # --- Check 1: Critical Prerequisite Links ---
25
- print("\n## 1. Verifying Critical Prerequisite Links...")
26
- critical_links = [
27
- ("CS1800", "CS2800"), # Discrete -> Logic & Comp
28
- ("CS2500", "CS2510"), # Fundies 1 -> Fundies 2
29
- ("CS2510", "CS3500"), # Fundies 2 -> OOD
30
- ("CS2510", "CS3000") # Fundies 2 -> Algorithms
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  ]
32
- all_links_ok = True
33
- for prereq, course in critical_links:
34
- if graph.has_node(prereq) and graph.has_node(course):
 
35
  if graph.has_edge(prereq, course):
36
- print(f" [PASS] Prerequisite link exists: {prereq} -> {course}")
37
  else:
38
- print(f" [FAIL] CRITICAL LINK MISSING: The graph has no link from {prereq} to {course}.")
39
- all_links_ok = False
40
  else:
41
- print(f" [WARN] One or both courses in link {prereq} -> {course} are not in the graph.")
42
- all_links_ok = False
 
 
43
 
44
- if all_links_ok:
45
- print(" -> All critical prerequisite links seem to be intact.")
46
-
47
- # --- Check 2: Foundational Courses ---
48
- print("\n## 2. Analyzing Foundational Courses (courses with no prerequisites)...")
49
- foundations = [n for n, d in graph.in_degree() if d == 0]
50
- if foundations:
51
- print(f" Found {len(foundations)} foundational courses.")
52
- cs_foundations = [c for c in foundations if c.startswith("CS")]
53
- if cs_foundations:
54
- print(f" -> Foundational CS courses: {', '.join(cs_foundations[:5])}...")
55
- else:
56
- print(" [WARN] No foundational courses with a 'CS' prefix were found. This is unusual.")
 
 
 
 
 
 
 
 
 
 
 
57
  else:
58
- print(" [FAIL] No foundational courses found. The graph may have a cycle or is structured incorrectly.")
59
-
60
- # --- Check 3: Key Course Inspection ---
61
- print("\n## 3. Inspecting Key Courses...")
62
- courses_to_inspect = ["CS2500", "CS2510", "CS3500"]
63
- for course_id in courses_to_inspect:
64
- if graph.has_node(course_id):
65
- prereqs = list(graph.predecessors(course_id))
66
- unlocks = list(graph.successors(course_id))
67
- print(f"\n - Course: {course_id} ({graph.nodes[course_id].get('name', 'N/A')})")
68
- print(f" - Prerequisites (What it needs): {prereqs or 'None'}")
69
- print(f" - Unlocks (What it leads to): {unlocks or 'None'}")
70
- else:
71
- print(f"\n - Course: {course_id} -> [NOT FOUND IN GRAPH]")
72
-
73
- print("\n--- ախ DIAGNOSIS ---")
74
- if not all_links_ok:
75
- print("Your graph is missing critical prerequisite information.")
76
- print("The planner cannot create a logical schedule without these links.")
77
- print("This issue likely originates in `neu_scraper.py` or how it parses prerequisite data from the API.")
78
  else:
79
- print("The graph structure for critical courses appears to be correct.")
80
- print("If plans are still illogical, the issue may lie in the complexity/depth attributes or the planner's sorting logic.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
  if __name__ == "__main__":
83
- parser = argparse.ArgumentParser(description="Curriculum Graph Diagnostic Tool")
84
- # CORRECTED: Use a variable name for the argument
85
- parser.add_argument("graph_path", help="Path to the .pkl graph file to inspect.")
86
- args = parser.parse_args()
87
- # CORRECTED: Use the correct variable to access the argument
88
- inspect_graph(args.graph_path)
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Comprehensive Graph Data Inspector
4
+ Diagnoses all potential issues with the curriculum graph data
5
+ """
6
  import pickle
7
  import networkx as nx
8
+ from collections import defaultdict
9
+ import sys
10
 
11
+ def inspect_graph_thoroughly(graph_file):
12
+ """Complete inspection of curriculum graph data"""
13
+
14
+ print("=" * 70)
15
+ print("COMPREHENSIVE CURRICULUM GRAPH INSPECTION")
16
+ print("=" * 70)
17
+
18
+ # Load the graph
19
  try:
20
+ with open(graph_file, 'rb') as f:
21
  graph = pickle.load(f)
 
 
 
 
 
 
22
  except Exception as e:
23
+ print(f"❌ ERROR: Could not load graph: {e}")
24
  return
25
+
26
+ print(f"\n📊 BASIC STATS:")
27
+ print(f" Total nodes: {graph.number_of_nodes()}")
28
+ print(f" Total edges: {graph.number_of_edges()}")
29
+
30
+ # 1. CHECK SUBJECT DISTRIBUTION
31
+ print("\n📚 SUBJECT ANALYSIS:")
32
+ subject_counts = defaultdict(int)
33
+ courses_by_subject = defaultdict(list)
34
+
35
+ for node, data in graph.nodes(data=True):
36
+ subject = data.get('subject', 'UNKNOWN')
37
+ subject_counts[subject] += 1
38
+ courses_by_subject[subject].append(node)
39
+
40
+ # Categorize subjects
41
+ CS_RELEVANT = {"CS", "DS", "IS", "CY", "MATH", "PHYS", "ENGW", "STAT", "EECE"}
42
+ MAYBE_RELEVANT = {"CHEM", "BIOL", "PSYC", "PHIL", "ECON"}
43
+
44
+ print("\n Relevant CS Subjects:")
45
+ for subj in sorted(CS_RELEVANT):
46
+ count = subject_counts.get(subj, 0)
47
+ if count > 0:
48
+ sample = courses_by_subject[subj][:3]
49
+ print(f" ✅ {subj:8s}: {count:3d} courses (e.g., {', '.join(sample)})")
50
+ else:
51
+ print(f" ❌ {subj:8s}: 0 courses - MISSING!")
52
+
53
+ print("\n Irrelevant Subjects (should be removed):")
54
+ irrelevant_found = False
55
+ for subj, count in sorted(subject_counts.items()):
56
+ if subj not in CS_RELEVANT and subj not in MAYBE_RELEVANT and count > 0:
57
+ irrelevant_found = True
58
+ sample = courses_by_subject[subj][:3]
59
+ print(f" ❌ {subj:8s}: {count:3d} courses (e.g., {', '.join(sample)})")
60
+
61
+ if not irrelevant_found:
62
+ print(" ✅ None found - graph is clean!")
63
+
64
+ # 2. CHECK CRITICAL COURSES EXISTENCE
65
+ print("\n🎯 CRITICAL COURSES CHECK:")
66
+
67
+ # Foundation courses
68
+ foundation_courses = ["CS1800", "CS2500", "CS2510", "CS2800"]
69
+ print("\n Foundation Courses:")
70
+ for course in foundation_courses:
71
+ if course in graph:
72
+ data = graph.nodes[course]
73
+ print(f" ✅ {course}: {data.get('name', 'Unknown')}")
74
+ else:
75
+ print(f" ❌ {course}: MISSING!")
76
+
77
+ # Core CS courses
78
+ core_courses = ["CS3000", "CS3500", "CS3650", "CS3700", "CS3200"]
79
+ print("\n Core CS Courses:")
80
+ for course in core_courses:
81
+ if course in graph:
82
+ data = graph.nodes[course]
83
+ print(f" ✅ {course}: {data.get('name', 'Unknown')}")
84
+ else:
85
+ print(f" ❌ {course}: MISSING!")
86
+
87
+ # AI/ML concentration courses
88
+ ai_ml_courses = ["CS4100", "DS4400", "CS4120", "DS4420", "CS4180", "DS4440"]
89
+ print("\n AI/ML Concentration:")
90
+ missing_concentration = []
91
+ for course in ai_ml_courses:
92
+ if course in graph:
93
+ data = graph.nodes[course]
94
+ print(f" ✅ {course}: {data.get('name', 'Unknown')}")
95
+ else:
96
+ missing_concentration.append(course)
97
+ print(f" ❌ {course}: MISSING!")
98
+
99
+ # 3. CHECK PREREQUISITE CHAINS
100
+ print("\n🔗 PREREQUISITE CHAINS:")
101
+
102
+ critical_chains = [
103
+ ("CS1800", "CS2800", "Discrete Structures → Logic"),
104
+ ("CS2500", "CS2510", "Fundies 1 → Fundies 2"),
105
+ ("CS2510", "CS3500", "Fundies 2 → OOD"),
106
+ ("CS2510", "CS3000", "Fundies 2 → Algorithms"),
107
+ ("MATH1341", "MATH1342", "Calc 1 → Calc 2"),
108
+ ("DS2000", "DS2500", "Prog w/ Data → Intermediate"),
109
+ ("DS2500", "DS3500", "Intermediate → Advanced")
110
  ]
111
+
112
+ broken_chains = []
113
+ for prereq, course, desc in critical_chains:
114
+ if prereq in graph and course in graph:
115
  if graph.has_edge(prereq, course):
116
+ print(f" {prereq} {course} ({desc})")
117
  else:
118
+ broken_chains.append((prereq, course))
119
+ print(f" ❌ {prereq} → {course} ({desc}) - EDGE MISSING!")
120
  else:
121
+ if prereq not in graph:
122
+ print(f" ⚠️ {prereq} {course} - {prereq} doesn't exist")
123
+ if course not in graph:
124
+ print(f" ⚠️ {prereq} → {course} - {course} doesn't exist")
125
 
126
+ # 4. CS2800 SPECIFIC DIAGNOSIS
127
+ print("\n🔍 CS2800 DETAILED ANALYSIS:")
128
+
129
+ if "CS2800" in graph:
130
+ cs2800_data = graph.nodes["CS2800"]
131
+ print(f" ✅ CS2800 exists")
132
+ print(f" Name: {cs2800_data.get('name', 'Unknown')}")
133
+ print(f" Subject: {cs2800_data.get('subject', 'Unknown')}")
134
+ print(f" Credits: {cs2800_data.get('maxCredits', 'Unknown')}")
135
+
136
+ # Check prerequisites
137
+ prereqs = list(graph.predecessors("CS2800"))
138
+ print(f" Prerequisites: {prereqs if prereqs else 'NONE (this is wrong!)'}")
139
+
140
+ # What it unlocks
141
+ unlocks = list(graph.successors("CS2800"))[:5]
142
+ print(f" Unlocks: {unlocks if unlocks else 'Nothing (suspicious...)'}")
143
+
144
+ # Specific CS1800 connection
145
+ if "CS1800" in graph:
146
+ if graph.has_edge("CS1800", "CS2800"):
147
+ print(f" ✅ CS1800 → CS2800 connection exists")
148
+ else:
149
+ print(f" ❌ CS1800 → CS2800 connection MISSING!")
150
  else:
151
+ print(f" CS2800 is completely MISSING from the graph!")
152
+
153
+ # 5. CHECK FOR DUPLICATE/REDUNDANT COURSES
154
+ print("\n🔄 CHECKING FOR REDUNDANT COURSES:")
155
+
156
+ calc_variants = ["MATH1341", "MATH1241", "MATH1231", "MATH1340"]
157
+ physics_variants = ["PHYS1151", "PHYS1161", "PHYS1145"]
158
+
159
+ print("\n Calculus variants in graph:")
160
+ calc_found = [c for c in calc_variants if c in graph]
161
+ if len(calc_found) > 1:
162
+ print(f" ⚠️ Multiple calculus courses found: {calc_found}")
163
+ print(f" These satisfy the same requirement - graph needs deduplication")
164
+ else:
165
+ print(f" ✅ Only one variant: {calc_found}")
166
+
167
+ print("\n Physics variants in graph:")
168
+ phys_found = [c for c in physics_variants if c in graph]
169
+ if len(phys_found) > 1:
170
+ print(f" ⚠️ Multiple physics courses found: {phys_found}")
171
  else:
172
+ print(f" Only one variant: {phys_found}")
173
+
174
+ # 6. CHECK FOR LABS/RECITATIONS
175
+ print("\n🧪 CHECKING FOR LABS/RECITATIONS (should be removed):")
176
+
177
+ labs_found = []
178
+ for node, data in graph.nodes(data=True):
179
+ name = data.get('name', '').lower()
180
+ if any(word in name for word in ['lab', 'recitation', 'seminar', 'practicum']):
181
+ labs_found.append((node, data.get('name', node)))
182
+
183
+ if labs_found:
184
+ print(f" ❌ Found {len(labs_found)} lab/recitation courses:")
185
+ for course_id, name in labs_found[:5]:
186
+ print(f" - {course_id}: {name}")
187
+ else:
188
+ print(f" ✅ No labs/recitations found")
189
+
190
+ # 7. CHECK 4000-LEVEL COURSES
191
+ print("\n🎓 4000-LEVEL COURSES:")
192
+
193
+ cs4000_courses = [n for n in graph.nodes() if n.startswith("CS4")]
194
+ ds4000_courses = [n for n in graph.nodes() if n.startswith("DS4")]
195
+
196
+ print(f" CS 4000-level: {len(cs4000_courses)} courses")
197
+ if cs4000_courses:
198
+ print(f" Examples: {', '.join(cs4000_courses[:5])}")
199
+ else:
200
+ print(f" ❌ NO CS 4000-level courses found!")
201
+
202
+ print(f" DS 4000-level: {len(ds4000_courses)} courses")
203
+ if ds4000_courses:
204
+ print(f" Examples: {', '.join(ds4000_courses[:5])}")
205
+ else:
206
+ print(f" ❌ NO DS 4000-level courses found!")
207
+
208
+ # FINAL VERDICT
209
+ print("\n" + "=" * 70)
210
+ print("VERDICT:")
211
+ print("=" * 70)
212
+
213
+ issues = []
214
+
215
+ if irrelevant_found:
216
+ issues.append("Contains irrelevant subjects (ARTH, FRNH, etc.)")
217
+
218
+ if missing_concentration:
219
+ issues.append(f"Missing critical courses: {', '.join(missing_concentration)}")
220
+
221
+ if broken_chains:
222
+ issues.append(f"Broken prerequisite chains: {len(broken_chains)}")
223
+
224
+ if not cs4000_courses or not ds4000_courses:
225
+ issues.append("Missing 4000-level courses")
226
+
227
+ if labs_found:
228
+ issues.append(f"Contains {len(labs_found)} lab/recitation courses")
229
+
230
+ if issues:
231
+ print("❌ GRAPH HAS ISSUES:")
232
+ for i, issue in enumerate(issues, 1):
233
+ print(f" {i}. {issue}")
234
+
235
+ print("\n📋 RECOMMENDED ACTIONS:")
236
+ print("1. Re-scrape with more subjects: CS DS IS CY MATH PHYS STAT EECE")
237
+ print("2. Re-run analyzer with stricter filtering")
238
+ print("3. Manually add missing prerequisite edges if needed")
239
+ else:
240
+ print("✅ Graph appears to be clean and complete!")
241
+
242
+ def suggest_fix_commands(graph_file):
243
+ """Suggest specific commands to fix issues"""
244
+
245
+ print("\n" + "=" * 70)
246
+ print("FIX COMMANDS:")
247
+ print("=" * 70)
248
+
249
+ print("\n1️⃣ If courses are missing, re-scrape with expanded subjects:")
250
+ print(" python neu_scraper.py --term 202510 --subjects CS DS IS CY MATH PHYS STAT EECE --prefix neu_complete")
251
+
252
+ print("\n2️⃣ Clean the new data:")
253
+ print(" python curriculum_analyzer.py --graph neu_complete_graph_*.pkl --courses neu_complete_courses_*.pkl --output-graph neu_graph_ultra_clean.pkl")
254
+
255
+ print("\n3️⃣ Test the cleaned data:")
256
+ print(f" python {sys.argv[0]} neu_graph_ultra_clean.pkl")
257
 
258
  if __name__ == "__main__":
259
+ if len(sys.argv) < 2:
260
+ print("Usage: python inspect_graph.py <graph.pkl>")
261
+ print("Example: python inspect_graph.py neu_graph_clean3.pkl")
262
+ else:
263
+ graph_file = sys.argv[1]
264
+ inspect_graph_thoroughly(graph_file)
265
+ suggest_fix_commands(graph_file)
src/neu_graph_clean8.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a587cdbcc482e13aff07b62e79a4d1c8732c1ab1cb41f1d699ed6f50148f4db4
3
+ size 244756
src/neu_scraper.py CHANGED
@@ -1,235 +1,236 @@
1
- """
2
- NEU Course Catalog Scraper using SearchNEU GraphQL API (With Proper Pagination)
3
-
4
- Fetches ALL courses for given subjects using first/offset pagination.
5
-
6
- Usage:
7
- python neu_scraper.py --term 202510 --subjects CS DS IS CY --prefix neu_api
8
- """
9
- import requests
10
- import pickle
11
- import networkx as nx
12
- import time
13
- import logging
14
- from typing import List, Dict, Set, Any
15
- from datetime import datetime
16
-
17
- # Configure logging
18
- logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
19
- logger = logging.getLogger(__name__)
20
-
21
- class NEUGraphQLScraper:
22
- def __init__(self, term_id: str, api_url: str = "https://searchneu.com/graphql"):
23
- self.term_id = term_id
24
- self.api_url = api_url
25
- self.headers = {"Content-Type": "application/json"}
26
- self.courses_data_cache: Dict[str, Dict] = {}
27
- self.all_course_ids: Set[str] = set()
28
- self.graph = nx.DiGraph()
29
-
30
- def get_all_courses_by_subject(self, subject: str, batch_size: int = 100) -> List[Dict]:
31
- """Fetch ALL courses for a specific subject via GraphQL with pagination."""
32
- all_courses = []
33
- offset = 0
34
- page = 1
35
-
36
- while True:
37
- query = """
38
- query searchQuery($termId: String!, $query: String!, $first: Int, $offset: Int) {
39
- search(termId: $termId, query: $query, first: $first, offset: $offset) {
40
- totalCount
41
- nodes {
42
- __typename
43
- ... on ClassOccurrence {
44
- subject
45
- classId
46
- name
47
- desc
48
- prereqs
49
- coreqs
50
- minCredits
51
- maxCredits
52
- }
53
- }
54
- }
55
- }
56
- """
57
- variables = {
58
- "termId": self.term_id,
59
- "query": subject,
60
- "first": batch_size,
61
- "offset": offset
62
- }
63
-
64
- try:
65
- resp = requests.post(self.api_url, json={"query": query, "variables": variables}, headers=self.headers)
66
- resp.raise_for_status()
67
- data = resp.json()
68
-
69
- if "errors" in data:
70
- logger.error(f"GraphQL errors for subject {subject}: {data['errors']}")
71
- break
72
-
73
- search_data = data.get("data", {}).get("search", {})
74
- nodes = search_data.get("nodes", [])
75
-
76
- # Extract ClassOccurrence nodes
77
- page_courses = [c for c in nodes if c.get("__typename") == "ClassOccurrence"]
78
- all_courses.extend(page_courses)
79
-
80
- logger.info(f"Page {page}: Found {len(page_courses)} courses, Total so far: {len(all_courses)}")
81
-
82
- # Check if we've reached the end
83
- if len(page_courses) < batch_size:
84
- break
85
-
86
- offset += batch_size
87
- page += 1
88
-
89
- # Add a small delay to avoid overwhelming the API
90
- time.sleep(0.1)
91
-
92
- except Exception as e:
93
- logger.error(f"Error fetching page {page} for subject {subject}: {e}")
94
- break
95
-
96
- logger.info(f"Total courses found for {subject}: {len(all_courses)}")
97
- return all_courses
98
-
99
- def get_course_data_by_id(self, subject: str, classId: str) -> Dict:
100
- """Fetch a specific course by its subject and classId."""
101
- query = """
102
- query searchQuery($termId: String!, $query: String!) {
103
- search(termId: $termId, query: $query) {
104
- nodes {
105
- __typename
106
- ... on ClassOccurrence {
107
- subject
108
- classId
109
- name
110
- desc
111
- prereqs
112
- coreqs
113
- minCredits
114
- maxCredits
115
- }
116
- }
117
- }
118
- }
119
- """
120
- variables = {"termId": self.term_id, "query": f"{subject}{classId}"}
121
- try:
122
- resp = requests.post(self.api_url, json={"query": query, "variables": variables}, headers=self.headers)
123
- resp.raise_for_status()
124
- data = resp.json()
125
-
126
- nodes = data.get("data", {}).get("search", {}).get("nodes", [])
127
- for c in nodes:
128
- if c.get("subject") == subject and c.get("classId") == classId:
129
- return c
130
- return {}
131
- except Exception as e:
132
- logger.error(f"Error fetching course {subject}{classId}: {e}")
133
- return {}
134
-
135
- def _recursive_parse_prereqs(self, prereq_obj: Any) -> Set[str]:
136
- """Extract course IDs from nested prereq/coreq structures."""
137
- ids = set()
138
- if not isinstance(prereq_obj, dict):
139
- return ids
140
-
141
- # Handle direct course references (the actual structure we see)
142
- if "classId" in prereq_obj and "subject" in prereq_obj:
143
- ids.add(f"{prereq_obj['subject']}{prereq_obj['classId']}")
144
- return ids
145
-
146
- # Handle logical operators (and/or) with nested values
147
- if prereq_obj.get("type") in ["and", "or"]:
148
- for val in prereq_obj.get("values", []):
149
- ids |= self._recursive_parse_prereqs(val)
150
-
151
- # Handle nested values in other structures
152
- elif "values" in prereq_obj:
153
- for val in prereq_obj.get("values", []):
154
- ids |= self._recursive_parse_prereqs(val)
155
-
156
- return ids
157
-
158
- def scrape_full_catalog(self, subjects: List[str]):
159
- """Scrape all courses for the given subjects."""
160
- logger.info(f"Fetching complete catalog for subjects: {subjects}")
161
-
162
- all_courses = []
163
- for subject in subjects:
164
- logger.info(f"Fetching courses for subject: {subject}")
165
- courses = self.get_all_courses_by_subject(subject)
166
- all_courses.extend(courses)
167
-
168
- # Add a small delay to be respectful to the API
169
- time.sleep(0.5)
170
-
171
- # Cache all courses
172
- for c in all_courses:
173
- cid = f"{c['subject']}{c['classId']}"
174
- self.courses_data_cache[cid] = c
175
- self.all_course_ids.add(cid)
176
-
177
- logger.info(f"Discovered {len(all_courses)} total courses in catalog")
178
-
179
- def build_graph(self):
180
- """Build NetworkX graph from scraped course data and requisites."""
181
- logger.info("Building course graph")
182
-
183
- # Add all courses as nodes
184
- for cid, cdata in self.courses_data_cache.items():
185
- self.graph.add_node(cid, **{
186
- "name": cdata.get("name", ""),
187
- "subject": cdata.get("subject", ""),
188
- "classId": cdata.get("classId", ""),
189
- "description": cdata.get("desc", ""), # Corrected from 'desc'
190
- "minCredits": cdata.get("minCredits", 0),
191
- "maxCredits": cdata.get("maxCredits", 0)
192
- })
193
-
194
- # Add edges ONLY for prerequisites
195
- for cid, cdata in self.courses_data_cache.items():
196
- prereqs = cdata.get("prereqs", {})
197
- if prereqs:
198
- prereq_ids = self._recursive_parse_prereqs(prereqs)
199
- for pid in prereq_ids:
200
- if pid in self.graph:
201
- self.graph.add_edge(pid, cid, relationship="prerequisite")
202
-
203
- def save_data(self, prefix: str):
204
- """Save graph and courses to pickle files with timestamp."""
205
- ts = datetime.now().strftime("%Y%m%d_%H%M%S")
206
- gfile = f"{prefix}_graph_{ts}.pkl"
207
- cfile = f"{prefix}_courses_{ts}.pkl"
208
-
209
- with open(gfile, "wb") as gf:
210
- pickle.dump(self.graph, gf)
211
- with open(cfile, "wb") as cf:
212
- pickle.dump(self.courses_data_cache, cf)
213
-
214
- logger.info(f"Data saved: {gfile}, {cfile}")
215
-
216
- # Also save some stats
217
- logger.info(f"Graph stats: {self.graph.number_of_nodes()} nodes, {self.graph.number_of_edges()} edges")
218
-
219
- def main():
220
- import argparse
221
- parser = argparse.ArgumentParser(description="Full NEU API Catalog Scraper")
222
- parser.add_argument("--term", required=True, help="Term ID e.g. 202510")
223
- parser.add_argument("--subjects", nargs="+", required=True, help="Subjects to scrape (e.g., CS DS IS CY)")
224
- parser.add_argument("--prefix", default="neu_api", help="Output prefix")
225
- parser.add_argument("--batch-size", type=int, default=100, help="Number of courses per page")
226
- args = parser.parse_args()
227
-
228
- scraper = NEUGraphQLScraper(term_id=args.term)
229
- scraper.scrape_full_catalog(args.subjects)
230
- scraper.build_graph()
231
- scraper.save_data(args.prefix)
232
- logger.info("Scraping complete.")
233
-
234
- if __name__ == "__main__":
 
235
  main()
 
1
+ """
2
+ Multi-Term NEU Course Scraper - Merges data from multiple terms
3
+ Fixes: Missing courses by scraping Fall/Spring/Summer catalogs
4
+ """
5
+ import requests
6
+ import pickle
7
+ import networkx as nx
8
+ import time
9
+ import logging
10
+ from typing import List, Dict, Set, Any
11
+ from datetime import datetime
12
+ from collections import defaultdict
13
+
14
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
15
+ logger = logging.getLogger(__name__)
16
+
17
+ class MultiTermScraper:
18
+ def __init__(self, term_ids: List[str], api_url: str = "https://searchneu.com/graphql"):
19
+ self.term_ids = term_ids
20
+ self.api_url = api_url
21
+ self.headers = {"Content-Type": "application/json"}
22
+ self.merged_courses: Dict[str, Dict] = {} # cid -> course data
23
+ self.graph = nx.DiGraph()
24
+
25
+ def get_all_courses_by_subject(self, term_id: str, subject: str, batch_size: int = 100) -> List[Dict]:
26
+ """Fetch ALL courses for a specific subject/term via pagination."""
27
+ all_courses = []
28
+ offset = 0
29
+ page = 1
30
+
31
+ while True:
32
+ query = """
33
+ query searchQuery($termId: String!, $query: String!, $first: Int, $offset: Int) {
34
+ search(termId: $termId, query: $query, first: $first, offset: $offset) {
35
+ totalCount
36
+ nodes {
37
+ __typename
38
+ ... on ClassOccurrence {
39
+ subject
40
+ classId
41
+ name
42
+ desc
43
+ prereqs
44
+ coreqs
45
+ minCredits
46
+ maxCredits
47
+ }
48
+ }
49
+ }
50
+ }
51
+ """
52
+ variables = {
53
+ "termId": term_id,
54
+ "query": subject,
55
+ "first": batch_size,
56
+ "offset": offset
57
+ }
58
+
59
+ try:
60
+ resp = requests.post(self.api_url, json={"query": query, "variables": variables}, headers=self.headers, timeout=10)
61
+ resp.raise_for_status()
62
+ data = resp.json()
63
+
64
+ if "errors" in data:
65
+ logger.error(f"GraphQL errors for {term_id}/{subject}: {data['errors']}")
66
+ break
67
+
68
+ search_data = data.get("data", {}).get("search", {})
69
+ nodes = search_data.get("nodes", [])
70
+ page_courses = [c for c in nodes if c.get("__typename") == "ClassOccurrence"]
71
+ all_courses.extend(page_courses)
72
+
73
+ logger.info(f"[{term_id}] {subject} Page {page}: {len(page_courses)} courses (Total: {len(all_courses)})")
74
+
75
+ if len(page_courses) < batch_size:
76
+ break
77
+
78
+ offset += batch_size
79
+ page += 1
80
+ time.sleep(0.1)
81
+
82
+ except Exception as e:
83
+ logger.error(f"Error fetching {term_id}/{subject} page {page}: {e}")
84
+ break
85
+
86
+ logger.info(f"[{term_id}] {subject}: {len(all_courses)} total courses")
87
+ return all_courses
88
+
89
+ def _recursive_parse_prereqs(self, prereq_obj: Any) -> Set[str]:
90
+ """Extract course IDs from nested prereq structures."""
91
+ ids = set()
92
+ if not isinstance(prereq_obj, dict):
93
+ return ids
94
+
95
+ if "classId" in prereq_obj and "subject" in prereq_obj:
96
+ ids.add(f"{prereq_obj['subject']}{prereq_obj['classId']}")
97
+ return ids
98
+
99
+ if prereq_obj.get("type") in ["and", "or"]:
100
+ for val in prereq_obj.get("values", []):
101
+ ids |= self._recursive_parse_prereqs(val)
102
+
103
+ elif "values" in prereq_obj:
104
+ for val in prereq_obj.get("values", []):
105
+ ids |= self._recursive_parse_prereqs(val)
106
+
107
+ return ids
108
+
109
+ def scrape_all_terms(self, subjects: List[str]):
110
+ """Scrape courses from all terms and merge by course ID."""
111
+ term_data = defaultdict(lambda: defaultdict(list)) # term_id -> subject -> courses
112
+
113
+ for term_id in self.term_ids:
114
+ logger.info(f"\n{'='*70}")
115
+ logger.info(f"SCRAPING TERM: {term_id}")
116
+ logger.info(f"{'='*70}")
117
+
118
+ for subject in subjects:
119
+ courses = self.get_all_courses_by_subject(term_id, subject)
120
+ term_data[term_id][subject] = courses
121
+ time.sleep(0.5)
122
+
123
+ # Merge courses across terms (prefer most recent data for duplicates)
124
+ for term_id in self.term_ids:
125
+ for subject in subjects:
126
+ for course in term_data[term_id][subject]:
127
+ cid = f"{course['subject']}{course['classId']}"
128
+
129
+ # Only update if we don't have this course OR this term is newer
130
+ if cid not in self.merged_courses:
131
+ self.merged_courses[cid] = course
132
+ logger.debug(f"Added {cid} from {term_id}")
133
+ else:
134
+ # Update if current course has more complete data
135
+ existing = self.merged_courses[cid]
136
+ if not existing.get('desc') and course.get('desc'):
137
+ self.merged_courses[cid] = course
138
+ logger.debug(f"Updated {cid} from {term_id} (better description)")
139
+
140
+ logger.info(f"\n{'='*70}")
141
+ logger.info(f"MERGE COMPLETE: {len(self.merged_courses)} unique courses")
142
+ logger.info(f"{'='*70}")
143
+
144
+ # Log subject breakdown
145
+ subject_counts = defaultdict(int)
146
+ for cid in self.merged_courses:
147
+ subject = self.merged_courses[cid].get('subject', 'UNKNOWN')
148
+ subject_counts[subject] += 1
149
+
150
+ logger.info("\nSubject breakdown:")
151
+ for subject in sorted(subject_counts.keys()):
152
+ logger.info(f" {subject}: {subject_counts[subject]} courses")
153
+
154
+ def build_graph(self):
155
+ """Build NetworkX graph from merged course data."""
156
+ logger.info("\nBuilding course dependency graph...")
157
+
158
+ # Add all courses as nodes
159
+ for cid, cdata in self.merged_courses.items():
160
+ self.graph.add_node(cid, **{
161
+ "name": cdata.get("name", ""),
162
+ "subject": cdata.get("subject", ""),
163
+ "classId": cdata.get("classId", ""),
164
+ "description": cdata.get("desc", ""),
165
+ "minCredits": cdata.get("minCredits", 0),
166
+ "maxCredits": cdata.get("maxCredits", 0)
167
+ })
168
+
169
+ # Add prerequisite edges
170
+ edge_count = 0
171
+ for cid, cdata in self.merged_courses.items():
172
+ prereqs = cdata.get("prereqs", {})
173
+ if prereqs:
174
+ prereq_ids = self._recursive_parse_prereqs(prereqs)
175
+ for pid in prereq_ids:
176
+ if pid in self.graph:
177
+ self.graph.add_edge(pid, cid, relationship="prerequisite")
178
+ edge_count += 1
179
+ else:
180
+ logger.warning(f"Prerequisite {pid} for {cid} not in graph")
181
+
182
+ logger.info(f"Graph built: {self.graph.number_of_nodes()} nodes, {edge_count} edges")
183
+
184
+ def save_data(self, prefix: str):
185
+ """Save merged graph and courses."""
186
+ ts = datetime.now().strftime("%Y%m%d_%H%M%S")
187
+ gfile = f"{prefix}_graph_{ts}.pkl"
188
+ cfile = f"{prefix}_courses_{ts}.pkl"
189
+
190
+ with open(gfile, "wb") as gf:
191
+ pickle.dump(self.graph, gf)
192
+ with open(cfile, "wb") as cf:
193
+ pickle.dump(self.merged_courses, cf)
194
+
195
+ logger.info(f"\nData saved:")
196
+ logger.info(f" Graph: {gfile}")
197
+ logger.info(f" Courses: {cfile}")
198
+
199
+ # Save merge report
200
+ report_file = f"{prefix}_merge_report_{ts}.txt"
201
+ with open(report_file, "w") as rf:
202
+ rf.write(f"Multi-Term Scrape Report\n")
203
+ rf.write(f"{'='*70}\n\n")
204
+ rf.write(f"Terms scraped: {', '.join(self.term_ids)}\n")
205
+ rf.write(f"Total unique courses: {len(self.merged_courses)}\n")
206
+ rf.write(f"Total edges: {self.graph.number_of_edges()}\n\n")
207
+
208
+ rf.write("Subject breakdown:\n")
209
+ subject_counts = defaultdict(int)
210
+ for cid in self.merged_courses:
211
+ subject = self.merged_courses[cid].get('subject', 'UNKNOWN')
212
+ subject_counts[subject] += 1
213
+
214
+ for subject in sorted(subject_counts.keys()):
215
+ rf.write(f" {subject}: {subject_counts[subject]}\n")
216
+
217
+ logger.info(f" Report: {report_file}")
218
+
219
+ def main():
220
+ import argparse
221
+ parser = argparse.ArgumentParser(description="Multi-Term NEU Catalog Scraper")
222
+ parser.add_argument("--terms", nargs="+", required=True, help="Term IDs (e.g., 202510 202520 202530)")
223
+ parser.add_argument("--subjects", nargs="+", required=True, help="Subjects (e.g., CS DS STAT)")
224
+ parser.add_argument("--prefix", default="neu_merged", help="Output prefix")
225
+ parser.add_argument("--batch-size", type=int, default=100, help="Courses per page")
226
+ args = parser.parse_args()
227
+
228
+ scraper = MultiTermScraper(term_ids=args.terms)
229
+ scraper.scrape_all_terms(args.subjects)
230
+ scraper.build_graph()
231
+ scraper.save_data(args.prefix)
232
+
233
+ logger.info("\n✅ Multi-term scraping complete!")
234
+
235
+ if __name__ == "__main__":
236
  main()