Spaces:

ckharche
/

8674-Project

Sleeping

App Files Files Community

ckharche commited on Oct 17, 2025

Commit

9f3c8c1

verified ·

1 Parent(s): 722bc5e

Upload 6 files

Browse files

Files changed (6) hide show

src/agentic_optimizer.py +400 -371
src/curriculum_analyzer.py +146 -83
src/curriculum_optimizer.py +292 -495
src/inspect_graph.py +249 -72
src/neu_graph_clean8.pkl +3 -0
src/neu_scraper.py +235 -234

src/agentic_optimizer.py CHANGED Viewed

@@ -1,413 +1,442 @@
 """
-Agentic Curriculum Optimizer
-Runs 100% locally, no API costs
-"""
 import json
-import sqlite3
 import networkx as nx
-import numpy as np
-from dataclasses import dataclass, asdict
-from typing import Dict, List, Tuple, Optional
 from datetime import datetime
-import pickle
 import torch
-from sentence_transformers import SentenceTransformer
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
-import schedule
-import time
-@dataclass
-class StudentProfile:
-    student_id: str
-    completed_courses: List[str]
-    current_gpa: float
-    interests: List[str]
-    career_goals: str
-    learning_style: str
-    time_commitment: int
-    preferred_difficulty: str
 @dataclass
-class PlanFeedback:
-    student_id: str
-    plan_id: str
-    timestamp: datetime
-    actual_gpa: float
-    difficulty_rating: int  # 1-5
-    satisfaction: int  # 1-5
-    completed_courses: List[str]
-    dropped_courses: List[str]
-class CurriculumAgent:
     """
-    Autonomous agent that:
-    1. Monitors student progress
-    2. Adapts recommendations based on feedback
-    3. Proactively suggests adjustments
-    4. Learns from outcomes
     """
-    def __init__(self, db_path="curriculum_agent.db"):
-        self.db_path = db_path
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        # Models (local, no API)
-        self.embedder = SentenceTransformer('all-MiniLM-L6-v2')  # Smaller for local
         self.graph = None
         self.courses = {}
-        # Initialize database for memory
-        self._init_database()
-        # Agent state
-        self.active_plans = {}
-        self.feedback_history = []
-    def _init_database(self):
-        """Create tables for agent memory"""
-        conn = sqlite3.connect(self.db_path)
-        c = conn.cursor()
-        # Student profiles
-        c.execute('''CREATE TABLE IF NOT EXISTS students
-                    (id TEXT PRIMARY KEY,
-                     profile TEXT,
-                     created_at TIMESTAMP)''')
-        # Generated plans
-        c.execute('''CREATE TABLE IF NOT EXISTS plans
-                    (id TEXT PRIMARY KEY,
-                     student_id TEXT,
-                     plan_data TEXT,
-                     created_at TIMESTAMP,
-                     performance_score REAL)''')
-        # Feedback for learning
-        c.execute('''CREATE TABLE IF NOT EXISTS feedback
-                    (id INTEGER PRIMARY KEY AUTOINCREMENT,
-                     plan_id TEXT,
-                     student_id TEXT,
-                     feedback_data TEXT,
-                     timestamp TIMESTAMP)''')
-        # Agent learning patterns
-        c.execute('''CREATE TABLE IF NOT EXISTS patterns
-                    (id INTEGER PRIMARY KEY AUTOINCREMENT,
-                     pattern_type TEXT,
-                     pattern_data TEXT,
-                     success_rate REAL,
-                     discovered_at TIMESTAMP)''')
-        conn.commit()
-        conn.close()
-    def perceive(self) -> Dict:
-        """
-        PERCEPTION: Gather information about environment
-        """
-        perceptions = {
-            "active_students": self._get_active_students(),
-            "recent_feedback": self._get_recent_feedback(),
-            "course_updates": self._check_course_updates(),
-            "success_patterns": self._analyze_success_patterns()
-        }
-        return perceptions
-    def decide(self, perceptions: Dict) -> List[Dict]:
-        """
-        DECISION: Determine what actions to take
-        """
-        decisions = []
-        # Decision 1: Which students need plan updates?
-        for student_id in perceptions["active_students"]:
-            if self._needs_plan_update(student_id, perceptions):
-                decisions.append({
-                    "action": "update_plan",
-                    "student_id": student_id,
-                    "reason": "Poor performance feedback"
-                })
-        # Decision 2: Identify at-risk students
-        at_risk = self._identify_at_risk_students(perceptions["recent_feedback"])
-        for student_id in at_risk:
-            decisions.append({
-                "action": "intervention",
-                "student_id": student_id,
-                "reason": "Risk of dropping out"
-            })
-        # Decision 3: Optimize based on patterns
-        if perceptions["success_patterns"]:
-            decisions.append({
-                "action": "update_algorithm",
-                "patterns": perceptions["success_patterns"]
-            })
-        return decisions
-    def act(self, decisions: List[Dict]) -> List[Dict]:
-        """
-        ACTION: Execute decisions
-        """
-        results = []
-        for decision in decisions:
-            if decision["action"] == "update_plan":
-                new_plan = self._regenerate_plan(decision["student_id"])
-                results.append({
-                    "action": "plan_updated",
-                    "student_id": decision["student_id"],
-                    "plan": new_plan
-                })
-            elif decision["action"] == "intervention":
-                intervention = self._create_intervention(decision["student_id"])
-                results.append({
-                    "action": "intervention_created",
-                    "student_id": decision["student_id"],
-                    "intervention": intervention
-                })
-            elif decision["action"] == "update_algorithm":
-                self._update_planning_algorithm(decision["patterns"])
-                results.append({
-                    "action": "algorithm_updated",
-                    "patterns_applied": len(decision["patterns"])
-                })
-        return results
-    def learn(self, results: List[Dict]):
-        """
-        LEARNING: Update knowledge based on outcomes
-        """
-        conn = sqlite3.connect(self.db_path)
-        c = conn.cursor()
-        for result in results:
-            if result["action"] == "plan_updated":
-                # Track plan performance
-                self._track_plan_performance(result["student_id"], result["plan"])
-            elif result["action"] == "intervention_created":
-                # Monitor intervention effectiveness
-                self._monitor_intervention(result["student_id"], result["intervention"])
-        # Discover new patterns
-        patterns = self._discover_patterns()
-        for pattern in patterns:
-            c.execute("INSERT INTO patterns (pattern_type, pattern_data, success_rate, discovered_at) VALUES (?, ?, ?, ?)",
-                     (pattern["type"], json.dumps(pattern["data"]), pattern["success_rate"], datetime.now()))
-        conn.commit()
-        conn.close()
-    def run_autonomous_cycle(self):
-        """
-        Main agent loop - runs continuously
-        """
-        while True:
-            print(f"\n[{datetime.now()}] Agent Cycle Starting...")
-            # 1. PERCEIVE
-            perceptions = self.perceive()
-            print(f"Perceptions: {len(perceptions['active_students'])} active students")
-            # 2. DECIDE
-            decisions = self.decide(perceptions)
-            print(f"Decisions: {len(decisions)} actions to take")
-            # 3. ACT
-            results = self.act(decisions)
-            print(f"Results: {len(results)} actions completed")
-            # 4. LEARN
-            self.learn(results)
-            print("Learning cycle complete")
-            # Wait before next cycle (in production, this could be daily)
-            time.sleep(60)  # Run every minute for demo
-    # --- Helper Methods ---
-    def _get_active_students(self) -> List[str]:
-        """Get list of active students"""
-        conn = sqlite3.connect(self.db_path)
-        c = conn.cursor()
-        c.execute("SELECT id FROM students")
-        students = [row[0] for row in c.fetchall()]
-        conn.close()
-        return students
-    def _get_recent_feedback(self) -> List[Dict]:
-        """Get recent feedback"""
-        conn = sqlite3.connect(self.db_path)
-        c = conn.cursor()
-        c.execute("SELECT feedback_data FROM feedback ORDER BY timestamp DESC LIMIT 10")
-        feedback = [json.loads(row[0]) for row in c.fetchall()]
-        conn.close()
-        return feedback
-    def _check_course_updates(self) -> Dict:
-        """Check for course changes (mock for demo)"""
-        return {"updated_courses": [], "new_prerequisites": {}}
-    def _analyze_success_patterns(self) -> List[Dict]:
-        """Identify successful patterns"""
-        conn = sqlite3.connect(self.db_path)
-        c = conn.cursor()
-        c.execute("SELECT pattern_data, success_rate FROM patterns WHERE success_rate > 0.7")
-        patterns = [{"data": json.loads(row[0]), "success_rate": row[1]} for row in c.fetchall()]
-        conn.close()
-        return patterns
-    def _needs_plan_update(self, student_id: str, perceptions: Dict) -> bool:
-        """Determine if student needs plan update"""
-        # Check if recent feedback shows issues
-        for feedback in perceptions["recent_feedback"]:
-            if feedback.get("student_id") == student_id:
-                if feedback.get("satisfaction", 5) < 3:
-                    return True
-        return False
-    def _identify_at_risk_students(self, feedback: List[Dict]) -> List[str]:
-        """Identify students at risk"""
-        at_risk = []
-        for fb in feedback:
-            if fb.get("difficulty_rating", 0) > 4 or fb.get("dropped_courses", []):
-                at_risk.append(fb.get("student_id"))
-        return at_risk
-    def _regenerate_plan(self, student_id: str) -> Dict:
-        """Generate new plan for student"""
-        # This would use your existing optimizer
-        return {"plan": "new_optimized_plan", "adjustments": ["reduced_difficulty"]}
-    def _create_intervention(self, student_id: str) -> Dict:
-        """Create intervention plan"""
-        return {
-            "type": "academic_support",
-            "recommendations": ["tutoring", "reduced_courseload", "advisor_meeting"]
         }
-    def _update_planning_algorithm(self, patterns: List[Dict]):
-        """Update planning based on learned patterns"""
-        # This would adjust your optimizer's weights/rules
-        print(f"Updating algorithm with {len(patterns)} patterns")
-    def _track_plan_performance(self, student_id: str, plan: Dict):
-        """Track how well plans perform"""
-        conn = sqlite3.connect(self.db_path)
-        c = conn.cursor()
-        c.execute("UPDATE plans SET performance_score = ? WHERE student_id = ?",
-                 (0.0, student_id))  # Would calculate actual score
-        conn.commit()
-        conn.close()
-    def _monitor_intervention(self, student_id: str, intervention: Dict):
-        """Monitor intervention effectiveness"""
-        print(f"Monitoring intervention for {student_id}")
-    def _discover_patterns(self) -> List[Dict]:
-        """Discover new patterns from data"""
-        # Example: Find that students who take CS2500 before CS2510 do better
-        patterns = []
-        # Analyze database for patterns
-        conn = sqlite3.connect(self.db_path)
-        c = conn.cursor()
-        # Example pattern discovery
-        c.execute("""
-            SELECT COUNT(*) FROM feedback
-            WHERE feedback_data LIKE '%CS2500%CS2510%'
-            AND json_extract(feedback_data, '$.satisfaction') > 4
-        """)
-        result = c.fetchone()
-        if result and result[0] > 5:  # If pattern appears frequently
-            patterns.append({
-                "type": "course_sequence",
-                "data": {"sequence": ["CS2500", "CS2510"]},
-                "success_rate": 0.85
-            })
-        conn.close()
-        return patterns
-class LocalAgentRunner:
-    """
-    Manages the agent without external dependencies
-    """
-    def __init__(self, curriculum_data_path: str):
-        self.agent = CurriculumAgent()
-        # Load curriculum data
-        with open(curriculum_data_path, 'rb') as f:
-            graph = pickle.load(f)
-        self.agent.graph = graph
-        self.agent.courses = dict(graph.nodes(data=True))
-    def add_student(self, profile: StudentProfile) -> str:
-        """Add a student to track"""
-        conn = sqlite3.connect(self.agent.db_path)
-        c = conn.cursor()
-        student_id = f"STU_{datetime.now().timestamp()}"
-        c.execute("INSERT INTO students (id, profile, created_at) VALUES (?, ?, ?)",
-                 (student_id, json.dumps(asdict(profile)), datetime.now()))
-        conn.commit()
-        conn.close()
-        return student_id
-    def submit_feedback(self, feedback: PlanFeedback):
-        """Submit feedback for learning"""
-        conn = sqlite3.connect(self.agent.db_path)
-        c = conn.cursor()
-        c.execute("INSERT INTO feedback (plan_id, student_id, feedback_data, timestamp) VALUES (?, ?, ?, ?)",
-                 (feedback.plan_id, feedback.student_id, json.dumps(asdict(feedback)), feedback.timestamp))
-        conn.commit()
-        conn.close()
-    def start_agent(self):
-        """Start the autonomous agent"""
-        print("Starting Curriculum Agent...")
-        print("Agent will monitor students and adapt plans automatically")
-        print("Press Ctrl+C to stop")
-        try:
-            self.agent.run_autonomous_cycle()
-        except KeyboardInterrupt:
-            print("\nAgent stopped")
-# Example usage
-if __name__ == "__main__":
-    # Initialize agent
-    runner = LocalAgentRunner("neu_graph_analyzed_clean.pkl")
-    # Add a test student
-    student = StudentProfile(
-        student_id="test_001",
-        completed_courses=["CS1800", "CS2500"],
-        current_gpa=3.5,
-        interests=["AI", "Machine Learning"],
-        career_goals="ML Engineer",
-        learning_style="Visual",
-        time_commitment=40,
-        preferred_difficulty="moderate"
-    )
-    student_id = runner.add_student(student)
-    print(f"Added student: {student_id}")
-    # Start autonomous agent
-    runner.start_agent()

 """
+Agentic Curriculum Optimizer - Autonomous Graph Validator & Fixer
+Detects missing courses, suggests replacements, and directly patches the graph.
+Usage:
+    python agentic_optimizer.py --graph neu_graph_clean6.pkl --validate
+    python agentic_optimizer.py --graph neu_graph_clean6.pkl --fix --output neu_graph_fixed.pkl
+"""
+import pickle
 import json
+import re
+import argparse
 import networkx as nx
+from typing import Dict, Set, List, Tuple, Optional
 from datetime import datetime
+from dataclasses import dataclass, asdict
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 @dataclass
+class CourseChange:
+    """Detected change in course catalog"""
+    old_code: str
+    new_code: str = None
+    status: str = "missing"  # missing, renamed, moved, deprecated
+    replacement_suggestion: str = None
+    confidence: float = 0.0
+    evidence: str = ""
+class AgenticOptimizer:
     """
+    Autonomous agent that validates requirements AND fixes graph automatically
     """
+    # Requirements synced with curriculum_optimizer.py
+    CONCENTRATION_REQUIREMENTS = {
+        "ai_ml": {
+            "foundations": {
+                "required": ["CS1800", "CS2500", "CS2510", "CS2800"],
+            },
+            "core": {
+                "required": ["CS3000", "CS3500"],
+                "pick_1_from": ["CS3200", "CS3650", "CS5700"]  # FIXED: CS3700 → CS5700
+            },
+            "concentration_specific": {
+                "required": ["CS4100", "DS4400"],
+                "pick_2_from": ["CS4120", "CS4180", "DS4420", "DS4440"],
+                "pick_1_systems": ["CS4730", "CS4700"]  # REMOVED: CS4750 (doesn't exist)
+            },
+            "math": {
+                "required": ["MATH1341", "MATH1342"],
+                "pick_1_from": ["MATH2331", "MATH3081"]  # REMOVED: STAT3150
+            }
+        },
+        "systems": {
+            "foundations": {
+                "required": ["CS1800", "CS2500", "CS2510", "CS2800"]
+            },
+            "core": {
+                "required": ["CS3000", "CS3500", "CS3650"],
+                "pick_1_from": ["CS5700", "CS3200"]  # FIXED: CS3700 → CS5700
+            },
+            "concentration_specific": {
+                "required": ["CS4700"],
+                "pick_2_from": ["CS4730"],  # REMOVED: CS4750, CS4770
+                "pick_1_from": ["CS4400", "CS4500", "CS4520"]
+            },
+            "math": {
+                "required": ["MATH1341", "MATH1342"]
+            }
+        },
+        "security": {
+            "foundations": {
+                "required": ["CS1800", "CS2500", "CS2510", "CS2800"]
+            },
+            "core": {
+                "required": ["CS3000", "CS3650", "CY2550"],
+                "pick_1_from": ["CS5700", "CS3500"]  # FIXED: CS3700 → CS5700
+            },
+            "concentration_specific": {
+                "required": ["CY3740"],
+                "pick_2_from": ["CY4740", "CY4760", "CY4770"],  # CY4770 (moved from CS)
+                "pick_1_from": ["CS4700", "CS4730"]
+            },
+            "math": {
+                "required": ["MATH1342"],
+                "pick_1_from": ["MATH3527", "MATH3081"]
+            }
+        }
+    }
+    # Known manual additions for courses that don't appear in scraper
+    MANUAL_COURSES = {
+        "CS5700": {
+            "name": "Fundamentals of Networks",
+            "subject": "CS",
+            "classId": "5700",
+            "description": "Networks and distributed systems (grad level, no prereqs)",
+            "minCredits": 4,
+            "maxCredits": 4,
+            "prerequisites": []  # Open to undergrads
+        },
+        "CY4770": {
+            "name": "Foundations of Cryptography",
+            "subject": "CY",
+            "classId": "4770",
+            "description": "Mathematical cryptography (moved from CS dept)",
+            "minCredits": 4,
+            "maxCredits": 4,
+            "prerequisites": ["CS3000"]  # Simplified prereq
+        }
+    }
+    def __init__(self, graph_path: str, use_llm: bool = True):
+        self.graph_path = graph_path
+        self.use_llm = use_llm
         self.graph = None
         self.courses = {}
+        self.changes = []
+        # Load LLM if needed
+        self.llm = None
+        self.tokenizer = None
+        if use_llm:
+            self._load_llm()
+    def _load_llm(self):
+        """Load local LLM for intelligent validation"""
+        print("🤖 Loading LLM for catalog analysis...")
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        if device.type == 'cuda':
+            model_name = "meta-llama/Llama-3.1-8B-Instruct"
+            quant_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_quant_type="nf4",
+                bnb_4bit_compute_dtype=torch.bfloat16
+            )
+            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+            self.llm = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                quantization_config=quant_config,
+                device_map="auto"
+            )
+            print("✅ LLM loaded")
+        else:
+            print("⚠️  No GPU available, LLM disabled")
+            self.use_llm = False
+    def load_graph(self):
+        """Load curriculum graph"""
+        print(f"📚 Loading graph: {self.graph_path}")
+        with open(self.graph_path, 'rb') as f:
+            self.graph = pickle.load(f)
+        self.courses = dict(self.graph.nodes(data=True))
+        print(f"✅ Loaded {len(self.courses)} courses")
+    def validate_requirements(self) -> Dict[str, List[CourseChange]]:
+        """Check which required courses are missing from graph"""
+        print("\n🔍 Validating CONCENTRATION_REQUIREMENTS against graph...")
+        track_changes = {}
+        for track, track_reqs in self.CONCENTRATION_REQUIREMENTS.items():
+            print(f"\n📋 Checking {track} track:")
+            track_changes[track] = []
+            for category, reqs in track_reqs.items():
+                if not isinstance(reqs, dict):
+                    continue
+                for key, courses in reqs.items():
+                    if not isinstance(courses, list):
+                        continue
+                    for course in courses:
+                        if course not in self.courses:
+                            change = CourseChange(
+                                old_code=course,
+                                status="missing",
+                                evidence=f"Not found in scraped graph ({len(self.courses)} courses)"
+                            )
+                            track_changes[track].append(change)
+                            print(f"  ❌ {course} - MISSING")
+                        else:
+                            print(f"  ✅ {course}")
+        return track_changes
+    def find_replacements(self, changes: Dict[str, List[CourseChange]]) -> Dict[str, List[CourseChange]]:
+        """Use pattern matching + LLM to suggest replacements"""
+        print("\n🤖 Analyzing missing courses...")
+        for track, track_changes in changes.items():
+            for change in track_changes:
+                if change.status != "missing":
+                    continue
+                # Try pattern matching first (instant)
+                replacement = self._pattern_match_replacement(change.old_code)
+                if replacement:
+                    change.new_code = replacement
+                    change.status = "renamed"
+                    change.confidence = 0.7
+                    change.evidence = "Pattern matching"
+                    print(f"  🔄 {change.old_code} → {replacement} (pattern)")
+                    continue
+                # Check manual course database
+                if change.old_code in self.MANUAL_COURSES:
+                    change.new_code = change.old_code  # Will be added to graph
+                    change.status = "manual_add"
+                    change.confidence = 1.0
+                    change.evidence = "Manual course database"
+                    print(f"  ➕ {change.old_code} - Will be added manually")
+                    continue
+                # Use LLM for ambiguous cases
+                if self.use_llm and self.llm:
+                    replacement = self._llm_suggest_replacement(change.old_code, track)
+                    if replacement:
+                        change.new_code = replacement
+                        change.status = "renamed"
+                        change.confidence = 0.9
+                        change.evidence = "LLM analysis"
+                        print(f"  🔄 {change.old_code} → {replacement} (LLM)")
+                else:
+                    print(f"  ⚠️  {change.old_code} - No replacement found")
+        return changes
+    def _pattern_match_replacement(self, course_code: str) -> Optional[str]:
+        """Fast pattern-based replacement detection"""
+        # Known replacements from manual verification
+        known_replacements = {
+            "CS3700": "CS5700",
+            "CS4770": "CY4770",
+            "STAT3150": "MATH3081",
         }
+        if course_code in known_replacements:
+            if known_replacements[course_code] in self.courses:
+                return known_replacements[course_code]
+        # Try subject swap (CS ↔ CY)
+        if course_code.startswith("CS"):
+            alt_code = "CY" + course_code[2:]
+            if alt_code in self.courses:
+                return alt_code
+        elif course_code.startswith("CY"):
+            alt_code = "CS" + course_code[2:]
+            if alt_code in self.courses:
+                return alt_code
+        # Try grad-level version (3XXX/4XXX → 5XXX)
+        match = re.match(r'([A-Z]+)(\d)(\d{3})', course_code)
+        if match:
+            subject, first_digit, rest = match.groups()
+            if first_digit in ['3', '4']:
+                grad_code = f"{subject}5{rest}"
+                if grad_code in self.courses:
+                    return grad_code
+        return None
+    def _llm_suggest_replacement(self, missing_course: str, track: str) -> Optional[str]:
+        """Use LLM to intelligently suggest replacement"""
+        subject = re.match(r'([A-Z]+)', missing_course).group(1)
+        similar_courses = [
+            (cid, data.get('name', ''))
+            for cid, data in self.courses.items()
+            if cid.startswith(subject) and cid != missing_course
+        ][:10]
+        course_list = "\n".join([f"- {cid}: {name}" for cid, name in similar_courses])
+        prompt = f"""Course catalog expert analyzing NEU curriculum changes.
+**Missing:** {missing_course}
+**Track:** {track}
+**Available courses:**
+{course_list}
+Which course replaced {missing_course}? Return ONLY the code or "NONE".
+Rules:
+- Networks: CS3700 → CS5700
+- Crypto: CS → CY dept
+- STAT → MATH
+- Game courses often don't exist
+"""
+        try:
+            inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to(self.llm.device)
+            with torch.no_grad():
+                outputs = self.llm.generate(
+                    **inputs,
+                    max_new_tokens=50,
+                    temperature=0.1,
+                    do_sample=True,
+                    pad_token_id=self.tokenizer.eos_token_id
+                )
+            response = self.tokenizer.decode(outputs[0][len(inputs['input_ids'][0]):], skip_special_tokens=True).strip()
+            match = re.search(r'([A-Z]{2,4}\d{4})', response)
+            if match:
+                suggested = match.group(1)
+                if suggested in self.courses:
+                    return suggested
+        except Exception as e:
+            print(f"  ⚠️  LLM error: {e}")
+        return None
+    def fix_graph(self, changes: Dict[str, List[CourseChange]]) -> int:
+        """Directly add missing courses to the graph"""
+        print("\n🔧 Fixing graph by adding missing courses...")
+        added_count = 0
+        for track, track_changes in changes.items():
+            for change in track_changes:
+                if change.status == "manual_add" and change.old_code in self.MANUAL_COURSES:
+                    course_data = self.MANUAL_COURSES[change.old_code]
+                    cid = change.old_code
+                    # Add node
+                    self.graph.add_node(cid, **course_data)
+                    self.courses[cid] = course_data
+                    # Add prerequisite edges
+                    for prereq in course_data.get("prerequisites", []):
+                        if prereq in self.graph:
+                            self.graph.add_edge(prereq, cid, relationship="prerequisite")
+                        else:
+                            print(f"    ⚠️  Prereq {prereq} for {cid} not in graph")
+                    print(f"  ✅ Added {cid}: {course_data['name']}")
+                    added_count += 1
+        return added_count
+    def save_report(self, changes: Dict[str, List[CourseChange]], output_path: str = None):
+        """Save validation report"""
+        if not output_path:
+            output_path = f"catalog_validation_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
+        report = {
+            "timestamp": datetime.now().isoformat(),
+            "graph_file": self.graph_path,
+            "total_courses_in_graph": len(self.courses),
+            "changes": {
+                track: [asdict(c) for c in track_changes]
+                for track, track_changes in changes.items()
+            }
+        }
+        with open(output_path, 'w') as f:
+            json.dump(report, f, indent=2)
+        print(f"\n💾 Report saved: {output_path}")
+    def save_graph(self, output_path: str):
+        """Save the fixed graph"""
+        with open(output_path, 'wb') as f:
+            pickle.dump(self.graph, f)
+        print(f"💾 Fixed graph saved: {output_path}")
+        print(f"📊 Final graph: {self.graph.number_of_nodes()} courses, {self.graph.number_of_edges()} edges")
+    def run(self, fix: bool = False, output: str = None):
+        """Main agent workflow"""
+        print("="*70)
+        print("AGENTIC OPTIMIZER - Autonomous Graph Validator & Fixer")
+        print("="*70)
+        # Step 1: Load data
+        self.load_graph()
+        # Step 2: Validate requirements
+        changes = self.validate_requirements()
+        # Count issues
+        total_missing = sum(len(c) for c in changes.values())
+        if total_missing == 0:
+            print("\n✅ All requirements valid! No changes needed.")
+            return
+        print(f"\n⚠️  Found {total_missing} missing courses across all tracks")
+        # Step 3: Find replacements
+        changes = self.find_replacements(changes)
+        # Step 4: Generate report
+        self.save_report(changes)
+        # Step 5: Fix graph if requested
+        if fix:
+            added = self.fix_graph(changes)
+            if added > 0:
+                print(f"\n✅ Added {added} courses to graph")
+                if output:
+                    self.save_graph(output)
+                else:
+                    # Default output name
+                    default_output = self.graph_path.replace('.pkl', '_fixed.pkl')
+                    self.save_graph(default_output)
+            else:
+                print("\n⚠️  No courses added (all issues are renamings, not missing)")
+        print("\n✨ Optimization complete!")
+def main():
+    parser = argparse.ArgumentParser(description="Agentic Optimizer - Auto-validate & fix curriculum graph")
+    parser.add_argument('--graph', required=True, help="Path to curriculum graph .pkl")
+    parser.add_argument('--validate', action='store_true', help="Only validate, don't fix")
+    parser.add_argument('--fix', action='store_true', help="Fix graph by adding missing courses")
+    parser.add_argument('--output', help="Output path for fixed graph")
+    parser.add_argument('--no-llm', action='store_true', help="Disable LLM (use pattern matching only)")
+    args = parser.parse_args()
+    agent = AgenticOptimizer(
+        graph_path=args.graph,
+        use_llm=not args.no_llm
+    )
+    agent.run(
+        fix=args.fix,
+        output=args.output
+    )
+if __name__ == "__main__":
+    main()

src/curriculum_analyzer.py CHANGED Viewed

@@ -1,13 +1,11 @@
 """
-Curriculum Analyzer and Data Enrichment Tool (with Pre-filtering)
-Analyzes, CLEANS, and enriches scraped NEU curriculum data.
 """
 import pickle
-import json
 import argparse
 import networkx as nx
 import re
-from collections import defaultdict
 def get_course_level(cid):
     """Extracts the numerical part of a course ID for level checking."""
@@ -16,112 +14,177 @@ def get_course_level(cid):
 class CurriculumAnalyzer:
     def __init__(self, graph_path, courses_path):
-        self.graph_path = graph_path
-        self.courses_path = courses_path
-        self.graph = None
-        self.courses = None
-        self.load_data()
-    def load_data(self):
         print("📚 Loading raw curriculum data...")
-        try:
-            with open(self.graph_path, 'rb') as f:
-                self.graph = pickle.load(f)
-            with open(self.courses_path, 'rb') as f:
-                self.courses = pickle.load(f)
-            # Merge course metadata into the graph nodes
-            for course_id, course_data in self.courses.items():
-                if self.graph.has_node(course_id):
-                    self.graph.nodes[course_id].update(course_data)
-            print(f"✅ Loaded raw data with {self.graph.number_of_nodes()} courses.")
-        except FileNotFoundError as e:
-            print(f"❌ Error: Data file not found. {e}")
-            exit(1)
     def pre_filter_graph(self):
-        """
-        Permanently removes irrelevant courses from the graph.
-        This is the most important step for creating logical plans.
-        """
-        print("\n🧹 Pre-filtering graph to remove irrelevant courses...")
-        # Define what subjects are considered relevant for a tech-focused degree
-        RELEVANT_SUBJECTS = {
-            "CS", "DS", "CY",
-        }
-        nodes_to_remove = []
         for node, data in self.graph.nodes(data=True):
-            subject = data.get('subject')
             level = get_course_level(node)
-            # Mark for removal if subject is irrelevant OR it's a grad course (>= 5000)
-            if subject not in RELEVANT_SUBJECTS or level >= 5000:
-                nodes_to_remove.append(node)
         self.graph.remove_nodes_from(nodes_to_remove)
-        print(f"✅ Graph filtered. Removed {len(nodes_to_remove)} irrelevant courses. Remaining: {self.graph.number_of_nodes()}")
-    def calculate_and_add_complexity(self):
-        """Calculates complexity scores for the remaining courses."""
-        print("\n🧮 Calculating complexity scores for filtered graph...")
-        if not self.graph.nodes():
-            return
-        foundation_courses = [n for n, d in self.graph.in_degree() if d == 0]
-        complexity_scores = {}
-        for node in self.graph.nodes():
-            # Calculate depth (longest path from a foundation course)
-            depth = 0
-            if foundation_courses:
-                paths = [nx.shortest_path_length(self.graph, source, node)
-                         for source in foundation_courses if nx.has_path(self.graph, source, node)]
-                if paths:
-                    depth = max(paths) # Use max path for a better sense of progression
-            in_deg = self.graph.in_degree(node)
-            out_deg = self.graph.out_degree(node)
-            # Formula: (prereqs * 10) + (unlocks * 5) + (depth * 3)
-            score = (in_deg * 10) + (out_deg * 5) + (depth * 3)
-            complexity_scores[node] = {
-                'complexity': score,
-                'depth': depth,
-                'prereq_count': in_deg,
-                'unlocks_count': out_deg
-            }
-        nx.set_node_attributes(self.graph, complexity_scores)
-        print("✅ Complexity scores calculated and added.")
     def save_enriched_graph(self, output_path):
         """Saves the final, clean, and enriched graph."""
-        print(f"\n💾 Saving CLEAN and enriched graph to {output_path}...")
         with open(output_path, 'wb') as f:
             pickle.dump(self.graph, f)
-        print("✅ Graph saved.")
 def main(args):
     """Main execution flow."""
     analyzer = CurriculumAnalyzer(args.graph, args.courses)
-    # Run the new cleaning step first!
     analyzer.pre_filter_graph()
     analyzer.calculate_and_add_complexity()
-    analyzer.save_enriched_graph(args.output_graph)
-    print("\n✨ Analysis and cleaning complete!")
-    print(f"➡️ In the Streamlit app, upload the new clean file: '{args.output_graph}'")
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="NEU Curriculum Analyzer and Data Enrichment Tool")
-    parser.add_argument('--graph', required=True, help="Path to the RAW curriculum graph from the scraper.")
-    parser.add_argument('--courses', required=True, help="Path to the RAW courses data from the scraper.")
-    parser.add_argument('--output-graph', default='neu_graph_analyzed_clean.pkl', help="Path to save the new CLEANED and enriched graph.")
     args = parser.parse_args()
-    main(args)

 """
+Fixed Curriculum Analyzer - Better handling of incomplete data
 """
 import pickle
 import argparse
 import networkx as nx
 import re
+from typing import Set, Dict
 def get_course_level(cid):
     """Extracts the numerical part of a course ID for level checking."""
 class CurriculumAnalyzer:
     def __init__(self, graph_path, courses_path):
         print("📚 Loading raw curriculum data...")
+        with open(graph_path, 'rb') as f:
+            self.graph = pickle.load(f)
+        with open(courses_path, 'rb') as f:
+            self.courses = pickle.load(f)
+        # Merge course data into graph nodes
+        for course_id, course_data in self.courses.items():
+            if self.graph.has_node(course_id):
+                self.graph.nodes[course_id].update(course_data)
+        print(f"✅ Loaded {self.graph.number_of_nodes()} courses, {self.graph.number_of_edges()} edges")
     def pre_filter_graph(self):
+        """Keeps only relevant subjects and removes labs/high-level courses."""
+        print("\n🧹 Pre-filtering graph...")
+        KEEP_SUBJECTS = {"CS", "DS", "IS", "CY", "MATH", "PHYS", "ENGW", "STAT", "EECE"}
+        nodes_to_remove = set()
         for node, data in self.graph.nodes(data=True):
+            subject = data.get('subject', '')
+            name = data.get('name', '').lower()
             level = get_course_level(node)
+            # Remove if:
+            # - Not in whitelist
+            # - Too advanced (5000+)
+            # - Lab/recitation/etc
+            if (subject not in KEEP_SUBJECTS or
+                level >= 5000 or
+                any(skip in name for skip in ['lab', 'recitation', 'seminar', 'practicum', 'co-op'])):
+                nodes_to_remove.add(node)
         self.graph.remove_nodes_from(nodes_to_remove)
+        print(f"✅ Removed {len(nodes_to_remove)} irrelevant courses")
+        print(f"   Remaining: {self.graph.number_of_nodes()} courses")
+    def fix_chains(self):
+        """Adds critical prerequisite chains that might be missing."""
+        print("\n🔗 Validating and fixing critical prerequisite chains...")
+        critical_chains = {
+            ("CS1800", "CS2800", "Discrete → Logic"),
+            ("CS2500", "CS2510", "Fundies 1 → Fundies 2"),
+            ("CS2510", "CS3500", "Fundies 2 → OOD"),
+            ("CS2510", "CS3000", "Fundies 2 → Algorithms"),
+            ("CS3000", "CS4100", "Algorithms → AI"),  # NEW
+            ("MATH1341", "MATH1342", "Calc 1 → Calc 2"),
+            ("DS2000", "DS2500", "Prog w/ Data → Intermediate"),
+            ("DS2500", "DS3500", "Intermediate → Advanced"),
+            ("DS3500", "DS4400", "Advanced → ML1"),  # NEW
+        }
+        added = 0
+        for prereq, course, desc in critical_chains:
+            if self.graph.has_node(prereq) and self.graph.has_node(course):
+                if not self.graph.has_edge(prereq, course):
+                    self.graph.add_edge(prereq, course)
+                    print(f"  🔧 FIXED: Added {prereq} → {course} ({desc})")
+                    added += 1
+        if added == 0:
+            print("  ✅ All critical chains present")
+    def remove_spurious_chains(self):
+        """Removes known incorrect prerequisite edges."""
+        print("\n🗑️ Removing spurious prerequisite chains...")
+        spurious_chains = {
+            ("MATH1365", "CS2800"),  # Not a real prereq
+        }
+        removed = 0
+        for prereq, course in spurious_chains:
+            if self.graph.has_edge(prereq, course):
+                self.graph.remove_edge(prereq, course)
+                print(f"  ✅ REMOVED: {prereq} → {course}")
+                removed += 1
+        if removed == 0:
+            print("  ✅ No spurious chains found")
+    def calculate_and_add_complexity(self):
+        """Calculates and adds complexity score to each course."""
+        print("\n🧮 Calculating complexity scores...")
+        for node in self.graph.nodes():
+            in_degree = self.graph.in_degree(node)
+            out_degree = self.graph.out_degree(node)
+            # Complexity heuristic: weighted by prerequisites and courses unlocked
+            score = (in_degree * 10) + (out_degree * 5)
+            nx.set_node_attributes(self.graph, {node: {'complexity': score}})
+        print("✅ Complexity scores calculated")
+    def validate_critical_courses(self) -> Dict[str, Set[str]]:
+        """Check if all critical courses exist in the graph."""
+        print("\n🎯 Validating critical course coverage...")
+        required_courses = {
+            "foundations": {"CS1800", "CS2500", "CS2510", "CS2800"},
+            "core": {"CS3000", "CS3500", "CS3650", "CS3700", "CS3200"},
+            "ai_ml": {"CS4100", "DS4400", "CS4120", "DS4420", "CS4180", "DS4440"},
+            "systems": {"CS4730", "CS4400", "CS4500"},  # Removed often-missing courses
+            "security": {"CY2550", "CY3740", "CY4740", "CY4760"},
+            "math": {"MATH1341", "MATH1342", "MATH2331", "MATH3081"},  # No STAT courses at NEU
+        }
+        missing = {}
+        for category, courses in required_courses.items():
+            missing_in_cat = courses - set(self.graph.nodes())
+            if missing_in_cat:
+                missing[category] = missing_in_cat
+                print(f"  ⚠️  {category}: Missing {missing_in_cat}")
+            else:
+                print(f"  ✅ {category}: All courses present")
+        return missing
     def save_enriched_graph(self, output_path):
         """Saves the final, clean, and enriched graph."""
+        print(f"\n💾 Saving cleaned graph to {output_path}...")
         with open(output_path, 'wb') as f:
             pickle.dump(self.graph, f)
+        print("✅ Graph saved")
+        # Save a summary report
+        report_path = output_path.replace('.pkl', '_report.txt')
+        with open(report_path, 'w') as f:
+            f.write("Curriculum Graph Analysis Report\n")
+            f.write("="*70 + "\n\n")
+            f.write(f"Total courses: {self.graph.number_of_nodes()}\n")
+            f.write(f"Total prerequisites: {self.graph.number_of_edges()}\n\n")
+            # Subject breakdown
+            from collections import defaultdict
+            subject_counts = defaultdict(int)
+            for node in self.graph.nodes():
+                subject = self.graph.nodes[node].get('subject', 'UNKNOWN')
+                subject_counts[subject] += 1
+            f.write("Subject breakdown:\n")
+            for subject in sorted(subject_counts.keys()):
+                f.write(f"  {subject}: {subject_counts[subject]}\n")
+        print(f"✅ Report saved to {report_path}")
 def main(args):
     """Main execution flow."""
     analyzer = CurriculumAnalyzer(args.graph, args.courses)
     analyzer.pre_filter_graph()
+    analyzer.fix_chains()
+    analyzer.remove_spurious_chains()
     analyzer.calculate_and_add_complexity()
+    missing = analyzer.validate_critical_courses()
+    if missing:
+        print("\n⚠️  WARNING: Some critical courses are missing!")
+        print("   Consider re-scraping with additional terms or subjects.")
+        print("   Missing courses will be excluded from planning.")
+    analyzer.save_enriched_graph(args.output_graph)
+    print("\n✨ Analysis complete!")
 if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="NEU Curriculum Analyzer - Cleans and validates data")
+    parser.add_argument('--graph', required=True, help="Path to RAW curriculum graph")
+    parser.add_argument('--courses', required=True, help="Path to RAW courses data")
+    parser.add_argument('--output-graph', default='neu_graph_clean.pkl', help="Output path")
     args = parser.parse_args()
+    main(args)

src/curriculum_optimizer.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """
 Fixed Hybrid Curriculum Optimizer
-Actually personalizes plans based on student profile
-WITH MUTUAL EXCLUSION AND SEQUENCE VALIDATION
 """
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
@@ -27,10 +26,15 @@ class StudentProfile:
 class HybridOptimizer:
     """
-    Fixed optimizer with proper course sequencing and mutual exclusion
     """
-    # COURSE TRACKS - Mutually exclusive sequences
     COURSE_TRACKS = {
         "physics": {
             "engineering": ["PHYS1151", "PHYS1155"],
@@ -43,83 +47,56 @@ class HybridOptimizer:
         }
     }
-    # CONCENTRATION REQUIREMENTS - Structured with pick lists
     CONCENTRATION_REQUIREMENTS = {
         "ai_ml": {
             "foundations": {
-                "required": ["CS1800", "CS2500", "CS2510", "CS2800"]
             },
             "core": {
                 "required": ["CS3000", "CS3500"],
-                "pick_1_from": ["CS3200", "CS3650", "CS3700"]
             },
             "concentration_specific": {
                 "required": ["CS4100", "DS4400"],
                 "pick_2_from": ["CS4120", "CS4180", "DS4420", "DS4440"],
-                "pick_1_systems": ["CS4730", "CS4700", "CS4750"]
             },
             "math": {
                 "required": ["MATH1341", "MATH1342"],
-                "pick_1_from": ["MATH2331", "MATH3081", "STAT315"]
             }
         },
         "systems": {
-            "foundations": {
-                "required": ["CS1800", "CS2500", "CS2510", "CS2800"]
-            },
-            "core": {
-                "required": ["CS3000", "CS3500", "CS3650"],
-                "pick_1_from": ["CS3700", "CS3200"]
-            },
-            "concentration_specific": {
-                "required": ["CS4700"],
-                "pick_2_from": ["CS4730", "CS4750", "CS4770"],
-                "pick_1_from": ["CS4400", "CS4500", "CS4520"]
-            },
-            "math": {
-                "required": ["MATH1341", "MATH1342"]
-            }
         },
         "security": {
-            "foundations": {
-                "required": ["CS1800", "CS2500", "CS2510", "CS2800"]
-            },
-            "core": {
-                "required": ["CS3000", "CS3650", "CY2550"],
-                "pick_1_from": ["CS3700", "CS3500"]
-            },
-            "concentration_specific": {
-                "required": ["CY3740"],
-                "pick_2_from": ["CY4740", "CY4760", "CY4770"],
-                "pick_1_from": ["CS4700", "CS4730"]
-            },
-            "math": {
-                "required": ["MATH1342"],
-                "pick_1_from": ["MATH3527", "MATH3081"]
-            }
         }
     }
     def __init__(self):
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        # Use smaller model for efficiency
         self.model_name = "meta-llama/Llama-3.1-8B-Instruct"
         self.embedding_model_name = 'BAAI/bge-large-en-v1.5'
         self.llm = None
         self.tokenizer = None
         self.embedding_model = None
         self.curriculum_graph = None
         self.courses = {}
     def load_models(self):
-        """Load embedding model and optionally LLM"""
         print("Loading embedding model...")
         self.embedding_model = SentenceTransformer(self.embedding_model_name, device=self.device)
     def load_llm(self):
-        """Load LLM separately for when needed"""
         if self.device.type == 'cuda' and self.llm is None:
             print("Loading LLM for intelligent planning...")
             quant_config = BitsAndBytesConfig(
@@ -134,272 +111,218 @@ class HybridOptimizer:
                 quantization_config=quant_config,
                 device_map="auto"
             )
     def load_data(self, graph: nx.DiGraph):
-        """Load and preprocess curriculum data"""
         self.curriculum_graph = graph
         self.courses = dict(graph.nodes(data=True))
-        # Filter valid courses
         self.valid_courses = []
         course_texts = []
         for cid, data in self.courses.items():
-            # Skip labs/recitations
             name = data.get('name', '')
-            if any(skip in name for skip in ['Lab', 'Recitation', 'Seminar', 'Practicum']):
                 continue
-            # Skip grad level
-            if self._get_level(cid) >= 5000:
                 continue
             self.valid_courses.append(cid)
             course_texts.append(f"{name} {data.get('description', '')}")
-        # Precompute embeddings
         print(f"Computing embeddings for {len(self.valid_courses)} courses...")
-        self.course_embeddings = self.embedding_model.encode(
-            course_texts,
-            convert_to_tensor=True,
-            show_progress_bar=True
-        )
-    def _get_track_commitment(self, completed: Set[str], track_type: str) -> Optional[str]:
-        """Once a student takes one course in a track, commit to that track"""
-        tracks = self.COURSE_TRACKS.get(track_type, {})
-        for track_name, courses in tracks.items():
-            if any(c in completed for c in courses):
-                return track_name
-        return None
     def _validate_sequence(self, selected: List[str], candidate: str) -> bool:
-        """Ensure course sequences stay consistent - no mixing tracks"""
         for track_type, tracks in self.COURSE_TRACKS.items():
             for track_name, sequence in tracks.items():
                 if candidate in sequence:
-                    # Check if any course from different track already selected
                     for other_track, other_seq in tracks.items():
-                        if other_track != track_name:
-                            if any(c in selected for c in other_seq):
-                                return False  # Don't mix sequences
         return True
-    def validate_plan(self, plan: Dict) -> Dict[str, List[str]]:
-        """Validate a plan for consistency and requirements"""
-        issues = {
-            "errors": [],
-            "warnings": [],
-            "info": []
-        }
-        all_courses = []
-        for year_key, year_data in plan.items():
-            if isinstance(year_data, dict) and year_key.startswith("year_"):
-                all_courses.extend(year_data.get("fall", []))
-                all_courses.extend(year_data.get("spring", []))
-        # Check for sequence mixing
-        for track_type, tracks in self.COURSE_TRACKS.items():
-            tracks_used = set()
-            for track_name, courses in tracks.items():
-                if any(c in all_courses for c in courses):
-                    tracks_used.add(track_name)
-            if len(tracks_used) > 1:
-                issues["errors"].append(
-                    f"Mixed {track_type} tracks: {', '.join(tracks_used)}. Must choose one sequence."
-                )
-        # Check prerequisites are satisfied
-        completed = set()
-        for year in range(1, 5):
-            for sem in ["fall", "spring"]:
-                year_key = f"year_{year}"
-                if year_key in plan:
-                    courses = plan[year_key].get(sem, [])
-                    for course in courses:
-                        if course in self.curriculum_graph:
-                            prereqs = set(self.curriculum_graph.predecessors(course))
-                            missing = prereqs - completed
-                            if missing:
-                                issues["errors"].append(
-                                    f"{course} in Year {year} {sem} missing prereqs: {', '.join(missing)}"
-                                )
-                    completed.update(courses)
-        return issues
-    def generate_llm_plan(self, student: StudentProfile) -> Dict:
-        """Generate AI-powered plan with LLM course selection"""
-        print("--- Generating AI-Optimized Plan ---")
-        # Ensure LLM is loaded
-        self.load_llm()
-        if not self.llm:
-            print("LLM not available, falling back to enhanced rule-based plan")
-            return self.generate_enhanced_rule_plan(student)
-        # Step 1: Identify track
-        track = self._identify_track(student)
-        print(f"Identified track: {track}")
-        # Step 2: Get LLM-suggested courses
-        llm_suggestions = self._get_llm_course_suggestions(student, track)
-        # Step 3: Build plan using LLM suggestions + rules
-        plan = self._build_structured_plan(student, track, llm_suggestions)
-        # Step 4: Validate plan
-        validation = self.validate_plan(plan)
-        if validation["errors"]:
-            print(f"Plan validation errors: {validation['errors']}")
-            # Try to fix errors
-            plan = self._fix_plan_errors(plan, validation, student)
-        # Step 5: Generate explanation
-        explanation = self._generate_explanation(student, plan, track, "AI-optimized")
-        return self._finalize_plan(plan, explanation, validation)
     def generate_simple_plan(self, student: StudentProfile) -> Dict:
-        """Generate rule-based plan that considers student preferences"""
         print("--- Generating Enhanced Rule-Based Plan ---")
         return self.generate_enhanced_rule_plan(student)
-    def generate_enhanced_rule_plan(self, student: StudentProfile) -> Dict:
-        """Enhanced rule-based plan with proper sequencing"""
-        # Step 1: Identify track
         track = self._identify_track(student)
-        # Step 2: Build structured plan
         plan = self._build_structured_plan(student, track, None)
-        # Step 3: Validate
-        validation = self.validate_plan(plan)
         if validation["errors"]:
             plan = self._fix_plan_errors(plan, validation, student)
-            validation = self.validate_plan(plan)  # Re-validate
-        # Step 4: Generate explanation
         difficulty_level = self._map_difficulty(student.preferred_difficulty)
         courses_per_semester = self._calculate_course_load(student.time_commitment)
         explanation = f"Personalized {track} track ({difficulty_level} difficulty, {courses_per_semester} courses/semester)"
         return self._finalize_plan(plan, explanation, validation)
-    def _build_structured_plan(
-        self,
-        student: StudentProfile,
-        track: str,
-        llm_suggestions: Optional[List[str]] = None
-    ) -> Dict:
-        """Build plan using structured concentration requirements"""
         completed = set(student.completed_courses)
         plan = {}
         requirements = self.CONCENTRATION_REQUIREMENTS.get(track, self.CONCENTRATION_REQUIREMENTS["ai_ml"])
-        # Determine course load
         courses_per_semester = self._calculate_course_load(student.time_commitment)
-        # Track which requirements have been satisfied
-        required_queue = []
-        pick_lists = []
-        # Build queue of required courses
         for category, reqs in requirements.items():
             if "required" in reqs:
-                required_queue.extend(reqs["required"])
-            # Handle pick lists
             for key, courses in reqs.items():
                 if key.startswith("pick_"):
-                    num_to_pick = int(re.search(r'\d+', key).group()) if re.search(r'\d+', key) else 1
-                    pick_lists.append({
-                        "courses": courses,
-                        "num_to_pick": num_to_pick,
-                        "category": category
-                    })
-        # Handle course track commitments (physics/calculus)
-        physics_track = self._get_track_commitment(completed, "physics")
-        calc_track = self._get_track_commitment(completed, "calculus")
-        # Build semesters
         for sem_num in range(1, 9):
             year = ((sem_num - 1) // 2) + 1
-            is_fall = (sem_num % 2) == 1
-            available = self._get_available_courses(completed, year)
-            selected = []
-            # Apply track commitments
-            if not physics_track and year <= 2:
-                # Choose physics track based on difficulty preference
-                if student.preferred_difficulty == "challenging":
-                    physics_track = "engineering"
-                else:
-                    physics_track = "science"
-            # Priority 1: Required courses
-            for course in required_queue[:]:
-                if course in available and len(selected) < courses_per_semester:
-                    if self._validate_sequence(selected, course):
-                        selected.append(course)
-                        required_queue.remove(course)
-                        available.remove(course)
-            # Priority 2: Handle pick lists
-            for pick_list in pick_lists:
-                if len(selected) >= courses_per_semester:
-                    break
-                # Filter available courses from this pick list
-                available_from_list = [c for c in pick_list["courses"] if c in available]
-                # Use LLM suggestions if available
-                if llm_suggestions:
-                    # Prioritize LLM-suggested courses
-                    for suggested in llm_suggestions:
-                        if suggested in available_from_list and pick_list["num_to_pick"] > 0:
-                            if self._validate_sequence(selected, suggested):
-                                selected.append(suggested)
-                                available.remove(suggested)
-                                pick_list["num_to_pick"] -= 1
-                # Fill remaining slots
-                for course in available_from_list[:pick_list["num_to_pick"]]:
-                    if len(selected) < courses_per_semester and course in available:
-                        if self._validate_sequence(selected, course):
-                            selected.append(course)
-                            available.remove(course)
-                            pick_list["num_to_pick"] -= 1
-            # Priority 3: Track-specific courses (physics/calc)
-            if physics_track and year <= 2:
-                physics_courses = self.COURSE_TRACKS["physics"].get(physics_track, [])
-                for course in physics_courses:
-                    if course in available and len(selected) < courses_per_semester:
-                        selected.append(course)
-                        available.remove(course)
-            # Priority 4: Fill with electives
-            if len(selected) < courses_per_semester and available:
-                semantic_scores = self._compute_semantic_scores(student)
-                electives = sorted(
-                    available,
-                    key=lambda c: self._score_elective(c, semantic_scores, completed),
                     reverse=True
                 )
-                for elective in electives:
-                    if len(selected) >= courses_per_semester:
-                        break
-                    if self._validate_sequence(selected, elective):
-                        selected.append(elective)
             # Add to plan
             if selected:
@@ -407,310 +330,184 @@ class HybridOptimizer:
                 if year_key not in plan:
                     plan[year_key] = {}
-                sem_type = 'fall' if is_fall else 'spring'
-                plan[year_key][sem_type] = selected[:courses_per_semester]
                 completed.update(selected)
         return plan
-    def _fix_plan_errors(self, plan: Dict, validation: Dict, student: StudentProfile) -> Dict:
-        """Attempt to fix validation errors in a plan"""
-        # For now, if there are sequence mixing errors, rebuild with enforced consistency
-        if any("Mixed" in error for error in validation["errors"]):
-            print("Fixing sequence mixing errors...")
-            # Find which tracks were mixed and pick the first one
-            for error in validation["errors"]:
-                if "Mixed physics" in error:
-                    # Force engineering track (most common)
-                    self.COURSE_TRACKS["physics"] = {"engineering": ["PHYS1151", "PHYS1155"]}
-                elif "Mixed calculus" in error:
-                    # Force standard calc
-                    self.COURSE_TRACKS["calculus"] = {"standard": ["MATH1341", "MATH1342"]}
-            # Rebuild plan with enforced tracks
-            return self._build_structured_plan(student, self._identify_track(student), None)
         return plan
-    def _get_llm_course_suggestions(self, student: StudentProfile, track: str) -> List[str]:
-        """Use LLM to suggest personalized course priorities"""
-        requirements = self.CONCENTRATION_REQUIREMENTS.get(track, self.CONCENTRATION_REQUIREMENTS["ai_ml"])
-        # Gather all elective options from pick lists
-        all_options = []
-        for category, reqs in requirements.items():
             for key, courses in reqs.items():
-                if key.startswith("pick_"):
-                    all_options.extend(courses)
-        # Create course options text
-        course_options = []
-        for cid in all_options[:10]:  # Limit to avoid token limits
-            if cid in self.courses:
-                name = self.courses[cid].get('name', cid)
-                desc = self.courses[cid].get('description', '')[:100]
-                course_options.append(f"{cid}: {name} - {desc}")
-        prompt = f"""You are a curriculum advisor. Given this student profile, rank the TOP 5 most relevant courses from the options below.
-Student Profile:
-- Career Goal: {student.career_goals}
-- Interests: {', '.join(student.interests)}
-- Time Commitment: {student.time_commitment} hours/week
-- Preferred Difficulty: {student.preferred_difficulty}
-- Current GPA: {student.current_gpa}
-Available Courses:
-{chr(10).join(course_options)}
-Return ONLY the top 5 course IDs in order of priority, one per line. Example:
-CS4100
-DS4400
-CS4120
-CS4180
-DS4440"""
         try:
-            inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to(self.device)
             with torch.no_grad():
-                outputs = self.llm.generate(
-                    **inputs,
-                    max_new_tokens=100,
-                    temperature=0.3,
-                    do_sample=True,
-                    pad_token_id=self.tokenizer.eos_token_id
-                )
             response = self.tokenizer.decode(outputs[0][len(inputs['input_ids'][0]):], skip_special_tokens=True)
-            # Extract course IDs
-            suggested_courses = []
-            for line in response.strip().split('\n'):
-                line = line.strip()
-                match = re.search(r'([A-Z]{2,4}\d{4})', line)
-                if match:
-                    suggested_courses.append(match.group(1))
             return suggested_courses[:5]
         except Exception as e:
             print(f"LLM suggestion failed: {e}")
-            return all_options[:5]  # Fallback
     def _map_difficulty(self, preferred_difficulty: str) -> str:
-        """Map UI difficulty to internal levels"""
-        mapping = {
-            "easy": "easy",
-            "moderate": "medium",
-            "challenging": "hard"
-        }
-        return mapping.get(preferred_difficulty.lower(), "medium")
     def _calculate_course_load(self, time_commitment: int) -> int:
-        """Calculate courses per semester based on time commitment"""
-        if time_commitment < 20:
-            return 3  # Part-time
-        elif time_commitment < 30:
-            return 4  # Standard
-        elif time_commitment < 40:
-            return 4  # Standard-heavy
-        else:
-            return 4  # Max (prerequisites limit anyway)
-    def _identify_track(self, student: StudentProfile) -> str:
-        """Use embeddings to identify best track"""
         profile_text = f"{student.career_goals} {' '.join(student.interests)}"
         profile_emb = self.embedding_model.encode(profile_text, convert_to_tensor=True)
         track_descriptions = {
-            "ai_ml": "artificial intelligence machine learning deep learning neural networks data science NLP computer vision LLM",
-            "systems": "operating systems distributed systems networks compilers databases performance optimization backend",
-            "security": "cybersecurity cryptography penetration testing security vulnerabilities network security ethical hacking"
         }
-        best_track = "ai_ml"
-        best_score = -1
         for track, description in track_descriptions.items():
             track_emb = self.embedding_model.encode(description, convert_to_tensor=True)
             score = float(util.cos_sim(profile_emb, track_emb))
             if score > best_score:
-                best_score = score
-                best_track = track
         return best_track
-    def _compute_semantic_scores(self, student: StudentProfile) -> Dict[str, float]:
-        """Compute semantic alignment for all courses"""
         query_text = f"{student.career_goals} {' '.join(student.interests)}"
         query_emb = self.embedding_model.encode(query_text, convert_to_tensor=True)
         similarities = util.cos_sim(query_emb, self.course_embeddings)[0]
-        scores = {}
-        for idx, cid in enumerate(self.valid_courses):
-            scores[cid] = float(similarities[idx])
-        return scores
-    def _get_available_courses(self, completed: Set[str], year: int) -> List[str]:
-        """Get schedulable courses with year restrictions"""
-        available = []
-        max_level = 2999 if year == 1 else 3999 if year == 2 else 9999
-        for cid in self.valid_courses:
-            if cid in completed:
-                continue
-            if self._get_level(cid) > max_level:
-                continue
-            # Check prerequisites
-            if cid in self.curriculum_graph:
-                prereqs = set(self.curriculum_graph.predecessors(cid))
-                if not prereqs.issubset(completed):
-                    continue
-            available.append(cid)
-        return available
-    def _score_elective(
-        self,
-        course_id: str,
-        semantic_scores: Dict[str, float],
-        completed: Set[str]
-    ) -> float:
-        """Basic elective scoring"""
-        score = 0.0
-        # Semantic alignment (50%)
-        score += semantic_scores.get(course_id, 0) * 0.5
-        # Unlocks future courses (30%)
-        if course_id in self.curriculum_graph:
-            unlocks = len(list(self.curriculum_graph.successors(course_id)))
-            score += min(unlocks / 5, 1.0) * 0.3
-        # Subject relevance (20%)
-        subject = self.courses.get(course_id, {}).get('subject', '')
-        subject_scores = {"CS": 1.0, "DS": 0.9, "IS": 0.6, "MATH": 0.7, "CY": 0.8}
-        score += subject_scores.get(subject, 0.3) * 0.2
-        return score
     def _generate_explanation(self, student: StudentProfile, plan: Dict, track: str, plan_type: str) -> str:
-        """Generate explanation using LLM if available"""
-        if not self.llm:
-            return f"{plan_type} {track} track plan for {student.career_goals}"
-        # Count courses
-        total_courses = sum(
-            len(plan.get(f"year_{y}", {}).get(sem, []))
-            for y in range(1, 5)
-            for sem in ["fall", "spring"]
-        )
-        prompt = f"""Explain this curriculum plan in 1-2 sentences:
-Plan Type: {plan_type}
-Track: {track}
-Student Goal: {student.career_goals}
-Interests: {', '.join(student.interests[:2])}
-Difficulty: {student.preferred_difficulty}
-Time Commitment: {student.time_commitment}h/week
-Total Courses: {total_courses}
-Be specific about how the plan matches their preferences."""
-        try:
-            inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True).to(self.device)
-            with torch.no_grad():
-                outputs = self.llm.generate(
-                    **inputs,
-                    max_new_tokens=150,
-                    temperature=0.7,
-                    do_sample=True,
-                    pad_token_id=self.tokenizer.eos_token_id
-                )
-            explanation = self.tokenizer.decode(outputs[0][len(inputs['input_ids'][0]):], skip_special_tokens=True)
-            return explanation.strip()
-        except Exception as e:
-            print(f"Explanation generation failed: {e}")
-            return f"{plan_type} {track} track plan optimized for {student.career_goals}"
-    def _get_level(self, course_id: str) -> int:
-        """Extract course level"""
-        match = re.search(r'\d+', course_id)
-        return int(match.group()) if match else 9999
     def _finalize_plan(self, plan: Dict, explanation: str, validation: Dict = None) -> Dict:
-        """Add structure, metrics, and validation to plan"""
-        structured = {
-            "reasoning": explanation,
-            "validation": validation if validation else {"errors": [], "warnings": [], "info": []}
-        }
-        # Ensure all years present
         for year in range(1, 5):
             year_key = f"year_{year}"
-            if year_key not in plan:
-                plan[year_key] = {}
-            structured[year_key] = {
-                "fall": plan[year_key].get("fall", []),
-                "spring": plan[year_key].get("spring", []),
                 "summer": "co-op" if year in [2, 3] else []
             }
-        # Calculate complexity metrics
-        complexities = []
-        for year_key in structured:
-            if year_key.startswith("year_"):
-                for sem in ["fall", "spring"]:
-                    courses = structured[year_key].get(sem, [])
-                    if courses:
-                        sem_complexity = sum(
-                            self.courses.get(c, {}).get('complexity', 50)
-                            for c in courses
-                        )
-                        complexities.append(sem_complexity)
-        structured["complexity_analysis"] = {
             "average_semester_complexity": float(np.mean(complexities)) if complexities else 0,
             "peak_semester_complexity": float(np.max(complexities)) if complexities else 0,
             "total_complexity": float(np.sum(complexities)) if complexities else 0,
             "balance_score (std_dev)": float(np.std(complexities)) if complexities else 0
         }
-        # Add metadata
-        structured["metadata"] = {
             "generated": datetime.now().isoformat(),
             "valid": len(validation.get("errors", [])) == 0 if validation else True,
-            "has_warnings": len(validation.get("warnings", [])) > 0 if validation else False
         }
-        return {"pathway": structured}
-# Backward compatibility wrapper
 class CurriculumOptimizer(HybridOptimizer):
-    """Compatibility wrapper"""
     def __init__(self):
         super().__init__()
     def generate_plan(self, student: StudentProfile) -> Dict:
-        """Default plan generation - uses enhanced rules"""
         return self.generate_enhanced_rule_plan(student)

 """
 Fixed Hybrid Curriculum Optimizer
+WITH PROPER COURSE DISCOVERY, SUBJECT-AWARE SCORING, AND CONCENTRATION FOCUS
 """
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 class HybridOptimizer:
     """
+    Fixed optimizer with subject-aware scoring and concentration focus
     """
+    EQUIVALENCY_GROUPS = [
+        {"MATH1341", "MATH1241", "MATH1231"}, # Calculus 1
+        {"MATH1342", "MATH1242"},             # Calculus 2
+        {"PHYS1151", "PHYS1161", "PHYS1145"}, # Physics 1
+        {"PHYS1155", "PHYS1165", "PHYS1147"}, # Physics 2
+    ]
     COURSE_TRACKS = {
         "physics": {
             "engineering": ["PHYS1151", "PHYS1155"],
         }
     }
     CONCENTRATION_REQUIREMENTS = {
         "ai_ml": {
             "foundations": {
+                "required": ["CS1800", "CS2500", "CS2510", "CS2800"],
+                "sequence": True
             },
             "core": {
                 "required": ["CS3000", "CS3500"],
+                "pick_1_from": ["CS3200", "CS3650", "CS5700"]
             },
             "concentration_specific": {
                 "required": ["CS4100", "DS4400"],
                 "pick_2_from": ["CS4120", "CS4180", "DS4420", "DS4440"],
+                "pick_1_systems": ["CS4730", "CS4700"]
             },
             "math": {
                 "required": ["MATH1341", "MATH1342"],
+                "pick_1_from": ["MATH2331", "MATH3081"]
             }
         },
         "systems": {
+            "foundations": { "required": ["CS1800", "CS2500", "CS2510", "CS2800"] },
+            "core": { "required": ["CS3000", "CS3500", "CS3650"], "pick_1_from": ["CS5700", "CS3200"] },
+            "concentration_specific": { "required": ["CS4700"], "pick_2_from": ["CS4730"], "pick_1_from": ["CS4400", "CS4500", "CS4520"] },
+            "math": { "required": ["MATH1341", "MATH1342"] }
         },
         "security": {
+            "foundations": { "required": ["CS1800", "CS2500", "CS2510", "CS2800"] },
+            "core": { "required": ["CS3000", "CS3650", "CY2550"], "pick_1_from": ["CS5700", "CS3500"] },
+            "concentration_specific": { "required": ["CY3740"], "pick_2_from": ["CY4740", "CY4760", "CY4770"], "pick_1_from": ["CS4700", "CS4730"] },
+            "math": { "required": ["MATH1342"], "pick_1_from": ["MATH3527", "MATH3081"] }
         }
     }
     def __init__(self):
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.model_name = "meta-llama/Llama-3.1-8B-Instruct"
         self.embedding_model_name = 'BAAI/bge-large-en-v1.5'
         self.llm = None
         self.tokenizer = None
         self.embedding_model = None
         self.curriculum_graph = None
         self.courses = {}
+        self.current_student = None
     def load_models(self):
         print("Loading embedding model...")
         self.embedding_model = SentenceTransformer(self.embedding_model_name, device=self.device)
     def load_llm(self):
         if self.device.type == 'cuda' and self.llm is None:
             print("Loading LLM for intelligent planning...")
             quant_config = BitsAndBytesConfig(
                 quantization_config=quant_config,
                 device_map="auto"
             )
     def load_data(self, graph: nx.DiGraph):
         self.curriculum_graph = graph
         self.courses = dict(graph.nodes(data=True))
+        UNDERGRAD_ACCESSIBLE_GRAD = {"CS5700", "CY5700", "DS5110", "CS5010"}
         self.valid_courses = []
         course_texts = []
+        concentration_courses = set()
+        for track_reqs in self.CONCENTRATION_REQUIREMENTS.values():
+            for category, reqs in track_reqs.items():
+                if isinstance(reqs, dict):
+                    for key, courses in reqs.items():
+                        if isinstance(courses, list):
+                            concentration_courses.update(courses)
         for cid, data in self.courses.items():
             name = data.get('name', '')
+            if not name or name.strip() == '' or any(skip in name.lower() for skip in ['lab', 'recitation', 'seminar', 'practicum']):
                 continue
+            course_level = self._get_level(cid)
+            if course_level >= 5000 and cid not in UNDERGRAD_ACCESSIBLE_GRAD:
                 continue
             self.valid_courses.append(cid)
             course_texts.append(f"{name} {data.get('description', '')}")
+        missing_required = concentration_courses - set(self.valid_courses)
+        if missing_required:
+            print(f"\n⚠️ WARNING: {len(missing_required)} required courses missing from graph: {sorted(missing_required)}\n")
         print(f"Computing embeddings for {len(self.valid_courses)} courses...")
+        self.course_embeddings = self.embedding_model.encode(course_texts, convert_to_tensor=True, show_progress_bar=True)
+        print(f"\nTotal valid courses: {len(self.valid_courses)}")
+    def _get_level(self, course_id: str) -> int:
+        match = re.search(r'\d+', course_id)
+        return int(match.group()) if match else 9999
+    def _get_completed_with_equivalents(self, completed: Set[str]) -> Set[str]:
+        expanded_completed = completed.copy()
+        for course in completed:
+            for group in self.EQUIVALENCY_GROUPS:
+                if course in group:
+                    expanded_completed.update(group)
+        return expanded_completed
+    def _can_take_course(self, course_id: str, completed: Set[str]) -> bool:
+        effective_completed = self._get_completed_with_equivalents(completed)
+        if course_id not in self.curriculum_graph:
+            return True
+        prereqs = set(self.curriculum_graph.predecessors(course_id))
+        return prereqs.issubset(effective_completed)
     def _validate_sequence(self, selected: List[str], candidate: str) -> bool:
         for track_type, tracks in self.COURSE_TRACKS.items():
             for track_name, sequence in tracks.items():
                 if candidate in sequence:
                     for other_track, other_seq in tracks.items():
+                        if other_track != track_name and any(c in selected for c in other_seq):
+                            return False
         return True
+    def _score_course(self, course_id: str, semantic_scores: Dict[str, float], required_set: Set[str], picklist_set: Set[str]) -> float:
+        """FIXED: Proper scoring with IS heavy penalty"""
+        if course_id not in self.courses or not self.courses[course_id].get('name', '').strip():
+            return -10000.0
+        course_data = self.courses[course_id]
+        subject = course_data.get('subject', '')
+        score = 0.0
+        # Subject bonuses/penalties
+        if subject in ["CS", "DS", "CY"]:
+            score += 300.0
+        elif subject == "MATH":
+            score += 100.0
+        else:
+            score -= 1000.0  # Heavy penalty for everything else (including IS)
+        # Required courses: massive boost
+        if course_id in required_set:
+            score += 10000.0  # INCREASED from 1000
+        # Pick-list courses: high boost
+        if course_id in picklist_set:
+            score += 5000.0  # INCREASED from 500
+        # Unlocking factor (reduced weight)
+        if course_id in self.curriculum_graph:
+            unlocks = self.curriculum_graph.out_degree(course_id)
+            score += min(unlocks, 5) * 2.0  # REDUCED
+        # Level preference
+        level = self._get_level(course_id)
+        score -= (level / 100.0)
+        # Semantic alignment (reduced weight)
+        score += semantic_scores.get(course_id, 0.0) * 5.0  # REDUCED from 15
+        return score
     def generate_simple_plan(self, student: StudentProfile) -> Dict:
         print("--- Generating Enhanced Rule-Based Plan ---")
+        self.current_student = student
         return self.generate_enhanced_rule_plan(student)
+    def generate_enhanced_rule_plan(self, student: StudentProfile) -> Dict:
+        self.current_student = student
         track = self._identify_track(student)
         plan = self._build_structured_plan(student, track, None)
+        validation = self.validate_plan(plan, student)
         if validation["errors"]:
             plan = self._fix_plan_errors(plan, validation, student)
+            validation = self.validate_plan(plan, student)
         difficulty_level = self._map_difficulty(student.preferred_difficulty)
         courses_per_semester = self._calculate_course_load(student.time_commitment)
         explanation = f"Personalized {track} track ({difficulty_level} difficulty, {courses_per_semester} courses/semester)"
         return self._finalize_plan(plan, explanation, validation)
+    def generate_llm_plan(self, student: StudentProfile) -> Dict:
+        print("--- Generating AI-Optimized Plan ---")
+        self.current_student = student
+        self.load_llm()
+        if not self.llm:
+            return self.generate_enhanced_rule_plan(student)
+        track = self._identify_track(student)
+        llm_suggestions = self._get_llm_course_suggestions(student, track)
+        plan = self._build_structured_plan(student, track, llm_suggestions)
+        validation = self.validate_plan(plan, student)
+        if validation["errors"]:
+            plan = self._fix_plan_errors(plan, validation, student)
+            validation = self.validate_plan(plan, student)
+        explanation = self._generate_explanation(student, plan, track, "AI-optimized")
+        return self._finalize_plan(plan, explanation, validation)
+    def _build_structured_plan(self, student: StudentProfile, track: str, llm_suggestions: Optional[List[str]] = None) -> Dict:
+        """FIXED with hardcoded Year 2 priorities"""
         completed = set(student.completed_courses)
         plan = {}
         requirements = self.CONCENTRATION_REQUIREMENTS.get(track, self.CONCENTRATION_REQUIREMENTS["ai_ml"])
         courses_per_semester = self._calculate_course_load(student.time_commitment)
+        # Build required and pick sets
+        required_set = set()
+        picklist_set = set()
         for category, reqs in requirements.items():
             if "required" in reqs:
+                required_set.update(reqs["required"])
             for key, courses in reqs.items():
                 if key.startswith("pick_"):
+                    picklist_set.update(courses)
+        semantic_scores = self._compute_semantic_scores(student)
+        # HARDCODED FIX: Force Year 2 to prioritize core courses
+        YEAR2_MUST_TAKE = ["CS3000", "CS3500", "DS2500", "MATH2331", "MATH3081"]
         for sem_num in range(1, 9):
             year = ((sem_num - 1) // 2) + 1
+            available_courses = self._get_available_courses(completed, year, sem_num, track)
+            # Filter: must be takeable
+            schedulable = [
+                c for c in available_courses
+                if c not in completed and self._can_take_course(c, completed)
+            ]
+            # HARDCODED: In Year 2, force core courses to the top
+            if year == 2:
+                priority_courses = [c for c in YEAR2_MUST_TAKE if c in schedulable]
+                other_courses = [c for c in schedulable if c not in YEAR2_MUST_TAKE]
+                # Score priority courses separately
+                scored_priority = sorted(
+                    priority_courses,
+                    key=lambda c: self._score_course(c, semantic_scores, required_set, picklist_set),
+                    reverse=True
+                )
+                scored_others = sorted(
+                    other_courses,
+                    key=lambda c: self._score_course(c, semantic_scores, required_set, picklist_set),
                     reverse=True
                 )
+                scored_courses = scored_priority + scored_others
+            else:
+                # Normal scoring for other years
+                scored_courses = sorted(
+                    schedulable,
+                    key=lambda c: self._score_course(c, semantic_scores, required_set, picklist_set),
+                    reverse=True
+                )
+            # Select top N courses
+            selected = []
+            for course in scored_courses:
+                if len(selected) >= courses_per_semester:
+                    break
+                if self._validate_sequence(selected, course):
+                    selected.append(course)
             # Add to plan
             if selected:
                 if year_key not in plan:
                     plan[year_key] = {}
+                sem_type = 'fall' if (sem_num % 2) == 1 else 'spring'
+                plan[year_key][sem_type] = selected
                 completed.update(selected)
         return plan
+    def _get_available_courses(self, completed: Set[str], year: int, sem_num: int = None, track: str = "ai_ml") -> List[str]:
+        """FIXED: Return ALL courses that COULD be taken in this year"""
+        # Year 1: Hardcoded foundation
+        if year == 1:
+            if not completed or len(completed) < 2:
+                return [c for c in ["CS1800", "CS2500", "MATH1341", "ENGW1111"] if c in self.valid_courses]
+            else:
+                next_courses = []
+                for course, prereq in [("CS2800", "CS1800"), ("CS2510", "CS2500"), ("MATH1342", "MATH1341"), ("DS2000", None)]:
+                    if course in self.valid_courses and course not in completed:
+                        if prereq is None or prereq in completed:
+                            next_courses.append(course)
+                return next_courses
+        # Years 2-4: Filter by subject and level
+        available = []
+        # ONLY CS/DS/CY/MATH allowed
+        ALLOWED_SUBJECTS = {"CS", "DS", "CY", "MATH"}
+        for cid in self.valid_courses:
+            if cid in completed:
+                continue
+            course_data = self.courses.get(cid, {})
+            subject = course_data.get('subject')
+            if subject not in ALLOWED_SUBJECTS:
+                continue
+            course_level = self._get_level(cid)
+            # Year-based level filtering
+            if year == 2 and course_level > 3999:
+                continue  # No 4000+ in Year 2
+            if year >= 3 and course_level < 2000:
+                continue  # No intro courses in Years 3-4
+            available.append(cid)
+        return available
+    def _fix_plan_errors(self, plan: Dict, validation: Dict, student: StudentProfile) -> Dict:
+        if any("Mixed" in error for error in validation["errors"]):
+            return self._build_structured_plan(student, self._identify_track(student), None)
         return plan
+    def _get_llm_course_suggestions(self, student: StudentProfile, track: str) -> List[str]:
+        requirements = self.CONCENTRATION_REQUIREMENTS.get(track, {})
+        all_options = set()
+        for reqs in requirements.values():
             for key, courses in reqs.items():
+                if key.startswith("pick_"): all_options.update(courses)
+        course_options_text = [f"{cid}: {self.courses[cid].get('name', cid)} - {self.courses[cid].get('description', '')[:100].strip()}"
+                               for cid in list(all_options)[:15] if cid in self.courses]
+        prompt = f"""You are an expert curriculum advisor. Based on the student profile, rank the top 5 most relevant courses from the list below.
+### Student Profile:
+- **Career Goal:** {student.career_goals}
+- **Interests:** {', '.join(student.interests)}
+- **Preferred Difficulty:** {student.preferred_difficulty}
+### Available Elective Courses:
+{chr(10).join(course_options_text)}
+Return ONLY the top 5 course IDs, each on a new line.
+"""
         try:
+            inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096).to(self.device)
             with torch.no_grad():
+                outputs = self.llm.generate(**inputs, max_new_tokens=100, temperature=0.2, do_sample=True, pad_token_id=self.tokenizer.eos_token_id)
             response = self.tokenizer.decode(outputs[0][len(inputs['input_ids'][0]):], skip_special_tokens=True)
+            suggested_courses = re.findall(r'([A-Z]{2,4}\d{4})', response)
             return suggested_courses[:5]
         except Exception as e:
             print(f"LLM suggestion failed: {e}")
+            return list(all_options)[:5]
     def _map_difficulty(self, preferred_difficulty: str) -> str:
+        return {"easy": "easy", "moderate": "medium", "challenging": "hard"}.get(preferred_difficulty.lower(), "medium")
     def _calculate_course_load(self, time_commitment: int) -> int:
+        if time_commitment <= 20: return 3
+        if time_commitment <= 40: return 4 # Setting hours to 40 will now correctly return 4.
+        return 5
+    def _identify_track(self, student: StudentProfile) -> str:
+        if not hasattr(self, 'embedding_model') or self.embedding_model is None:
+            combined = f"{student.career_goals.lower()} {' '.join(student.interests).lower()}"
+            if any(word in combined for word in ['ai', 'ml', 'machine learning', 'data']): return "ai_ml"
+            if any(word in combined for word in ['systems', 'distributed', 'backend']): return "systems"
+            if any(word in combined for word in ['security', 'cyber']): return "security"
+            return "ai_ml"
         profile_text = f"{student.career_goals} {' '.join(student.interests)}"
         profile_emb = self.embedding_model.encode(profile_text, convert_to_tensor=True)
         track_descriptions = {
+            "ai_ml": "artificial intelligence machine learning deep learning neural networks data science",
+            "systems": "operating systems distributed systems networks compilers databases performance backend",
+            "security": "cybersecurity cryptography network security ethical hacking vulnerabilities"
         }
+        best_track, best_score = "ai_ml", -1.0
         for track, description in track_descriptions.items():
             track_emb = self.embedding_model.encode(description, convert_to_tensor=True)
             score = float(util.cos_sim(profile_emb, track_emb))
             if score > best_score:
+                best_score, best_track = score, track
         return best_track
+    def _compute_semantic_scores(self, student: StudentProfile) -> Dict[str, float]:
         query_text = f"{student.career_goals} {' '.join(student.interests)}"
         query_emb = self.embedding_model.encode(query_text, convert_to_tensor=True)
         similarities = util.cos_sim(query_emb, self.course_embeddings)[0]
+        return {cid: float(similarities[idx]) for idx, cid in enumerate(self.valid_courses)}
     def _generate_explanation(self, student: StudentProfile, plan: Dict, track: str, plan_type: str) -> str:
+        return f"{plan_type.title()} plan for the {track} track, tailored to your goal of becoming a {student.career_goals}."
+    def validate_plan(self, plan: Dict, student: StudentProfile = None) -> Dict[str, List[str]]:
+        issues = {"errors": [], "warnings": [], "info": []}
+        all_courses = [course for year in plan.values() for sem in year.values() for course in sem if isinstance(sem, list)]
+        for track_type, tracks in self.COURSE_TRACKS.items():
+            tracks_used = {name for name, courses in tracks.items() if any(c in all_courses for c in courses)}
+            if len(tracks_used) > 1:
+                issues["errors"].append(f"Mixed {track_type} tracks: {', '.join(tracks_used)}. Choose one sequence.")
+        completed_for_validation = set(student.completed_courses) if student else set()
+        for year in range(1, 5):
+            for sem in ["fall", "spring"]:
+                year_key = f"year_{year}"
+                sem_courses = plan.get(year_key, {}).get(sem, [])
+                for course in sem_courses:
+                    if course in self.curriculum_graph:
+                        prereqs = set(self.curriculum_graph.predecessors(course))
+                        if not prereqs.issubset(self._get_completed_with_equivalents(completed_for_validation)):
+                            missing = prereqs - completed_for_validation
+                            issues["errors"].append(f"{course} in Year {year} {sem} is missing prereqs: {', '.join(missing)}")
+                completed_for_validation.update(sem_courses)
+        return issues
     def _finalize_plan(self, plan: Dict, explanation: str, validation: Dict = None) -> Dict:
+        structured_plan = {"reasoning": explanation, "validation": validation or {"errors": [], "warnings": [], "info": []}}
+        complexities = []
         for year in range(1, 5):
             year_key = f"year_{year}"
+            structured_plan[year_key] = {
+                "fall": plan.get(year_key, {}).get("fall", []),
+                "spring": plan.get(year_key, {}).get("spring", []),
                 "summer": "co-op" if year in [2, 3] else []
             }
+            for sem in ["fall", "spring"]:
+                courses = structured_plan[year_key][sem]
+                if courses:
+                    sem_complexity = sum(self.courses.get(c, {}).get('complexity', 50) for c in courses)
+                    complexities.append(sem_complexity)
+        structured_plan["complexity_analysis"] = {
             "average_semester_complexity": float(np.mean(complexities)) if complexities else 0,
             "peak_semester_complexity": float(np.max(complexities)) if complexities else 0,
             "total_complexity": float(np.sum(complexities)) if complexities else 0,
             "balance_score (std_dev)": float(np.std(complexities)) if complexities else 0
         }
+        structured_plan["metadata"] = {
             "generated": datetime.now().isoformat(),
             "valid": len(validation.get("errors", [])) == 0 if validation else True,
         }
+        return {"pathway": structured_plan}
 class CurriculumOptimizer(HybridOptimizer):
+    """Wrapper to maintain compatibility with older script calls."""
     def __init__(self):
         super().__init__()
     def generate_plan(self, student: StudentProfile) -> Dict:
         return self.generate_enhanced_rule_plan(student)

src/inspect_graph.py CHANGED Viewed

@@ -1,88 +1,265 @@
 import pickle
 import networkx as nx
-import argparse
-def inspect_graph(graph_path: str):
-    """
-    Loads a curriculum graph and runs diagnostic checks to verify its integrity.
-    """
     try:
-        with open(graph_path, 'rb') as f:
             graph = pickle.load(f)
-        print(f"✅ Successfully loaded graph '{graph_path}'")
-        print(f"   - Total Courses (Nodes): {graph.number_of_nodes()}")
-        print(f"   - Prerequisite Links (Edges): {graph.number_of_edges()}")
-    except FileNotFoundError:
-        print(f"❌ ERROR: File not found at '{graph_path}'. Please check the path.")
-        return
     except Exception as e:
-        print(f"❌ ERROR: Could not load or parse the pickle file. Reason: {e}")
         return
-    print("\n--- 🧐 DIAGNOSTIC CHECKS ---")
-    # --- Check 1: Critical Prerequisite Links ---
-    print("\n## 1. Verifying Critical Prerequisite Links...")
-    critical_links = [
-        ("CS1800", "CS2800"), # Discrete -> Logic & Comp
-        ("CS2500", "CS2510"), # Fundies 1 -> Fundies 2
-        ("CS2510", "CS3500"), # Fundies 2 -> OOD
-        ("CS2510", "CS3000")  # Fundies 2 -> Algorithms
     ]
-    all_links_ok = True
-    for prereq, course in critical_links:
-        if graph.has_node(prereq) and graph.has_node(course):
             if graph.has_edge(prereq, course):
-                print(f"  [PASS] Prerequisite link exists: {prereq} -> {course}")
             else:
-                print(f"  [FAIL] CRITICAL LINK MISSING: The graph has no link from {prereq} to {course}.")
-                all_links_ok = False
         else:
-            print(f"  [WARN] One or both courses in link {prereq} -> {course} are not in the graph.")
-            all_links_ok = False
-    if all_links_ok:
-        print("  -> All critical prerequisite links seem to be intact.")
-    # --- Check 2: Foundational Courses ---
-    print("\n## 2. Analyzing Foundational Courses (courses with no prerequisites)...")
-    foundations = [n for n, d in graph.in_degree() if d == 0]
-    if foundations:
-        print(f"  Found {len(foundations)} foundational courses.")
-        cs_foundations = [c for c in foundations if c.startswith("CS")]
-        if cs_foundations:
-            print(f"  -> Foundational CS courses: {', '.join(cs_foundations[:5])}...")
-        else:
-            print("  [WARN] No foundational courses with a 'CS' prefix were found. This is unusual.")
     else:
-        print("  [FAIL] No foundational courses found. The graph may have a cycle or is structured incorrectly.")
-    # --- Check 3: Key Course Inspection ---
-    print("\n## 3. Inspecting Key Courses...")
-    courses_to_inspect = ["CS2500", "CS2510", "CS3500"]
-    for course_id in courses_to_inspect:
-        if graph.has_node(course_id):
-            prereqs = list(graph.predecessors(course_id))
-            unlocks = list(graph.successors(course_id))
-            print(f"\n  - Course: {course_id} ({graph.nodes[course_id].get('name', 'N/A')})")
-            print(f"    - Prerequisites (What it needs): {prereqs or 'None'}")
-            print(f"    - Unlocks (What it leads to): {unlocks or 'None'}")
-        else:
-            print(f"\n  - Course: {course_id} -> [NOT FOUND IN GRAPH]")
-    print("\n--- ախ DIAGNOSIS ---")
-    if not all_links_ok:
-        print("Your graph is missing critical prerequisite information.")
-        print("The planner cannot create a logical schedule without these links.")
-        print("This issue likely originates in `neu_scraper.py` or how it parses prerequisite data from the API.")
     else:
-        print("The graph structure for critical courses appears to be correct.")
-        print("If plans are still illogical, the issue may lie in the complexity/depth attributes or the planner's sorting logic.")
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Curriculum Graph Diagnostic Tool")
-    # CORRECTED: Use a variable name for the argument
-    parser.add_argument("graph_path", help="Path to the .pkl graph file to inspect.")
-    args = parser.parse_args()
-    # CORRECTED: Use the correct variable to access the argument
-    inspect_graph(args.graph_path)

+#!/usr/bin/env python3
+"""
+Comprehensive Graph Data Inspector
+Diagnoses all potential issues with the curriculum graph data
+"""
 import pickle
 import networkx as nx
+from collections import defaultdict
+import sys
+def inspect_graph_thoroughly(graph_file):
+    """Complete inspection of curriculum graph data"""
+    print("=" * 70)
+    print("COMPREHENSIVE CURRICULUM GRAPH INSPECTION")
+    print("=" * 70)
+    # Load the graph
     try:
+        with open(graph_file, 'rb') as f:
             graph = pickle.load(f)
     except Exception as e:
+        print(f"❌ ERROR: Could not load graph: {e}")
         return
+    print(f"\n📊 BASIC STATS:")
+    print(f"  Total nodes: {graph.number_of_nodes()}")
+    print(f"  Total edges: {graph.number_of_edges()}")
+    # 1. CHECK SUBJECT DISTRIBUTION
+    print("\n📚 SUBJECT ANALYSIS:")
+    subject_counts = defaultdict(int)
+    courses_by_subject = defaultdict(list)
+    for node, data in graph.nodes(data=True):
+        subject = data.get('subject', 'UNKNOWN')
+        subject_counts[subject] += 1
+        courses_by_subject[subject].append(node)
+    # Categorize subjects
+    CS_RELEVANT = {"CS", "DS", "IS", "CY", "MATH", "PHYS", "ENGW", "STAT", "EECE"}
+    MAYBE_RELEVANT = {"CHEM", "BIOL", "PSYC", "PHIL", "ECON"}
+    print("\n  Relevant CS Subjects:")
+    for subj in sorted(CS_RELEVANT):
+        count = subject_counts.get(subj, 0)
+        if count > 0:
+            sample = courses_by_subject[subj][:3]
+            print(f"    ✅ {subj:8s}: {count:3d} courses (e.g., {', '.join(sample)})")
+        else:
+            print(f"    ❌ {subj:8s}: 0 courses - MISSING!")
+    print("\n  Irrelevant Subjects (should be removed):")
+    irrelevant_found = False
+    for subj, count in sorted(subject_counts.items()):
+        if subj not in CS_RELEVANT and subj not in MAYBE_RELEVANT and count > 0:
+            irrelevant_found = True
+            sample = courses_by_subject[subj][:3]
+            print(f"    ❌ {subj:8s}: {count:3d} courses (e.g., {', '.join(sample)})")
+    if not irrelevant_found:
+        print("    ✅ None found - graph is clean!")
+    # 2. CHECK CRITICAL COURSES EXISTENCE
+    print("\n🎯 CRITICAL COURSES CHECK:")
+    # Foundation courses
+    foundation_courses = ["CS1800", "CS2500", "CS2510", "CS2800"]
+    print("\n  Foundation Courses:")
+    for course in foundation_courses:
+        if course in graph:
+            data = graph.nodes[course]
+            print(f"    ✅ {course}: {data.get('name', 'Unknown')}")
+        else:
+            print(f"    ❌ {course}: MISSING!")
+    # Core CS courses
+    core_courses = ["CS3000", "CS3500", "CS3650", "CS3700", "CS3200"]
+    print("\n  Core CS Courses:")
+    for course in core_courses:
+        if course in graph:
+            data = graph.nodes[course]
+            print(f"    ✅ {course}: {data.get('name', 'Unknown')}")
+        else:
+            print(f"    ❌ {course}: MISSING!")
+    # AI/ML concentration courses
+    ai_ml_courses = ["CS4100", "DS4400", "CS4120", "DS4420", "CS4180", "DS4440"]
+    print("\n  AI/ML Concentration:")
+    missing_concentration = []
+    for course in ai_ml_courses:
+        if course in graph:
+            data = graph.nodes[course]
+            print(f"    ✅ {course}: {data.get('name', 'Unknown')}")
+        else:
+            missing_concentration.append(course)
+            print(f"    ❌ {course}: MISSING!")
+    # 3. CHECK PREREQUISITE CHAINS
+    print("\n🔗 PREREQUISITE CHAINS:")
+    critical_chains = [
+        ("CS1800", "CS2800", "Discrete Structures → Logic"),
+        ("CS2500", "CS2510", "Fundies 1 → Fundies 2"),
+        ("CS2510", "CS3500", "Fundies 2 → OOD"),
+        ("CS2510", "CS3000", "Fundies 2 → Algorithms"),
+        ("MATH1341", "MATH1342", "Calc 1 → Calc 2"),
+        ("DS2000", "DS2500", "Prog w/ Data → Intermediate"),
+        ("DS2500", "DS3500", "Intermediate → Advanced")
     ]
+    broken_chains = []
+    for prereq, course, desc in critical_chains:
+        if prereq in graph and course in graph:
             if graph.has_edge(prereq, course):
+                print(f"    ✅ {prereq} → {course} ({desc})")
             else:
+                broken_chains.append((prereq, course))
+                print(f"    ❌ {prereq} → {course} ({desc}) - EDGE MISSING!")
         else:
+            if prereq not in graph:
+                print(f"    ⚠️  {prereq} → {course} - {prereq} doesn't exist")
+            if course not in graph:
+                print(f"    ⚠️  {prereq} → {course} - {course} doesn't exist")
+    # 4. CS2800 SPECIFIC DIAGNOSIS
+    print("\n🔍 CS2800 DETAILED ANALYSIS:")
+    if "CS2800" in graph:
+        cs2800_data = graph.nodes["CS2800"]
+        print(f"  ✅ CS2800 exists")
+        print(f"     Name: {cs2800_data.get('name', 'Unknown')}")
+        print(f"     Subject: {cs2800_data.get('subject', 'Unknown')}")
+        print(f"     Credits: {cs2800_data.get('maxCredits', 'Unknown')}")
+        # Check prerequisites
+        prereqs = list(graph.predecessors("CS2800"))
+        print(f"     Prerequisites: {prereqs if prereqs else 'NONE (this is wrong!)'}")
+        # What it unlocks
+        unlocks = list(graph.successors("CS2800"))[:5]
+        print(f"     Unlocks: {unlocks if unlocks else 'Nothing (suspicious...)'}")
+        # Specific CS1800 connection
+        if "CS1800" in graph:
+            if graph.has_edge("CS1800", "CS2800"):
+                print(f"     ✅ CS1800 → CS2800 connection exists")
+            else:
+                print(f"     ❌ CS1800 → CS2800 connection MISSING!")
     else:
+        print(f"  ❌ CS2800 is completely MISSING from the graph!")
+    # 5. CHECK FOR DUPLICATE/REDUNDANT COURSES
+    print("\n🔄 CHECKING FOR REDUNDANT COURSES:")
+    calc_variants = ["MATH1341", "MATH1241", "MATH1231", "MATH1340"]
+    physics_variants = ["PHYS1151", "PHYS1161", "PHYS1145"]
+    print("\n  Calculus variants in graph:")
+    calc_found = [c for c in calc_variants if c in graph]
+    if len(calc_found) > 1:
+        print(f"    ⚠️  Multiple calculus courses found: {calc_found}")
+        print(f"       These satisfy the same requirement - graph needs deduplication")
+    else:
+        print(f"    ✅ Only one variant: {calc_found}")
+    print("\n  Physics variants in graph:")
+    phys_found = [c for c in physics_variants if c in graph]
+    if len(phys_found) > 1:
+        print(f"    ⚠️  Multiple physics courses found: {phys_found}")
     else:
+        print(f"    ✅ Only one variant: {phys_found}")
+    # 6. CHECK FOR LABS/RECITATIONS
+    print("\n🧪 CHECKING FOR LABS/RECITATIONS (should be removed):")
+    labs_found = []
+    for node, data in graph.nodes(data=True):
+        name = data.get('name', '').lower()
+        if any(word in name for word in ['lab', 'recitation', 'seminar', 'practicum']):
+            labs_found.append((node, data.get('name', node)))
+    if labs_found:
+        print(f"  ❌ Found {len(labs_found)} lab/recitation courses:")
+        for course_id, name in labs_found[:5]:
+            print(f"     - {course_id}: {name}")
+    else:
+        print(f"  ✅ No labs/recitations found")
+    # 7. CHECK 4000-LEVEL COURSES
+    print("\n🎓 4000-LEVEL COURSES:")
+    cs4000_courses = [n for n in graph.nodes() if n.startswith("CS4")]
+    ds4000_courses = [n for n in graph.nodes() if n.startswith("DS4")]
+    print(f"  CS 4000-level: {len(cs4000_courses)} courses")
+    if cs4000_courses:
+        print(f"    Examples: {', '.join(cs4000_courses[:5])}")
+    else:
+        print(f"    ❌ NO CS 4000-level courses found!")
+    print(f"  DS 4000-level: {len(ds4000_courses)} courses")
+    if ds4000_courses:
+        print(f"    Examples: {', '.join(ds4000_courses[:5])}")
+    else:
+        print(f"    ❌ NO DS 4000-level courses found!")
+    # FINAL VERDICT
+    print("\n" + "=" * 70)
+    print("VERDICT:")
+    print("=" * 70)
+    issues = []
+    if irrelevant_found:
+        issues.append("Contains irrelevant subjects (ARTH, FRNH, etc.)")
+    if missing_concentration:
+        issues.append(f"Missing critical courses: {', '.join(missing_concentration)}")
+    if broken_chains:
+        issues.append(f"Broken prerequisite chains: {len(broken_chains)}")
+    if not cs4000_courses or not ds4000_courses:
+        issues.append("Missing 4000-level courses")
+    if labs_found:
+        issues.append(f"Contains {len(labs_found)} lab/recitation courses")
+    if issues:
+        print("❌ GRAPH HAS ISSUES:")
+        for i, issue in enumerate(issues, 1):
+            print(f"   {i}. {issue}")
+        print("\n📋 RECOMMENDED ACTIONS:")
+        print("1. Re-scrape with more subjects: CS DS IS CY MATH PHYS STAT EECE")
+        print("2. Re-run analyzer with stricter filtering")
+        print("3. Manually add missing prerequisite edges if needed")
+    else:
+        print("✅ Graph appears to be clean and complete!")
+def suggest_fix_commands(graph_file):
+    """Suggest specific commands to fix issues"""
+    print("\n" + "=" * 70)
+    print("FIX COMMANDS:")
+    print("=" * 70)
+    print("\n1️⃣ If courses are missing, re-scrape with expanded subjects:")
+    print("   python neu_scraper.py --term 202510 --subjects CS DS IS CY MATH PHYS STAT EECE --prefix neu_complete")
+    print("\n2️⃣ Clean the new data:")
+    print("   python curriculum_analyzer.py --graph neu_complete_graph_*.pkl --courses neu_complete_courses_*.pkl --output-graph neu_graph_ultra_clean.pkl")
+    print("\n3️⃣ Test the cleaned data:")
+    print(f"   python {sys.argv[0]} neu_graph_ultra_clean.pkl")
 if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: python inspect_graph.py <graph.pkl>")
+        print("Example: python inspect_graph.py neu_graph_clean3.pkl")
+    else:
+        graph_file = sys.argv[1]
+        inspect_graph_thoroughly(graph_file)
+        suggest_fix_commands(graph_file)

src/neu_graph_clean8.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a587cdbcc482e13aff07b62e79a4d1c8732c1ab1cb41f1d699ed6f50148f4db4
+size 244756

src/neu_scraper.py CHANGED Viewed

@@ -1,235 +1,236 @@
-"""
-NEU Course Catalog Scraper using SearchNEU GraphQL API (With Proper Pagination)
-Fetches ALL courses for given subjects using first/offset pagination.
-Usage:
-    python neu_scraper.py --term 202510 --subjects CS DS IS CY --prefix neu_api
-"""
-import requests
-import pickle
-import networkx as nx
-import time
-import logging
-from typing import List, Dict, Set, Any
-from datetime import datetime
-# Configure logging
-logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
-logger = logging.getLogger(__name__)
-class NEUGraphQLScraper:
-    def __init__(self, term_id: str, api_url: str = "https://searchneu.com/graphql"):
-        self.term_id = term_id
-        self.api_url = api_url
-        self.headers = {"Content-Type": "application/json"}
-        self.courses_data_cache: Dict[str, Dict] = {}
-        self.all_course_ids: Set[str] = set()
-        self.graph = nx.DiGraph()
-    def get_all_courses_by_subject(self, subject: str, batch_size: int = 100) -> List[Dict]:
-        """Fetch ALL courses for a specific subject via GraphQL with pagination."""
-        all_courses = []
-        offset = 0
-        page = 1
-        while True:
-            query = """
-            query searchQuery($termId: String!, $query: String!, $first: Int, $offset: Int) {
-              search(termId: $termId, query: $query, first: $first, offset: $offset) {
-                totalCount
-                nodes {
-                  __typename
-                  ... on ClassOccurrence {
-                    subject
-                    classId
-                    name
-                    desc
-                    prereqs
-                    coreqs
-                    minCredits
-                    maxCredits
-                  }
-                }
-              }
-            }
-            """
-            variables = {
-                "termId": self.term_id,
-                "query": subject,
-                "first": batch_size,
-                "offset": offset
-            }
-            try:
-                resp = requests.post(self.api_url, json={"query": query, "variables": variables}, headers=self.headers)
-                resp.raise_for_status()
-                data = resp.json()
-                if "errors" in data:
-                    logger.error(f"GraphQL errors for subject {subject}: {data['errors']}")
-                    break
-                search_data = data.get("data", {}).get("search", {})
-                nodes = search_data.get("nodes", [])
-                # Extract ClassOccurrence nodes
-                page_courses = [c for c in nodes if c.get("__typename") == "ClassOccurrence"]
-                all_courses.extend(page_courses)
-                logger.info(f"Page {page}: Found {len(page_courses)} courses, Total so far: {len(all_courses)}")
-                # Check if we've reached the end
-                if len(page_courses) < batch_size:
-                    break
-                offset += batch_size
-                page += 1
-                # Add a small delay to avoid overwhelming the API
-                time.sleep(0.1)
-            except Exception as e:
-                logger.error(f"Error fetching page {page} for subject {subject}: {e}")
-                break
-        logger.info(f"Total courses found for {subject}: {len(all_courses)}")
-        return all_courses
-    def get_course_data_by_id(self, subject: str, classId: str) -> Dict:
-        """Fetch a specific course by its subject and classId."""
-        query = """
-        query searchQuery($termId: String!, $query: String!) {
-          search(termId: $termId, query: $query) {
-            nodes {
-              __typename
-              ... on ClassOccurrence {
-                subject
-                classId
-                name
-                desc
-                prereqs
-                coreqs
-                minCredits
-                maxCredits
-              }
-            }
-          }
-        }
-        """
-        variables = {"termId": self.term_id, "query": f"{subject}{classId}"}
-        try:
-            resp = requests.post(self.api_url, json={"query": query, "variables": variables}, headers=self.headers)
-            resp.raise_for_status()
-            data = resp.json()
-            nodes = data.get("data", {}).get("search", {}).get("nodes", [])
-            for c in nodes:
-                if c.get("subject") == subject and c.get("classId") == classId:
-                    return c
-            return {}
-        except Exception as e:
-            logger.error(f"Error fetching course {subject}{classId}: {e}")
-            return {}
-    def _recursive_parse_prereqs(self, prereq_obj: Any) -> Set[str]:
-        """Extract course IDs from nested prereq/coreq structures."""
-        ids = set()
-        if not isinstance(prereq_obj, dict):
-            return ids
-        # Handle direct course references (the actual structure we see)
-        if "classId" in prereq_obj and "subject" in prereq_obj:
-            ids.add(f"{prereq_obj['subject']}{prereq_obj['classId']}")
-            return ids
-        # Handle logical operators (and/or) with nested values
-        if prereq_obj.get("type") in ["and", "or"]:
-            for val in prereq_obj.get("values", []):
-                ids |= self._recursive_parse_prereqs(val)
-        # Handle nested values in other structures
-        elif "values" in prereq_obj:
-            for val in prereq_obj.get("values", []):
-                ids |= self._recursive_parse_prereqs(val)
-        return ids
-    def scrape_full_catalog(self, subjects: List[str]):
-        """Scrape all courses for the given subjects."""
-        logger.info(f"Fetching complete catalog for subjects: {subjects}")
-        all_courses = []
-        for subject in subjects:
-            logger.info(f"Fetching courses for subject: {subject}")
-            courses = self.get_all_courses_by_subject(subject)
-            all_courses.extend(courses)
-            # Add a small delay to be respectful to the API
-            time.sleep(0.5)
-        # Cache all courses
-        for c in all_courses:
-            cid = f"{c['subject']}{c['classId']}"
-            self.courses_data_cache[cid] = c
-            self.all_course_ids.add(cid)
-        logger.info(f"Discovered {len(all_courses)} total courses in catalog")
-    def build_graph(self):
-        """Build NetworkX graph from scraped course data and requisites."""
-        logger.info("Building course graph")
-        # Add all courses as nodes
-        for cid, cdata in self.courses_data_cache.items():
-            self.graph.add_node(cid, **{
-                "name": cdata.get("name", ""),
-                "subject": cdata.get("subject", ""),
-                "classId": cdata.get("classId", ""),
-                "description": cdata.get("desc", ""), # Corrected from 'desc'
-                "minCredits": cdata.get("minCredits", 0),
-                "maxCredits": cdata.get("maxCredits", 0)
-            })
-        # Add edges ONLY for prerequisites
-        for cid, cdata in self.courses_data_cache.items():
-            prereqs = cdata.get("prereqs", {})
-            if prereqs:
-                prereq_ids = self._recursive_parse_prereqs(prereqs)
-                for pid in prereq_ids:
-                    if pid in self.graph:
-                        self.graph.add_edge(pid, cid, relationship="prerequisite")
-    def save_data(self, prefix: str):
-        """Save graph and courses to pickle files with timestamp."""
-        ts = datetime.now().strftime("%Y%m%d_%H%M%S")
-        gfile = f"{prefix}_graph_{ts}.pkl"
-        cfile = f"{prefix}_courses_{ts}.pkl"
-        with open(gfile, "wb") as gf:
-            pickle.dump(self.graph, gf)
-        with open(cfile, "wb") as cf:
-            pickle.dump(self.courses_data_cache, cf)
-        logger.info(f"Data saved: {gfile}, {cfile}")
-        # Also save some stats
-        logger.info(f"Graph stats: {self.graph.number_of_nodes()} nodes, {self.graph.number_of_edges()} edges")
-def main():
-    import argparse
-    parser = argparse.ArgumentParser(description="Full NEU API Catalog Scraper")
-    parser.add_argument("--term", required=True, help="Term ID e.g. 202510")
-    parser.add_argument("--subjects", nargs="+", required=True, help="Subjects to scrape (e.g., CS DS IS CY)")
-    parser.add_argument("--prefix", default="neu_api", help="Output prefix")
-    parser.add_argument("--batch-size", type=int, default=100, help="Number of courses per page")
-    args = parser.parse_args()
-    scraper = NEUGraphQLScraper(term_id=args.term)
-    scraper.scrape_full_catalog(args.subjects)
-    scraper.build_graph()
-    scraper.save_data(args.prefix)
-    logger.info("Scraping complete.")
-if __name__ == "__main__":
     main()

+"""
+Multi-Term NEU Course Scraper - Merges data from multiple terms
+Fixes: Missing courses by scraping Fall/Spring/Summer catalogs
+"""
+import requests
+import pickle
+import networkx as nx
+import time
+import logging
+from typing import List, Dict, Set, Any
+from datetime import datetime
+from collections import defaultdict
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+logger = logging.getLogger(__name__)
+class MultiTermScraper:
+    def __init__(self, term_ids: List[str], api_url: str = "https://searchneu.com/graphql"):
+        self.term_ids = term_ids
+        self.api_url = api_url
+        self.headers = {"Content-Type": "application/json"}
+        self.merged_courses: Dict[str, Dict] = {}  # cid -> course data
+        self.graph = nx.DiGraph()
+    def get_all_courses_by_subject(self, term_id: str, subject: str, batch_size: int = 100) -> List[Dict]:
+        """Fetch ALL courses for a specific subject/term via pagination."""
+        all_courses = []
+        offset = 0
+        page = 1
+        while True:
+            query = """
+            query searchQuery($termId: String!, $query: String!, $first: Int, $offset: Int) {
+              search(termId: $termId, query: $query, first: $first, offset: $offset) {
+                totalCount
+                nodes {
+                  __typename
+                  ... on ClassOccurrence {
+                    subject
+                    classId
+                    name
+                    desc
+                    prereqs
+                    coreqs
+                    minCredits
+                    maxCredits
+                  }
+                }
+              }
+            }
+            """
+            variables = {
+                "termId": term_id,
+                "query": subject,
+                "first": batch_size,
+                "offset": offset
+            }
+            try:
+                resp = requests.post(self.api_url, json={"query": query, "variables": variables}, headers=self.headers, timeout=10)
+                resp.raise_for_status()
+                data = resp.json()
+                if "errors" in data:
+                    logger.error(f"GraphQL errors for {term_id}/{subject}: {data['errors']}")
+                    break
+                search_data = data.get("data", {}).get("search", {})
+                nodes = search_data.get("nodes", [])
+                page_courses = [c for c in nodes if c.get("__typename") == "ClassOccurrence"]
+                all_courses.extend(page_courses)
+                logger.info(f"[{term_id}] {subject} Page {page}: {len(page_courses)} courses (Total: {len(all_courses)})")
+                if len(page_courses) < batch_size:
+                    break
+                offset += batch_size
+                page += 1
+                time.sleep(0.1)
+            except Exception as e:
+                logger.error(f"Error fetching {term_id}/{subject} page {page}: {e}")
+                break
+        logger.info(f"[{term_id}] {subject}: {len(all_courses)} total courses")
+        return all_courses
+    def _recursive_parse_prereqs(self, prereq_obj: Any) -> Set[str]:
+        """Extract course IDs from nested prereq structures."""
+        ids = set()
+        if not isinstance(prereq_obj, dict):
+            return ids
+        if "classId" in prereq_obj and "subject" in prereq_obj:
+            ids.add(f"{prereq_obj['subject']}{prereq_obj['classId']}")
+            return ids
+        if prereq_obj.get("type") in ["and", "or"]:
+            for val in prereq_obj.get("values", []):
+                ids |= self._recursive_parse_prereqs(val)
+        elif "values" in prereq_obj:
+            for val in prereq_obj.get("values", []):
+                ids |= self._recursive_parse_prereqs(val)
+        return ids
+    def scrape_all_terms(self, subjects: List[str]):
+        """Scrape courses from all terms and merge by course ID."""
+        term_data = defaultdict(lambda: defaultdict(list))  # term_id -> subject -> courses
+        for term_id in self.term_ids:
+            logger.info(f"\n{'='*70}")
+            logger.info(f"SCRAPING TERM: {term_id}")
+            logger.info(f"{'='*70}")
+            for subject in subjects:
+                courses = self.get_all_courses_by_subject(term_id, subject)
+                term_data[term_id][subject] = courses
+                time.sleep(0.5)
+        # Merge courses across terms (prefer most recent data for duplicates)
+        for term_id in self.term_ids:
+            for subject in subjects:
+                for course in term_data[term_id][subject]:
+                    cid = f"{course['subject']}{course['classId']}"
+                    # Only update if we don't have this course OR this term is newer
+                    if cid not in self.merged_courses:
+                        self.merged_courses[cid] = course
+                        logger.debug(f"Added {cid} from {term_id}")
+                    else:
+                        # Update if current course has more complete data
+                        existing = self.merged_courses[cid]
+                        if not existing.get('desc') and course.get('desc'):
+                            self.merged_courses[cid] = course
+                            logger.debug(f"Updated {cid} from {term_id} (better description)")
+        logger.info(f"\n{'='*70}")
+        logger.info(f"MERGE COMPLETE: {len(self.merged_courses)} unique courses")
+        logger.info(f"{'='*70}")
+        # Log subject breakdown
+        subject_counts = defaultdict(int)
+        for cid in self.merged_courses:
+            subject = self.merged_courses[cid].get('subject', 'UNKNOWN')
+            subject_counts[subject] += 1
+        logger.info("\nSubject breakdown:")
+        for subject in sorted(subject_counts.keys()):
+            logger.info(f"  {subject}: {subject_counts[subject]} courses")
+    def build_graph(self):
+        """Build NetworkX graph from merged course data."""
+        logger.info("\nBuilding course dependency graph...")
+        # Add all courses as nodes
+        for cid, cdata in self.merged_courses.items():
+            self.graph.add_node(cid, **{
+                "name": cdata.get("name", ""),
+                "subject": cdata.get("subject", ""),
+                "classId": cdata.get("classId", ""),
+                "description": cdata.get("desc", ""),
+                "minCredits": cdata.get("minCredits", 0),
+                "maxCredits": cdata.get("maxCredits", 0)
+            })
+        # Add prerequisite edges
+        edge_count = 0
+        for cid, cdata in self.merged_courses.items():
+            prereqs = cdata.get("prereqs", {})
+            if prereqs:
+                prereq_ids = self._recursive_parse_prereqs(prereqs)
+                for pid in prereq_ids:
+                    if pid in self.graph:
+                        self.graph.add_edge(pid, cid, relationship="prerequisite")
+                        edge_count += 1
+                    else:
+                        logger.warning(f"Prerequisite {pid} for {cid} not in graph")
+        logger.info(f"Graph built: {self.graph.number_of_nodes()} nodes, {edge_count} edges")
+    def save_data(self, prefix: str):
+        """Save merged graph and courses."""
+        ts = datetime.now().strftime("%Y%m%d_%H%M%S")
+        gfile = f"{prefix}_graph_{ts}.pkl"
+        cfile = f"{prefix}_courses_{ts}.pkl"
+        with open(gfile, "wb") as gf:
+            pickle.dump(self.graph, gf)
+        with open(cfile, "wb") as cf:
+            pickle.dump(self.merged_courses, cf)
+        logger.info(f"\nData saved:")
+        logger.info(f"  Graph: {gfile}")
+        logger.info(f"  Courses: {cfile}")
+        # Save merge report
+        report_file = f"{prefix}_merge_report_{ts}.txt"
+        with open(report_file, "w") as rf:
+            rf.write(f"Multi-Term Scrape Report\n")
+            rf.write(f"{'='*70}\n\n")
+            rf.write(f"Terms scraped: {', '.join(self.term_ids)}\n")
+            rf.write(f"Total unique courses: {len(self.merged_courses)}\n")
+            rf.write(f"Total edges: {self.graph.number_of_edges()}\n\n")
+            rf.write("Subject breakdown:\n")
+            subject_counts = defaultdict(int)
+            for cid in self.merged_courses:
+                subject = self.merged_courses[cid].get('subject', 'UNKNOWN')
+                subject_counts[subject] += 1
+            for subject in sorted(subject_counts.keys()):
+                rf.write(f"  {subject}: {subject_counts[subject]}\n")
+        logger.info(f"  Report: {report_file}")
+def main():
+    import argparse
+    parser = argparse.ArgumentParser(description="Multi-Term NEU Catalog Scraper")
+    parser.add_argument("--terms", nargs="+", required=True, help="Term IDs (e.g., 202510 202520 202530)")
+    parser.add_argument("--subjects", nargs="+", required=True, help="Subjects (e.g., CS DS STAT)")
+    parser.add_argument("--prefix", default="neu_merged", help="Output prefix")
+    parser.add_argument("--batch-size", type=int, default=100, help="Courses per page")
+    args = parser.parse_args()
+    scraper = MultiTermScraper(term_ids=args.terms)
+    scraper.scrape_all_terms(args.subjects)
+    scraper.build_graph()
+    scraper.save_data(args.prefix)
+    logger.info("\n✅ Multi-term scraping complete!")
+if __name__ == "__main__":
     main()