#!/usr/bin/env python3
"""
FIXED Curriculum Analyzer - Production Version

Synchronized with optimizer logic:
1.  Filters subjects to ONLY: CS, DS, CY, MATH, PHYS, ENGW.
2.  Removes IS, EECE, STAT, and other irrelevant subjects.
3.  ADDS exception for undergrad-accessible 5000-level courses (CS5700).
4.  FIXES bad prerequisite data (e.g., CS2500 -> CS2800).
"""
import pickle
import argparse
import networkx as nx
import re
from typing import Set, Dict
from collections import defaultdict

def get_course_level(cid):
    """Extracts the numerical part of a course ID for level checking."""
    match = re.search(r'\d+', cid)
    return int(match.group(0)) if match else 9999

class CurriculumAnalyzer:
    
    # --- FIX 1: DEFINE LISTS THAT MATCH THE OPTIMIZER ---
    
    # Subjects the optimizer is programmed to understand.
    # ENGW/PHYS are needed only for hardcoded Year 1.
    KEEP_SUBJECTS = {"CS", "DS", "CY", "MATH", "PHYS", "ENGW"}
    
    # 5000-level courses the optimizer explicitly allows.
    UNDERGRAD_ACCESSIBLE_GRAD = {"CS5700", "CY5700", "DS5110", "CS5010"}

    def __init__(self, graph_path, courses_path):
        print("📚 Loading raw curriculum data...")
        try:
            with open(graph_path, 'rb') as f:
                self.graph = pickle.load(f)
            with open(courses_path, 'rb') as f:
                self.courses = pickle.load(f)
        except Exception as e:
            print(f"❌ ERROR: Could not load files. {e}")
            exit(1)
            
        # Merge course data into graph nodes
        for course_id, course_data in self.courses.items():
            if self.graph.has_node(course_id):
                self.graph.nodes[course_id].update(course_data)
        
        print(f"✅ Loaded {self.graph.number_of_nodes()} courses, {self.graph.number_of_edges()} edges")

    def pre_filter_graph(self):
        """
        --- FIX 2: IMPLEMENTS STRICT FILTERING ---
        Keeps only relevant subjects and removes labs/high-level courses.
        """
        print("\n🧹 Pre-filtering graph...")
        
        nodes_to_remove = set()
        for node, data in self.graph.nodes(data=True):
            subject = data.get('subject', '')
            name = data.get('name', '').lower()
            level = get_course_level(node)
            
            # Check for removal
            is_irrelevant_subject = subject not in self.KEEP_SUBJECTS
            is_lab_or_seminar = any(skip in name for skip in ['lab', 'recitation', 'seminar', 'practicum', 'co-op'])
            
            # Grad-level check
            is_grad_level = level >= 5000
            is_allowed_grad = node in self.UNDERGRAD_ACCESSIBLE_GRAD
            
            if (is_irrelevant_subject or 
                is_lab_or_seminar or
                (is_grad_level and not is_allowed_grad)): # <-- Bug fix
                nodes_to_remove.add(node)
        
        original_count = self.graph.number_of_nodes()
        self.graph.remove_nodes_from(nodes_to_remove)
        
        print(f"✅ Removed {len(nodes_to_remove)} irrelevant courses (IS, EECE, etc.)")
        print(f"   Original nodes: {original_count}")
        print(f"   Remaining nodes: {self.graph.number_of_nodes()}")

    def fix_chains(self):
        """Adds critical prerequisite chains that might be missing."""
        print("\n🔗 Validating and fixing critical prerequisite chains...")
        
        critical_chains = {
            # Foundations
            ("CS1800", "CS2800", "Discrete → Logic"),
            ("CS2500", "CS2510", "Fundies 1 → Fundies 2"),
            # Core CS
            ("CS2510", "CS3500", "Fundies 2 → OOD"),
            ("CS2510", "CS3000", "Fundies 2 → Algorithms"),
            ("CS2800", "CS3000", "Logic → Algorithms"), 
            
            # --- THIS IS THE FIX ---
            ("CS3000", "CS3650", "Algorithms -> Systems"), 
            # ---------------------

            # Core AI/ML
            ("CS3000", "CS4100", "Algorithms → AI"),
            ("CS3500", "CS4100", "OOD → AI"),
            # Core DS Path
            ("DS2000", "DS2500", "Prog w/ Data → Intermediate"),
            ("DS2500", "DS3500", "Intermediate → Advanced"),
            ("DS3500", "DS4400", "Advanced → ML1"),
            ("CS3500", "DS4400", "OOD → ML1"),
            # Math
            ("MATH1341", "MATH1342", "Calc 1 → Calc 2"),
        }
        
        added = 0
        for prereq, course, desc in critical_chains:
            if self.graph.has_node(prereq) and self.graph.has_node(course):
                if not self.graph.has_edge(prereq, course):
                    self.graph.add_edge(prereq, course)
                    print(f"  🔧 FIXED: Added {prereq} → {course} ({desc})")
                    added += 1
        
        if added == 0:
            print("  ✅ All critical chains present")

    def remove_spurious_chains(self):
        """
        --- FIX 3: REMOVE BAD DATA ---
        Removes known incorrect prerequisite edges from scraper.
        """
        print("\n🗑️ Removing spurious prerequisite chains...")
        
        # Based on your inspect_graph output and catalog knowledge
        spurious_chains = {
            ("CS2500", "CS2800"),  # Fundies 1 is NOT a prereq for Logic
            ("MATH1365", "CS2800"), # Not a real prereq
            ("EECE2160", "CS3000"), # Irrelevant prereq
            ("EECE2560", "CS3500"), # Irrelevant prereq
        }
        
        removed = 0
        for prereq, course in spurious_chains:
            if self.graph.has_edge(prereq, course):
                self.graph.remove_edge(prereq, course)
                print(f"  ✅ REMOVED: {prereq} → {course}")
                removed += 1
        
        if removed == 0:
            print("  ✅ No spurious chains found")

    def calculate_and_add_complexity(self):
        """Calculates and adds complexity score to each course."""
        print("\n🧮 Calculating complexity scores...")
        
        for node in self.graph.nodes():
            # Use predecessors/successors on the *cleaned* graph
            in_degree = self.graph.in_degree(node)
            out_degree = self.graph.out_degree(node)
            
            # Complexity heuristic: weighted by prerequisites and courses unlocked
            score = (in_degree * 10) + (out_degree * 5)
            nx.set_node_attributes(self.graph, {node: {'complexity': score}})
        
        print("✅ Complexity scores calculated")

    def validate_critical_courses(self) -> Dict[str, Set[str]]:
        """Check if all critical courses exist in the graph."""
        print("\n🎯 Validating critical course coverage...")
        
        # This list MUST match the optimizer's requirements
        required_courses = {
            "foundations": {"CS1800", "CS2500", "CS2510", "CS2800"},
            "core": {"CS3000", "CS3500", "CS3650", "CS3200", "CS5700"}, # Added CS5700
            "ai_ml": {"CS4100", "DS4400", "CS4120", "DS4420", "CS4180", "DS4440"},
            "systems": {"CS4730", "CS4700", "CS4400", "CS4500"},
            "security": {"CY2550", "CY3740", "CY4740", "CY4760"},
            "math": {"MATH1341", "MATH1342", "MATH2331", "MATH3081"},
        }
        
        missing = {}
        for category, courses in required_courses.items():
            missing_in_cat = courses - set(self.graph.nodes())
            if missing_in_cat:
                missing[category] = missing_in_cat
                print(f"  ⚠️  {category}: Missing {missing_in_cat}")
            else:
                print(f"  ✅ {category}: All courses present")
        
        return missing

    def save_enriched_graph(self, output_path):
        """Saves the final, clean, and enriched graph."""
        print(f"\n💾 Saving cleaned graph to {output_path}...")
        with open(output_path, 'wb') as f:
            pickle.dump(self.graph, f)
        print("✅ Graph saved")
        
        # Save a summary report
        report_path = output_path.replace('.pkl', '_report.txt')
        with open(report_path, 'w') as f:
            f.write("Curriculum Graph Analysis Report\n")
            f.write("="*70 + "\n\n")
            f.write(f"Total courses: {self.graph.number_of_nodes()}\n")
            f.write(f"Total prerequisites: {self.graph.number_of_edges()}\n\n")
            
            subject_counts = defaultdict(int)
            for node in self.graph.nodes():
                subject = self.graph.nodes[node].get('subject', 'UNKNOWN')
                subject_counts[subject] += 1
            
            f.write("Subject breakdown (Filtered):\n")
            for subject in sorted(subject_counts.keys()):
                f.write(f"  {subject}: {subject_counts[subject]}\n")
        
        print(f"✅ Report saved to {report_path}")

def main(args):
    """Main execution flow."""
    analyzer = CurriculumAnalyzer(args.graph, args.courses)
    analyzer.pre_filter_graph()
    analyzer.fix_chains()
    analyzer.remove_spurious_chains()
    analyzer.calculate_and_add_complexity()
    
    missing = analyzer.validate_critical_courses()
    
    if missing:
        print("\n⚠️  WARNING: Some critical courses are missing!")
        print("   Consider re-scraping with additional terms or subjects.")
        print("   Missing courses will be excluded from planning.")
    
    analyzer.save_enriched_graph(args.output_graph)
    print("\n✨ Analysis complete!")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="NEU Curriculum Analyzer - Cleans and validates data")
    parser.add_argument('--graph', required=True, help="Path to RAW curriculum graph (e.g., neu_merged_graph_...pkl)")
    parser.add_argument('--courses', required=True, help="Path to RAW courses data (e.g., neu_merged_courses_...pkl)")
    parser.add_argument('--output-graph', default='neu_graph_clean.pkl', help="Output path for the new, clean graph")
    args = parser.parse_args()
    main(args)