"""
Evaluation script for PaDiM anomaly detection model
"""

import torch
import numpy as np
from tqdm import tqdm
from pathlib import Path
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve
import sys
import json

sys.path.append(str(Path(__file__).parent))

import config
from src.data_loader import get_dataloader
from src.feature_extractor import FeatureExtractor, extract_embeddings
from src.padim import PaDiM
from src.visualize import plot_roc_curve, save_prediction
from PIL import Image


def evaluate_padim():
    """Evaluate PaDiM model on test data"""
    
    print("=" * 60)
    print("AUTOMATED TABLET DEFECT DETECTION - EVALUATION")
    print("=" * 60)
    
    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    # Load model
    print("\nLoading trained model...")
    model_path = config.MODEL_DIR / "padim_model.pkl"
    if not model_path.exists():
        raise FileNotFoundError(f"Model not found at {model_path}. Run train.py first.")
    
    padim_model = PaDiM()
    padim_model.load(model_path)
    
    # Initialize feature extractor
    print("Initializing feature extractor...")
    extractor = FeatureExtractor(
        backbone=config.BACKBONE,
        layers=config.FEATURE_LAYERS
    ).to(device)
    
    # Evaluate on test set
    print("\nEvaluating on test set...")
    
    all_scores = []
    all_labels = []
    all_predictions = []
    
    defect_types = ["good"] + config.DEFECT_TYPES
    
    for defect_type in defect_types:
        test_dir = config.TEST_DIR / defect_type
        
        if not test_dir.exists():
            print(f"Skipping {defect_type} (directory not found)")
            continue
        
        print(f"\nProcessing {defect_type}...")
        
        # Ground truth: 0 for good, 1 for defect
        is_defect = 1 if defect_type != "good" else 0
        
        # Get dataloader
        test_loader = get_dataloader(test_dir, batch_size=1, shuffle=False)
        
        for images, paths, _ in tqdm(test_loader):
            images = images.to(device)
            
            # Extract embeddings
            with torch.no_grad():
                embeddings = extract_embeddings(extractor, images)
            
            # Predict anomaly
            embeddings_np = embeddings.cpu().numpy()
            anomaly_score, anomaly_map = padim_model.predict(embeddings_np)
            
            all_scores.append(anomaly_score)
            all_labels.append(is_defect)
            
            # Save some example predictions
            if len(all_predictions) < 20:  # Save first 20 examples
                img_path = paths[0]
                img = Image.open(img_path)
                
                save_path = config.RESULTS_DIR / f"{defect_type}_{Path(img_path).name}"
                save_prediction(img, anomaly_score, anomaly_map, str(save_path))
                all_predictions.append({
                    'image': img_path,
                    'score': float(anomaly_score),
                    'label': is_defect
                })
    
    # Compute metrics
    all_scores = np.array(all_scores)
    all_labels = np.array(all_labels)
    
    # ROC-AUC
    roc_auc = roc_auc_score(all_labels, all_scores)
    print(f"\n{'=' * 60}")
    print(f"IMAGE-LEVEL ROC-AUC: {roc_auc:.4f}")
    print(f"{'=' * 60}")
    
    # Find optimal threshold using Youden's J statistic
    fpr, tpr, thresholds = roc_curve(all_labels, all_scores)
    optimal_idx = np.argmax(tpr - fpr)
    optimal_threshold = thresholds[optimal_idx]
    
    print(f"\nOptimal threshold: {optimal_threshold:.4f}")
    
    # Compute precision and recall at optimal threshold
    predictions = (all_scores >= optimal_threshold).astype(int)
    
    tp = np.sum((predictions == 1) & (all_labels == 1))
    fp = np.sum((predictions == 1) & (all_labels == 0))
    fn = np.sum((predictions == 0) & (all_labels == 1))
    tn = np.sum((predictions == 0) & (all_labels == 0))
    
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    accuracy = (tp + tn) / len(all_labels)
    
    print(f"\nMetrics at optimal threshold:")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall: {recall:.4f}")
    print(f"  F1-Score: {f1:.4f}")
    print(f"  Accuracy: {accuracy:.4f}")
    
    print(f"\nConfusion Matrix:")
    print(f"  TP: {tp}, FP: {fp}")
    print(f"  FN: {fn}, TN: {tn}")
    
    # Plot ROC curve
    roc_path = config.RESULTS_DIR / "roc_curve.png"
    plot_roc_curve(fpr, tpr, roc_auc, str(roc_path))
    
    # Save results
    results = {
        'roc_auc': float(roc_auc),
        'optimal_threshold': float(optimal_threshold),
        'precision': float(precision),
        'recall': float(recall),
        'f1_score': float(f1),
        'accuracy': float(accuracy),
        'confusion_matrix': {
            'tp': int(tp), 'fp': int(fp),
            'fn': int(fn), 'tn': int(tn)
        }
    }
    
    results_path = config.RESULTS_DIR / "evaluation_results.json"
    with open(results_path, 'w') as f:
        json.dump(results, f, indent=2)
    
    print(f"\nResults saved to {results_path}")
    print(f"Example predictions saved to {config.RESULTS_DIR}")
    
    return results


if __name__ == "__main__":
    evaluate_padim()