"""
model.py – Define, train, and evaluate all three models:
    1. Naive baseline (majority class classifier)
    2. Classical ML  (Random Forest on HOG features)
    3. Deep learning (ScribblNet CNN)

Also runs the training size sensitivity experiment and saves results/plots.

Usage:
    python scripts/model.py
"""

import json
import sys
import time
from pathlib import Path
from typing import Any

import joblib
import matplotlib
matplotlib.use("Agg")  # headless backend
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
)
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, TensorDataset
import seaborn as sns

sys.path.insert(0, str(Path(__file__).parent.parent))

from config import (
    CLASSES,
    MODELS_DIR,
    OUTPUTS_DIR,
    PROCESSED_DIR,
    NUM_CLASSES,
    RF_MAX_DEPTH,
    RF_N_ESTIMATORS,
    DEEP_BATCH_SIZE,
    DEEP_EPOCHS,
    DEEP_LR,
    DEEP_WEIGHT_DECAY,
    IMG_SIZE,
    EXPERIMENT_FRACTIONS,
    EXPERIMENT_EPOCHS,
)


# Utility

def get_device() -> torch.device:
    """Return the best available torch device (MPS > CUDA > CPU)."""
    if torch.backends.mps.is_available():
        return torch.device("mps")
    if torch.cuda.is_available():
        return torch.device("cuda")
    return torch.device("cpu")


def load_processed_data() -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """Load all processed arrays from disk.

    Returns:
        X_train_raw, X_test_raw, y_train, y_test, X_train_hog, X_test_hog
    """
    X_train_raw = np.load(PROCESSED_DIR / "X_train_raw.npy")
    X_test_raw = np.load(PROCESSED_DIR / "X_test_raw.npy")
    y_train = np.load(PROCESSED_DIR / "y_train.npy")
    y_test = np.load(PROCESSED_DIR / "y_test.npy")
    X_train_hog = np.load(PROCESSED_DIR / "X_train_hog.npy")
    X_test_hog = np.load(PROCESSED_DIR / "X_test_hog.npy")
    return X_train_raw, X_test_raw, y_train, y_test, X_train_hog, X_test_hog


# 1. Naive Baseline

class MajorityClassifier:
    """Naive baseline: always predicts the most frequent class in training."""

    def __init__(self) -> None:
        self.majority_class: int = 0

    def fit(self, y: np.ndarray) -> "MajorityClassifier":
        """Fit by finding the majority class label.

        Args:
            y: 1-D array of integer class labels.

        Returns:
            self
        """
        counts = np.bincount(y)
        self.majority_class = int(np.argmax(counts))
        return self

    def predict(self, n_samples: int) -> np.ndarray:
        """Return the majority class repeated n_samples times.

        Args:
            n_samples: Number of predictions to generate.

        Returns:
            Array of length n_samples, all equal to majority_class.
        """
        return np.full(n_samples, self.majority_class, dtype=np.int64)


def train_naive(y_train: np.ndarray, y_test: np.ndarray) -> dict[str, Any]:
    """Train and evaluate the majority class baseline.

    Args:
        y_train: Training labels.
        y_test:  Test labels.

    Returns:
        Dictionary of evaluation metrics.
    """
    print(f"\nNaive Baseline")
    clf = MajorityClassifier().fit(y_train)
    preds = clf.predict(len(y_test))
    acc = accuracy_score(y_test, preds)
    print(f"  Majority class: {CLASSES[clf.majority_class]}")
    print(f"  Test accuracy:  {acc:.4f}")

    model_data = {"majority_class": clf.majority_class, "accuracy": acc}
    joblib.dump(model_data, MODELS_DIR / "naive_model.pkl")

    return {"model": "naive", "accuracy": acc}


# 2. Classical ML

def train_classical(
    X_train_hog: np.ndarray,
    X_test_hog: np.ndarray,
    y_train: np.ndarray,
    y_test: np.ndarray,
) -> dict[str, Any]:
    """Train Random Forest on HOG features and evaluate.

    Args:
        X_train_hog: Training HOG feature matrix.
        X_test_hog:  Test HOG feature matrix.
        y_train:     Training labels.
        y_test:      Test labels.

    Returns:
        Dictionary of evaluation metrics.
    """
    print(f"\nClassical ML (Random Forest on HOG)")

    # Standardise features
    scaler = StandardScaler()
    X_tr = scaler.fit_transform(X_train_hog)
    X_te = scaler.transform(X_test_hog)

    clf = RandomForestClassifier(
        n_estimators=RF_N_ESTIMATORS,
        max_depth=RF_MAX_DEPTH,
        n_jobs=-1,
        random_state=42,
    )
    t0 = time.time()
    clf.fit(X_tr, y_train)
    elapsed = time.time() - t0

    preds = clf.predict(X_te)
    acc = accuracy_score(y_test, preds)
    report = classification_report(y_test, preds, target_names=CLASSES)

    print(f"  Training time:  {elapsed:.1f}s")
    print(f"  Test accuracy:  {acc:.4f}")
    print(f"\n{report}")

    joblib.dump({"clf": clf, "scaler": scaler}, MODELS_DIR / "classical_model.pkl")
    _save_confusion_matrix(y_test, preds, "classical_confusion_matrix.png")

    return {"model": "classical", "accuracy": acc, "training_time_s": elapsed}


# 3. Deep Model

class ScribblNet(nn.Module):
    """Lightweight CNN for 28×28 grayscale sketch classification.

    Architecture:
        3 × (Conv2d → BatchNorm → ReLU → MaxPool)
        Dropout → FC(1152→256) → ReLU → Dropout → FC(256→num_classes)
    """

    def __init__(self, num_classes: int = NUM_CLASSES) -> None:
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),

            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),

            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),
        )
        # 28→14→7→3  ∴ feature map is 128×3×3 = 1152
        self.classifier = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(128 * 3 * 3, 256),
            nn.ReLU(inplace=True),
            nn.Dropout(0.3),
            nn.Linear(256, num_classes),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Forward pass.

        Args:
            x: Tensor of shape (B, 1, 28, 28), values in [0, 1].

        Returns:
            Logits tensor of shape (B, num_classes).
        """
        x = self.features(x)
        x = x.view(x.size(0), -1)
        return self.classifier(x)


def make_dataloaders(
    X_raw: np.ndarray,
    y: np.ndarray,
    X_test_raw: np.ndarray,
    y_test: np.ndarray,
    batch_size: int = DEEP_BATCH_SIZE,
    train_fraction: float = 1.0,
) -> tuple[DataLoader, DataLoader]:
    """Build PyTorch DataLoaders from raw pixel arrays.

    Pixel values are normalised to [0, 1].  Training set can be subsampled
    via train_fraction for the sensitivity experiment.

    Args:
        X_raw:          Training pixel array (N, 784), uint8.
        y:              Training labels.
        X_test_raw:     Test pixel array.
        y_test:         Test labels.
        batch_size:     Minibatch size.
        train_fraction: Fraction of training samples to use (0 < f ≤ 1).

    Returns:
        (train_loader, test_loader)
    """
    if train_fraction < 1.0:
        n = max(1, int(len(X_raw) * train_fraction))
        idx = np.random.default_rng(seed=7).permutation(len(X_raw))[:n]
        X_raw = X_raw[idx]
        y = y[idx]

    def _to_tensor(X: np.ndarray, labels: np.ndarray) -> TensorDataset:
        imgs = torch.from_numpy(X.astype(np.float32) / 255.0)
        imgs = imgs.view(-1, 1, IMG_SIZE, IMG_SIZE)
        return TensorDataset(imgs, torch.from_numpy(labels))

    train_ds = _to_tensor(X_raw, y)
    test_ds = _to_tensor(X_test_raw, y_test)

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=0)
    test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False, num_workers=0)
    return train_loader, test_loader


def train_one_epoch(
    model: nn.Module,
    loader: DataLoader,
    optimizer: torch.optim.Optimizer,
    criterion: nn.Module,
    device: torch.device,
) -> float:
    """Run one training epoch and return average loss.

    Args:
        model:     ScribblNet instance.
        loader:    Training DataLoader.
        optimizer: Optimiser (Adam).
        criterion: Loss function (CrossEntropyLoss).
        device:    Torch device.

    Returns:
        Mean loss over all minibatches.
    """
    model.train()
    total_loss = 0.0
    for imgs, labels in loader:
        imgs, labels = imgs.to(device), labels.to(device)
        optimizer.zero_grad()
        loss = criterion(model(imgs), labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)


def evaluate(
    model: nn.Module,
    loader: DataLoader,
    device: torch.device,
) -> tuple[float, np.ndarray]:
    """Evaluate model on a DataLoader.

    Args:
        model:  ScribblNet instance.
        loader: Evaluation DataLoader.
        device: Torch device.

    Returns:
        (accuracy, predictions_array)
    """
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for imgs, labels in loader:
            imgs = imgs.to(device)
            preds = model(imgs).argmax(dim=1).cpu().numpy()
            all_preds.append(preds)
            all_labels.append(labels.numpy())
    preds = np.concatenate(all_preds)
    labels = np.concatenate(all_labels)
    return accuracy_score(labels, preds), preds


def train_deep(
    X_train_raw: np.ndarray,
    X_test_raw: np.ndarray,
    y_train: np.ndarray,
    y_test: np.ndarray,
    epochs: int = DEEP_EPOCHS,
    train_fraction: float = 1.0,
    save_model: bool = True,
) -> dict[str, Any]:
    """Train ScribblNet and evaluate on test set.

    Args:
        X_train_raw:    Raw training pixel array.
        X_test_raw:     Raw test pixel array.
        y_train:        Training labels.
        y_test:         Test labels.
        epochs:         Number of training epochs.
        train_fraction: Fraction of training data to use.
        save_model:     Whether to save weights to disk.

    Returns:
        Dictionary of evaluation metrics and training history.
    """
    print(f"\nDeep Model (ScribblNet, fraction={train_fraction:.0%})")
    device = get_device()
    print(f"  Device: {device}")

    train_loader, test_loader = make_dataloaders(
        X_train_raw, y_train, X_test_raw, y_test, train_fraction=train_fraction
    )

    model = ScribblNet(num_classes=NUM_CLASSES).to(device)
    optimizer = torch.optim.Adam(
        model.parameters(), lr=DEEP_LR, weight_decay=DEEP_WEIGHT_DECAY
    )
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
    criterion = nn.CrossEntropyLoss()

    history = {"loss": [], "val_acc": []}
    best_acc = 0.0

    for epoch in range(1, epochs + 1):
        loss = train_one_epoch(model, train_loader, optimizer, criterion, device)
        acc, _ = evaluate(model, test_loader, device)
        scheduler.step()
        history["loss"].append(loss)
        history["val_acc"].append(acc)
        print(f"  epoch {epoch:02d}/{epochs}  loss={loss:.4f}  val_acc={acc:.4f}")

        if acc > best_acc:
            best_acc = acc
            if save_model:
                torch.save(model.state_dict(), MODELS_DIR / "deep_model.pth")

    # Final evaluation with best weights
    if save_model:
        model.load_state_dict(torch.load(MODELS_DIR / "deep_model.pth", map_location=device))

    final_acc, final_preds = evaluate(model, test_loader, device)
    print(f"\n  Best test accuracy: {best_acc:.4f}")

    if save_model:
        report = classification_report(y_test, final_preds, target_names=CLASSES)
        print(f"\n{report}")
        _save_confusion_matrix(y_test, final_preds, "deep_confusion_matrix.png")
        _save_training_curves(history)

    return {"model": "deep", "accuracy": best_acc, "history": history}


# Experiment: Training Size Sensitivity

def run_experiment(
    X_train_raw: np.ndarray,
    X_test_raw: np.ndarray,
    y_train: np.ndarray,
    y_test: np.ndarray,
    X_train_hog: np.ndarray,
    X_test_hog: np.ndarray,
) -> None:
    """Training set size sensitivity analysis.

    Sweeps over EXPERIMENT_FRACTIONS, training both the deep model and Random
    Forest at each fraction, then plots accuracy vs number of training samples.

    Motivation: Understanding how each model scales with data volume helps
    justify architectural choices and highlights when more data is beneficial.

    Args:
        X_train_raw:  Raw training pixels.
        X_test_raw:   Raw test pixels.
        y_train:      Training labels.
        y_test:       Test labels.
        X_train_hog:  HOG training features.
        X_test_hog:   HOG test features.
    """
    print(f"\nExperiment: Training Size Sensitivity")
    deep_accs, rf_accs, n_samples = [], [], []
    scaler = StandardScaler()
    X_test_scaled = scaler.fit_transform(X_test_hog)

    for frac in EXPERIMENT_FRACTIONS:
        n = int(len(X_train_raw) * frac)
        n_samples.append(n)
        print(f"\n  Fraction={frac:.0%}  (n={n})")

        # Deep model
        result = train_deep(
            X_train_raw, X_test_raw, y_train, y_test,
            epochs=EXPERIMENT_EPOCHS, train_fraction=frac, save_model=False,
        )
        deep_accs.append(result["accuracy"])

        # Random Forest
        idx = np.random.default_rng(seed=42).permutation(len(X_train_hog))[:n]
        X_tr = scaler.fit_transform(X_train_hog[idx])
        rf = RandomForestClassifier(
            n_estimators=100, n_jobs=-1, random_state=42
        )
        rf.fit(X_tr, y_train[idx])
        rf_pred = rf.predict(X_test_scaled)
        rf_accs.append(accuracy_score(y_test, rf_pred))
        print(f"  RF acc={rf_accs[-1]:.4f}")

    # Plot
    fig, ax = plt.subplots(figsize=(8, 5))
    ax.plot(n_samples, deep_accs, marker="o", linestyle="solid", label="ScribblNet (CNN)", linewidth=2, markersize=7)
    ax.plot(n_samples, rf_accs, marker="s", linestyle="dashed", label="Random Forest (HOG)", linewidth=2, markersize=7)
    ax.set_xlabel("Training samples", fontsize=12)
    ax.set_ylabel("Test accuracy", fontsize=12)
    ax.set_title("Training Set Size Sensitivity", fontsize=14)
    ax.legend(fontsize=11)
    ax.grid(True, alpha=0.3)
    ax.set_ylim(0, 1)
    plt.tight_layout()
    out_path = OUTPUTS_DIR / "experiment_sensitivity.png"
    fig.savefig(out_path, dpi=150)
    plt.close(fig)
    print(f"\n  Saved experiment plot → {out_path}")

    results = {
        "fractions": EXPERIMENT_FRACTIONS,
        "n_samples": n_samples,
        "deep_accs": deep_accs,
        "rf_accs": rf_accs,
    }
    with open(OUTPUTS_DIR / "experiment_results.json", "w") as f:
        json.dump(results, f, indent=2)
    print("  Saved experiment_results.json")


# Plotting Helpers

def _save_confusion_matrix(
    y_true: np.ndarray,
    y_pred: np.ndarray,
    filename: str,
) -> None:
    """Save a normalised confusion matrix heatmap.

    Args:
        y_true:   Ground truth labels.
        y_pred:   Predicted labels.
        filename: Output filename (saved under OUTPUTS_DIR).
    """
    cm = confusion_matrix(y_true, y_pred, normalize="true")
    fig, ax = plt.subplots(figsize=(10, 8))
    sns.heatmap(
        cm,
        annot=True,
        fmt=".2f",
        xticklabels=CLASSES,
        yticklabels=CLASSES,
        cmap="Blues",
        ax=ax,
        linewidths=0.5,
    )
    ax.set_xlabel("Predicted", fontsize=11)
    ax.set_ylabel("True", fontsize=11)
    ax.set_title(filename.replace("_", " ").replace(".png", "").title(), fontsize=13)
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    fig.savefig(OUTPUTS_DIR / filename, dpi=150)
    plt.close(fig)
    print(f"  Saved {filename}")


def _save_training_curves(history: dict[str, list[float]]) -> None:
    """Save loss and validation accuracy curves for the deep model.

    Args:
        history: Dict with keys 'loss' and 'val_acc', each a list of per epoch values.
    """
    epochs = range(1, len(history["loss"]) + 1)
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(11, 4))

    ax1.plot(epochs, history["loss"], color="steelblue", marker="o", linestyle="solid", markersize=5)
    ax1.set_xlabel("Epoch")
    ax1.set_ylabel("Training Loss")
    ax1.set_title("ScribblNet Training Loss")
    ax1.grid(True, alpha=0.3)

    ax2.plot(epochs, history["val_acc"], color="seagreen", marker="o", linestyle="solid", markersize=5)
    ax2.set_xlabel("Epoch")
    ax2.set_ylabel("Validation Accuracy")
    ax2.set_title("ScribblNet Validation Accuracy")
    ax2.grid(True, alpha=0.3)

    plt.tight_layout()
    fig.savefig(OUTPUTS_DIR / "deep_training_curves.png", dpi=150)
    plt.close(fig)
    print("  Saved deep_training_curves.png")


def _save_model_comparison(results: list[dict[str, Any]]) -> None:
    """Bar chart comparing test accuracy across all three models.

    Args:
        results: List of result dicts each containing 'model' and 'accuracy'.
    """
    names = [r["model"].capitalize() for r in results]
    accs = [r["accuracy"] for r in results]

    fig, ax = plt.subplots(figsize=(7, 4))
    bars = ax.bar(names, accs, color=["#94a3b8", "#60a5fa", "#34d399"], width=0.5)
    ax.set_ylim(0, 1)
    ax.set_ylabel("Test Accuracy")
    ax.set_title("Model Comparison")
    for bar, acc in zip(bars, accs):
        ax.text(
            bar.get_x() + bar.get_width() / 2,
            bar.get_height() + 0.01,
            f"{acc:.3f}",
            ha="center",
            fontsize=12,
        )
    ax.grid(True, axis="y", alpha=0.3)
    plt.tight_layout()
    fig.savefig(OUTPUTS_DIR / "model_comparison.png", dpi=150)
    plt.close(fig)
    print("  Saved model_comparison.png")


# Orchestrator

def train_all() -> None:
    """Train all three models, run the experiment, and save all artefacts."""
    X_train_raw, X_test_raw, y_train, y_test, X_train_hog, X_test_hog = (
        load_processed_data()
    )

    r_naive = train_naive(y_train, y_test)
    r_classical = train_classical(X_train_hog, X_test_hog, y_train, y_test)
    r_deep = train_deep(X_train_raw, X_test_raw, y_train, y_test)

    _save_model_comparison([r_naive, r_classical, r_deep])

    run_experiment(
        X_train_raw, X_test_raw, y_train, y_test, X_train_hog, X_test_hog
    )

    summary = {
        "naive_accuracy": r_naive["accuracy"],
        "classical_accuracy": r_classical["accuracy"],
        "deep_accuracy": r_deep["accuracy"],
    }
    with open(OUTPUTS_DIR / "results_summary.json", "w") as f:
        json.dump(summary, f, indent=2)

    print("\nTraining complete. Summary:")
    for k, v in summary.items():
        print(f"  {k}: {v:.4f}")


if __name__ == "__main__":
    train_all()