"""
SVM classifier — PRIMARY model per the report (§3.1.3 + §4.1).

Report quote:
  "the Support Vector Machine (SVM) was adopted as the core classifier"
  "classification using SVM to predict the associated MITRE ATT&CK techniques"

Inference logic (verbatim from MurshidUIPipeline.ipynb cell 16+19):
  scores  = svm_model.named_steps["clf"].decision_function(
                svm_model.named_steps["pca"].transform(X_user)
            ).reshape(-1)
  pred    = (scores >= thr_per_label).astype(int)
  margins = scores - thr_per_label
  conf    = sigmoid(margins) * 100

Original notebook file is NOT modified.
"""

from __future__ import annotations

import json
from pathlib import Path

import joblib
import numpy as np

from app.config import settings


def _sigmoid(x: np.ndarray) -> np.ndarray:
    """Probability calibration: sigmoid(margin) — notebook cell 17."""
    x = np.clip(x, -30, 30)
    return 1.0 / (1.0 + np.exp(-x))


class SVMModel:
    """
    Wraps the trained LinearSVC pipeline with per-label thresholds.
    Structure of the .joblib pack (from notebook):
        svm_pack["model"]                  → sklearn Pipeline (PCA + LinearSVC)
        svm_pack["thresholds_per_label"]   → np.ndarray shape (n_techniques,)
    """

    def __init__(self, models_dir: Path | None = None) -> None:
        base = Path(models_dir or settings.murshid_models_dir).resolve()

        svm_path    = base / settings.svm_joblib
        labels_path = base / settings.label_columns_json

        for p in (svm_path, labels_path):
            if not p.is_file():
                raise FileNotFoundError(f"Missing model file: {p}")

        svm_pack = joblib.load(svm_path)
        self._model      = svm_pack["model"]                 # Pipeline(PCA → LinearSVC)
        self._thresholds = np.asarray(
            svm_pack["thresholds_per_label"], dtype=np.float64
        )

        with open(labels_path, encoding="utf-8") as f:
            self.technique_names: list[str] = json.load(f)

        n = len(self.technique_names)
        if self._thresholds.shape[0] != n:
            raise ValueError(
                f"SVM thresholds length {self._thresholds.shape[0]} != {n} labels"
            )

    # ------------------------------------------------------------------

    def predict(self, embedding_1d: np.ndarray) -> list[dict]:
        """
        Run SVM inference exactly as in the notebook.

        Returns list of dicts sorted by confidence_percent desc:
            technique_id, predicted, confidence_percent, score, threshold, margin
        """
        X = embedding_1d.reshape(1, -1)

        # Apply PCA then LinearSVC decision function (notebook cell 19)
        scores = self._model.named_steps["clf"].decision_function(
            self._model.named_steps["pca"].transform(X)
        ).reshape(-1)

        pred    = (scores >= self._thresholds).astype(int)
        margins = scores - self._thresholds
        conf    = _sigmoid(margins) * 100          # calibrated confidence (%)

        results = [
            {
                "technique_id":      self.technique_names[i],
                "predicted":         bool(pred[i]),
                "confidence_percent": round(float(conf[i]), 2),
                "score":             round(float(scores[i]), 4),
                "threshold":         round(float(self._thresholds[i]), 4),
                "margin":            round(float(margins[i]), 4),
            }
            for i in range(len(self.technique_names))
        ]

        return sorted(results, key=lambda r: r["confidence_percent"], reverse=True)