"""Bloom Classifier training pipeline.

Trains a TF-IDF + LogisticRegression model for Bloom taxonomy classification.
Target: bloom_level (6 classes: Remember, Understand, Apply, Analyze, Evaluate, Create).
Features: question_text (TF-IDF).
Primary metric: macro F1.
"""

import logging
from datetime import datetime, timezone

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    f1_score,
)
from sklearn.preprocessing import LabelEncoder

from app.core.config import settings
from app.core.exceptions import TrainingError
from training.base_trainer import BaseTrainer, TrainingResult

logger = logging.getLogger(__name__)


class BloomClassifierTrainer(BaseTrainer):
    """TF-IDF + LogisticRegression for Bloom taxonomy classification.

    Target: bloom_level (6 classes: Remember, Understand, Apply, Analyze, Evaluate, Create)
    Features: question_text (TF-IDF)
    Primary metric: macro F1
    """

    @property
    def model_name(self) -> str:
        return "bloom_classifier"

    @property
    def model_version(self) -> str:
        return "bloom_classifier_v2_baseline_001"

    @property
    def table_name(self) -> str:
        return "training_bloom_classification"

    def train(self, train_df: pd.DataFrame, val_df: pd.DataFrame) -> dict:
        """Train TF-IDF + LogisticRegression(multinomial).

        Algorithm:
        1. Fit TF-IDF vectorizer on train question_text
        2. Encode bloom_level labels with LabelEncoder
        3. Fit LogisticRegression(multi_class="multinomial", C=1.0, solver="lbfgs",
                                   max_iter=1000, random_state=seed)
        4. Return {"model": logreg, "vectorizer": tfidf, "label_encoder": le}
        """
        tfidf = TfidfVectorizer(
            max_features=8000,
            ngram_range=(1, 2),
            sublinear_tf=True,
        )
        X_train = tfidf.fit_transform(train_df["question_text"])

        le = LabelEncoder()
        y_train = le.fit_transform(train_df["bloom_level"])

        logreg = LogisticRegression(
            multi_class="multinomial",
            C=1.0,
            solver="lbfgs",
            max_iter=1000,
            random_state=self._seed,
        )
        logreg.fit(X_train, y_train)

        logger.info(
            "Bloom Classifier trained — %d features, %d classes",
            X_train.shape[1],
            len(le.classes_),
        )

        return {"model": logreg, "vectorizer": tfidf, "label_encoder": le}

    def evaluate(self, artifacts: dict, df: pd.DataFrame, split_name: str) -> dict:
        """Evaluate model on a split.

        Computes: macro F1, weighted F1, per-class precision/recall/f1,
        confusion matrix.
        """
        model = artifacts["model"]
        tfidf = artifacts["vectorizer"]
        le = artifacts["label_encoder"]

        X = tfidf.transform(df["question_text"])
        y_true = le.transform(df["bloom_level"])
        y_pred = model.predict(X)

        # F1 scores
        macro_f1 = f1_score(y_true, y_pred, average="macro", zero_division=0)
        weighted_f1 = f1_score(y_true, y_pred, average="weighted", zero_division=0)

        # Per-class metrics
        report = classification_report(
            y_true, y_pred,
            target_names=le.classes_,
            output_dict=True,
            zero_division=0,
        )
        per_class = {}
        for class_name in le.classes_:
            if class_name in report:
                per_class[class_name] = {
                    "precision": round(report[class_name]["precision"], 4),
                    "recall": round(report[class_name]["recall"], 4),
                    "f1": round(report[class_name]["f1-score"], 4),
                    "support": int(report[class_name]["support"]),
                }

        # Confusion matrix
        cm = confusion_matrix(y_true, y_pred).tolist()

        metrics = {
            "macro_f1": round(macro_f1, 4),
            "weighted_f1": round(weighted_f1, 4),
            "per_class": per_class,
            "confusion_matrix": cm,
        }

        logger.info(
            "%s metrics — macro_f1: %.4f, weighted_f1: %.4f",
            split_name, macro_f1, weighted_f1,
        )

        return metrics

    def _check_baseline(self, metrics: dict) -> None:
        """Verify macro F1 > (1 / num_classes). Raise TrainingError if not met."""
        test_metrics = metrics.get("metrics", {}).get("test", {})
        macro_f1 = test_metrics.get("macro_f1", 0.0)
        num_classes = len(test_metrics.get("per_class", {}))

        # Fallback: use validation metrics if test not available
        if num_classes == 0:
            val_metrics = metrics.get("metrics", {}).get("validation", {})
            macro_f1 = val_metrics.get("macro_f1", 0.0)
            num_classes = len(val_metrics.get("per_class", {}))

        if num_classes == 0:
            raise TrainingError(
                "Cannot compute baseline: no classes found in metrics.",
                model_name=self.model_name,
            )

        baseline = 1.0 / num_classes
        if macro_f1 <= baseline:
            raise TrainingError(
                f"Macro F1 ({macro_f1:.4f}) does not exceed "
                f"random baseline ({baseline:.4f} = 1/{num_classes}). "
                f"Model is not better than random.",
                model_name=self.model_name,
            )

        logger.info(
            "Baseline check passed — macro F1 %.4f > baseline %.4f",
            macro_f1, baseline,
        )

    def _build_metrics(
        self,
        val_metrics: dict,
        test_metrics: dict,
        train_df: pd.DataFrame,
        val_df: pd.DataFrame,
        test_df: pd.DataFrame,
    ) -> dict:
        """Assemble full metrics.json content."""
        return {
            "model_name": self.model_name,
            "model_version": self.model_version,
            "dataset_version": settings.ai_service_version,
            "trained_at": datetime.now(timezone.utc).isoformat(),
            "seed": self._seed,
            "split_counts": {
                "train": len(train_df),
                "validation": len(val_df),
                "test": len(test_df),
            },
            "metrics": {
                "validation": val_metrics,
                "test": test_metrics,
            },
            "limitations": [
                "Trained on synthetic data only.",
                "6 classes with imbalanced distribution — Create (~2%) and Evaluate (~4%) are rare.",
                "Macro F1 is the primary metric; per-class recall may be low for rare classes.",
                "TF-IDF features do not capture semantic similarity beyond n-gram overlap.",
            ],
        }

    def _build_training_config(
        self,
        train_df: pd.DataFrame,
        val_df: pd.DataFrame,
        test_df: pd.DataFrame,
    ) -> dict:
        """Build training_config.json with hyperparameters."""
        return {
            "model_name": self.model_name,
            "model_version": self.model_version,
            "dataset_version": settings.ai_service_version,
            "seed": self._seed,
            "split_counts": {
                "train": len(train_df),
                "validation": len(val_df),
                "test": len(test_df),
            },
            "hyperparameters": {
                "tfidf_max_features": 8000,
                "ngram_range": [1, 2],
                "sublinear_tf": True,
                "logreg_C": 1.0,
                "logreg_solver": "lbfgs",
                "logreg_max_iter": 1000,
                "logreg_multi_class": "multinomial",
            },
            "feature_columns": ["question_text"],
            "target_column": "bloom_level",
            "algorithm": "LogisticRegression(multinomial)",
        }

    def _build_model_card(self, metrics: dict) -> str:
        """Generate model_card.md content."""
        val_metrics = metrics.get("metrics", {}).get("validation", {})
        test_metrics = metrics.get("metrics", {}).get("test", {})

        card = f"""# Model Card: Bloom Classifier

## Model Details

- **Model Name:** {self.model_name}
- **Model Version:** {self.model_version}
- **Algorithm:** TF-IDF + LogisticRegression (multinomial)
- **Framework:** scikit-learn
- **Trained At:** {metrics.get("trained_at", "N/A")}
- **Seed:** {self._seed}

## Intended Use

Automatically classify questions by Bloom's taxonomy cognitive level.
Used in the Bloom classification endpoint to predict one of 6 levels:
Remember, Understand, Apply, Analyze, Evaluate, Create.

## Training Data

- **Source:** training_bloom_classification.csv (synthetic dataset v2)
- **Split Counts:** train={metrics.get("split_counts", {}).get("train", "N/A")}, \
validation={metrics.get("split_counts", {}).get("validation", "N/A")}, \
test={metrics.get("split_counts", {}).get("test", "N/A")}
- **Feature:** question_text (TF-IDF vectorized, max_features=8000, ngram_range=(1,2))
- **Target:** bloom_level (6 classes)

## Metrics

### Validation Set
- Macro F1: {val_metrics.get("macro_f1", "N/A")}
- Weighted F1: {val_metrics.get("weighted_f1", "N/A")}

### Test Set
- Macro F1: {test_metrics.get("macro_f1", "N/A")}
- Weighted F1: {test_metrics.get("weighted_f1", "N/A")}

## Known Limitations

- Trained on synthetic data only — performance on real classroom questions is unknown.
- Class imbalance: Create (~2%) and Evaluate (~4%) are rare; recall on these classes may be low.
- TF-IDF features do not capture semantic similarity beyond n-gram overlap.
- Macro F1 is the primary metric; accuracy alone would mask poor performance on rare classes.

## Fallback Behavior

When the model is not loaded or confidence is below the threshold (0.55),
the system falls back to keyword heuristic classification:
define/list → Remember; explain → Understand; calculate/use → Apply;
compare/contrast → Analyze; justify → Evaluate; design → Create.
"""
        return card