"""
MindWatch — Distress Classification Model
Uses sentence-transformers embeddings + sklearn classifiers.
Designed to run on HuggingFace Spaces free tier (CPU only).
"""

import numpy as np
import pandas as pd
import joblib
import os
from typing import Tuple, Dict, List, Optional
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier, VotingClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    f1_score,
)
from sklearn.preprocessing import StandardScaler
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

from utils.preprocessing import preprocess_text, extract_linguistic_features


class DistressClassifier:
    """
    Multi-signal distress classifier combining:
      1. Sentence-transformer embeddings (semantic signal)
      2. Linguistic features (psychological signal)
      3. Ensemble of LogisticRegression + GradientBoosting
    """

    def __init__(
        self,
        embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2",
        model_dir: str = "saved_models",
    ):
        self.embedding_model_name = embedding_model
        self.model_dir = model_dir
        self.encoder: Optional[SentenceTransformer] = None
        self.classifier = None
        self.scaler = StandardScaler()
        self.label_map: Dict[str, int] = {}
        self.inv_label_map: Dict[int, str] = {}
        self.is_trained = False

    def _load_encoder(self):
        """Lazy-load the sentence transformer."""
        if self.encoder is None:
            print(f"Loading embedding model: {self.embedding_model_name}")
            self.encoder = SentenceTransformer(self.embedding_model_name)

    def _get_features(self, texts: List[str], show_progress: bool = True) -> np.ndarray:
        """
        Extract combined features: embeddings + linguistic features.
        """
        self._load_encoder()

        # Sentence embeddings
        embeddings = self.encoder.encode(
            texts, show_progress_bar=show_progress, batch_size=64
        )

        # Linguistic features
        ling_features = []
        iterator = tqdm(texts, desc="Extracting features") if show_progress else texts
        for text in iterator:
            feats = extract_linguistic_features(text)
            ling_features.append(list(feats.values()))

        ling_array = np.array(ling_features)

        # Combine
        combined = np.hstack([embeddings, ling_array])
        return combined

    def train(
        self,
        df: pd.DataFrame,
        label_map: Dict[str, int],
        test_size: float = 0.2,
    ) -> dict:
        """
        Train the ensemble classifier.

        Args:
            df: DataFrame with 'clean_text' and 'label_id' columns
            label_map: Mapping from label names to IDs
            test_size: Fraction for test split

        Returns:
            Dictionary with evaluation metrics
        """
        self.label_map = label_map
        self.inv_label_map = {v: k for k, v in label_map.items()}

        texts = df["clean_text"].tolist()
        labels = df["label_id"].values

        print(f"\n🔧 Training on {len(texts)} samples...")

        # Extract features
        features = self._get_features(texts)

        # Train/test split
        X_train, X_test, y_train, y_test = train_test_split(
            features, labels, test_size=test_size, random_state=42, stratify=labels
        )

        # Scale features
        X_train = self.scaler.fit_transform(X_train)
        X_test = self.scaler.transform(X_test)

        # Build ensemble
        lr = LogisticRegression(
            max_iter=1000, C=1.0, class_weight="balanced", random_state=42
        )
        gb = HistGradientBoostingClassifier(
            max_iter=100, max_depth=5, learning_rate=0.1, random_state=42
        )

        self.classifier = VotingClassifier(
            estimators=[("lr", lr), ("gb", gb)],
            voting="soft",
            weights=[1, 1.5],
        )

        print("Training ensemble classifier...")
        self.classifier.fit(X_train, y_train)
        self.is_trained = True

        # Evaluate
        y_pred = self.classifier.predict(X_test)
        target_names = [self.inv_label_map[i] for i in sorted(self.inv_label_map)]

        report = classification_report(
            y_test, y_pred, target_names=target_names, output_dict=True
        )
        cm = confusion_matrix(y_test, y_pred)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average="weighted")

        # Cross-validation on training data
        cv_scores = cross_val_score(
            LogisticRegression(max_iter=1000, class_weight="balanced"),
            X_train, y_train, cv=5, scoring="f1_weighted",
        )

        metrics = {
            "accuracy": accuracy,
            "f1_weighted": f1,
            "cv_f1_mean": cv_scores.mean(),
            "cv_f1_std": cv_scores.std(),
            "classification_report": report,
            "confusion_matrix": cm.tolist(),
            "target_names": target_names,
        }

        print(f"\n📈 Results:")
        print(f"   Accuracy:     {accuracy:.4f}")
        print(f"   F1 (weighted): {f1:.4f}")
        print(f"   CV F1:         {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
        print(f"\n{classification_report(y_test, y_pred, target_names=target_names)}")

        return metrics

    def predict(self, text: str) -> Dict:
        """
        Predict distress category for a single text.

        Returns:
            Dict with 'label', 'confidence', 'probabilities', 'features'
        """
        if not self.is_trained:
            raise RuntimeError("Model not trained. Call train() or load() first.")

        clean = preprocess_text(text)
        features = self._get_features([clean], show_progress=False)
        features_scaled = self.scaler.transform(features)

        proba = self.classifier.predict_proba(features_scaled)[0]
        pred_id = np.argmax(proba)
        pred_label = self.inv_label_map[pred_id]

        probabilities = {
            self.inv_label_map[i]: float(p) for i, p in enumerate(proba)
        }

        # Linguistic features for explainability
        ling = extract_linguistic_features(text)

        return {
            "label": pred_label,
            "confidence": float(proba[pred_id]),
            "probabilities": probabilities,
            "linguistic_features": ling,
            "clean_text": clean,
        }

    def predict_batch(self, texts: List[str]) -> List[Dict]:
        """Predict for multiple texts."""
        return [self.predict(t) for t in texts]

    def save(self, path: Optional[str] = None):
        """Save model to disk."""
        save_dir = path or self.model_dir
        os.makedirs(save_dir, exist_ok=True)

        joblib.dump(self.classifier, os.path.join(save_dir, "classifier.pkl"))
        joblib.dump(self.scaler, os.path.join(save_dir, "scaler.pkl"))
        joblib.dump(self.label_map, os.path.join(save_dir, "label_map.pkl"))
        joblib.dump(self.inv_label_map, os.path.join(save_dir, "inv_label_map.pkl"))
        print(f"✓ Model saved to {save_dir}/")

    def load(self, path: Optional[str] = None):
        """Load model from disk."""
        load_dir = path or self.model_dir
        self.classifier = joblib.load(os.path.join(load_dir, "classifier.pkl"))
        self.scaler = joblib.load(os.path.join(load_dir, "scaler.pkl"))
        self.label_map = joblib.load(os.path.join(load_dir, "label_map.pkl"))
        self.inv_label_map = joblib.load(os.path.join(load_dir, "inv_label_map.pkl"))
        self.is_trained = True
        print(f"✓ Model loaded from {load_dir}/")


if __name__ == "__main__":
    from data.dataset_loader import load_all_datasets

    df, label_map = load_all_datasets(max_emotion=3000, max_go_emotions=3000)
    model = DistressClassifier()
    metrics = model.train(df, label_map)
    model.save()

    # Test prediction
    result = model.predict("I feel so hopeless and tired of everything")
    print(f"\nTest prediction: {result}")