""" MindWatch — Distress Classification Model Uses sentence-transformers embeddings + sklearn classifiers. Designed to run on HuggingFace Spaces free tier (CPU only). """ import numpy as np import pandas as pd import joblib import os from typing import Tuple, Dict, List, Optional from sklearn.linear_model import LogisticRegression from sklearn.ensemble import HistGradientBoostingClassifier, VotingClassifier from sklearn.model_selection import train_test_split, cross_val_score from sklearn.metrics import ( classification_report, confusion_matrix, accuracy_score, f1_score, ) from sklearn.preprocessing import StandardScaler from sentence_transformers import SentenceTransformer from tqdm import tqdm from utils.preprocessing import preprocess_text, extract_linguistic_features class DistressClassifier: """ Multi-signal distress classifier combining: 1. Sentence-transformer embeddings (semantic signal) 2. Linguistic features (psychological signal) 3. Ensemble of LogisticRegression + GradientBoosting """ def __init__( self, embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2", model_dir: str = "saved_models", ): self.embedding_model_name = embedding_model self.model_dir = model_dir self.encoder: Optional[SentenceTransformer] = None self.classifier = None self.scaler = StandardScaler() self.label_map: Dict[str, int] = {} self.inv_label_map: Dict[int, str] = {} self.is_trained = False def _load_encoder(self): """Lazy-load the sentence transformer.""" if self.encoder is None: print(f"Loading embedding model: {self.embedding_model_name}") self.encoder = SentenceTransformer(self.embedding_model_name) def _get_features(self, texts: List[str], show_progress: bool = True) -> np.ndarray: """ Extract combined features: embeddings + linguistic features. """ self._load_encoder() # Sentence embeddings embeddings = self.encoder.encode( texts, show_progress_bar=show_progress, batch_size=64 ) # Linguistic features ling_features = [] iterator = tqdm(texts, desc="Extracting features") if show_progress else texts for text in iterator: feats = extract_linguistic_features(text) ling_features.append(list(feats.values())) ling_array = np.array(ling_features) # Combine combined = np.hstack([embeddings, ling_array]) return combined def train( self, df: pd.DataFrame, label_map: Dict[str, int], test_size: float = 0.2, ) -> dict: """ Train the ensemble classifier. Args: df: DataFrame with 'clean_text' and 'label_id' columns label_map: Mapping from label names to IDs test_size: Fraction for test split Returns: Dictionary with evaluation metrics """ self.label_map = label_map self.inv_label_map = {v: k for k, v in label_map.items()} texts = df["clean_text"].tolist() labels = df["label_id"].values print(f"\nšŸ”§ Training on {len(texts)} samples...") # Extract features features = self._get_features(texts) # Train/test split X_train, X_test, y_train, y_test = train_test_split( features, labels, test_size=test_size, random_state=42, stratify=labels ) # Scale features X_train = self.scaler.fit_transform(X_train) X_test = self.scaler.transform(X_test) # Build ensemble lr = LogisticRegression( max_iter=1000, C=1.0, class_weight="balanced", random_state=42 ) gb = HistGradientBoostingClassifier( max_iter=100, max_depth=5, learning_rate=0.1, random_state=42 ) self.classifier = VotingClassifier( estimators=[("lr", lr), ("gb", gb)], voting="soft", weights=[1, 1.5], ) print("Training ensemble classifier...") self.classifier.fit(X_train, y_train) self.is_trained = True # Evaluate y_pred = self.classifier.predict(X_test) target_names = [self.inv_label_map[i] for i in sorted(self.inv_label_map)] report = classification_report( y_test, y_pred, target_names=target_names, output_dict=True ) cm = confusion_matrix(y_test, y_pred) accuracy = accuracy_score(y_test, y_pred) f1 = f1_score(y_test, y_pred, average="weighted") # Cross-validation on training data cv_scores = cross_val_score( LogisticRegression(max_iter=1000, class_weight="balanced"), X_train, y_train, cv=5, scoring="f1_weighted", ) metrics = { "accuracy": accuracy, "f1_weighted": f1, "cv_f1_mean": cv_scores.mean(), "cv_f1_std": cv_scores.std(), "classification_report": report, "confusion_matrix": cm.tolist(), "target_names": target_names, } print(f"\nšŸ“ˆ Results:") print(f" Accuracy: {accuracy:.4f}") print(f" F1 (weighted): {f1:.4f}") print(f" CV F1: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}") print(f"\n{classification_report(y_test, y_pred, target_names=target_names)}") return metrics def predict(self, text: str) -> Dict: """ Predict distress category for a single text. Returns: Dict with 'label', 'confidence', 'probabilities', 'features' """ if not self.is_trained: raise RuntimeError("Model not trained. Call train() or load() first.") clean = preprocess_text(text) features = self._get_features([clean], show_progress=False) features_scaled = self.scaler.transform(features) proba = self.classifier.predict_proba(features_scaled)[0] pred_id = np.argmax(proba) pred_label = self.inv_label_map[pred_id] probabilities = { self.inv_label_map[i]: float(p) for i, p in enumerate(proba) } # Linguistic features for explainability ling = extract_linguistic_features(text) return { "label": pred_label, "confidence": float(proba[pred_id]), "probabilities": probabilities, "linguistic_features": ling, "clean_text": clean, } def predict_batch(self, texts: List[str]) -> List[Dict]: """Predict for multiple texts.""" return [self.predict(t) for t in texts] def save(self, path: Optional[str] = None): """Save model to disk.""" save_dir = path or self.model_dir os.makedirs(save_dir, exist_ok=True) joblib.dump(self.classifier, os.path.join(save_dir, "classifier.pkl")) joblib.dump(self.scaler, os.path.join(save_dir, "scaler.pkl")) joblib.dump(self.label_map, os.path.join(save_dir, "label_map.pkl")) joblib.dump(self.inv_label_map, os.path.join(save_dir, "inv_label_map.pkl")) print(f"āœ“ Model saved to {save_dir}/") def load(self, path: Optional[str] = None): """Load model from disk.""" load_dir = path or self.model_dir self.classifier = joblib.load(os.path.join(load_dir, "classifier.pkl")) self.scaler = joblib.load(os.path.join(load_dir, "scaler.pkl")) self.label_map = joblib.load(os.path.join(load_dir, "label_map.pkl")) self.inv_label_map = joblib.load(os.path.join(load_dir, "inv_label_map.pkl")) self.is_trained = True print(f"āœ“ Model loaded from {load_dir}/") if __name__ == "__main__": from data.dataset_loader import load_all_datasets df, label_map = load_all_datasets(max_emotion=3000, max_go_emotions=3000) model = DistressClassifier() metrics = model.train(df, label_map) model.save() # Test prediction result = model.predict("I feel so hopeless and tired of everything") print(f"\nTest prediction: {result}")