| """ |
| MindWatch — Distress Classification Model |
| Uses sentence-transformers embeddings + sklearn classifiers. |
| Designed to run on HuggingFace Spaces free tier (CPU only). |
| """ |
|
|
| import numpy as np |
| import pandas as pd |
| import joblib |
| import os |
| from typing import Tuple, Dict, List, Optional |
| from sklearn.linear_model import LogisticRegression |
| from sklearn.ensemble import HistGradientBoostingClassifier, VotingClassifier |
| from sklearn.model_selection import train_test_split, cross_val_score |
| from sklearn.metrics import ( |
| classification_report, |
| confusion_matrix, |
| accuracy_score, |
| f1_score, |
| ) |
| from sklearn.preprocessing import StandardScaler |
| from sentence_transformers import SentenceTransformer |
| from tqdm import tqdm |
|
|
| from utils.preprocessing import preprocess_text, extract_linguistic_features |
|
|
|
|
| class DistressClassifier: |
| """ |
| Multi-signal distress classifier combining: |
| 1. Sentence-transformer embeddings (semantic signal) |
| 2. Linguistic features (psychological signal) |
| 3. Ensemble of LogisticRegression + GradientBoosting |
| """ |
|
|
| def __init__( |
| self, |
| embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2", |
| model_dir: str = "saved_models", |
| ): |
| self.embedding_model_name = embedding_model |
| self.model_dir = model_dir |
| self.encoder: Optional[SentenceTransformer] = None |
| self.classifier = None |
| self.scaler = StandardScaler() |
| self.label_map: Dict[str, int] = {} |
| self.inv_label_map: Dict[int, str] = {} |
| self.is_trained = False |
|
|
| def _load_encoder(self): |
| """Lazy-load the sentence transformer.""" |
| if self.encoder is None: |
| print(f"Loading embedding model: {self.embedding_model_name}") |
| self.encoder = SentenceTransformer(self.embedding_model_name) |
|
|
| def _get_features(self, texts: List[str], show_progress: bool = True) -> np.ndarray: |
| """ |
| Extract combined features: embeddings + linguistic features. |
| """ |
| self._load_encoder() |
|
|
| |
| embeddings = self.encoder.encode( |
| texts, show_progress_bar=show_progress, batch_size=64 |
| ) |
|
|
| |
| ling_features = [] |
| iterator = tqdm(texts, desc="Extracting features") if show_progress else texts |
| for text in iterator: |
| feats = extract_linguistic_features(text) |
| ling_features.append(list(feats.values())) |
|
|
| ling_array = np.array(ling_features) |
|
|
| |
| combined = np.hstack([embeddings, ling_array]) |
| return combined |
|
|
| def train( |
| self, |
| df: pd.DataFrame, |
| label_map: Dict[str, int], |
| test_size: float = 0.2, |
| ) -> dict: |
| """ |
| Train the ensemble classifier. |
| |
| Args: |
| df: DataFrame with 'clean_text' and 'label_id' columns |
| label_map: Mapping from label names to IDs |
| test_size: Fraction for test split |
| |
| Returns: |
| Dictionary with evaluation metrics |
| """ |
| self.label_map = label_map |
| self.inv_label_map = {v: k for k, v in label_map.items()} |
|
|
| texts = df["clean_text"].tolist() |
| labels = df["label_id"].values |
|
|
| print(f"\n🔧 Training on {len(texts)} samples...") |
|
|
| |
| features = self._get_features(texts) |
|
|
| |
| X_train, X_test, y_train, y_test = train_test_split( |
| features, labels, test_size=test_size, random_state=42, stratify=labels |
| ) |
|
|
| |
| X_train = self.scaler.fit_transform(X_train) |
| X_test = self.scaler.transform(X_test) |
|
|
| |
| lr = LogisticRegression( |
| max_iter=1000, C=1.0, class_weight="balanced", random_state=42 |
| ) |
| gb = HistGradientBoostingClassifier( |
| max_iter=100, max_depth=5, learning_rate=0.1, random_state=42 |
| ) |
|
|
| self.classifier = VotingClassifier( |
| estimators=[("lr", lr), ("gb", gb)], |
| voting="soft", |
| weights=[1, 1.5], |
| ) |
|
|
| print("Training ensemble classifier...") |
| self.classifier.fit(X_train, y_train) |
| self.is_trained = True |
|
|
| |
| y_pred = self.classifier.predict(X_test) |
| target_names = [self.inv_label_map[i] for i in sorted(self.inv_label_map)] |
|
|
| report = classification_report( |
| y_test, y_pred, target_names=target_names, output_dict=True |
| ) |
| cm = confusion_matrix(y_test, y_pred) |
| accuracy = accuracy_score(y_test, y_pred) |
| f1 = f1_score(y_test, y_pred, average="weighted") |
|
|
| |
| cv_scores = cross_val_score( |
| LogisticRegression(max_iter=1000, class_weight="balanced"), |
| X_train, y_train, cv=5, scoring="f1_weighted", |
| ) |
|
|
| metrics = { |
| "accuracy": accuracy, |
| "f1_weighted": f1, |
| "cv_f1_mean": cv_scores.mean(), |
| "cv_f1_std": cv_scores.std(), |
| "classification_report": report, |
| "confusion_matrix": cm.tolist(), |
| "target_names": target_names, |
| } |
|
|
| print(f"\n📈 Results:") |
| print(f" Accuracy: {accuracy:.4f}") |
| print(f" F1 (weighted): {f1:.4f}") |
| print(f" CV F1: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}") |
| print(f"\n{classification_report(y_test, y_pred, target_names=target_names)}") |
|
|
| return metrics |
|
|
| def predict(self, text: str) -> Dict: |
| """ |
| Predict distress category for a single text. |
| |
| Returns: |
| Dict with 'label', 'confidence', 'probabilities', 'features' |
| """ |
| if not self.is_trained: |
| raise RuntimeError("Model not trained. Call train() or load() first.") |
|
|
| clean = preprocess_text(text) |
| features = self._get_features([clean], show_progress=False) |
| features_scaled = self.scaler.transform(features) |
|
|
| proba = self.classifier.predict_proba(features_scaled)[0] |
| pred_id = np.argmax(proba) |
| pred_label = self.inv_label_map[pred_id] |
|
|
| probabilities = { |
| self.inv_label_map[i]: float(p) for i, p in enumerate(proba) |
| } |
|
|
| |
| ling = extract_linguistic_features(text) |
|
|
| return { |
| "label": pred_label, |
| "confidence": float(proba[pred_id]), |
| "probabilities": probabilities, |
| "linguistic_features": ling, |
| "clean_text": clean, |
| } |
|
|
| def predict_batch(self, texts: List[str]) -> List[Dict]: |
| """Predict for multiple texts.""" |
| return [self.predict(t) for t in texts] |
|
|
| def save(self, path: Optional[str] = None): |
| """Save model to disk.""" |
| save_dir = path or self.model_dir |
| os.makedirs(save_dir, exist_ok=True) |
|
|
| joblib.dump(self.classifier, os.path.join(save_dir, "classifier.pkl")) |
| joblib.dump(self.scaler, os.path.join(save_dir, "scaler.pkl")) |
| joblib.dump(self.label_map, os.path.join(save_dir, "label_map.pkl")) |
| joblib.dump(self.inv_label_map, os.path.join(save_dir, "inv_label_map.pkl")) |
| print(f"✓ Model saved to {save_dir}/") |
|
|
| def load(self, path: Optional[str] = None): |
| """Load model from disk.""" |
| load_dir = path or self.model_dir |
| self.classifier = joblib.load(os.path.join(load_dir, "classifier.pkl")) |
| self.scaler = joblib.load(os.path.join(load_dir, "scaler.pkl")) |
| self.label_map = joblib.load(os.path.join(load_dir, "label_map.pkl")) |
| self.inv_label_map = joblib.load(os.path.join(load_dir, "inv_label_map.pkl")) |
| self.is_trained = True |
| print(f"✓ Model loaded from {load_dir}/") |
|
|
|
|
| if __name__ == "__main__": |
| from data.dataset_loader import load_all_datasets |
|
|
| df, label_map = load_all_datasets(max_emotion=3000, max_go_emotions=3000) |
| model = DistressClassifier() |
| metrics = model.train(df, label_map) |
| model.save() |
|
|
| |
| result = model.predict("I feel so hopeless and tired of everything") |
| print(f"\nTest prediction: {result}") |
|
|