mindwatch / models /classifier.py
priyadip's picture
Fix OSError in NLTK check and use HistGradientBoostingClassifier
f66fec4 verified
"""
MindWatch — Distress Classification Model
Uses sentence-transformers embeddings + sklearn classifiers.
Designed to run on HuggingFace Spaces free tier (CPU only).
"""
import numpy as np
import pandas as pd
import joblib
import os
from typing import Tuple, Dict, List, Optional
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier, VotingClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import (
classification_report,
confusion_matrix,
accuracy_score,
f1_score,
)
from sklearn.preprocessing import StandardScaler
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from utils.preprocessing import preprocess_text, extract_linguistic_features
class DistressClassifier:
"""
Multi-signal distress classifier combining:
1. Sentence-transformer embeddings (semantic signal)
2. Linguistic features (psychological signal)
3. Ensemble of LogisticRegression + GradientBoosting
"""
def __init__(
self,
embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2",
model_dir: str = "saved_models",
):
self.embedding_model_name = embedding_model
self.model_dir = model_dir
self.encoder: Optional[SentenceTransformer] = None
self.classifier = None
self.scaler = StandardScaler()
self.label_map: Dict[str, int] = {}
self.inv_label_map: Dict[int, str] = {}
self.is_trained = False
def _load_encoder(self):
"""Lazy-load the sentence transformer."""
if self.encoder is None:
print(f"Loading embedding model: {self.embedding_model_name}")
self.encoder = SentenceTransformer(self.embedding_model_name)
def _get_features(self, texts: List[str], show_progress: bool = True) -> np.ndarray:
"""
Extract combined features: embeddings + linguistic features.
"""
self._load_encoder()
# Sentence embeddings
embeddings = self.encoder.encode(
texts, show_progress_bar=show_progress, batch_size=64
)
# Linguistic features
ling_features = []
iterator = tqdm(texts, desc="Extracting features") if show_progress else texts
for text in iterator:
feats = extract_linguistic_features(text)
ling_features.append(list(feats.values()))
ling_array = np.array(ling_features)
# Combine
combined = np.hstack([embeddings, ling_array])
return combined
def train(
self,
df: pd.DataFrame,
label_map: Dict[str, int],
test_size: float = 0.2,
) -> dict:
"""
Train the ensemble classifier.
Args:
df: DataFrame with 'clean_text' and 'label_id' columns
label_map: Mapping from label names to IDs
test_size: Fraction for test split
Returns:
Dictionary with evaluation metrics
"""
self.label_map = label_map
self.inv_label_map = {v: k for k, v in label_map.items()}
texts = df["clean_text"].tolist()
labels = df["label_id"].values
print(f"\n🔧 Training on {len(texts)} samples...")
# Extract features
features = self._get_features(texts)
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
features, labels, test_size=test_size, random_state=42, stratify=labels
)
# Scale features
X_train = self.scaler.fit_transform(X_train)
X_test = self.scaler.transform(X_test)
# Build ensemble
lr = LogisticRegression(
max_iter=1000, C=1.0, class_weight="balanced", random_state=42
)
gb = HistGradientBoostingClassifier(
max_iter=100, max_depth=5, learning_rate=0.1, random_state=42
)
self.classifier = VotingClassifier(
estimators=[("lr", lr), ("gb", gb)],
voting="soft",
weights=[1, 1.5],
)
print("Training ensemble classifier...")
self.classifier.fit(X_train, y_train)
self.is_trained = True
# Evaluate
y_pred = self.classifier.predict(X_test)
target_names = [self.inv_label_map[i] for i in sorted(self.inv_label_map)]
report = classification_report(
y_test, y_pred, target_names=target_names, output_dict=True
)
cm = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
# Cross-validation on training data
cv_scores = cross_val_score(
LogisticRegression(max_iter=1000, class_weight="balanced"),
X_train, y_train, cv=5, scoring="f1_weighted",
)
metrics = {
"accuracy": accuracy,
"f1_weighted": f1,
"cv_f1_mean": cv_scores.mean(),
"cv_f1_std": cv_scores.std(),
"classification_report": report,
"confusion_matrix": cm.tolist(),
"target_names": target_names,
}
print(f"\n📈 Results:")
print(f" Accuracy: {accuracy:.4f}")
print(f" F1 (weighted): {f1:.4f}")
print(f" CV F1: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
print(f"\n{classification_report(y_test, y_pred, target_names=target_names)}")
return metrics
def predict(self, text: str) -> Dict:
"""
Predict distress category for a single text.
Returns:
Dict with 'label', 'confidence', 'probabilities', 'features'
"""
if not self.is_trained:
raise RuntimeError("Model not trained. Call train() or load() first.")
clean = preprocess_text(text)
features = self._get_features([clean], show_progress=False)
features_scaled = self.scaler.transform(features)
proba = self.classifier.predict_proba(features_scaled)[0]
pred_id = np.argmax(proba)
pred_label = self.inv_label_map[pred_id]
probabilities = {
self.inv_label_map[i]: float(p) for i, p in enumerate(proba)
}
# Linguistic features for explainability
ling = extract_linguistic_features(text)
return {
"label": pred_label,
"confidence": float(proba[pred_id]),
"probabilities": probabilities,
"linguistic_features": ling,
"clean_text": clean,
}
def predict_batch(self, texts: List[str]) -> List[Dict]:
"""Predict for multiple texts."""
return [self.predict(t) for t in texts]
def save(self, path: Optional[str] = None):
"""Save model to disk."""
save_dir = path or self.model_dir
os.makedirs(save_dir, exist_ok=True)
joblib.dump(self.classifier, os.path.join(save_dir, "classifier.pkl"))
joblib.dump(self.scaler, os.path.join(save_dir, "scaler.pkl"))
joblib.dump(self.label_map, os.path.join(save_dir, "label_map.pkl"))
joblib.dump(self.inv_label_map, os.path.join(save_dir, "inv_label_map.pkl"))
print(f"✓ Model saved to {save_dir}/")
def load(self, path: Optional[str] = None):
"""Load model from disk."""
load_dir = path or self.model_dir
self.classifier = joblib.load(os.path.join(load_dir, "classifier.pkl"))
self.scaler = joblib.load(os.path.join(load_dir, "scaler.pkl"))
self.label_map = joblib.load(os.path.join(load_dir, "label_map.pkl"))
self.inv_label_map = joblib.load(os.path.join(load_dir, "inv_label_map.pkl"))
self.is_trained = True
print(f"✓ Model loaded from {load_dir}/")
if __name__ == "__main__":
from data.dataset_loader import load_all_datasets
df, label_map = load_all_datasets(max_emotion=3000, max_go_emotions=3000)
model = DistressClassifier()
metrics = model.train(df, label_map)
model.save()
# Test prediction
result = model.predict("I feel so hopeless and tired of everything")
print(f"\nTest prediction: {result}")