Spaces:

priyadip
/

mindwatch

Running

App Files Files Community

mindwatch / models /classifier.py

priyadip

Fix OSError in NLTK check and use HistGradientBoostingClassifier

f66fec4 verified 20 days ago

raw

history blame contribute delete

8.22 kB

	"""
	MindWatch — Distress Classification Model
	Uses sentence-transformers embeddings + sklearn classifiers.
	Designed to run on HuggingFace Spaces free tier (CPU only).
	"""

	import numpy as np
	import pandas as pd
	import joblib
	import os
	from typing import Tuple, Dict, List, Optional
	from sklearn.linear_model import LogisticRegression
	from sklearn.ensemble import HistGradientBoostingClassifier, VotingClassifier
	from sklearn.model_selection import train_test_split, cross_val_score
	from sklearn.metrics import (
	classification_report,
	confusion_matrix,
	accuracy_score,
	f1_score,
	)
	from sklearn.preprocessing import StandardScaler
	from sentence_transformers import SentenceTransformer
	from tqdm import tqdm

	from utils.preprocessing import preprocess_text, extract_linguistic_features


	class DistressClassifier:
	"""
	Multi-signal distress classifier combining:
	1. Sentence-transformer embeddings (semantic signal)
	2. Linguistic features (psychological signal)
	3. Ensemble of LogisticRegression + GradientBoosting
	"""

	def __init__(
	self,
	embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2",
	model_dir: str = "saved_models",
	):
	self.embedding_model_name = embedding_model
	self.model_dir = model_dir
	self.encoder: Optional[SentenceTransformer] = None
	self.classifier = None
	self.scaler = StandardScaler()
	self.label_map: Dict[str, int] = {}
	self.inv_label_map: Dict[int, str] = {}
	self.is_trained = False

	def _load_encoder(self):
	"""Lazy-load the sentence transformer."""
	if self.encoder is None:
	print(f"Loading embedding model: {self.embedding_model_name}")
	self.encoder = SentenceTransformer(self.embedding_model_name)

	def _get_features(self, texts: List[str], show_progress: bool = True) -> np.ndarray:
	"""
	Extract combined features: embeddings + linguistic features.
	"""
	self._load_encoder()

	# Sentence embeddings
	embeddings = self.encoder.encode(
	texts, show_progress_bar=show_progress, batch_size=64
	)

	# Linguistic features
	ling_features = []
	iterator = tqdm(texts, desc="Extracting features") if show_progress else texts
	for text in iterator:
	feats = extract_linguistic_features(text)
	ling_features.append(list(feats.values()))

	ling_array = np.array(ling_features)

	# Combine
	combined = np.hstack([embeddings, ling_array])
	return combined

	def train(
	self,
	df: pd.DataFrame,
	label_map: Dict[str, int],
	test_size: float = 0.2,
	) -> dict:
	"""
	Train the ensemble classifier.

	Args:
	df: DataFrame with 'clean_text' and 'label_id' columns
	label_map: Mapping from label names to IDs
	test_size: Fraction for test split

	Returns:
	Dictionary with evaluation metrics
	"""
	self.label_map = label_map
	self.inv_label_map = {v: k for k, v in label_map.items()}

	texts = df["clean_text"].tolist()
	labels = df["label_id"].values

	print(f"\n🔧 Training on {len(texts)} samples...")

	# Extract features
	features = self._get_features(texts)

	# Train/test split
	X_train, X_test, y_train, y_test = train_test_split(
	features, labels, test_size=test_size, random_state=42, stratify=labels
	)

	# Scale features
	X_train = self.scaler.fit_transform(X_train)
	X_test = self.scaler.transform(X_test)

	# Build ensemble
	lr = LogisticRegression(
	max_iter=1000, C=1.0, class_weight="balanced", random_state=42
	)
	gb = HistGradientBoostingClassifier(
	max_iter=100, max_depth=5, learning_rate=0.1, random_state=42
	)

	self.classifier = VotingClassifier(
	estimators=[("lr", lr), ("gb", gb)],
	voting="soft",
	weights=[1, 1.5],
	)

	print("Training ensemble classifier...")
	self.classifier.fit(X_train, y_train)
	self.is_trained = True

	# Evaluate
	y_pred = self.classifier.predict(X_test)
	target_names = [self.inv_label_map[i] for i in sorted(self.inv_label_map)]

	report = classification_report(
	y_test, y_pred, target_names=target_names, output_dict=True
	)
	cm = confusion_matrix(y_test, y_pred)
	accuracy = accuracy_score(y_test, y_pred)
	f1 = f1_score(y_test, y_pred, average="weighted")

	# Cross-validation on training data
	cv_scores = cross_val_score(
	LogisticRegression(max_iter=1000, class_weight="balanced"),
	X_train, y_train, cv=5, scoring="f1_weighted",
	)

	metrics = {
	"accuracy": accuracy,
	"f1_weighted": f1,
	"cv_f1_mean": cv_scores.mean(),
	"cv_f1_std": cv_scores.std(),
	"classification_report": report,
	"confusion_matrix": cm.tolist(),
	"target_names": target_names,
	}

	print(f"\n📈 Results:")
	print(f" Accuracy: {accuracy:.4f}")
	print(f" F1 (weighted): {f1:.4f}")
	print(f" CV F1: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
	print(f"\n{classification_report(y_test, y_pred, target_names=target_names)}")

	return metrics

	def predict(self, text: str) -> Dict:
	"""
	Predict distress category for a single text.

	Returns:
	Dict with 'label', 'confidence', 'probabilities', 'features'
	"""
	if not self.is_trained:
	raise RuntimeError("Model not trained. Call train() or load() first.")

	clean = preprocess_text(text)
	features = self._get_features([clean], show_progress=False)
	features_scaled = self.scaler.transform(features)

	proba = self.classifier.predict_proba(features_scaled)[0]
	pred_id = np.argmax(proba)
	pred_label = self.inv_label_map[pred_id]

	probabilities = {
	self.inv_label_map[i]: float(p) for i, p in enumerate(proba)
	}

	# Linguistic features for explainability
	ling = extract_linguistic_features(text)

	return {
	"label": pred_label,
	"confidence": float(proba[pred_id]),
	"probabilities": probabilities,
	"linguistic_features": ling,
	"clean_text": clean,
	}

	def predict_batch(self, texts: List[str]) -> List[Dict]:
	"""Predict for multiple texts."""
	return [self.predict(t) for t in texts]

	def save(self, path: Optional[str] = None):
	"""Save model to disk."""
	save_dir = path or self.model_dir
	os.makedirs(save_dir, exist_ok=True)

	joblib.dump(self.classifier, os.path.join(save_dir, "classifier.pkl"))
	joblib.dump(self.scaler, os.path.join(save_dir, "scaler.pkl"))
	joblib.dump(self.label_map, os.path.join(save_dir, "label_map.pkl"))
	joblib.dump(self.inv_label_map, os.path.join(save_dir, "inv_label_map.pkl"))
	print(f"✓ Model saved to {save_dir}/")

	def load(self, path: Optional[str] = None):
	"""Load model from disk."""
	load_dir = path or self.model_dir
	self.classifier = joblib.load(os.path.join(load_dir, "classifier.pkl"))
	self.scaler = joblib.load(os.path.join(load_dir, "scaler.pkl"))
	self.label_map = joblib.load(os.path.join(load_dir, "label_map.pkl"))
	self.inv_label_map = joblib.load(os.path.join(load_dir, "inv_label_map.pkl"))
	self.is_trained = True
	print(f"✓ Model loaded from {load_dir}/")


	if __name__ == "__main__":
	from data.dataset_loader import load_all_datasets

	df, label_map = load_all_datasets(max_emotion=3000, max_go_emotions=3000)
	model = DistressClassifier()
	metrics = model.train(df, label_map)
	model.save()

	# Test prediction
	result = model.predict("I feel so hopeless and tired of everything")
	print(f"\nTest prediction: {result}")