Spaces:

MatanKriel
/

social-assistent

Sleeping

social-assistent / model-prep.py

Matan Kriel

updated clustering metric in model test

2f9170f 9 days ago

9.75 kB

	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	import warnings
	import os
	import torch
	import time
	import pickle
	from datasets import load_dataset
	from sentence_transformers import SentenceTransformer
	from sklearn.metrics import silhouette_score, mean_squared_error, r2_score
	from sklearn.ensemble import RandomForestRegressor
	from sklearn.preprocessing import LabelEncoder
	from xgboost import XGBRegressor
	from sklearn.linear_model import LinearRegression
	from dotenv import load_dotenv

	# Load environment variables
	load_dotenv()

	# Configuration
	warnings.filterwarnings('ignore')
	pd.set_option('display.max_columns', None)
	device = "mps" if torch.backends.mps.is_available() else "cpu"

	print(f"🚀 Optimization: Running on {device.upper()} device")

	if not os.path.exists('project_plots'):
	os.makedirs('project_plots')

	# ---------------------------------------------------------
	# 1. LOAD DATA
	# ---------------------------------------------------------
	def load_data():
	print(f"\n[1/4] Loading Dataset from Hugging Face...")
	try:
	# 1. Source: Exact dataset name requested
	dataset = load_dataset("MatanKriel/social-assitent-synthetic-data")
	if 'train' in dataset:
	df = dataset['train'].to_pandas()
	else:
	df = dataset.to_pandas()

	print(f" -> ✅ Loaded {len(df)} rows.")

	# --- LOG TRANSFORMATION ---
	if 'views' in df.columns:
	df['log_views'] = np.log1p(df['views'])
	print(" -> 📉 Applied log1p transformation to 'views'.")

	return df
	except Exception as e:
	print(f" ❌ Error loading data: {e}")
	return pd.DataFrame()

	# ---------------------------------------------------------
	# 2. EMBEDDING BENCHMARK
	# ---------------------------------------------------------
	def benchmark_and_select_model(df):
	print("\n[2/4] Benchmarking Embedding Models...")

	models = [
	"sentence-transformers/all-MiniLM-L6-v2",
	"sentence-transformers/all-mpnet-base-v2",
	"BAAI/bge-small-en-v1.5"
	]

	results = []

	# Create Composite Labels for Silhouette Score
	# Goal: Use "Category_ViralClass" (e.g., "Fitness_High") to measure separation

	# 1. Ensure viral_class exists for benchmarking
	if 'viral_class' not in df.columns and 'views' in df.columns:
	threshold = df['views'].quantile(0.75)
	df['viral_class'] = np.where(df['views'] > threshold, 'High', 'Low')
	print(f" -> ℹ️ Created temporary 'viral_class' (High/Low) for benchmarking.")

	# 2. Define Labels
	if 'category' in df.columns and 'viral_class' in df.columns:
	print(" -> 🏷️ Using Composite Labels (Category + Viral Class) for metrics.")
	# We need to perform this on the SAMPLE, not the whole DF if we sample later.
	# But to be safe, let's just use the column if it exists.
	pass # Logic handled after sampling
	elif 'category' in df.columns:
	print(" -> ⚠️ 'viral_class' missing. Falling back to 'category' only.")
	else:
	print(" -> ⚠️ No categories found. Skipping quality metric.")

	# Sample for speed (using the updated df which might have viral_class)
	sample_df = df.sample(min(len(df), 3000), random_state=42)
	sample_texts = sample_df['description'].fillna("").tolist()

	if 'category' in sample_df.columns and 'viral_class' in sample_df.columns:
	# Composite Label Formula
	sample_labels = sample_df['category'].astype(str) + "_" + sample_df['viral_class'].astype(str)
	sample_labels = sample_labels.values
	elif 'category' in sample_df.columns:
	sample_labels = sample_df['category'].values
	else:
	sample_labels = np.zeros(len(sample_df))

	print(f"{'Model':<40} \| {'Time (s)':<10} \| {'Silhouette':<10}")
	print("-" * 65)

	best_score = -2
	best_model_name = models[0] # Default

	for model_name in models:
	try:
	st_model = SentenceTransformer(model_name, device=device)
	start_t = time.time()
	embeddings = st_model.encode(sample_texts, convert_to_numpy=True, show_progress_bar=False)
	time_taken = time.time() - start_t
	score = silhouette_score(embeddings, sample_labels)

	results.append({
	"Model": model_name.split('/')[-1],
	"Time (s)": time_taken,
	"Silhouette Score": score
	})

	print(f"{model_name:<40} \| {time_taken:.2f} \| {score:.4f}")

	if score > best_score:
	best_score = score
	best_model_name = model_name

	except Exception as e:
	print(f"❌ Error with {model_name}: {e}")

	print("-" * 65)
	print(f"🏆 Winner: {best_model_name}")

	# Plotting Benchmark
	if results:
	res_df = pd.DataFrame(results)
	fig, axes = plt.subplots(1, 2, figsize=(14, 6))
	sns.barplot(data=res_df, x='Model', y='Time (s)', ax=axes[0], palette='Blues_d')
	axes[0].set_title('Encoding Speed (Lower is Better)')
	sns.barplot(data=res_df, x='Model', y='Silhouette Score', ax=axes[1], palette='Greens_d')
	axes[1].set_title('Clustering Quality (Higher is Better)')
	plt.tight_layout()
	plt.savefig('project_plots/embedding_benchmark.png')
	plt.close()

	return best_model_name

	# ---------------------------------------------------------
	# 3. GENERATE KNOWLEDGE BASE (EMBEDDINGS)
	# ---------------------------------------------------------
	def generate_embeddings(df, model_name):
	print(f"\n[3/4] Generating Embeddings with Winner ({model_name})...")

	st_model = SentenceTransformer(model_name, device=device)
	embeddings = st_model.encode(df['description'].fillna("").tolist(),
	convert_to_numpy=True,
	show_progress_bar=True)
	df['embedding'] = list(embeddings)
	return df

	# ---------------------------------------------------------
	# 4. TRAIN REGRESSION MODEL
	# ---------------------------------------------------------
	def train_regressor(df):
	print("\n[4/4] Training View Prediction Model...")

	X_text = np.stack(df['embedding'].values)

	print(" -> Defining strict feature sets...")

	# Age is NUMERIC
	num_cols = ['duration', 'hour_of_day', 'followers', 'age']
	cat_cols = ['category', 'gender', 'day_of_week']

	# Verify cols exist
	real_num = [c for c in num_cols if c in df.columns]
	real_cat = [c for c in cat_cols if c in df.columns]

	# Fill missing for numerics
	for c in real_num:
	# Ensure it is float/int
	df[c] = pd.to_numeric(df[c], errors='coerce').fillna(0)

	# Process Categoricals
	cat_encoded_names = []
	for c in real_cat:
	df[c] = df[c].fillna('Unknown')
	le = LabelEncoder()
	new_col = c + '_encoded'
	df[new_col] = le.fit_transform(df[c].astype(str))
	cat_encoded_names.append(new_col)

	final_meta_cols = real_num + cat_encoded_names
	print(f" -> Final Features: Embeddings + {final_meta_cols}")

	X_meta = df[final_meta_cols].values

	# Combine
	X = np.hstack((X_text, X_meta))

	# Log-Target Logic
	y = df['log_views'].values

	# Split
	split = int(len(df) * 0.8)
	X_train, X_test = X[:split], X[split:]
	y_train, y_test = y[:split], y[split:]

	# Models
	models = {
	"RandomForest": RandomForestRegressor(n_estimators=100, max_depth=10, n_jobs=-1),
	"XGBoost": XGBRegressor(n_estimators=100, learning_rate=0.1, n_jobs=-1),
	"LinearReg": LinearRegression()
	}

	best_model = None
	best_rmse = float('inf')
	results = []

	print(f"{'Model':<15} \| {'RMSE (Views)':<15} \| {'R²':<10}")
	print("-" * 45)

	for name, model in models.items():
	model.fit(X_train, y_train)
	preds_log = model.predict(X_test)

	# INVERSE TRANSFORM: Log -> Real
	preds_real = np.expm1(preds_log)
	y_real = np.expm1(y_test)

	# Clip negatives
	preds_real = np.maximum(preds_real, 0)

	rmse = np.sqrt(mean_squared_error(y_real, preds_real))
	r2 = r2_score(y_test, preds_log)

	results.append({"Model": name, "RMSE": rmse, "R2": r2})
	print(f"{name:<15} \| {rmse:,.0f} \| {r2:.3f}")

	if rmse < best_rmse:
	best_rmse = rmse
	best_model = model

	print("-" * 45)
	print(f"🏆 Best Regressor: {type(best_model).__name__}")

	# Plotting Model Comparison
	if results:
	res_df = pd.DataFrame(results)
	fig, axes = plt.subplots(1, 2, figsize=(14, 6))
	sns.barplot(data=res_df, x='Model', y='RMSE', ax=axes[0], palette='Reds_d')
	axes[0].set_title('Prediction Error (RMSE) - Lower is Better')
	sns.barplot(data=res_df, x='Model', y='R2', ax=axes[1], palette='Greens_d')
	axes[1].set_title('Explained Variance (R²) - Higher is Better')
	plt.tight_layout()
	plt.savefig('project_plots/regression_comparison.png')
	plt.close()

	# Save Model
	with open("viral_model.pkl", "wb") as f:
	pickle.dump(best_model, f)
	print(" -> ✅ Model saved to 'viral_model.pkl'")

	return best_model

	if __name__ == "__main__":
	df = load_data()
	if not df.empty:
	best_emb = benchmark_and_select_model(df)
	df = generate_embeddings(df, best_emb)
	train_regressor(df)