import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import warnings import os import torch import time import pickle from datasets import load_dataset from sentence_transformers import SentenceTransformer from sklearn.metrics import silhouette_score, mean_squared_error, r2_score from sklearn.ensemble import RandomForestRegressor from sklearn.preprocessing import LabelEncoder from xgboost import XGBRegressor from sklearn.linear_model import LinearRegression from dotenv import load_dotenv # Load environment variables load_dotenv() # Configuration warnings.filterwarnings('ignore') pd.set_option('display.max_columns', None) device = "mps" if torch.backends.mps.is_available() else "cpu" print(f"🚀 Optimization: Running on {device.upper()} device") if not os.path.exists('project_plots'): os.makedirs('project_plots') # --------------------------------------------------------- # 1. LOAD DATA # --------------------------------------------------------- def load_data(): print(f"\n[1/4] Loading Dataset from Hugging Face...") try: # 1. Source: Exact dataset name requested dataset = load_dataset("MatanKriel/social-assitent-synthetic-data") if 'train' in dataset: df = dataset['train'].to_pandas() else: df = dataset.to_pandas() print(f" -> ✅ Loaded {len(df)} rows.") # --- LOG TRANSFORMATION --- if 'views' in df.columns: df['log_views'] = np.log1p(df['views']) print(" -> 📉 Applied log1p transformation to 'views'.") return df except Exception as e: print(f" ❌ Error loading data: {e}") return pd.DataFrame() # --------------------------------------------------------- # 2. EMBEDDING BENCHMARK # --------------------------------------------------------- def benchmark_and_select_model(df): print("\n[2/4] Benchmarking Embedding Models...") models = [ "sentence-transformers/all-MiniLM-L6-v2", "sentence-transformers/all-mpnet-base-v2", "BAAI/bge-small-en-v1.5" ] results = [] # Create Composite Labels for Silhouette Score # Goal: Use "Category_ViralClass" (e.g., "Fitness_High") to measure separation # 1. Ensure viral_class exists for benchmarking if 'viral_class' not in df.columns and 'views' in df.columns: threshold = df['views'].quantile(0.75) df['viral_class'] = np.where(df['views'] > threshold, 'High', 'Low') print(f" -> ℹ️ Created temporary 'viral_class' (High/Low) for benchmarking.") # 2. Define Labels if 'category' in df.columns and 'viral_class' in df.columns: print(" -> 🏷️ Using Composite Labels (Category + Viral Class) for metrics.") # We need to perform this on the SAMPLE, not the whole DF if we sample later. # But to be safe, let's just use the column if it exists. pass # Logic handled after sampling elif 'category' in df.columns: print(" -> ⚠️ 'viral_class' missing. Falling back to 'category' only.") else: print(" -> ⚠️ No categories found. Skipping quality metric.") # Sample for speed (using the updated df which might have viral_class) sample_df = df.sample(min(len(df), 3000), random_state=42) sample_texts = sample_df['description'].fillna("").tolist() if 'category' in sample_df.columns and 'viral_class' in sample_df.columns: # Composite Label Formula sample_labels = sample_df['category'].astype(str) + "_" + sample_df['viral_class'].astype(str) sample_labels = sample_labels.values elif 'category' in sample_df.columns: sample_labels = sample_df['category'].values else: sample_labels = np.zeros(len(sample_df)) print(f"{'Model':<40} | {'Time (s)':<10} | {'Silhouette':<10}") print("-" * 65) best_score = -2 best_model_name = models[0] # Default for model_name in models: try: st_model = SentenceTransformer(model_name, device=device) start_t = time.time() embeddings = st_model.encode(sample_texts, convert_to_numpy=True, show_progress_bar=False) time_taken = time.time() - start_t score = silhouette_score(embeddings, sample_labels) results.append({ "Model": model_name.split('/')[-1], "Time (s)": time_taken, "Silhouette Score": score }) print(f"{model_name:<40} | {time_taken:.2f} | {score:.4f}") if score > best_score: best_score = score best_model_name = model_name except Exception as e: print(f"❌ Error with {model_name}: {e}") print("-" * 65) print(f"🏆 Winner: {best_model_name}") # Plotting Benchmark if results: res_df = pd.DataFrame(results) fig, axes = plt.subplots(1, 2, figsize=(14, 6)) sns.barplot(data=res_df, x='Model', y='Time (s)', ax=axes[0], palette='Blues_d') axes[0].set_title('Encoding Speed (Lower is Better)') sns.barplot(data=res_df, x='Model', y='Silhouette Score', ax=axes[1], palette='Greens_d') axes[1].set_title('Clustering Quality (Higher is Better)') plt.tight_layout() plt.savefig('project_plots/embedding_benchmark.png') plt.close() return best_model_name # --------------------------------------------------------- # 3. GENERATE KNOWLEDGE BASE (EMBEDDINGS) # --------------------------------------------------------- def generate_embeddings(df, model_name): print(f"\n[3/4] Generating Embeddings with Winner ({model_name})...") st_model = SentenceTransformer(model_name, device=device) embeddings = st_model.encode(df['description'].fillna("").tolist(), convert_to_numpy=True, show_progress_bar=True) df['embedding'] = list(embeddings) return df # --------------------------------------------------------- # 4. TRAIN REGRESSION MODEL # --------------------------------------------------------- def train_regressor(df): print("\n[4/4] Training View Prediction Model...") X_text = np.stack(df['embedding'].values) print(" -> Defining strict feature sets...") # Age is NUMERIC num_cols = ['duration', 'hour_of_day', 'followers', 'age'] cat_cols = ['category', 'gender', 'day_of_week'] # Verify cols exist real_num = [c for c in num_cols if c in df.columns] real_cat = [c for c in cat_cols if c in df.columns] # Fill missing for numerics for c in real_num: # Ensure it is float/int df[c] = pd.to_numeric(df[c], errors='coerce').fillna(0) # Process Categoricals cat_encoded_names = [] for c in real_cat: df[c] = df[c].fillna('Unknown') le = LabelEncoder() new_col = c + '_encoded' df[new_col] = le.fit_transform(df[c].astype(str)) cat_encoded_names.append(new_col) final_meta_cols = real_num + cat_encoded_names print(f" -> Final Features: Embeddings + {final_meta_cols}") X_meta = df[final_meta_cols].values # Combine X = np.hstack((X_text, X_meta)) # Log-Target Logic y = df['log_views'].values # Split split = int(len(df) * 0.8) X_train, X_test = X[:split], X[split:] y_train, y_test = y[:split], y[split:] # Models models = { "RandomForest": RandomForestRegressor(n_estimators=100, max_depth=10, n_jobs=-1), "XGBoost": XGBRegressor(n_estimators=100, learning_rate=0.1, n_jobs=-1), "LinearReg": LinearRegression() } best_model = None best_rmse = float('inf') results = [] print(f"{'Model':<15} | {'RMSE (Views)':<15} | {'R²':<10}") print("-" * 45) for name, model in models.items(): model.fit(X_train, y_train) preds_log = model.predict(X_test) # INVERSE TRANSFORM: Log -> Real preds_real = np.expm1(preds_log) y_real = np.expm1(y_test) # Clip negatives preds_real = np.maximum(preds_real, 0) rmse = np.sqrt(mean_squared_error(y_real, preds_real)) r2 = r2_score(y_test, preds_log) results.append({"Model": name, "RMSE": rmse, "R2": r2}) print(f"{name:<15} | {rmse:,.0f} | {r2:.3f}") if rmse < best_rmse: best_rmse = rmse best_model = model print("-" * 45) print(f"🏆 Best Regressor: {type(best_model).__name__}") # Plotting Model Comparison if results: res_df = pd.DataFrame(results) fig, axes = plt.subplots(1, 2, figsize=(14, 6)) sns.barplot(data=res_df, x='Model', y='RMSE', ax=axes[0], palette='Reds_d') axes[0].set_title('Prediction Error (RMSE) - Lower is Better') sns.barplot(data=res_df, x='Model', y='R2', ax=axes[1], palette='Greens_d') axes[1].set_title('Explained Variance (R²) - Higher is Better') plt.tight_layout() plt.savefig('project_plots/regression_comparison.png') plt.close() # Save Model with open("viral_model.pkl", "wb") as f: pickle.dump(best_model, f) print(" -> ✅ Model saved to 'viral_model.pkl'") return best_model if __name__ == "__main__": df = load_data() if not df.empty: best_emb = benchmark_and_select_model(df) df = generate_embeddings(df, best_emb) train_regressor(df)