Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import warnings | |
| import os | |
| import torch | |
| import time | |
| import pickle | |
| from datasets import load_dataset | |
| from sentence_transformers import SentenceTransformer | |
| from sklearn.metrics import silhouette_score, mean_squared_error, r2_score | |
| from sklearn.ensemble import RandomForestRegressor | |
| from sklearn.preprocessing import LabelEncoder | |
| from xgboost import XGBRegressor | |
| from sklearn.linear_model import LinearRegression | |
| from dotenv import load_dotenv | |
| # Load environment variables | |
| load_dotenv() | |
| # Configuration | |
| warnings.filterwarnings('ignore') | |
| pd.set_option('display.max_columns', None) | |
| device = "mps" if torch.backends.mps.is_available() else "cpu" | |
| print(f"π Optimization: Running on {device.upper()} device") | |
| if not os.path.exists('project_plots'): | |
| os.makedirs('project_plots') | |
| # --------------------------------------------------------- | |
| # 1. LOAD DATA | |
| # --------------------------------------------------------- | |
| def load_data(): | |
| print(f"\n[1/4] Loading Dataset from Hugging Face...") | |
| try: | |
| # 1. Source: Exact dataset name requested | |
| dataset = load_dataset("MatanKriel/social-assitent-synthetic-data") | |
| if 'train' in dataset: | |
| df = dataset['train'].to_pandas() | |
| else: | |
| df = dataset.to_pandas() | |
| print(f" -> β Loaded {len(df)} rows.") | |
| # --- LOG TRANSFORMATION --- | |
| if 'views' in df.columns: | |
| df['log_views'] = np.log1p(df['views']) | |
| print(" -> π Applied log1p transformation to 'views'.") | |
| return df | |
| except Exception as e: | |
| print(f" β Error loading data: {e}") | |
| return pd.DataFrame() | |
| # --------------------------------------------------------- | |
| # 2. EMBEDDING BENCHMARK | |
| # --------------------------------------------------------- | |
| def benchmark_and_select_model(df): | |
| print("\n[2/4] Benchmarking Embedding Models...") | |
| models = [ | |
| "sentence-transformers/all-MiniLM-L6-v2", | |
| "sentence-transformers/all-mpnet-base-v2", | |
| "BAAI/bge-small-en-v1.5" | |
| ] | |
| results = [] | |
| # Create Composite Labels for Silhouette Score | |
| # Goal: Use "Category_ViralClass" (e.g., "Fitness_High") to measure separation | |
| # 1. Ensure viral_class exists for benchmarking | |
| if 'viral_class' not in df.columns and 'views' in df.columns: | |
| threshold = df['views'].quantile(0.75) | |
| df['viral_class'] = np.where(df['views'] > threshold, 'High', 'Low') | |
| print(f" -> βΉοΈ Created temporary 'viral_class' (High/Low) for benchmarking.") | |
| # 2. Define Labels | |
| if 'category' in df.columns and 'viral_class' in df.columns: | |
| print(" -> π·οΈ Using Composite Labels (Category + Viral Class) for metrics.") | |
| # We need to perform this on the SAMPLE, not the whole DF if we sample later. | |
| # But to be safe, let's just use the column if it exists. | |
| pass # Logic handled after sampling | |
| elif 'category' in df.columns: | |
| print(" -> β οΈ 'viral_class' missing. Falling back to 'category' only.") | |
| else: | |
| print(" -> β οΈ No categories found. Skipping quality metric.") | |
| # Sample for speed (using the updated df which might have viral_class) | |
| sample_df = df.sample(min(len(df), 3000), random_state=42) | |
| sample_texts = sample_df['description'].fillna("").tolist() | |
| if 'category' in sample_df.columns and 'viral_class' in sample_df.columns: | |
| # Composite Label Formula | |
| sample_labels = sample_df['category'].astype(str) + "_" + sample_df['viral_class'].astype(str) | |
| sample_labels = sample_labels.values | |
| elif 'category' in sample_df.columns: | |
| sample_labels = sample_df['category'].values | |
| else: | |
| sample_labels = np.zeros(len(sample_df)) | |
| print(f"{'Model':<40} | {'Time (s)':<10} | {'Silhouette':<10}") | |
| print("-" * 65) | |
| best_score = -2 | |
| best_model_name = models[0] # Default | |
| for model_name in models: | |
| try: | |
| st_model = SentenceTransformer(model_name, device=device) | |
| start_t = time.time() | |
| embeddings = st_model.encode(sample_texts, convert_to_numpy=True, show_progress_bar=False) | |
| time_taken = time.time() - start_t | |
| score = silhouette_score(embeddings, sample_labels) | |
| results.append({ | |
| "Model": model_name.split('/')[-1], | |
| "Time (s)": time_taken, | |
| "Silhouette Score": score | |
| }) | |
| print(f"{model_name:<40} | {time_taken:.2f} | {score:.4f}") | |
| if score > best_score: | |
| best_score = score | |
| best_model_name = model_name | |
| except Exception as e: | |
| print(f"β Error with {model_name}: {e}") | |
| print("-" * 65) | |
| print(f"π Winner: {best_model_name}") | |
| # Plotting Benchmark | |
| if results: | |
| res_df = pd.DataFrame(results) | |
| fig, axes = plt.subplots(1, 2, figsize=(14, 6)) | |
| sns.barplot(data=res_df, x='Model', y='Time (s)', ax=axes[0], palette='Blues_d') | |
| axes[0].set_title('Encoding Speed (Lower is Better)') | |
| sns.barplot(data=res_df, x='Model', y='Silhouette Score', ax=axes[1], palette='Greens_d') | |
| axes[1].set_title('Clustering Quality (Higher is Better)') | |
| plt.tight_layout() | |
| plt.savefig('project_plots/embedding_benchmark.png') | |
| plt.close() | |
| return best_model_name | |
| # --------------------------------------------------------- | |
| # 3. GENERATE KNOWLEDGE BASE (EMBEDDINGS) | |
| # --------------------------------------------------------- | |
| def generate_embeddings(df, model_name): | |
| print(f"\n[3/4] Generating Embeddings with Winner ({model_name})...") | |
| st_model = SentenceTransformer(model_name, device=device) | |
| embeddings = st_model.encode(df['description'].fillna("").tolist(), | |
| convert_to_numpy=True, | |
| show_progress_bar=True) | |
| df['embedding'] = list(embeddings) | |
| return df | |
| # --------------------------------------------------------- | |
| # 4. TRAIN REGRESSION MODEL | |
| # --------------------------------------------------------- | |
| def train_regressor(df): | |
| print("\n[4/4] Training View Prediction Model...") | |
| X_text = np.stack(df['embedding'].values) | |
| print(" -> Defining strict feature sets...") | |
| # Age is NUMERIC | |
| num_cols = ['duration', 'hour_of_day', 'followers', 'age'] | |
| cat_cols = ['category', 'gender', 'day_of_week'] | |
| # Verify cols exist | |
| real_num = [c for c in num_cols if c in df.columns] | |
| real_cat = [c for c in cat_cols if c in df.columns] | |
| # Fill missing for numerics | |
| for c in real_num: | |
| # Ensure it is float/int | |
| df[c] = pd.to_numeric(df[c], errors='coerce').fillna(0) | |
| # Process Categoricals | |
| cat_encoded_names = [] | |
| for c in real_cat: | |
| df[c] = df[c].fillna('Unknown') | |
| le = LabelEncoder() | |
| new_col = c + '_encoded' | |
| df[new_col] = le.fit_transform(df[c].astype(str)) | |
| cat_encoded_names.append(new_col) | |
| final_meta_cols = real_num + cat_encoded_names | |
| print(f" -> Final Features: Embeddings + {final_meta_cols}") | |
| X_meta = df[final_meta_cols].values | |
| # Combine | |
| X = np.hstack((X_text, X_meta)) | |
| # Log-Target Logic | |
| y = df['log_views'].values | |
| # Split | |
| split = int(len(df) * 0.8) | |
| X_train, X_test = X[:split], X[split:] | |
| y_train, y_test = y[:split], y[split:] | |
| # Models | |
| models = { | |
| "RandomForest": RandomForestRegressor(n_estimators=100, max_depth=10, n_jobs=-1), | |
| "XGBoost": XGBRegressor(n_estimators=100, learning_rate=0.1, n_jobs=-1), | |
| "LinearReg": LinearRegression() | |
| } | |
| best_model = None | |
| best_rmse = float('inf') | |
| results = [] | |
| print(f"{'Model':<15} | {'RMSE (Views)':<15} | {'RΒ²':<10}") | |
| print("-" * 45) | |
| for name, model in models.items(): | |
| model.fit(X_train, y_train) | |
| preds_log = model.predict(X_test) | |
| # INVERSE TRANSFORM: Log -> Real | |
| preds_real = np.expm1(preds_log) | |
| y_real = np.expm1(y_test) | |
| # Clip negatives | |
| preds_real = np.maximum(preds_real, 0) | |
| rmse = np.sqrt(mean_squared_error(y_real, preds_real)) | |
| r2 = r2_score(y_test, preds_log) | |
| results.append({"Model": name, "RMSE": rmse, "R2": r2}) | |
| print(f"{name:<15} | {rmse:,.0f} | {r2:.3f}") | |
| if rmse < best_rmse: | |
| best_rmse = rmse | |
| best_model = model | |
| print("-" * 45) | |
| print(f"π Best Regressor: {type(best_model).__name__}") | |
| # Plotting Model Comparison | |
| if results: | |
| res_df = pd.DataFrame(results) | |
| fig, axes = plt.subplots(1, 2, figsize=(14, 6)) | |
| sns.barplot(data=res_df, x='Model', y='RMSE', ax=axes[0], palette='Reds_d') | |
| axes[0].set_title('Prediction Error (RMSE) - Lower is Better') | |
| sns.barplot(data=res_df, x='Model', y='R2', ax=axes[1], palette='Greens_d') | |
| axes[1].set_title('Explained Variance (RΒ²) - Higher is Better') | |
| plt.tight_layout() | |
| plt.savefig('project_plots/regression_comparison.png') | |
| plt.close() | |
| # Save Model | |
| with open("viral_model.pkl", "wb") as f: | |
| pickle.dump(best_model, f) | |
| print(" -> β Model saved to 'viral_model.pkl'") | |
| return best_model | |
| if __name__ == "__main__": | |
| df = load_data() | |
| if not df.empty: | |
| best_emb = benchmark_and_select_model(df) | |
| df = generate_embeddings(df, best_emb) | |
| train_regressor(df) | |