import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
import torch
import time
import pickle
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from sklearn.metrics import silhouette_score, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Configuration
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
device = "mps" if torch.backends.mps.is_available() else "cpu"

print(f"🚀 Optimization: Running on {device.upper()} device")

if not os.path.exists('project_plots'):
    os.makedirs('project_plots')

# ---------------------------------------------------------
# 1. LOAD DATA
# ---------------------------------------------------------
def load_data():
    print(f"\n[1/4] Loading Dataset from Hugging Face...")
    try:
        # 1. Source: Exact dataset name requested
        dataset = load_dataset("MatanKriel/social-assitent-synthetic-data")
        if 'train' in dataset:
            df = dataset['train'].to_pandas()
        else:
            df = dataset.to_pandas()
            
        print(f"    -> ✅ Loaded {len(df)} rows.")

        # --- LOG TRANSFORMATION ---
        if 'views' in df.columns:
            df['log_views'] = np.log1p(df['views'])
            print("    -> 📉 Applied log1p transformation to 'views'.")
            
        return df
    except Exception as e:
        print(f"    ❌ Error loading data: {e}")
        return pd.DataFrame()

# ---------------------------------------------------------
# 2. EMBEDDING BENCHMARK 
# ---------------------------------------------------------
def benchmark_and_select_model(df):
    print("\n[2/4] Benchmarking Embedding Models...")
    
    models = [
        "sentence-transformers/all-MiniLM-L6-v2",
        "sentence-transformers/all-mpnet-base-v2",
        "BAAI/bge-small-en-v1.5"
    ]
    
    results = []
    
    # Create Composite Labels for Silhouette Score
    # Goal: Use "Category_ViralClass" (e.g., "Fitness_High") to measure separation
    
    # 1. Ensure viral_class exists for benchmarking
    if 'viral_class' not in df.columns and 'views' in df.columns:
         threshold = df['views'].quantile(0.75)
         df['viral_class'] = np.where(df['views'] > threshold, 'High', 'Low')
         print(f"    -> ℹ️ Created temporary 'viral_class' (High/Low) for benchmarking.")

    # 2. Define Labels
    if 'category' in df.columns and 'viral_class' in df.columns:
        print("    -> 🏷️ Using Composite Labels (Category + Viral Class) for metrics.")
        # We need to perform this on the SAMPLE, not the whole DF if we sample later.
        # But to be safe, let's just use the column if it exists.
        pass # Logic handled after sampling
    elif 'category' in df.columns:
        print("    -> ⚠️ 'viral_class' missing. Falling back to 'category' only.")
    else:
        print("    -> ⚠️ No categories found. Skipping quality metric.")

    # Sample for speed (using the updated df which might have viral_class)
    sample_df = df.sample(min(len(df), 3000), random_state=42)
    sample_texts = sample_df['description'].fillna("").tolist()
    
    if 'category' in sample_df.columns and 'viral_class' in sample_df.columns:
        # Composite Label Formula
        sample_labels = sample_df['category'].astype(str) + "_" + sample_df['viral_class'].astype(str)
        sample_labels = sample_labels.values
    elif 'category' in sample_df.columns:
        sample_labels = sample_df['category'].values
    else:
        sample_labels = np.zeros(len(sample_df))

    print(f"{'Model':<40} | {'Time (s)':<10} | {'Silhouette':<10}")
    print("-" * 65)

    best_score = -2
    best_model_name = models[0] # Default

    for model_name in models:
        try:
            st_model = SentenceTransformer(model_name, device=device)
            start_t = time.time()
            embeddings = st_model.encode(sample_texts, convert_to_numpy=True, show_progress_bar=False)
            time_taken = time.time() - start_t
            score = silhouette_score(embeddings, sample_labels)
            
            results.append({
                "Model": model_name.split('/')[-1], 
                "Time (s)": time_taken,
                "Silhouette Score": score
            })
            
            print(f"{model_name:<40} | {time_taken:.2f}       | {score:.4f}")
            
            if score > best_score:
                best_score = score
                best_model_name = model_name
                
        except Exception as e:
            print(f"❌ Error with {model_name}: {e}")

    print("-" * 65)
    print(f"🏆 Winner: {best_model_name}")
    
    # Plotting Benchmark
    if results:
        res_df = pd.DataFrame(results)
        fig, axes = plt.subplots(1, 2, figsize=(14, 6))
        sns.barplot(data=res_df, x='Model', y='Time (s)', ax=axes[0], palette='Blues_d')
        axes[0].set_title('Encoding Speed (Lower is Better)')
        sns.barplot(data=res_df, x='Model', y='Silhouette Score', ax=axes[1], palette='Greens_d')
        axes[1].set_title('Clustering Quality (Higher is Better)')
        plt.tight_layout()
        plt.savefig('project_plots/embedding_benchmark.png')
        plt.close()
    
    return best_model_name

# ---------------------------------------------------------
# 3. GENERATE KNOWLEDGE BASE (EMBEDDINGS)
# ---------------------------------------------------------
def generate_embeddings(df, model_name):
    print(f"\n[3/4] Generating Embeddings with Winner ({model_name})...")
    
    st_model = SentenceTransformer(model_name, device=device)
    embeddings = st_model.encode(df['description'].fillna("").tolist(), 
                               convert_to_numpy=True, 
                               show_progress_bar=True)
    df['embedding'] = list(embeddings)    
    return df

# ---------------------------------------------------------
# 4. TRAIN REGRESSION MODEL
# ---------------------------------------------------------
def train_regressor(df):
    print("\n[4/4] Training View Prediction Model...")
    
    X_text = np.stack(df['embedding'].values)
    
    print("    -> Defining strict feature sets...")
    
    # Age is NUMERIC
    num_cols = ['duration', 'hour_of_day', 'followers', 'age'] 
    cat_cols = ['category', 'gender', 'day_of_week'] 
    
    # Verify cols exist
    real_num = [c for c in num_cols if c in df.columns]
    real_cat = [c for c in cat_cols if c in df.columns]

    # Fill missing for numerics
    for c in real_num: 
        # Ensure it is float/int
        df[c] = pd.to_numeric(df[c], errors='coerce').fillna(0)
    
    # Process Categoricals
    cat_encoded_names = []
    for c in real_cat:
        df[c] = df[c].fillna('Unknown')
        le = LabelEncoder()
        new_col = c + '_encoded'
        df[new_col] = le.fit_transform(df[c].astype(str))
        cat_encoded_names.append(new_col)
        
    final_meta_cols = real_num + cat_encoded_names
    print(f"    -> Final Features: Embeddings + {final_meta_cols}")
    
    X_meta = df[final_meta_cols].values
    
    # Combine
    X = np.hstack((X_text, X_meta))
    
    # Log-Target Logic
    y = df['log_views'].values 
    
    # Split
    split = int(len(df) * 0.8)
    X_train, X_test = X[:split], X[split:]
    y_train, y_test = y[:split], y[split:]
    
    # Models
    models = {
        "RandomForest": RandomForestRegressor(n_estimators=100, max_depth=10, n_jobs=-1),
        "XGBoost": XGBRegressor(n_estimators=100, learning_rate=0.1, n_jobs=-1),
        "LinearReg": LinearRegression()
    }
    
    best_model = None
    best_rmse = float('inf')
    results = [] 
    
    print(f"{'Model':<15} | {'RMSE (Views)':<15} | {'R²':<10}")
    print("-" * 45)
    
    for name, model in models.items():
        model.fit(X_train, y_train)
        preds_log = model.predict(X_test)
        
        # INVERSE TRANSFORM: Log -> Real
        preds_real = np.expm1(preds_log)
        y_real = np.expm1(y_test)
        
        # Clip negatives
        preds_real = np.maximum(preds_real, 0)
        
        rmse = np.sqrt(mean_squared_error(y_real, preds_real))
        r2 = r2_score(y_test, preds_log) 
        
        results.append({"Model": name, "RMSE": rmse, "R2": r2})
        print(f"{name:<15} | {rmse:,.0f}        | {r2:.3f}")
        
        if rmse < best_rmse:
            best_rmse = rmse
            best_model = model
            
    print("-" * 45)
    print(f"🏆 Best Regressor: {type(best_model).__name__}")
    
    # Plotting Model Comparison
    if results:
        res_df = pd.DataFrame(results)
        fig, axes = plt.subplots(1, 2, figsize=(14, 6))
        sns.barplot(data=res_df, x='Model', y='RMSE', ax=axes[0], palette='Reds_d')
        axes[0].set_title('Prediction Error (RMSE) - Lower is Better')
        sns.barplot(data=res_df, x='Model', y='R2', ax=axes[1], palette='Greens_d')
        axes[1].set_title('Explained Variance (R²) - Higher is Better')
        plt.tight_layout()
        plt.savefig('project_plots/regression_comparison.png')
        plt.close()
    
    # Save Model
    with open("viral_model.pkl", "wb") as f:
        pickle.dump(best_model, f)
    print("    -> ✅ Model saved to 'viral_model.pkl'")
    
    return best_model

if __name__ == "__main__":
    df = load_data()
    if not df.empty:
        best_emb = benchmark_and_select_model(df)
        df = generate_embeddings(df, best_emb)
        train_regressor(df)