# File: ai-service/training/train_performance_predictor.py

import pandas as pd
import xgboost as xgb
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import joblib
import os
import sys

# Root directory ko path mein add karein
ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(ROOT_DIR)

def train_performance_models():
    """
    Loads the performance data CSV and trains two separate models:
    one for predicting 'likes' and one for predicting 'comments'.
    """
    print("--- Starting Performance Predictor Model Training ---")

    data_path = os.path.join(ROOT_DIR, 'data', 'performance_training_data.csv')
    
    try:
        df = pd.read_csv(data_path)
        print(f"Loaded {len(df)} rows from {data_path}")
        if df.empty:
            print("⚠️ CSV file is empty. Aborting model training.")
            return
    except (FileNotFoundError, pd.errors.EmptyDataError):
        print(f"🔴 ERROR: Data file not found or is empty at {data_path}")
        return

    # Features (X) hum in cheezon se likes/comments ka anuman lagayenge
    features = ['follower_count', 'caption_length', 'campaign_niche', 'content_format']
    X = df[features]

    # Targets (y)
    y_likes = df['likes']
    y_comments = df['comments']

    # Preprocessing pipeline (categorical features ke liye)
    categorical_features = ['campaign_niche', 'content_format']
    preprocessor = ColumnTransformer(
        transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)],
        remainder='passthrough'
    )

    # ---- Model #1: Likes Predictor ----
    print("\n--- Training Likes Predictor Model ---")
    likes_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42))
    ])
    likes_pipeline.fit(X, y_likes)
    
    likes_model_path = os.path.join(ROOT_DIR, 'models', 'likes_predictor_v1.joblib')
    joblib.dump(likes_pipeline, likes_model_path)
    print(f"✅ Likes Predictor model saved to: {likes_model_path}")

    # ---- Model #2: Comments Predictor ----
    print("\n--- Training Comments Predictor Model ---")
    comments_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.05, random_state=42))
    ])
    comments_pipeline.fit(X, y_comments)

    comments_model_path = os.path.join(ROOT_DIR, 'models', 'comments_predictor_v1.joblib')
    joblib.dump(comments_pipeline, comments_model_path)
    print(f"✅ Comments Predictor model saved to: {comments_model_path}")
    
    print("\n🎉 All performance models trained and saved successfully!")

if __name__ == '__main__':
    train_performance_models()