import os
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

# --- CONFIGURATION ---
DATA_FILE = os.path.join(os.path.dirname(__file__), '..', 'data', 'thunderbird_market_trends.csv')
MODEL_OUTPUT_FILE = os.path.join(os.path.dirname(__file__), '..', 'models', 'thunderbird_market_predictor_v1.joblib')

def train_model():
    print("--- Starting Thunderbird Market Predictor Training ---")
    
    # 1. Load Data
    try:
        df = pd.read_csv(DATA_FILE)
        print(f"✅ Data loaded successfully. Shape: {df.shape}")
    except FileNotFoundError:
        print(f"❌ ERROR: Training data not found at {DATA_FILE}. Run the export script first.")
        return

    # 2. Preprocessing & Feature Engineering
    df['month'] = pd.to_datetime(df['month'])
    df['month_of_year'] = df['month'].dt.month
    
    X = df[['niche', 'trend_score', 'month_of_year']]
    y = df['successful_campaigns']

    # 3. Create a preprocessing pipeline for categorical features
    categorical_features = ['niche']
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ],
        remainder='passthrough' # Keep other columns (trend_score, month_of_year)
    )

    # 4. Define the model
    model = RandomForestRegressor(n_estimators=100, random_state=42, min_samples_leaf=2)

    # 5. Create the full pipeline
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', model)])

    # 6. Train the model
    print("🚀 Training the model...")
    pipeline.fit(X, y)
    print("✅ Model training complete.")
    
    # 7. Evaluate the model (optional)
    predictions = pipeline.predict(X)
    mse = mean_squared_error(y, predictions)
    print(f"   - Model Evaluation (MSE on training data): {mse:.2f}")

    # 8. Save the entire pipeline (preprocessor + model)
    joblib.dump(pipeline, MODEL_OUTPUT_FILE)
    print(f"\n✅ Success! Trained model saved to: {MODEL_OUTPUT_FILE}")

if __name__ == "__main__":
    train_model()