File size: 2,800 Bytes
0914e96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# File: ai-service/training/train_earning_optimizer.py (FINAL SIMPLIFIED VERSION)

import pandas as pd
import xgboost as xgb
import joblib
import os
import sys
from sklearn.preprocessing import OneHotEncoder

ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(ROOT_DIR)

def train_earning_optimizer():
    print("--- Starting Earning Optimizer Model Training (Simplified) ---")

    data_path = os.path.join(ROOT_DIR, 'data', 'earnings_training_data.csv')
    try:
        df = pd.read_csv(data_path)
        if df.empty:
            print("⚠️ CSV file is empty. Aborting.")
            return
    except (FileNotFoundError, pd.errors.EmptyDataError):
        print(f"🔴 ERROR: Data file not found or is empty at {data_path}")
        return

    # ... (Feature engineering code is the same)
    print("Creating 'Smart Performance Score'...")
    df['roi'] = df['payment_amount'] / df['follower_count']
    df['norm_engagement'] = 0.5 if df['engagement_rate'].nunique() == 1 else (df['engagement_rate'] - df['engagement_rate'].min()) / (df['engagement_rate'].max() - df['engagement_rate'].min())
    df['norm_roi'] = 0.5 if df['roi'].nunique() == 1 else (df['roi'] - df['roi'].min()) / (df['roi'].max() - df['roi'].min())
    df['smart_performance_score'] = 0.6 * df['norm_engagement'] + 0.4 * df['norm_roi']

    # === ✨ THE FIX STARTS HERE ✨ ===
    print("Preparing data MANUALLY without Pipeline...")
    
    # 1. Manually encode categorical features
    categorical_features = ['campaign_niche', 'content_format']
    encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    encoded_cats = encoder.fit_transform(df[categorical_features])
    encoded_df = pd.DataFrame(encoded_cats, columns=encoder.get_feature_names_out(categorical_features))

    # 2. Combine with numerical features
    numerical_features = df[['follower_count']].reset_index(drop=True)
    X_final = pd.concat([encoded_df, numerical_features], axis=1)
    y = df['smart_performance_score']

    # Save the encoder along with the model
    joblib.dump(encoder, os.path.join(ROOT_DIR, 'models', 'earnings_encoder.joblib'))
    print("--- Encoder saved successfully! ---")

    # Train the model DIRECTLY on the prepared data
    print("Training the XGBoost model...")
    model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)
    model.fit(X_final, y)
    print("--- Model training complete! ---")
    
    # Save the simple model (not the pipeline)
    model_path = os.path.join(ROOT_DIR, 'models', 'earnings_model.joblib')
    joblib.dump(model, model_path)
    print(f"--- SIMPLE Model saved successfully to {model_path} ---")
    # === ✨ THE FIX ENDS HERE ✨ ===

if __name__ == '__main__':
    train_earning_optimizer()