File size: 2,331 Bytes
01c71d2
 
8927482
01c71d2
8927482
01c71d2
8927482
 
 
01c71d2
 
8927482
01c71d2
 
 
8927482
 
01c71d2
 
8927482
 
01c71d2
8927482
01c71d2
 
8927482
 
 
01c71d2
8927482
01c71d2
 
8927482
 
 
 
 
 
 
 
01c71d2
8927482
 
01c71d2
8927482
 
 
01c71d2
8927482
 
 
 
 
 
 
 
 
01c71d2
8927482
 
 
01c71d2
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import os
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

# --- CONFIGURATION ---
DATA_FILE = os.path.join(os.path.dirname(__file__), '..', 'data', 'thunderbird_market_trends.csv')
MODEL_OUTPUT_FILE = os.path.join(os.path.dirname(__file__), '..', 'models', 'thunderbird_market_predictor_v1.joblib')

def train_model():
    print("--- Starting Thunderbird Market Predictor Training ---")
    
    # 1. Load Data
    try:
        df = pd.read_csv(DATA_FILE)
        print(f"βœ… Data loaded successfully. Shape: {df.shape}")
    except FileNotFoundError:
        print(f"❌ ERROR: Training data not found at {DATA_FILE}. Run the export script first.")
        return

    # 2. Preprocessing & Feature Engineering
    df['month'] = pd.to_datetime(df['month'])
    df['month_of_year'] = df['month'].dt.month
    
    X = df[['niche', 'trend_score', 'month_of_year']]
    y = df['successful_campaigns']

    # 3. Create a preprocessing pipeline for categorical features
    categorical_features = ['niche']
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ],
        remainder='passthrough' # Keep other columns (trend_score, month_of_year)
    )

    # 4. Define the model
    model = RandomForestRegressor(n_estimators=100, random_state=42, min_samples_leaf=2)

    # 5. Create the full pipeline
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', model)])

    # 6. Train the model
    print("πŸš€ Training the model...")
    pipeline.fit(X, y)
    print("βœ… Model training complete.")
    
    # 7. Evaluate the model (optional)
    predictions = pipeline.predict(X)
    mse = mean_squared_error(y, predictions)
    print(f"   - Model Evaluation (MSE on training data): {mse:.2f}")

    # 8. Save the entire pipeline (preprocessor + model)
    joblib.dump(pipeline, MODEL_OUTPUT_FILE)
    print(f"\nβœ… Success! Trained model saved to: {MODEL_OUTPUT_FILE}")

if __name__ == "__main__":
    train_model()