import os import pandas as pd import joblib from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestRegressor from sklearn.preprocessing import OneHotEncoder from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.metrics import mean_squared_error # --- CONFIGURATION --- DATA_FILE = os.path.join(os.path.dirname(__file__), '..', 'data', 'thunderbird_market_trends.csv') MODEL_OUTPUT_FILE = os.path.join(os.path.dirname(__file__), '..', 'models', 'thunderbird_market_predictor_v1.joblib') def train_model(): print("--- Starting Thunderbird Market Predictor Training ---") # 1. Load Data try: df = pd.read_csv(DATA_FILE) print(f"āœ… Data loaded successfully. Shape: {df.shape}") except FileNotFoundError: print(f"āŒ ERROR: Training data not found at {DATA_FILE}. Run the export script first.") return # 2. Preprocessing & Feature Engineering df['month'] = pd.to_datetime(df['month']) df['month_of_year'] = df['month'].dt.month X = df[['niche', 'trend_score', 'month_of_year']] y = df['successful_campaigns'] # 3. Create a preprocessing pipeline for categorical features categorical_features = ['niche'] preprocessor = ColumnTransformer( transformers=[ ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features) ], remainder='passthrough' # Keep other columns (trend_score, month_of_year) ) # 4. Define the model model = RandomForestRegressor(n_estimators=100, random_state=42, min_samples_leaf=2) # 5. Create the full pipeline pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', model)]) # 6. Train the model print("šŸš€ Training the model...") pipeline.fit(X, y) print("āœ… Model training complete.") # 7. Evaluate the model (optional) predictions = pipeline.predict(X) mse = mean_squared_error(y, predictions) print(f" - Model Evaluation (MSE on training data): {mse:.2f}") # 8. Save the entire pipeline (preprocessor + model) joblib.dump(pipeline, MODEL_OUTPUT_FILE) print(f"\nāœ… Success! Trained model saved to: {MODEL_OUTPUT_FILE}") if __name__ == "__main__": train_model()