reachify-ai-service / training /train_thunderbird_market_predictor.py
amitbhatt6075's picture
feat: Trained model with real Google Trends data
8927482
import os
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
# --- CONFIGURATION ---
DATA_FILE = os.path.join(os.path.dirname(__file__), '..', 'data', 'thunderbird_market_trends.csv')
MODEL_OUTPUT_FILE = os.path.join(os.path.dirname(__file__), '..', 'models', 'thunderbird_market_predictor_v1.joblib')
def train_model():
print("--- Starting Thunderbird Market Predictor Training ---")
# 1. Load Data
try:
df = pd.read_csv(DATA_FILE)
print(f"βœ… Data loaded successfully. Shape: {df.shape}")
except FileNotFoundError:
print(f"❌ ERROR: Training data not found at {DATA_FILE}. Run the export script first.")
return
# 2. Preprocessing & Feature Engineering
df['month'] = pd.to_datetime(df['month'])
df['month_of_year'] = df['month'].dt.month
X = df[['niche', 'trend_score', 'month_of_year']]
y = df['successful_campaigns']
# 3. Create a preprocessing pipeline for categorical features
categorical_features = ['niche']
preprocessor = ColumnTransformer(
transformers=[
('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
],
remainder='passthrough' # Keep other columns (trend_score, month_of_year)
)
# 4. Define the model
model = RandomForestRegressor(n_estimators=100, random_state=42, min_samples_leaf=2)
# 5. Create the full pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
('regressor', model)])
# 6. Train the model
print("πŸš€ Training the model...")
pipeline.fit(X, y)
print("βœ… Model training complete.")
# 7. Evaluate the model (optional)
predictions = pipeline.predict(X)
mse = mean_squared_error(y, predictions)
print(f" - Model Evaluation (MSE on training data): {mse:.2f}")
# 8. Save the entire pipeline (preprocessor + model)
joblib.dump(pipeline, MODEL_OUTPUT_FILE)
print(f"\nβœ… Success! Trained model saved to: {MODEL_OUTPUT_FILE}")
if __name__ == "__main__":
train_model()