Spaces:

Egeekle
/

Investment_Assistant

Build error

File size: 4,526 Bytes

7a658e1

"""
Model training script with MLflow tracking
Trains strategy recommendation models
"""
import os
import sys
import pandas as pd
import numpy as np
import pickle
import json
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import mlflow
import mlflow.sklearn
import yaml

def load_params():
    """Load parameters from params.yaml"""
    with open("params.yaml", "r") as f:
        return yaml.safe_load(f)

def prepare_features(df: pd.DataFrame):
    """Prepare features for model training"""
    features = ["sma_10", "sma_20", "rsi", "volatility", "price_position"]
    X = df[features].fillna(0)
    return X

def create_labels(df: pd.DataFrame, strategy_type: str = "TOP"):
    """Create labels based on strategy rules"""
    if strategy_type == "TOP":
        # TOP strategy: buy when price position > 70, RSI 50-70
        y = ((df["price_position"] > 70) & 
             (df["rsi"] > 50) & (df["rsi"] < 70)).astype(int)
    else:  # BOTTOM
        # BOTTOM strategy: buy when price position < 30, RSI < 30
        y = ((df["price_position"] < 30) & (df["rsi"] < 30)).astype(int)
    return y

def main():
    """Main training function"""
    params = load_params()
    model_params = params["model"]["params"]
    
    # Load data
    df = pd.read_parquet("data/processed/indicators.parquet")
    df = df.dropna(subset=["rsi", "sma_10", "sma_20"])
    
    # Prepare features
    X = prepare_features(df)
    
    # Create output directory
    os.makedirs("models", exist_ok=True)
    os.makedirs("metrics", exist_ok=True)
    
    # MLflow setup
    mlflow.set_tracking_uri(params["mlops"]["mlflow"]["tracking_uri"])
    mlflow.set_experiment(params["mlops"]["mlflow"]["experiment_name"])
    
    results = {}
    
    # Train models for both strategies
    for strategy_type in ["TOP", "BOTTOM"]:
        with mlflow.start_run(run_name=f"{strategy_type}_strategy"):
            # Create labels
            y = create_labels(df, strategy_type)
            
            if y.sum() < 10:  # Need minimum samples
                print(f"Not enough samples for {strategy_type} strategy")
                continue
            
            # Split data
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2, random_state=params["model"]["random_state"]
            )
            
            # Train model
            model = RandomForestClassifier(
                n_estimators=model_params["n_estimators"],
                max_depth=model_params["max_depth"],
                random_state=params["model"]["random_state"]
            )
            model.fit(X_train, y_train)
            
            # Evaluate
            y_pred = model.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred, zero_division=0)
            recall = recall_score(y_test, y_pred, zero_division=0)
            f1 = f1_score(y_test, y_pred, zero_division=0)
            
            # Log to MLflow
            mlflow.log_params(model_params)
            mlflow.log_param("strategy_type", strategy_type)
            mlflow.log_metrics({
                "accuracy": accuracy,
                "precision": precision,
                "recall": recall,
                "f1_score": f1
            })
            mlflow.sklearn.log_model(model, f"{strategy_type.lower()}_model")
            
            # Save model
            model_path = f"models/{strategy_type.lower()}_strategy_model.pkl"
            with open(model_path, "wb") as f:
                pickle.dump(model, f)
            
            results[strategy_type] = {
                "accuracy": float(accuracy),
                "precision": float(precision),
                "recall": float(recall),
                "f1_score": float(f1)
            }
            
            print(f"{strategy_type} Strategy - Accuracy: {accuracy:.3f}, F1: {f1:.3f}")
    
    # Save metadata
    metadata = {
        "models": list(results.keys()),
        "metrics": results,
        "training_date": pd.Timestamp.now().isoformat()
    }
    with open("models/model_metadata.json", "w") as f:
        json.dump(metadata, f, indent=2)
    
    # Save metrics for DVC
    with open("metrics/model_metrics.json", "w") as f:
        json.dump(results, f, indent=2)
    
    print("Training complete!")

if __name__ == "__main__":
    main()