File size: 4,526 Bytes
7a658e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
"""
Model training script with MLflow tracking
Trains strategy recommendation models
"""
import os
import sys
import pandas as pd
import numpy as np
import pickle
import json
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import mlflow
import mlflow.sklearn
import yaml

def load_params():
    """Load parameters from params.yaml"""
    with open("params.yaml", "r") as f:
        return yaml.safe_load(f)

def prepare_features(df: pd.DataFrame):
    """Prepare features for model training"""
    features = ["sma_10", "sma_20", "rsi", "volatility", "price_position"]
    X = df[features].fillna(0)
    return X

def create_labels(df: pd.DataFrame, strategy_type: str = "TOP"):
    """Create labels based on strategy rules"""
    if strategy_type == "TOP":
        # TOP strategy: buy when price position > 70, RSI 50-70
        y = ((df["price_position"] > 70) & 
             (df["rsi"] > 50) & (df["rsi"] < 70)).astype(int)
    else:  # BOTTOM
        # BOTTOM strategy: buy when price position < 30, RSI < 30
        y = ((df["price_position"] < 30) & (df["rsi"] < 30)).astype(int)
    return y

def main():
    """Main training function"""
    params = load_params()
    model_params = params["model"]["params"]
    
    # Load data
    df = pd.read_parquet("data/processed/indicators.parquet")
    df = df.dropna(subset=["rsi", "sma_10", "sma_20"])
    
    # Prepare features
    X = prepare_features(df)
    
    # Create output directory
    os.makedirs("models", exist_ok=True)
    os.makedirs("metrics", exist_ok=True)
    
    # MLflow setup
    mlflow.set_tracking_uri(params["mlops"]["mlflow"]["tracking_uri"])
    mlflow.set_experiment(params["mlops"]["mlflow"]["experiment_name"])
    
    results = {}
    
    # Train models for both strategies
    for strategy_type in ["TOP", "BOTTOM"]:
        with mlflow.start_run(run_name=f"{strategy_type}_strategy"):
            # Create labels
            y = create_labels(df, strategy_type)
            
            if y.sum() < 10:  # Need minimum samples
                print(f"Not enough samples for {strategy_type} strategy")
                continue
            
            # Split data
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2, random_state=params["model"]["random_state"]
            )
            
            # Train model
            model = RandomForestClassifier(
                n_estimators=model_params["n_estimators"],
                max_depth=model_params["max_depth"],
                random_state=params["model"]["random_state"]
            )
            model.fit(X_train, y_train)
            
            # Evaluate
            y_pred = model.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred, zero_division=0)
            recall = recall_score(y_test, y_pred, zero_division=0)
            f1 = f1_score(y_test, y_pred, zero_division=0)
            
            # Log to MLflow
            mlflow.log_params(model_params)
            mlflow.log_param("strategy_type", strategy_type)
            mlflow.log_metrics({
                "accuracy": accuracy,
                "precision": precision,
                "recall": recall,
                "f1_score": f1
            })
            mlflow.sklearn.log_model(model, f"{strategy_type.lower()}_model")
            
            # Save model
            model_path = f"models/{strategy_type.lower()}_strategy_model.pkl"
            with open(model_path, "wb") as f:
                pickle.dump(model, f)
            
            results[strategy_type] = {
                "accuracy": float(accuracy),
                "precision": float(precision),
                "recall": float(recall),
                "f1_score": float(f1)
            }
            
            print(f"{strategy_type} Strategy - Accuracy: {accuracy:.3f}, F1: {f1:.3f}")
    
    # Save metadata
    metadata = {
        "models": list(results.keys()),
        "metrics": results,
        "training_date": pd.Timestamp.now().isoformat()
    }
    with open("models/model_metadata.json", "w") as f:
        json.dump(metadata, f, indent=2)
    
    # Save metrics for DVC
    with open("metrics/model_metrics.json", "w") as f:
        json.dump(results, f, indent=2)
    
    print("Training complete!")

if __name__ == "__main__":
    main()