File size: 4,664 Bytes
5f10e37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression # Example model
from sklearn.metrics import accuracy_score, classification_report

# (Placeholder for more sophisticated feature engineering)
def create_features(df: pd.DataFrame, target_lag: int = 1) -> pd.DataFrame:
    """
    Creates basic features for financial time series.
    - Lagged returns
    - Target variable (e.g., price goes up or down)
    """
    df_copy = df.copy()
    df_copy['returns'] = df_copy['Close'].pct_change()
    
    # Simple target: 1 if next day's close is higher, 0 otherwise
    df_copy['target'] = (df_copy['Close'].shift(-target_lag) > df_copy['Close']).astype(int)
    
    # Add more features: e.g., moving averages, RSI, MACD
    df_copy['ma5'] = df_copy['Close'].rolling(window=5).mean()
    df_copy['ma20'] = df_copy['Close'].rolling(window=20).mean()
    
    df_copy = df_copy.dropna()
    return df_copy

def preprocess_data_for_supervised(
    df: pd.DataFrame,
    features_list: list = ['returns', 'ma5', 'ma20'],
    target_col: str = 'target',
    test_size: float = 0.2,
    random_state: int = 42
):
    """
    Prepares data for supervised learning.
    - Selects features and target.
    - Splits data into training and testing sets.
    - Scales features.
    """
    X = df[features_list]
    y = df[target_col]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, shuffle=False) # Time series data, so no shuffle
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train_scaled, X_test_scaled, y_train, y_test, scaler

def train_supervised_model(X_train, y_train, model_type='logistic_regression', model_params=None):
    """
    Trains a supervised learning model.
    """
    if model_params is None:
        model_params = {}
        
    if model_type == 'logistic_regression':
        model = LogisticRegression(**model_params, random_state=42, max_iter=1000) # Added max_iter
    # Add other model types here (e.g., SVM, RandomForest, GradientBoosting)
    # elif model_type == 'random_forest':
    #     from sklearn.ensemble import RandomForestClassifier
    #     model = RandomForestClassifier(**model_params, random_state=42)
    else:
        raise ValueError(f"Unsupported model type: {model_type}")
        
    model.fit(X_train, y_train)
    return model

def evaluate_supervised_model(model, X_test, y_test):
    """
    Evaluates the trained supervised model.
    """
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    report = classification_report(y_test, predictions)
    
    print(f"Model Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(report)
    
    return accuracy, report, predictions

if __name__ == '__main__':
    # This is an example of how to use the functions.
    # You'll need to integrate this with your data fetching agent.
    
    # 1. Fetch data (using the financial_data_agent you modified)
    # from agents.financial_data_agent import fetch_historical_ohlcv
    # raw_data = fetch_historical_ohlcv("AAPL", period="1y", interval="1d")
    
    # For demonstration, creating a dummy DataFrame:
    dates = pd.date_range(start='2023-01-01', periods=200, freq='B')
    data = np.random.rand(200, 5) * 100 + 100
    raw_data = pd.DataFrame(data, index=dates, columns=['Open', 'High', 'Low', 'Close', 'Volume'])
    raw_data['Close'] = raw_data['Close'] + np.sin(np.linspace(0, 10, 200)) * 10 # Add some trend

    if not raw_data.empty:
        # 2. Create features
        featured_data = create_features(raw_data)
        
        if not featured_data.empty:
            # 3. Preprocess data
            X_train_scaled, X_test_scaled, y_train, y_test, scaler = preprocess_data_for_supervised(
                featured_data, 
                features_list=['returns', 'ma5', 'ma20'] # Ensure these features exist
            )
            
            # 4. Train model
            print("Training supervised model...")
            trained_model = train_supervised_model(X_train_scaled, y_train)
            print("Model trained.")
            
            # 5. Evaluate model
            print("\nEvaluating model...")
            evaluate_supervised_model(trained_model, X_test_scaled, y_test)
        else:
            print("Featured data is empty. Check feature creation.")
    else:
        print("Raw data is empty. Check data fetching.")