DeepFin / supervised_financial_model.py
Amós e Souza Fernandes
Upload 120 files
5f10e37 verified
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression # Example model
from sklearn.metrics import accuracy_score, classification_report
# (Placeholder for more sophisticated feature engineering)
def create_features(df: pd.DataFrame, target_lag: int = 1) -> pd.DataFrame:
"""
Creates basic features for financial time series.
- Lagged returns
- Target variable (e.g., price goes up or down)
"""
df_copy = df.copy()
df_copy['returns'] = df_copy['Close'].pct_change()
# Simple target: 1 if next day's close is higher, 0 otherwise
df_copy['target'] = (df_copy['Close'].shift(-target_lag) > df_copy['Close']).astype(int)
# Add more features: e.g., moving averages, RSI, MACD
df_copy['ma5'] = df_copy['Close'].rolling(window=5).mean()
df_copy['ma20'] = df_copy['Close'].rolling(window=20).mean()
df_copy = df_copy.dropna()
return df_copy
def preprocess_data_for_supervised(
df: pd.DataFrame,
features_list: list = ['returns', 'ma5', 'ma20'],
target_col: str = 'target',
test_size: float = 0.2,
random_state: int = 42
):
"""
Prepares data for supervised learning.
- Selects features and target.
- Splits data into training and testing sets.
- Scales features.
"""
X = df[features_list]
y = df[target_col]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, shuffle=False) # Time series data, so no shuffle
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
return X_train_scaled, X_test_scaled, y_train, y_test, scaler
def train_supervised_model(X_train, y_train, model_type='logistic_regression', model_params=None):
"""
Trains a supervised learning model.
"""
if model_params is None:
model_params = {}
if model_type == 'logistic_regression':
model = LogisticRegression(**model_params, random_state=42, max_iter=1000) # Added max_iter
# Add other model types here (e.g., SVM, RandomForest, GradientBoosting)
# elif model_type == 'random_forest':
# from sklearn.ensemble import RandomForestClassifier
# model = RandomForestClassifier(**model_params, random_state=42)
else:
raise ValueError(f"Unsupported model type: {model_type}")
model.fit(X_train, y_train)
return model
def evaluate_supervised_model(model, X_test, y_test):
"""
Evaluates the trained supervised model.
"""
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
report = classification_report(y_test, predictions)
print(f"Model Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(report)
return accuracy, report, predictions
if __name__ == '__main__':
# This is an example of how to use the functions.
# You'll need to integrate this with your data fetching agent.
# 1. Fetch data (using the financial_data_agent you modified)
# from agents.financial_data_agent import fetch_historical_ohlcv
# raw_data = fetch_historical_ohlcv("AAPL", period="1y", interval="1d")
# For demonstration, creating a dummy DataFrame:
dates = pd.date_range(start='2023-01-01', periods=200, freq='B')
data = np.random.rand(200, 5) * 100 + 100
raw_data = pd.DataFrame(data, index=dates, columns=['Open', 'High', 'Low', 'Close', 'Volume'])
raw_data['Close'] = raw_data['Close'] + np.sin(np.linspace(0, 10, 200)) * 10 # Add some trend
if not raw_data.empty:
# 2. Create features
featured_data = create_features(raw_data)
if not featured_data.empty:
# 3. Preprocess data
X_train_scaled, X_test_scaled, y_train, y_test, scaler = preprocess_data_for_supervised(
featured_data,
features_list=['returns', 'ma5', 'ma20'] # Ensure these features exist
)
# 4. Train model
print("Training supervised model...")
trained_model = train_supervised_model(X_train_scaled, y_train)
print("Model trained.")
# 5. Evaluate model
print("\nEvaluating model...")
evaluate_supervised_model(trained_model, X_test_scaled, y_test)
else:
print("Featured data is empty. Check feature creation.")
else:
print("Raw data is empty. Check data fetching.")