|
|
import pandas as pd |
|
|
import numpy as np |
|
|
from sklearn.model_selection import train_test_split |
|
|
from sklearn.preprocessing import StandardScaler |
|
|
from sklearn.linear_model import LogisticRegression |
|
|
from sklearn.metrics import accuracy_score, classification_report |
|
|
|
|
|
|
|
|
def create_features(df: pd.DataFrame, target_lag: int = 1) -> pd.DataFrame: |
|
|
""" |
|
|
Creates basic features for financial time series. |
|
|
- Lagged returns |
|
|
- Target variable (e.g., price goes up or down) |
|
|
""" |
|
|
df_copy = df.copy() |
|
|
df_copy['returns'] = df_copy['Close'].pct_change() |
|
|
|
|
|
|
|
|
df_copy['target'] = (df_copy['Close'].shift(-target_lag) > df_copy['Close']).astype(int) |
|
|
|
|
|
|
|
|
df_copy['ma5'] = df_copy['Close'].rolling(window=5).mean() |
|
|
df_copy['ma20'] = df_copy['Close'].rolling(window=20).mean() |
|
|
|
|
|
df_copy = df_copy.dropna() |
|
|
return df_copy |
|
|
|
|
|
def preprocess_data_for_supervised( |
|
|
df: pd.DataFrame, |
|
|
features_list: list = ['returns', 'ma5', 'ma20'], |
|
|
target_col: str = 'target', |
|
|
test_size: float = 0.2, |
|
|
random_state: int = 42 |
|
|
): |
|
|
""" |
|
|
Prepares data for supervised learning. |
|
|
- Selects features and target. |
|
|
- Splits data into training and testing sets. |
|
|
- Scales features. |
|
|
""" |
|
|
X = df[features_list] |
|
|
y = df[target_col] |
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, shuffle=False) |
|
|
|
|
|
scaler = StandardScaler() |
|
|
X_train_scaled = scaler.fit_transform(X_train) |
|
|
X_test_scaled = scaler.transform(X_test) |
|
|
|
|
|
return X_train_scaled, X_test_scaled, y_train, y_test, scaler |
|
|
|
|
|
def train_supervised_model(X_train, y_train, model_type='logistic_regression', model_params=None): |
|
|
""" |
|
|
Trains a supervised learning model. |
|
|
""" |
|
|
if model_params is None: |
|
|
model_params = {} |
|
|
|
|
|
if model_type == 'logistic_regression': |
|
|
model = LogisticRegression(**model_params, random_state=42, max_iter=1000) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
else: |
|
|
raise ValueError(f"Unsupported model type: {model_type}") |
|
|
|
|
|
model.fit(X_train, y_train) |
|
|
return model |
|
|
|
|
|
def evaluate_supervised_model(model, X_test, y_test): |
|
|
""" |
|
|
Evaluates the trained supervised model. |
|
|
""" |
|
|
predictions = model.predict(X_test) |
|
|
accuracy = accuracy_score(y_test, predictions) |
|
|
report = classification_report(y_test, predictions) |
|
|
|
|
|
print(f"Model Accuracy: {accuracy:.4f}") |
|
|
print("Classification Report:") |
|
|
print(report) |
|
|
|
|
|
return accuracy, report, predictions |
|
|
|
|
|
if __name__ == '__main__': |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
dates = pd.date_range(start='2023-01-01', periods=200, freq='B') |
|
|
data = np.random.rand(200, 5) * 100 + 100 |
|
|
raw_data = pd.DataFrame(data, index=dates, columns=['Open', 'High', 'Low', 'Close', 'Volume']) |
|
|
raw_data['Close'] = raw_data['Close'] + np.sin(np.linspace(0, 10, 200)) * 10 |
|
|
|
|
|
if not raw_data.empty: |
|
|
|
|
|
featured_data = create_features(raw_data) |
|
|
|
|
|
if not featured_data.empty: |
|
|
|
|
|
X_train_scaled, X_test_scaled, y_train, y_test, scaler = preprocess_data_for_supervised( |
|
|
featured_data, |
|
|
features_list=['returns', 'ma5', 'ma20'] |
|
|
) |
|
|
|
|
|
|
|
|
print("Training supervised model...") |
|
|
trained_model = train_supervised_model(X_train_scaled, y_train) |
|
|
print("Model trained.") |
|
|
|
|
|
|
|
|
print("\nEvaluating model...") |
|
|
evaluate_supervised_model(trained_model, X_test_scaled, y_test) |
|
|
else: |
|
|
print("Featured data is empty. Check feature creation.") |
|
|
else: |
|
|
print("Raw data is empty. Check data fetching.") |