import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression # Example model from sklearn.metrics import accuracy_score, classification_report # (Placeholder for more sophisticated feature engineering) def create_features(df: pd.DataFrame, target_lag: int = 1) -> pd.DataFrame: """ Creates basic features for financial time series. - Lagged returns - Target variable (e.g., price goes up or down) """ df_copy = df.copy() df_copy['returns'] = df_copy['Close'].pct_change() # Simple target: 1 if next day's close is higher, 0 otherwise df_copy['target'] = (df_copy['Close'].shift(-target_lag) > df_copy['Close']).astype(int) # Add more features: e.g., moving averages, RSI, MACD df_copy['ma5'] = df_copy['Close'].rolling(window=5).mean() df_copy['ma20'] = df_copy['Close'].rolling(window=20).mean() df_copy = df_copy.dropna() return df_copy def preprocess_data_for_supervised( df: pd.DataFrame, features_list: list = ['returns', 'ma5', 'ma20'], target_col: str = 'target', test_size: float = 0.2, random_state: int = 42 ): """ Prepares data for supervised learning. - Selects features and target. - Splits data into training and testing sets. - Scales features. """ X = df[features_list] y = df[target_col] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, shuffle=False) # Time series data, so no shuffle scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) return X_train_scaled, X_test_scaled, y_train, y_test, scaler def train_supervised_model(X_train, y_train, model_type='logistic_regression', model_params=None): """ Trains a supervised learning model. """ if model_params is None: model_params = {} if model_type == 'logistic_regression': model = LogisticRegression(**model_params, random_state=42, max_iter=1000) # Added max_iter # Add other model types here (e.g., SVM, RandomForest, GradientBoosting) # elif model_type == 'random_forest': # from sklearn.ensemble import RandomForestClassifier # model = RandomForestClassifier(**model_params, random_state=42) else: raise ValueError(f"Unsupported model type: {model_type}") model.fit(X_train, y_train) return model def evaluate_supervised_model(model, X_test, y_test): """ Evaluates the trained supervised model. """ predictions = model.predict(X_test) accuracy = accuracy_score(y_test, predictions) report = classification_report(y_test, predictions) print(f"Model Accuracy: {accuracy:.4f}") print("Classification Report:") print(report) return accuracy, report, predictions if __name__ == '__main__': # This is an example of how to use the functions. # You'll need to integrate this with your data fetching agent. # 1. Fetch data (using the financial_data_agent you modified) # from agents.financial_data_agent import fetch_historical_ohlcv # raw_data = fetch_historical_ohlcv("AAPL", period="1y", interval="1d") # For demonstration, creating a dummy DataFrame: dates = pd.date_range(start='2023-01-01', periods=200, freq='B') data = np.random.rand(200, 5) * 100 + 100 raw_data = pd.DataFrame(data, index=dates, columns=['Open', 'High', 'Low', 'Close', 'Volume']) raw_data['Close'] = raw_data['Close'] + np.sin(np.linspace(0, 10, 200)) * 10 # Add some trend if not raw_data.empty: # 2. Create features featured_data = create_features(raw_data) if not featured_data.empty: # 3. Preprocess data X_train_scaled, X_test_scaled, y_train, y_test, scaler = preprocess_data_for_supervised( featured_data, features_list=['returns', 'ma5', 'ma20'] # Ensure these features exist ) # 4. Train model print("Training supervised model...") trained_model = train_supervised_model(X_train_scaled, y_train) print("Model trained.") # 5. Evaluate model print("\nEvaluating model...") evaluate_supervised_model(trained_model, X_test_scaled, y_test) else: print("Featured data is empty. Check feature creation.") else: print("Raw data is empty. Check data fetching.")