Spaces:
Running
Running
| """ | |
| Project: Optimizing Stock Trading Strategy With Reinforcement Learning | |
| Authors: Amey Thakur & Mega Satish | |
| Reference: https://github.com/Amey-Thakur/OPTIMIZING-STOCK-TRADING-STRATEGY-WITH-REINFORCEMENT-LEARNING | |
| License: MIT | |
| Description: | |
| This script implements the training phase of the Reinforcement Learning agent (Q-Learning). | |
| It preprocesses historical stock data, defines the market environment as a set of states | |
| based on Moving Average crossovers, and iteratively updates a Q-Table to learn optimal | |
| trading actions (Buy, Sell, Hold) that maximize portfolio returns. | |
| """ | |
| import pandas as pd | |
| import numpy as np | |
| import pickle as pkl | |
| import os | |
| # ========================================== | |
| # 1. Data Preprocessing | |
| # ========================================== | |
| def data_prep(data, name): | |
| """ | |
| Preprocesses the stock data for a specific company. | |
| Args: | |
| data (pd.DataFrame): The complete dataset containing all stocks. | |
| name (str): The ticker symbol of the stock to filter (e.g., 'AAPL'). | |
| Returns: | |
| tuple: (train_df, test_df) - The split training and testing datasets. | |
| Methodology: | |
| - Filters data by stock name. | |
| - Computes Technical Indicators: 5-day and 1-day Moving Averages (MA). | |
| - 5-day MA represents the short-term trend baseline. | |
| - 1-day MA represents the immediate price action. | |
| - The interaction between these two MAs serves as the primary signal for state determination. | |
| """ | |
| df = pd.DataFrame(data[data['Name'] == name]) | |
| df.dropna(inplace=True) | |
| df.drop(['high', 'low', 'volume', 'Name'], axis=1, inplace=True) | |
| df.reset_index(drop=True, inplace=True) | |
| # Calculating Moving Averages used for State Definition | |
| df['5day_MA'] = df['close'].rolling(5).mean() | |
| df['1day_MA'] = df['close'].rolling(1).mean() | |
| # Initialize first few rows where rolling mean is NaN | |
| df.loc[:4, '5day_MA'] = 0 | |
| # Splitting into Train (80%) and Test (20%) sets | |
| split_idx = int(len(df) * 0.8) | |
| train_df = df[:split_idx] | |
| test_df = df[split_idx:].reset_index(drop=True) | |
| return train_df, test_df | |
| # ========================================== | |
| # 2. Environment & State Definitions | |
| # ========================================== | |
| def get_state(long_ma, short_ma, t): | |
| """ | |
| Discretizes continuous market data into a finite set of states. | |
| The state space is defined by a tuple (Trend_Signal, Holding_Status). | |
| 1. Trend_Signal: | |
| - 0: short_ma < long_ma (Bearish/Downtrend) | |
| - 1: short_ma > long_ma (Bullish/Uptrend) | |
| 2. Holding_Status (t): | |
| - 0: Currently holding stock | |
| - 1: Currently holding cash (no stock) | |
| Returns: | |
| tuple: (trend, holding_status) representing the current environment state. | |
| """ | |
| if short_ma < long_ma: | |
| if t == 1: | |
| return (0, 1) # Bearish Trend, Holding Cash | |
| else: | |
| return (0, 0) # Bearish Trend, Holding Stock | |
| elif short_ma > long_ma: | |
| if t == 1: | |
| return (1, 1) # Bullish Trend, Holding Cash | |
| else: | |
| return (1, 0) # Bullish Trend, Holding Stock | |
| # Default case (should rarely be hit with floats) | |
| return (0, 1) | |
| def trade_t(num_of_stocks, port_value, current_price): | |
| """ | |
| Determines the holding capability of the agent. | |
| Returns: | |
| int: 1 if the agent has capital to buy (Cash), 0 if fully invested (Stock). | |
| """ | |
| # Simply mapping: if we have stocks or cash value > current price, we can 'technically' buy/hold | |
| # But in this simplified binary state (All-in or All-out), we track logical status. | |
| # Here, we simplify: | |
| if num_of_stocks > 0: | |
| return 0 # User holds stock | |
| else: | |
| if port_value > current_price: | |
| return 1 # User holds cash and can afford stock | |
| else: | |
| return 0 # User is broke/cannot buy | |
| # ========================================== | |
| # 3. Q-Learning Agent Logic | |
| # ========================================== | |
| def next_act(state, qtable, epsilon, action_space=3): | |
| """ | |
| Selects the next action using the Epsilon-Greedy Policy. | |
| Args: | |
| state (tuple): The current state of the environment. | |
| qtable (np.array): The Q-Table storing action-values. | |
| epsilon (float): Exploration rate (probability of random action). | |
| Returns: | |
| int: The selected action index. | |
| 0: Buy | |
| 1: Sell | |
| 2: Hold | |
| """ | |
| if np.random.rand() < epsilon: | |
| # Exploration: Random action | |
| action = np.random.randint(action_space) | |
| else: | |
| # Exploitation: Best known action from Q-Table | |
| action = np.argmax(qtable[state]) | |
| return action | |
| def get_reward(state, action, current_close, past_close, buy_history): | |
| """ | |
| Calculates the immediate reward for a given state-action pair. | |
| The Reward Function is crucial for guiding the agent: | |
| - Penalize invalid moves (e.g., Buying when already holding). | |
| - Reward profit generation (Selling higher than bought). | |
| - Reward capital preservation (Holding during downturns). | |
| """ | |
| if state == (0, 0) or state == (1, 0): # State: Holding Stock | |
| if action == 0: # Try to Buy again | |
| return -1000 # Heavy Penalty for illegal move | |
| elif action == 1: # Sell | |
| return (current_close - buy_history) # Reward is the realized PnL | |
| elif action == 2: # Hold | |
| return (current_close - past_close) # Reward is the unrealized daily change | |
| elif state == (0, 1) or state == (1, 1): # State: Holding Cash | |
| if action == 0: # Buy | |
| return 0 # Neutral reward for entering position | |
| elif action == 1: # Try to Sell again | |
| return -1000 # Heavy Penalty for illegal move | |
| elif action == 2: # Hold (Wait) | |
| return (current_close - past_close) # Opportunity cost/benefit tracking | |
| return 0 | |
| # ========================================== | |
| # 4. Main Training Loop | |
| # ========================================== | |
| def train_model(): | |
| print("Initializing Training Process...") | |
| # 4.1 Initialize Q-Table | |
| # Dimensions: 2 (Trend States) x 2 (Holding States) x 3 (Actions) | |
| env_rows = 2 | |
| env_cols = 2 | |
| n_action = 3 | |
| q_table = np.zeros((env_rows, env_cols, n_action)) | |
| # 4.2 Load Data | |
| try: | |
| stocks = pd.read_csv('all_stocks_5yr.csv') | |
| # We train primarily on AAPL as the representative asset for this strategy | |
| stocks_train, _ = data_prep(stocks, 'AAPL') | |
| except FileNotFoundError: | |
| print("Error: 'all_stocks_5yr.csv' not found.") | |
| return | |
| # 4.3 Hyperparameters | |
| episodes = 100 # Number of times to iterate over the dataset | |
| epsilon = 1.0 # Initial Exploration Rate (100% random) | |
| alpha = 0.05 # Learning Rate (Impact of new information) | |
| gamma = 0.15 # Discount Factor (Importance of future rewards) | |
| print(f"Starting Training for {episodes} episodes...") | |
| for i in range(episodes): | |
| # Reset Episode Variables | |
| port_value = 1000 | |
| num_stocks = 0 | |
| buy_history = 0 | |
| net_worth = [1000] | |
| # Iterate over the time-series | |
| for dt in range(len(stocks_train)): | |
| long_ma = stocks_train.iloc[dt]['5day_MA'] | |
| short_ma = stocks_train.iloc[dt]['1day_MA'] | |
| close_price = stocks_train.iloc[dt]['close'] | |
| # Get Previous Close for Reward Calc | |
| if dt > 0: | |
| past_close = stocks_train.iloc[dt-1]['close'] | |
| else: | |
| past_close = close_price | |
| # Determine Current State | |
| t = trade_t(num_stocks, net_worth[-1], close_price) | |
| state = get_state(long_ma, short_ma, t) | |
| # Select Action | |
| action = next_act(state, q_table, epsilon) | |
| # Execute Action & Update Portfolio Logic | |
| if action == 0: # Buy | |
| num_stocks += 1 | |
| buy_history = close_price | |
| net_worth.append(np.round(net_worth[-1] - close_price, 1)) | |
| r = 0 # Reward calculated later if needed, mostly 0 for entry | |
| elif action == 1: # Sell | |
| num_stocks -= 1 | |
| net_worth.append(np.round(net_worth[-1] + close_price, 1)) | |
| # buy_history handled in reward | |
| elif action == 2: # Hold | |
| net_worth.append(np.round(net_worth[-1] + close_price, 1)) # Simplified tracking | |
| # Compute Reward | |
| r = get_reward(state, action, close_price, past_close, buy_history) | |
| # Observe Next State | |
| try: | |
| next_long = stocks_train.iloc[dt+1]['5day_MA'] | |
| next_short = stocks_train.iloc[dt+1]['1day_MA'] | |
| next_state = get_state(next_long, next_short, t) | |
| except IndexError: | |
| # End of data | |
| break | |
| # Update Q-Value using Bellman Equation | |
| # Q(s,a) = (1-alpha) * Q(s,a) + alpha * (reward + gamma * max(Q(s', a'))) | |
| q_table[state][action] = (1. - alpha) * q_table[state][action] + alpha * (r + gamma * np.max(q_table[next_state])) | |
| # Decay Epsilon to reduce exploration over time | |
| if (epsilon - 0.01) > 0.15: | |
| epsilon -= 0.01 | |
| if (i + 1) % 10 == 0: | |
| print(f"Episode {i+1}/{episodes} complete. Epsilon: {epsilon:.2f}") | |
| print("Training Complete.") | |
| # 4.4 Save the Trained Model | |
| with open('model.pkl', 'wb') as f: | |
| pkl.dump(q_table, f) | |
| print("Model saved to 'model.pkl'.") | |
| if __name__ == "__main__": | |
| train_model() | |