Spaces:
Sleeping
Sleeping
File size: 9,867 Bytes
9ffa007 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 | """
Project: Optimizing Stock Trading Strategy With Reinforcement Learning
Authors: Amey Thakur & Mega Satish
Reference: https://github.com/Amey-Thakur/OPTIMIZING-STOCK-TRADING-STRATEGY-WITH-REINFORCEMENT-LEARNING
License: MIT
Description:
This script implements the training phase of the Reinforcement Learning agent (Q-Learning).
It preprocesses historical stock data, defines the market environment as a set of states
based on Moving Average crossovers, and iteratively updates a Q-Table to learn optimal
trading actions (Buy, Sell, Hold) that maximize portfolio returns.
"""
import pandas as pd
import numpy as np
import pickle as pkl
import os
# ==========================================
# 1. Data Preprocessing
# ==========================================
def data_prep(data, name):
"""
Preprocesses the stock data for a specific company.
Args:
data (pd.DataFrame): The complete dataset containing all stocks.
name (str): The ticker symbol of the stock to filter (e.g., 'AAPL').
Returns:
tuple: (train_df, test_df) - The split training and testing datasets.
Methodology:
- Filters data by stock name.
- Computes Technical Indicators: 5-day and 1-day Moving Averages (MA).
- 5-day MA represents the short-term trend baseline.
- 1-day MA represents the immediate price action.
- The interaction between these two MAs serves as the primary signal for state determination.
"""
df = pd.DataFrame(data[data['Name'] == name])
df.dropna(inplace=True)
df.drop(['high', 'low', 'volume', 'Name'], axis=1, inplace=True)
df.reset_index(drop=True, inplace=True)
# Calculating Moving Averages used for State Definition
df['5day_MA'] = df['close'].rolling(5).mean()
df['1day_MA'] = df['close'].rolling(1).mean()
# Initialize first few rows where rolling mean is NaN
df.loc[:4, '5day_MA'] = 0
# Splitting into Train (80%) and Test (20%) sets
split_idx = int(len(df) * 0.8)
train_df = df[:split_idx]
test_df = df[split_idx:].reset_index(drop=True)
return train_df, test_df
# ==========================================
# 2. Environment & State Definitions
# ==========================================
def get_state(long_ma, short_ma, t):
"""
Discretizes continuous market data into a finite set of states.
The state space is defined by a tuple (Trend_Signal, Holding_Status).
1. Trend_Signal:
- 0: short_ma < long_ma (Bearish/Downtrend)
- 1: short_ma > long_ma (Bullish/Uptrend)
2. Holding_Status (t):
- 0: Currently holding stock
- 1: Currently holding cash (no stock)
Returns:
tuple: (trend, holding_status) representing the current environment state.
"""
if short_ma < long_ma:
if t == 1:
return (0, 1) # Bearish Trend, Holding Cash
else:
return (0, 0) # Bearish Trend, Holding Stock
elif short_ma > long_ma:
if t == 1:
return (1, 1) # Bullish Trend, Holding Cash
else:
return (1, 0) # Bullish Trend, Holding Stock
# Default case (should rarely be hit with floats)
return (0, 1)
def trade_t(num_of_stocks, port_value, current_price):
"""
Determines the holding capability of the agent.
Returns:
int: 1 if the agent has capital to buy (Cash), 0 if fully invested (Stock).
"""
# Simply mapping: if we have stocks or cash value > current price, we can 'technically' buy/hold
# But in this simplified binary state (All-in or All-out), we track logical status.
# Here, we simplify:
if num_of_stocks > 0:
return 0 # User holds stock
else:
if port_value > current_price:
return 1 # User holds cash and can afford stock
else:
return 0 # User is broke/cannot buy
# ==========================================
# 3. Q-Learning Agent Logic
# ==========================================
def next_act(state, qtable, epsilon, action_space=3):
"""
Selects the next action using the Epsilon-Greedy Policy.
Args:
state (tuple): The current state of the environment.
qtable (np.array): The Q-Table storing action-values.
epsilon (float): Exploration rate (probability of random action).
Returns:
int: The selected action index.
0: Buy
1: Sell
2: Hold
"""
if np.random.rand() < epsilon:
# Exploration: Random action
action = np.random.randint(action_space)
else:
# Exploitation: Best known action from Q-Table
action = np.argmax(qtable[state])
return action
def get_reward(state, action, current_close, past_close, buy_history):
"""
Calculates the immediate reward for a given state-action pair.
The Reward Function is crucial for guiding the agent:
- Penalize invalid moves (e.g., Buying when already holding).
- Reward profit generation (Selling higher than bought).
- Reward capital preservation (Holding during downturns).
"""
if state == (0, 0) or state == (1, 0): # State: Holding Stock
if action == 0: # Try to Buy again
return -1000 # Heavy Penalty for illegal move
elif action == 1: # Sell
return (current_close - buy_history) # Reward is the realized PnL
elif action == 2: # Hold
return (current_close - past_close) # Reward is the unrealized daily change
elif state == (0, 1) or state == (1, 1): # State: Holding Cash
if action == 0: # Buy
return 0 # Neutral reward for entering position
elif action == 1: # Try to Sell again
return -1000 # Heavy Penalty for illegal move
elif action == 2: # Hold (Wait)
return (current_close - past_close) # Opportunity cost/benefit tracking
return 0
# ==========================================
# 4. Main Training Loop
# ==========================================
def train_model():
print("Initializing Training Process...")
# 4.1 Initialize Q-Table
# Dimensions: 2 (Trend States) x 2 (Holding States) x 3 (Actions)
env_rows = 2
env_cols = 2
n_action = 3
q_table = np.zeros((env_rows, env_cols, n_action))
# 4.2 Load Data
try:
stocks = pd.read_csv('all_stocks_5yr.csv')
# We train primarily on AAPL as the representative asset for this strategy
stocks_train, _ = data_prep(stocks, 'AAPL')
except FileNotFoundError:
print("Error: 'all_stocks_5yr.csv' not found.")
return
# 4.3 Hyperparameters
episodes = 100 # Number of times to iterate over the dataset
epsilon = 1.0 # Initial Exploration Rate (100% random)
alpha = 0.05 # Learning Rate (Impact of new information)
gamma = 0.15 # Discount Factor (Importance of future rewards)
print(f"Starting Training for {episodes} episodes...")
for i in range(episodes):
# Reset Episode Variables
port_value = 1000
num_stocks = 0
buy_history = 0
net_worth = [1000]
# Iterate over the time-series
for dt in range(len(stocks_train)):
long_ma = stocks_train.iloc[dt]['5day_MA']
short_ma = stocks_train.iloc[dt]['1day_MA']
close_price = stocks_train.iloc[dt]['close']
# Get Previous Close for Reward Calc
if dt > 0:
past_close = stocks_train.iloc[dt-1]['close']
else:
past_close = close_price
# Determine Current State
t = trade_t(num_stocks, net_worth[-1], close_price)
state = get_state(long_ma, short_ma, t)
# Select Action
action = next_act(state, q_table, epsilon)
# Execute Action & Update Portfolio Logic
if action == 0: # Buy
num_stocks += 1
buy_history = close_price
net_worth.append(np.round(net_worth[-1] - close_price, 1))
r = 0 # Reward calculated later if needed, mostly 0 for entry
elif action == 1: # Sell
num_stocks -= 1
net_worth.append(np.round(net_worth[-1] + close_price, 1))
# buy_history handled in reward
elif action == 2: # Hold
net_worth.append(np.round(net_worth[-1] + close_price, 1)) # Simplified tracking
# Compute Reward
r = get_reward(state, action, close_price, past_close, buy_history)
# Observe Next State
try:
next_long = stocks_train.iloc[dt+1]['5day_MA']
next_short = stocks_train.iloc[dt+1]['1day_MA']
next_state = get_state(next_long, next_short, t)
except IndexError:
# End of data
break
# Update Q-Value using Bellman Equation
# Q(s,a) = (1-alpha) * Q(s,a) + alpha * (reward + gamma * max(Q(s', a')))
q_table[state][action] = (1. - alpha) * q_table[state][action] + alpha * (r + gamma * np.max(q_table[next_state]))
# Decay Epsilon to reduce exploration over time
if (epsilon - 0.01) > 0.15:
epsilon -= 0.01
if (i + 1) % 10 == 0:
print(f"Episode {i+1}/{episodes} complete. Epsilon: {epsilon:.2f}")
print("Training Complete.")
# 4.4 Save the Trained Model
with open('model.pkl', 'wb') as f:
pkl.dump(q_table, f)
print("Model saved to 'model.pkl'.")
if __name__ == "__main__":
train_model()
|