Stock-Trading-RL / Source Code /model_training.py
ameythakur's picture
Stock-Trading-RL
9ffa007 unverified
"""
Project: Optimizing Stock Trading Strategy With Reinforcement Learning
Authors: Amey Thakur & Mega Satish
Reference: https://github.com/Amey-Thakur/OPTIMIZING-STOCK-TRADING-STRATEGY-WITH-REINFORCEMENT-LEARNING
License: MIT
Description:
This script implements the training phase of the Reinforcement Learning agent (Q-Learning).
It preprocesses historical stock data, defines the market environment as a set of states
based on Moving Average crossovers, and iteratively updates a Q-Table to learn optimal
trading actions (Buy, Sell, Hold) that maximize portfolio returns.
"""
import pandas as pd
import numpy as np
import pickle as pkl
import os
# ==========================================
# 1. Data Preprocessing
# ==========================================
def data_prep(data, name):
"""
Preprocesses the stock data for a specific company.
Args:
data (pd.DataFrame): The complete dataset containing all stocks.
name (str): The ticker symbol of the stock to filter (e.g., 'AAPL').
Returns:
tuple: (train_df, test_df) - The split training and testing datasets.
Methodology:
- Filters data by stock name.
- Computes Technical Indicators: 5-day and 1-day Moving Averages (MA).
- 5-day MA represents the short-term trend baseline.
- 1-day MA represents the immediate price action.
- The interaction between these two MAs serves as the primary signal for state determination.
"""
df = pd.DataFrame(data[data['Name'] == name])
df.dropna(inplace=True)
df.drop(['high', 'low', 'volume', 'Name'], axis=1, inplace=True)
df.reset_index(drop=True, inplace=True)
# Calculating Moving Averages used for State Definition
df['5day_MA'] = df['close'].rolling(5).mean()
df['1day_MA'] = df['close'].rolling(1).mean()
# Initialize first few rows where rolling mean is NaN
df.loc[:4, '5day_MA'] = 0
# Splitting into Train (80%) and Test (20%) sets
split_idx = int(len(df) * 0.8)
train_df = df[:split_idx]
test_df = df[split_idx:].reset_index(drop=True)
return train_df, test_df
# ==========================================
# 2. Environment & State Definitions
# ==========================================
def get_state(long_ma, short_ma, t):
"""
Discretizes continuous market data into a finite set of states.
The state space is defined by a tuple (Trend_Signal, Holding_Status).
1. Trend_Signal:
- 0: short_ma < long_ma (Bearish/Downtrend)
- 1: short_ma > long_ma (Bullish/Uptrend)
2. Holding_Status (t):
- 0: Currently holding stock
- 1: Currently holding cash (no stock)
Returns:
tuple: (trend, holding_status) representing the current environment state.
"""
if short_ma < long_ma:
if t == 1:
return (0, 1) # Bearish Trend, Holding Cash
else:
return (0, 0) # Bearish Trend, Holding Stock
elif short_ma > long_ma:
if t == 1:
return (1, 1) # Bullish Trend, Holding Cash
else:
return (1, 0) # Bullish Trend, Holding Stock
# Default case (should rarely be hit with floats)
return (0, 1)
def trade_t(num_of_stocks, port_value, current_price):
"""
Determines the holding capability of the agent.
Returns:
int: 1 if the agent has capital to buy (Cash), 0 if fully invested (Stock).
"""
# Simply mapping: if we have stocks or cash value > current price, we can 'technically' buy/hold
# But in this simplified binary state (All-in or All-out), we track logical status.
# Here, we simplify:
if num_of_stocks > 0:
return 0 # User holds stock
else:
if port_value > current_price:
return 1 # User holds cash and can afford stock
else:
return 0 # User is broke/cannot buy
# ==========================================
# 3. Q-Learning Agent Logic
# ==========================================
def next_act(state, qtable, epsilon, action_space=3):
"""
Selects the next action using the Epsilon-Greedy Policy.
Args:
state (tuple): The current state of the environment.
qtable (np.array): The Q-Table storing action-values.
epsilon (float): Exploration rate (probability of random action).
Returns:
int: The selected action index.
0: Buy
1: Sell
2: Hold
"""
if np.random.rand() < epsilon:
# Exploration: Random action
action = np.random.randint(action_space)
else:
# Exploitation: Best known action from Q-Table
action = np.argmax(qtable[state])
return action
def get_reward(state, action, current_close, past_close, buy_history):
"""
Calculates the immediate reward for a given state-action pair.
The Reward Function is crucial for guiding the agent:
- Penalize invalid moves (e.g., Buying when already holding).
- Reward profit generation (Selling higher than bought).
- Reward capital preservation (Holding during downturns).
"""
if state == (0, 0) or state == (1, 0): # State: Holding Stock
if action == 0: # Try to Buy again
return -1000 # Heavy Penalty for illegal move
elif action == 1: # Sell
return (current_close - buy_history) # Reward is the realized PnL
elif action == 2: # Hold
return (current_close - past_close) # Reward is the unrealized daily change
elif state == (0, 1) or state == (1, 1): # State: Holding Cash
if action == 0: # Buy
return 0 # Neutral reward for entering position
elif action == 1: # Try to Sell again
return -1000 # Heavy Penalty for illegal move
elif action == 2: # Hold (Wait)
return (current_close - past_close) # Opportunity cost/benefit tracking
return 0
# ==========================================
# 4. Main Training Loop
# ==========================================
def train_model():
print("Initializing Training Process...")
# 4.1 Initialize Q-Table
# Dimensions: 2 (Trend States) x 2 (Holding States) x 3 (Actions)
env_rows = 2
env_cols = 2
n_action = 3
q_table = np.zeros((env_rows, env_cols, n_action))
# 4.2 Load Data
try:
stocks = pd.read_csv('all_stocks_5yr.csv')
# We train primarily on AAPL as the representative asset for this strategy
stocks_train, _ = data_prep(stocks, 'AAPL')
except FileNotFoundError:
print("Error: 'all_stocks_5yr.csv' not found.")
return
# 4.3 Hyperparameters
episodes = 100 # Number of times to iterate over the dataset
epsilon = 1.0 # Initial Exploration Rate (100% random)
alpha = 0.05 # Learning Rate (Impact of new information)
gamma = 0.15 # Discount Factor (Importance of future rewards)
print(f"Starting Training for {episodes} episodes...")
for i in range(episodes):
# Reset Episode Variables
port_value = 1000
num_stocks = 0
buy_history = 0
net_worth = [1000]
# Iterate over the time-series
for dt in range(len(stocks_train)):
long_ma = stocks_train.iloc[dt]['5day_MA']
short_ma = stocks_train.iloc[dt]['1day_MA']
close_price = stocks_train.iloc[dt]['close']
# Get Previous Close for Reward Calc
if dt > 0:
past_close = stocks_train.iloc[dt-1]['close']
else:
past_close = close_price
# Determine Current State
t = trade_t(num_stocks, net_worth[-1], close_price)
state = get_state(long_ma, short_ma, t)
# Select Action
action = next_act(state, q_table, epsilon)
# Execute Action & Update Portfolio Logic
if action == 0: # Buy
num_stocks += 1
buy_history = close_price
net_worth.append(np.round(net_worth[-1] - close_price, 1))
r = 0 # Reward calculated later if needed, mostly 0 for entry
elif action == 1: # Sell
num_stocks -= 1
net_worth.append(np.round(net_worth[-1] + close_price, 1))
# buy_history handled in reward
elif action == 2: # Hold
net_worth.append(np.round(net_worth[-1] + close_price, 1)) # Simplified tracking
# Compute Reward
r = get_reward(state, action, close_price, past_close, buy_history)
# Observe Next State
try:
next_long = stocks_train.iloc[dt+1]['5day_MA']
next_short = stocks_train.iloc[dt+1]['1day_MA']
next_state = get_state(next_long, next_short, t)
except IndexError:
# End of data
break
# Update Q-Value using Bellman Equation
# Q(s,a) = (1-alpha) * Q(s,a) + alpha * (reward + gamma * max(Q(s', a')))
q_table[state][action] = (1. - alpha) * q_table[state][action] + alpha * (r + gamma * np.max(q_table[next_state]))
# Decay Epsilon to reduce exploration over time
if (epsilon - 0.01) > 0.15:
epsilon -= 0.01
if (i + 1) % 10 == 0:
print(f"Episode {i+1}/{episodes} complete. Epsilon: {epsilon:.2f}")
print("Training Complete.")
# 4.4 Save the Trained Model
with open('model.pkl', 'wb') as f:
pkl.dump(q_table, f)
print("Model saved to 'model.pkl'.")
if __name__ == "__main__":
train_model()