Spaces:

ameythakur
/

Stock-Trading-RL

Running

App Files Files Community

Stock-Trading-RL / Source Code /model_training.py

ameythakur

Stock-Trading-RL

9ffa007 unverified about 2 months ago

raw

history blame contribute delete

9.87 kB

	"""
	Project: Optimizing Stock Trading Strategy With Reinforcement Learning
	Authors: Amey Thakur & Mega Satish
	Reference: https://github.com/Amey-Thakur/OPTIMIZING-STOCK-TRADING-STRATEGY-WITH-REINFORCEMENT-LEARNING
	License: MIT

	Description:
	This script implements the training phase of the Reinforcement Learning agent (Q-Learning).
	It preprocesses historical stock data, defines the market environment as a set of states
	based on Moving Average crossovers, and iteratively updates a Q-Table to learn optimal
	trading actions (Buy, Sell, Hold) that maximize portfolio returns.
	"""

	import pandas as pd
	import numpy as np
	import pickle as pkl
	import os

	# ==========================================
	# 1. Data Preprocessing
	# ==========================================
	def data_prep(data, name):
	"""
	Preprocesses the stock data for a specific company.

	Args:
	data (pd.DataFrame): The complete dataset containing all stocks.
	name (str): The ticker symbol of the stock to filter (e.g., 'AAPL').

	Returns:
	tuple: (train_df, test_df) - The split training and testing datasets.

	Methodology:
	- Filters data by stock name.
	- Computes Technical Indicators: 5-day and 1-day Moving Averages (MA).
	- 5-day MA represents the short-term trend baseline.
	- 1-day MA represents the immediate price action.
	- The interaction between these two MAs serves as the primary signal for state determination.
	"""
	df = pd.DataFrame(data[data['Name'] == name])
	df.dropna(inplace=True)
	df.drop(['high', 'low', 'volume', 'Name'], axis=1, inplace=True)
	df.reset_index(drop=True, inplace=True)

	# Calculating Moving Averages used for State Definition
	df['5day_MA'] = df['close'].rolling(5).mean()
	df['1day_MA'] = df['close'].rolling(1).mean()

	# Initialize first few rows where rolling mean is NaN
	df.loc[:4, '5day_MA'] = 0

	# Splitting into Train (80%) and Test (20%) sets
	split_idx = int(len(df) * 0.8)
	train_df = df[:split_idx]
	test_df = df[split_idx:].reset_index(drop=True)

	return train_df, test_df

	# ==========================================
	# 2. Environment & State Definitions
	# ==========================================
	def get_state(long_ma, short_ma, t):
	"""
	Discretizes continuous market data into a finite set of states.

	The state space is defined by a tuple (Trend_Signal, Holding_Status).

	1. Trend_Signal:
	- 0: short_ma < long_ma (Bearish/Downtrend)
	- 1: short_ma > long_ma (Bullish/Uptrend)

	2. Holding_Status (t):
	- 0: Currently holding stock
	- 1: Currently holding cash (no stock)

	Returns:
	tuple: (trend, holding_status) representing the current environment state.
	"""
	if short_ma < long_ma:
	if t == 1:
	return (0, 1) # Bearish Trend, Holding Cash
	else:
	return (0, 0) # Bearish Trend, Holding Stock

	elif short_ma > long_ma:
	if t == 1:
	return (1, 1) # Bullish Trend, Holding Cash
	else:
	return (1, 0) # Bullish Trend, Holding Stock

	# Default case (should rarely be hit with floats)
	return (0, 1)

	def trade_t(num_of_stocks, port_value, current_price):
	"""
	Determines the holding capability of the agent.

	Returns:
	int: 1 if the agent has capital to buy (Cash), 0 if fully invested (Stock).
	"""
	# Simply mapping: if we have stocks or cash value > current price, we can 'technically' buy/hold
	# But in this simplified binary state (All-in or All-out), we track logical status.
	# Here, we simplify:
	if num_of_stocks > 0:
	return 0 # User holds stock
	else:
	if port_value > current_price:
	return 1 # User holds cash and can afford stock
	else:
	return 0 # User is broke/cannot buy

	# ==========================================
	# 3. Q-Learning Agent Logic
	# ==========================================
	def next_act(state, qtable, epsilon, action_space=3):
	"""
	Selects the next action using the Epsilon-Greedy Policy.

	Args:
	state (tuple): The current state of the environment.
	qtable (np.array): The Q-Table storing action-values.
	epsilon (float): Exploration rate (probability of random action).

	Returns:
	int: The selected action index.
	0: Buy
	1: Sell
	2: Hold
	"""
	if np.random.rand() < epsilon:
	# Exploration: Random action
	action = np.random.randint(action_space)
	else:
	# Exploitation: Best known action from Q-Table
	action = np.argmax(qtable[state])

	return action

	def get_reward(state, action, current_close, past_close, buy_history):
	"""
	Calculates the immediate reward for a given state-action pair.

	The Reward Function is crucial for guiding the agent:
	- Penalize invalid moves (e.g., Buying when already holding).
	- Reward profit generation (Selling higher than bought).
	- Reward capital preservation (Holding during downturns).
	"""
	if state == (0, 0) or state == (1, 0): # State: Holding Stock
	if action == 0: # Try to Buy again
	return -1000 # Heavy Penalty for illegal move
	elif action == 1: # Sell
	return (current_close - buy_history) # Reward is the realized PnL
	elif action == 2: # Hold
	return (current_close - past_close) # Reward is the unrealized daily change

	elif state == (0, 1) or state == (1, 1): # State: Holding Cash
	if action == 0: # Buy
	return 0 # Neutral reward for entering position
	elif action == 1: # Try to Sell again
	return -1000 # Heavy Penalty for illegal move
	elif action == 2: # Hold (Wait)
	return (current_close - past_close) # Opportunity cost/benefit tracking

	return 0

	# ==========================================
	# 4. Main Training Loop
	# ==========================================
	def train_model():
	print("Initializing Training Process...")

	# 4.1 Initialize Q-Table
	# Dimensions: 2 (Trend States) x 2 (Holding States) x 3 (Actions)
	env_rows = 2
	env_cols = 2
	n_action = 3
	q_table = np.zeros((env_rows, env_cols, n_action))

	# 4.2 Load Data
	try:
	stocks = pd.read_csv('all_stocks_5yr.csv')
	# We train primarily on AAPL as the representative asset for this strategy
	stocks_train, _ = data_prep(stocks, 'AAPL')
	except FileNotFoundError:
	print("Error: 'all_stocks_5yr.csv' not found.")
	return

	# 4.3 Hyperparameters
	episodes = 100 # Number of times to iterate over the dataset
	epsilon = 1.0 # Initial Exploration Rate (100% random)
	alpha = 0.05 # Learning Rate (Impact of new information)
	gamma = 0.15 # Discount Factor (Importance of future rewards)

	print(f"Starting Training for {episodes} episodes...")

	for i in range(episodes):
	# Reset Episode Variables
	port_value = 1000
	num_stocks = 0
	buy_history = 0
	net_worth = [1000]

	# Iterate over the time-series
	for dt in range(len(stocks_train)):
	long_ma = stocks_train.iloc[dt]['5day_MA']
	short_ma = stocks_train.iloc[dt]['1day_MA']
	close_price = stocks_train.iloc[dt]['close']

	# Get Previous Close for Reward Calc
	if dt > 0:
	past_close = stocks_train.iloc[dt-1]['close']
	else:
	past_close = close_price

	# Determine Current State
	t = trade_t(num_stocks, net_worth[-1], close_price)
	state = get_state(long_ma, short_ma, t)

	# Select Action
	action = next_act(state, q_table, epsilon)

	# Execute Action & Update Portfolio Logic
	if action == 0: # Buy
	num_stocks += 1
	buy_history = close_price
	net_worth.append(np.round(net_worth[-1] - close_price, 1))
	r = 0 # Reward calculated later if needed, mostly 0 for entry

	elif action == 1: # Sell
	num_stocks -= 1
	net_worth.append(np.round(net_worth[-1] + close_price, 1))
	# buy_history handled in reward

	elif action == 2: # Hold
	net_worth.append(np.round(net_worth[-1] + close_price, 1)) # Simplified tracking

	# Compute Reward
	r = get_reward(state, action, close_price, past_close, buy_history)

	# Observe Next State
	try:
	next_long = stocks_train.iloc[dt+1]['5day_MA']
	next_short = stocks_train.iloc[dt+1]['1day_MA']
	next_state = get_state(next_long, next_short, t)
	except IndexError:
	# End of data
	break

	# Update Q-Value using Bellman Equation
	# Q(s,a) = (1-alpha) * Q(s,a) + alpha * (reward + gamma * max(Q(s', a')))
	q_table[state][action] = (1. - alpha) * q_table[state][action] + alpha * (r + gamma * np.max(q_table[next_state]))

	# Decay Epsilon to reduce exploration over time
	if (epsilon - 0.01) > 0.15:
	epsilon -= 0.01

	if (i + 1) % 10 == 0:
	print(f"Episode {i+1}/{episodes} complete. Epsilon: {epsilon:.2f}")

	print("Training Complete.")

	# 4.4 Save the Trained Model
	with open('model.pkl', 'wb') as f:
	pkl.dump(q_table, f)
	print("Model saved to 'model.pkl'.")

	if __name__ == "__main__":
	train_model()