"""
Project: Optimizing Stock Trading Strategy With Reinforcement Learning
Authors: Amey Thakur & Mega Satish
Reference: https://github.com/Amey-Thakur/OPTIMIZING-STOCK-TRADING-STRATEGY-WITH-REINFORCEMENT-LEARNING
License: MIT

Description:
This script implements the training phase of the Reinforcement Learning agent (Q-Learning).
It preprocesses historical stock data, defines the market environment as a set of states
based on Moving Average crossovers, and iteratively updates a Q-Table to learn optimal
trading actions (Buy, Sell, Hold) that maximize portfolio returns.
"""

import pandas as pd
import numpy as np
import pickle as pkl
import os

# ==========================================
# 1. Data Preprocessing
# ==========================================
def data_prep(data, name):
    """
    Preprocesses the stock data for a specific company.
    
    Args:
        data (pd.DataFrame): The complete dataset containing all stocks.
        name (str): The ticker symbol of the stock to filter (e.g., 'AAPL').
        
    Returns:
        tuple: (train_df, test_df) - The split training and testing datasets.
        
    Methodology:
    - Filters data by stock name.
    - Computes Technical Indicators: 5-day and 1-day Moving Averages (MA).
        - 5-day MA represents the short-term trend baseline.
        - 1-day MA represents the immediate price action.
    - The interaction between these two MAs serves as the primary signal for state determination.
    """
    df = pd.DataFrame(data[data['Name'] == name])
    df.dropna(inplace=True)
    df.drop(['high', 'low', 'volume', 'Name'], axis=1, inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    # Calculating Moving Averages used for State Definition
    df['5day_MA'] = df['close'].rolling(5).mean()
    df['1day_MA'] = df['close'].rolling(1).mean()
    
    # Initialize first few rows where rolling mean is NaN
    df.loc[:4, '5day_MA'] = 0
    
    # Splitting into Train (80%) and Test (20%) sets
    split_idx = int(len(df) * 0.8)
    train_df = df[:split_idx]
    test_df = df[split_idx:].reset_index(drop=True)
    
    return train_df, test_df

# ==========================================
# 2. Environment & State Definitions
# ==========================================
def get_state(long_ma, short_ma, t):
    """
    Discretizes continuous market data into a finite set of states.
    
    The state space is defined by a tuple (Trend_Signal, Holding_Status).
    
    1. Trend_Signal:
       - 0: short_ma < long_ma (Bearish/Downtrend)
       - 1: short_ma > long_ma (Bullish/Uptrend)
       
    2. Holding_Status (t):
       - 0: Currently holding stock
       - 1: Currently holding cash (no stock)
       
    Returns:
        tuple: (trend, holding_status) representing the current environment state.
    """
    if short_ma < long_ma:
        if t == 1:
            return (0, 1) # Bearish Trend, Holding Cash
        else:
            return (0, 0) # Bearish Trend, Holding Stock
            
    elif short_ma > long_ma:
        if t == 1:
            return (1, 1) # Bullish Trend, Holding Cash
        else:
            return (1, 0) # Bullish Trend, Holding Stock
            
    # Default case (should rarely be hit with floats)
    return (0, 1)

def trade_t(num_of_stocks, port_value, current_price):
    """
    Determines the holding capability of the agent.
    
    Returns:
        int: 1 if the agent has capital to buy (Cash), 0 if fully invested (Stock).
    """
    # Simply mapping: if we have stocks or cash value > current price, we can 'technically' buy/hold
    # But in this simplified binary state (All-in or All-out), we track logical status.
    # Here, we simplify:
    if num_of_stocks > 0:
        return 0 # User holds stock
    else:
        if port_value > current_price:
            return 1 # User holds cash and can afford stock
        else: 
            return 0 # User is broke/cannot buy

# ==========================================
# 3. Q-Learning Agent Logic
# ==========================================
def next_act(state, qtable, epsilon, action_space=3):
    """
    Selects the next action using the Epsilon-Greedy Policy.
    
    Args:
        state (tuple): The current state of the environment.
        qtable (np.array): The Q-Table storing action-values.
        epsilon (float): Exploration rate (probability of random action).
        
    Returns:
        int: The selected action index.
            0: Buy
            1: Sell
            2: Hold
    """
    if np.random.rand() < epsilon:
        # Exploration: Random action
        action = np.random.randint(action_space)
    else:
        # Exploitation: Best known action from Q-Table
        action = np.argmax(qtable[state])
        
    return action

def get_reward(state, action, current_close, past_close, buy_history):
    """
    Calculates the immediate reward for a given state-action pair.
    
    The Reward Function is crucial for guiding the agent:
    - Penalize invalid moves (e.g., Buying when already holding).
    - Reward profit generation (Selling higher than bought).
    - Reward capital preservation (Holding during downturns).
    """
    if state == (0, 0) or state == (1, 0): # State: Holding Stock
        if action == 0: # Try to Buy again
            return -1000 # Heavy Penalty for illegal move
        elif action == 1: # Sell
            return (current_close - buy_history) # Reward is the realized PnL
        elif action == 2: # Hold
            return (current_close - past_close) # Reward is the unrealized daily change
    
    elif state == (0, 1) or state == (1, 1): # State: Holding Cash
        if action == 0: # Buy
            return 0 # Neutral reward for entering position
        elif action == 1: # Try to Sell again
            return -1000 # Heavy Penalty for illegal move
        elif action == 2: # Hold (Wait)
            return (current_close - past_close) # Opportunity cost/benefit tracking

    return 0

# ==========================================
# 4. Main Training Loop
# ==========================================
def train_model():
    print("Initializing Training Process...")
    
    # 4.1 Initialize Q-Table
    # Dimensions: 2 (Trend States) x 2 (Holding States) x 3 (Actions)
    env_rows = 2
    env_cols = 2
    n_action = 3
    q_table = np.zeros((env_rows, env_cols, n_action))
    
    # 4.2 Load Data
    try:
        stocks = pd.read_csv('all_stocks_5yr.csv')
        # We train primarily on AAPL as the representative asset for this strategy
        stocks_train, _ = data_prep(stocks, 'AAPL')
    except FileNotFoundError:
        print("Error: 'all_stocks_5yr.csv' not found.")
        return

    # 4.3 Hyperparameters
    episodes = 100       # Number of times to iterate over the dataset
    epsilon = 1.0        # Initial Exploration Rate (100% random)
    alpha = 0.05         # Learning Rate (Impact of new information)
    gamma = 0.15         # Discount Factor (Importance of future rewards)
    
    print(f"Starting Training for {episodes} episodes...")
    
    for i in range(episodes):
        # Reset Episode Variables
        port_value = 1000
        num_stocks = 0
        buy_history = 0
        net_worth = [1000]
        
        # Iterate over the time-series
        for dt in range(len(stocks_train)):
            long_ma = stocks_train.iloc[dt]['5day_MA']
            short_ma = stocks_train.iloc[dt]['1day_MA']
            close_price = stocks_train.iloc[dt]['close']
            
            # Get Previous Close for Reward Calc
            if dt > 0:
                past_close = stocks_train.iloc[dt-1]['close']
            else:
                past_close = close_price
                
            # Determine Current State
            t = trade_t(num_stocks, net_worth[-1], close_price)
            state = get_state(long_ma, short_ma, t)
            
            # Select Action
            action = next_act(state, q_table, epsilon)
            
            # Execute Action & Update Portfolio Logic
            if action == 0: # Buy
                num_stocks += 1
                buy_history = close_price
                net_worth.append(np.round(net_worth[-1] - close_price, 1))
                r = 0 # Reward calculated later if needed, mostly 0 for entry
            
            elif action == 1: # Sell
                num_stocks -= 1
                net_worth.append(np.round(net_worth[-1] + close_price, 1))
                # buy_history handled in reward
            
            elif action == 2: # Hold
                net_worth.append(np.round(net_worth[-1] + close_price, 1)) # Simplified tracking
            
            # Compute Reward
            r = get_reward(state, action, close_price, past_close, buy_history)
            
            # Observe Next State
            try:
                next_long = stocks_train.iloc[dt+1]['5day_MA']
                next_short = stocks_train.iloc[dt+1]['1day_MA']
                next_state = get_state(next_long, next_short, t)
            except IndexError:
                # End of data
                break
                
            # Update Q-Value using Bellman Equation
            # Q(s,a) = (1-alpha) * Q(s,a) + alpha * (reward + gamma * max(Q(s', a')))
            q_table[state][action] = (1. - alpha) * q_table[state][action] + alpha * (r + gamma * np.max(q_table[next_state]))
        
        # Decay Epsilon to reduce exploration over time
        if (epsilon - 0.01) > 0.15:
            epsilon -= 0.01
            
        if (i + 1) % 10 == 0:
            print(f"Episode {i+1}/{episodes} complete. Epsilon: {epsilon:.2f}")

    print("Training Complete.")
    
    # 4.4 Save the Trained Model
    with open('model.pkl', 'wb') as f:
        pkl.dump(q_table, f)
    print("Model saved to 'model.pkl'.")

if __name__ == "__main__":
    train_model()