import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque

# --- Hyperparameters ---
LEARNING_RATE = 0.001
GAMMA = 0.95
EPSILON_START = 1.0
EPSILON_END = 0.01
EPSILON_DECAY = 0.995
MEMORY_SIZE = 10000
BATCH_SIZE = 64
EPISODES = 1000 
MODEL_PATH = "blackjack_cnn.pth" # Local filename

class BlackjackCNN(nn.Module):
    def __init__(self):
        super(BlackjackCNN, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=2, stride=1, padding=1), 
            nn.ReLU(),
            nn.Conv2d(16, 32, kernel_size=2, stride=1, padding=1),
            nn.ReLU()
        )
        self.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(800, 64), 
            nn.ReLU(),
            nn.Linear(64, 2)
        )

    def forward(self, x):
        x = self.conv(x)
        return self.fc(x)

def preprocess_state(state):
    grid = np.zeros((3, 3))
    grid[0, 0] = state[0] / 31.0
    grid[1, 1] = state[1] / 10.0
    grid[2, 2] = 1.0 if state[2] else 0.0
    return torch.FloatTensor(grid).view(1, 1, 3, 3)

# --- Training Loop ---
env = gym.make('Blackjack-v1')
policy_net = BlackjackCNN()
optimizer = optim.Adam(policy_net.parameters(), lr=LEARNING_RATE)
memory = deque(maxlen=MEMORY_SIZE)
epsilon = EPSILON_START

print(f"Starting training for {EPISODES} episodes...")

for episode in range(EPISODES):
    obs, info = env.reset()
    state_img = preprocess_state(obs)
    done = False
    
    while not done:
        if random.random() < epsilon:
            action = env.action_space.sample()
        else:
            with torch.no_grad():
                action = policy_net(state_img).argmax().item()
        
        next_obs, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        next_state_img = preprocess_state(next_obs)
        
        memory.append((state_img, action, reward, next_state_img, done))
        state_img = next_state_img
        
        if len(memory) > BATCH_SIZE:
            batch = random.sample(memory, BATCH_SIZE)
            states, actions, rewards, next_states, dones = zip(*batch)
            
            states = torch.cat(states)
            actions = torch.tensor(actions).unsqueeze(1)
            rewards = torch.tensor(rewards).float()
            next_states = torch.cat(next_states)
            dones = torch.tensor(dones).float()
            
            current_q = policy_net(states).gather(1, actions)
            next_q = policy_net(next_states).max(1)[0].detach()
            target_q = rewards + (GAMMA * next_q * (1 - dones))
            
            loss = nn.MSELoss()(current_q.squeeze(), target_q)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
    epsilon = max(EPSILON_END, epsilon * EPSILON_DECAY)
    
    if (episode + 1) % 100 == 0:
        print(f"Episode {episode + 1} | Epsilon: {epsilon:.2f}")

# --- Save the Model ---
torch.save(policy_net.state_dict(), MODEL_PATH)
print(f"\nModel saved locally to {MODEL_PATH}")

# --- Quick Test ---
print("\nTesting saved model for 5 rounds:")
policy_net.eval() # Set to evaluation mode
for i in range(5):
    obs, _ = env.reset()
    state_img = preprocess_state(obs)
    with torch.no_grad():
        action = policy_net(state_img).argmax().item()
    action_name = "HIT" if action == 1 else "STICK"
    print(f"Round {i+1}: Hand={obs[0]}, Dealer={obs[1]}, Action={action_name}")