#!/usr/bin/env python3
"""Generate DQN notebook."""

import nbformat as nbf

nb = nbf.v4.new_notebook()
nb.metadata = {"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"},"language_info": {"name": "python", "version": "3.12.0"}}

cells = []
def md(s): cells.append(nbf.v4.new_markdown_cell(s))
def code(s): cells.append(nbf.v4.new_code_cell(s))

md("# DQN: Deep Q-Network\n\nReinforcement learning with experience replay on CartPole-v1.")

md("""## 背景

DQN（Mnih et al. 2015）将深度学习与 Q-Learning 结合，首次在 Atari 游戏上达到人类水平。
核心创新：

- **经验回放（Experience Replay）**：存储过去经验，随机采样训练，打破数据相关性
- **目标网络（Target Network）**：固定 Q-target，稳定训练

环境：**CartPole-v1** — 控制小车左右移动，保持杆子不倒。
状态：4 维（位置、速度、角度、角速度），动作：2 维（左、右）。
""")

md("""## 数学原理

### Q-Learning

$$Q(s, a) \\leftarrow Q(s, a) + \\alpha \\left(r + \\gamma \\max_{a'} Q(s', a') - Q(s, a)\\right)$$

### DQN Loss

$$\\mathcal{L} = \\mathbb{E}_{(s,a,r,s') \\sim \\mathcal{D}} \\left[\\left(r + \\gamma \\max_{a'} Q_{\\theta^-}(s', a') - Q_\\theta(s, a)\\right)^2\\right]$$

- $\\mathcal{D}$: 经验回放缓冲区
- $\\theta$: 在线网络参数
- $\\theta^-$: 目标网络参数（每隔 $C$ 步复制一次）

### ε-greedy 探索

$$a = \\begin{cases} \\text{random}, & \\text{概率 } \\varepsilon \\\\ \\arg\\max_a Q(s, a), & \\text{概率 } 1-\\varepsilon \\end{cases}$$

ε 随时间指数衰减。
""")

code("""\
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

from rl.dqn.dqn import DQN, ReplayBuffer, train_episode, epsilon_by_episode
from utils.config import load_config

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Device: {device}")

env = gym.make("CartPole-v1")
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
print(f"State: {state_dim}  Action: {action_dim}")
""")

code("""\
model = DQN(state_dim, action_dim, hidden_dim=128).to(device)
target = DQN(state_dim, action_dim, hidden_dim=128).to(device)
target.load_state_dict(model.state_dict())
target.eval()
print(f"Parameters: {sum(p.numel() for p in model.parameters()):,}")
""")

md("""## 训练

> ⏱ 预估耗时：**500 episode × ~0.2s ≈ 2 分钟**（M4 Max）
""")

code("""\
NUM_EPISODES = 500
LR = 0.001
GAMMA = 0.99
BATCH_SIZE = 64
BUFFER_SIZE = 50000
TARGET_UPDATE = 100
EPSILON_START = 1.0
EPSILON_END = 0.01
EPSILON_DECAY = 500

optimizer = optim.Adam(model.parameters(), lr=LR)
buffer = ReplayBuffer(BUFFER_SIZE)
rewards = []

for episode in range(1, NUM_EPISODES + 1):
    state, _ = env.reset()
    episode_reward = 0
    eps = epsilon_by_episode(episode, EPSILON_START, EPSILON_END, EPSILON_DECAY)

    while True:
        if np.random.random() < eps:
            action = env.action_space.sample()
        else:
            with torch.no_grad():
                q = model(torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0))
                action = q.argmax().item()

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        buffer.push(state, action, reward, next_state, done)
        state = next_state
        episode_reward += reward
        _ = train_episode(model, target, optimizer, buffer, BATCH_SIZE, GAMMA)
        if done:
            break

    if episode % TARGET_UPDATE == 0:
        target.load_state_dict(model.state_dict())

    rewards.append(episode_reward)
    if episode % 50 == 0:
        avg = np.mean(rewards[-50:])
        print(f"Episode [{episode:3d}/{NUM_EPISODES}]  Reward: {episode_reward:.0f}  Avg(50): {avg:.1f}  ε: {eps:.3f}")

env.close()
""")

md("""## Reward 曲线""")

code("""\
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 4))
plt.plot(rewards)
plt.xlabel("Episode"); plt.ylabel("Total Reward"); plt.title("DQN Training on CartPole")
plt.grid(True)
plt.axhline(y=195, color='r', linestyle='--', label='Solved (195)')
plt.legend()
plt.show()
""")

md("""\
## 思考题

1. 为什么需要经验回放（Experience Replay）？在线学习会有什么问题？
2. 目标网络（Target Network）解决了什么？如果不固定 target，Q 值会发散吗？
3. ϵ-greedy 中的 ϵ 从 1.0 开始衰减有什么含义？
4. 如果把 `hidden_dim` 从 128 改到 32，训练速度会怎样？收敛难度呢？
""")

nb.cells = cells
with open("rl/dqn/dqn.ipynb", "w") as f:
    nbf.write(nb, f)
print("Generated rl/dqn/dqn.ipynb")