import math
import time
from typing import Tuple

import gradio as gr
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np
import pandas as pd

# ============================================================
# Robot Learning to Walk with Reinforcement Learning
# Gradio visual simulation / teaching dashboard
# ------------------------------------------------------------
# This is an educational visualization, not a real MuJoCo PPO
# training run. It simulates the visible behavior of RL training:
#   1. Early episodes: robot falls
#   2. Middle episodes: robot struggles and becomes unstable
#   3. Later episodes: robot improves balance
#   4. Final episodes: robot develops a stable walking gait
# ============================================================

MAX_EPISODE = 25_000


# -----------------------------
# Utility functions
# -----------------------------

def clamp(value, low, high):
    return max(low, min(high, value))


def training_progress(episode: int, max_episode: int = MAX_EPISODE) -> float:
    """Return normalized training progress from 0 to 1."""
    return clamp(episode / max_episode, 0, 1)


def stage_from_progress(p: float) -> Tuple[str, str]:
    """Map training progress to a learning stage."""
    if p < 0.08:
        return "Falling", "The policy has not learned balance yet."
    elif p < 0.22:
        return "Struggling", "The robot discovers pushing and limb movement."
    elif p < 0.42:
        return "Unstable", "The robot can stand briefly but tips often."
    elif p < 0.65:
        return "Improving Balance", "The policy begins coordinating feet and torso."
    elif p < 0.85:
        return "Mostly Stable", "The robot walks forward with occasional wobble."
    else:
        return "Natural Walk", "The policy has learned a smooth, efficient gait."


def simulated_reward_curve(max_episode=MAX_EPISODE, points=350, seed=42):
    """Create a reward curve that trends upward with realistic noise."""
    rng = np.random.default_rng(seed)
    x = np.linspace(0, max_episode, points)
    p = x / max_episode

    base = -300 + 1250 / (1 + np.exp(-7.2 * (p - 0.47)))
    noise = rng.normal(0, 85 * (1 - p) + 30, size=points)
    reward = base + noise
    return x, reward


def simulated_stability_curve(max_episode=MAX_EPISODE, points=350, seed=7):
    """Create a stability curve that improves over time."""
    rng = np.random.default_rng(seed)
    x = np.linspace(0, max_episode, points)
    p = x / max_episode
    base = 0.08 + 0.87 / (1 + np.exp(-7.5 * (p - 0.35)))
    noise = rng.normal(0, 0.08 * (1 - p) + 0.015, size=points)
    stability = np.clip(base + noise, 0, 1)
    return x, stability


def current_value_from_curve(x, y, episode):
    idx = np.searchsorted(x, episode)
    idx = int(clamp(idx, 0, len(y) - 1))
    return y[idx]


REWARD_X, REWARD_Y = simulated_reward_curve()
STABILITY_X, STABILITY_Y = simulated_stability_curve()


# -----------------------------
# Drawing functions
# -----------------------------

def draw_floor_grid(ax, x_min=-3, x_max=3, y_min=-0.1, y_max=2.4):
    ax.set_facecolor("#101b2a")
    for x in np.linspace(x_min, x_max, 13):
        ax.plot([x, x], [y_min, y_max], color="#26384f", linewidth=0.6)
    for y in np.linspace(y_min, y_max, 8):
        ax.plot([x_min, x_max], [y, y], color="#26384f", linewidth=0.6)
    ax.axhline(0, color="#93a4b8", linewidth=2)


def limb(ax, x1, y1, x2, y2, color="#e8eef7", lw=7):
    ax.plot([x1, x2], [y1, y2], color="#1f2937", linewidth=lw + 3, solid_capstyle="round", alpha=0.9)
    ax.plot([x1, x2], [y1, y2], color=color, linewidth=lw, solid_capstyle="round")


def joint(ax, x, y, radius=0.045, color="#4ea1ff"):
    circle = patches.Circle((x, y), radius, facecolor=color, edgecolor="#dbeafe", linewidth=1.2, zorder=5)
    ax.add_patch(circle)


def draw_robot(ax, progress, phase=0, x_offset=0):
    """
    Draw a simple humanoid robot whose pose changes based on training progress.
    progress: 0 to 1
    phase: walking phase in radians
    """
    stage, _ = stage_from_progress(progress)

    wobble = (1 - progress) * 0.35 * math.sin(phase * 1.8)
    gait_amp = 0.15 + progress * 0.28
    torso_lean = wobble * 0.8
    height = 0.75 + progress * 0.55

    if stage == "Falling":
        hip = np.array([x_offset - 0.15, 0.18])
        chest = np.array([x_offset + 0.2, 0.33])
        head = np.array([x_offset + 0.42, 0.38])
        left_knee = np.array([x_offset - 0.65, 0.12])
        left_foot = np.array([x_offset - 1.0, 0.03])
        right_knee = np.array([x_offset + 0.0, 0.05])
        right_foot = np.array([x_offset + 0.35, 0.02])
        left_elbow = np.array([x_offset + 0.10, 0.12])
        left_hand = np.array([x_offset - 0.25, 0.02])
        right_elbow = np.array([x_offset + 0.48, 0.18])
        right_hand = np.array([x_offset + 0.72, 0.03])
    elif stage == "Struggling":
        hip = np.array([x_offset - 0.1, 0.45])
        chest = np.array([x_offset + 0.10, 0.75])
        head = np.array([x_offset + 0.20, 0.96])
        left_knee = np.array([x_offset - 0.45, 0.22])
        left_foot = np.array([x_offset - 0.78, 0.03])
        right_knee = np.array([x_offset + 0.25, 0.25])
        right_foot = np.array([x_offset + 0.55, 0.03])
        left_elbow = np.array([x_offset - 0.20, 0.42])
        left_hand = np.array([x_offset - 0.55, 0.03])
        right_elbow = np.array([x_offset + 0.42, 0.48])
        right_hand = np.array([x_offset + 0.70, 0.03])
    else:
        torso_x = x_offset + torso_lean
        hip = np.array([torso_x, height])
        chest = np.array([torso_x + torso_lean * 0.15, height + 0.55])
        head = np.array([torso_x + torso_lean * 0.25, height + 0.83])

        left_leg_phase = math.sin(phase)
        right_leg_phase = math.sin(phase + math.pi)
        left_arm_phase = math.sin(phase + math.pi)
        right_arm_phase = math.sin(phase)

        left_foot = np.array([x_offset - 0.25 + gait_amp * left_leg_phase, 0.03])
        right_foot = np.array([x_offset + 0.25 + gait_amp * right_leg_phase, 0.03])
        left_knee = (hip + left_foot) / 2 + np.array([0.05 * math.cos(phase), 0.18 + 0.07 * progress])
        right_knee = (hip + right_foot) / 2 + np.array([0.05 * math.cos(phase + math.pi), 0.18 + 0.07 * progress])

        left_shoulder = chest + np.array([-0.18, -0.03])
        right_shoulder = chest + np.array([0.18, -0.03])
        left_hand = left_shoulder + np.array([0.18 * left_arm_phase, -0.55])
        right_hand = right_shoulder + np.array([0.18 * right_arm_phase, -0.55])
        left_elbow = (left_shoulder + left_hand) / 2 + np.array([-0.05, -0.02])
        right_elbow = (right_shoulder + right_hand) / 2 + np.array([0.05, -0.02])

    if stage in ["Falling", "Struggling"]:
        left_shoulder = chest + np.array([-0.15, -0.03])
        right_shoulder = chest + np.array([0.15, -0.03])

    limb(ax, hip[0], hip[1], left_knee[0], left_knee[1])
    limb(ax, left_knee[0], left_knee[1], left_foot[0], left_foot[1])
    limb(ax, hip[0], hip[1], right_knee[0], right_knee[1])
    limb(ax, right_knee[0], right_knee[1], right_foot[0], right_foot[1])

    limb(ax, hip[0], hip[1], chest[0], chest[1], color="#f8fafc", lw=9)
    limb(ax, left_shoulder[0], left_shoulder[1], left_elbow[0], left_elbow[1], lw=6)
    limb(ax, left_elbow[0], left_elbow[1], left_hand[0], left_hand[1], lw=6)
    limb(ax, right_shoulder[0], right_shoulder[1], right_elbow[0], right_elbow[1], lw=6)
    limb(ax, right_elbow[0], right_elbow[1], right_hand[0], right_hand[1], lw=6)

    head_circle = patches.Circle((head[0], head[1]), 0.14, facecolor="#e5e7eb", edgecolor="#111827", linewidth=2, zorder=6)
    ax.add_patch(head_circle)
    visor = patches.Ellipse((head[0] + 0.03, head[1] + 0.02), 0.15, 0.06, facecolor="#0f172a", edgecolor="#38bdf8", linewidth=1, zorder=7)
    ax.add_patch(visor)

    for pt in [hip, chest, left_knee, right_knee, left_foot, right_foot, left_elbow, right_elbow, left_hand, right_hand]:
        joint(ax, pt[0], pt[1], radius=0.035)

    ax.scatter([hip[0]], [hip[1] + 0.18], s=90, color="#54e26f", edgecolor="white", zorder=9)
    ax.text(hip[0] + 0.08, hip[1] + 0.18, "COM", color="#54e26f", fontsize=8, va="center")
    ax.plot([x_offset - 0.45, x_offset + 0.45], [0.015, 0.015], color="#54e26f", linewidth=3, alpha=0.8)


def robot_figure(progress, episode, phase):
    fig, ax = plt.subplots(figsize=(9, 5))
    fig.patch.set_facecolor("#07111f")
    draw_floor_grid(ax)
    draw_robot(ax, progress, phase=phase)

    stage, description = stage_from_progress(progress)
    ax.text(-2.85, 2.25, f"LIVE SIMULATION  |  Episode {episode:,}", color="#4ea1ff", fontsize=13, fontweight="bold")
    ax.text(-2.85, 2.08, f"Stage: {stage}", color="#ffffff", fontsize=12, fontweight="bold")
    ax.text(-2.85, 1.93, description, color="#b8c7d9", fontsize=10)

    ax.text(
        1.15,
        2.10,
        "Observation State\n"
        f"Base height: {0.45 + progress * 0.75:.2f} m\n"
        f"Velocity: {progress * 1.2:.2f} m/s\n"
        f"Stability: {progress:.2f}\n"
        f"Wobble: {(1-progress)*100:.1f}%",
        color="#dbeafe",
        fontsize=9,
        bbox=dict(facecolor="#0c1b2d", edgecolor="#24425f", boxstyle="round,pad=0.55", alpha=0.95),
    )

    ax.set_xlim(-3, 3)
    ax.set_ylim(-0.1, 2.4)
    ax.axis("off")
    plt.tight_layout()
    return fig


def progression_figure():
    checkpoints = [
        (0.02, "1", "FALLING", "Episode ~0"),
        (0.14, "2", "STRUGGLING", "Episode ~200"),
        (0.32, "3", "UNSTABLE", "Episode ~1,000"),
        (0.55, "4", "IMPROVING BALANCE", "Episode ~5,000"),
        (0.75, "5", "MOSTLY STABLE", "Episode ~15,000"),
        (0.94, "6", "NATURAL WALK", "Episode ~25,000+"),
    ]

    fig, axes = plt.subplots(1, 6, figsize=(15, 2.7))
    fig.patch.set_facecolor("#07111f")

    for ax, (p, number, label, ep_label) in zip(axes, checkpoints):
        draw_floor_grid(ax, x_min=-1.4, x_max=1.4, y_min=-0.05, y_max=1.8)
        draw_robot(ax, p, phase=1.2)
        ax.set_xlim(-1.35, 1.35)
        ax.set_ylim(-0.05, 1.85)
        ax.axis("off")
        ax.set_title(f"{number}. {label}\n{ep_label}", color="#ffffff", fontsize=9, fontweight="bold")

    plt.tight_layout()
    return fig


def plot_training_curve(x, y, episode, title, ylabel):
    fig, ax = plt.subplots(figsize=(5.2, 3.2))
    fig.patch.set_facecolor("#07111f")
    ax.set_facecolor("#0c1b2d")
    ax.plot(x, y, linewidth=1.5, color="#54e26f" if "Reward" in title else "#4ea1ff")
    current = current_value_from_curve(x, y, episode)
    ax.scatter([episode], [current], s=80, zorder=4, color="#ffffff", edgecolor="#f59e0b")
    ax.set_title(title, color="white", fontsize=12, fontweight="bold")
    ax.set_xlabel("Episode", color="#dbeafe")
    ax.set_ylabel(ylabel, color="#dbeafe")
    ax.grid(True, alpha=0.25)
    ax.tick_params(colors="#dbeafe")
    for spine in ax.spines.values():
        spine.set_color("#24425f")
    plt.tight_layout()
    return fig


# -----------------------------
# Simulation panels
# -----------------------------

def reward_function_dataframe(progress):
    forward_velocity = progress * 1.0
    upright_bonus = progress * 0.5
    smooth_gait = progress * 0.2
    torque_cost = (1 - progress) * 0.18
    orientation_penalty = (1 - progress) * 0.35
    joint_penalty = (1 - progress) * 0.15
    action_penalty = (1 - progress) * 0.08

    total = forward_velocity + upright_bonus + smooth_gait - torque_cost - orientation_penalty - joint_penalty - action_penalty
    normalized = clamp((total + 0.7) / 2.4, 0, 1) * 1000

    rows = [
        ["+ Forward velocity", 1.0, "Encourage moving forward", forward_velocity],
        ["+ Upright bonus", 0.5, "Encourage staying upright", upright_bonus],
        ["+ Smooth gait", 0.2, "Encourage coordinated walking", smooth_gait],
        ["− Torque cost", 0.001, "Discourage wasted energy", -torque_cost],
        ["− Orientation penalty", 0.5, "Penalize tipping over", -orientation_penalty],
        ["− Joint limit penalty", 1.0, "Avoid unnatural joint limits", -joint_penalty],
        ["− Action rate penalty", 0.001, "Encourage smooth actions", -action_penalty],
        ["TOTAL normalized reward", "", "Scaled to 0–1000", normalized],
    ]

    return pd.DataFrame(rows, columns=["Reward Component", "Weight", "Purpose", "Current Contribution"])


def metric_html(episode, progress, reward, stability, algorithm, learning_rate, gamma):
    stage, stage_desc = stage_from_progress(progress)
    return f"""
    <div class="top-grid">
        <div class="metric-card">
            <div class="small-label">Status</div>
            <div class="green-text">● TRAINING</div>
            <div class="small-label">Environment: Humanoid-v1</div>
            <div class="small-label">Simulator: Teaching Demo</div>
        </div>
        <div class="metric-card">
            <div class="small-label">Episode</div>
            <div class="blue-text big-number">{episode:,}</div>
            <div class="small-label">of {MAX_EPISODE:,}</div>
        </div>
        <div class="metric-card">
            <div class="small-label">Learning Stage</div>
            <div class="orange-text stage-title">{stage}</div>
            <div class="small-label">{stage_desc}</div>
        </div>
        <div class="metric-card">
            <div class="small-label">Average Reward</div>
            <div class="green-text big-number">{reward:,.1f}</div>
            <div class="small-label">Higher is better</div>
        </div>
        <div class="metric-card">
            <div class="small-label">Stability</div>
            <div class="blue-text big-number">{stability:.2f}</div>
            <div class="small-label">Target: 0.90+</div>
        </div>
    </div>
    <div class="status-box">
        <b>Training Status:</b>
        Algorithm = {algorithm} | Policy Network = MLP, 3x256 | Learning Rate = {learning_rate} |
        Discount Factor γ = {gamma} | Total Timesteps ≈ {episode * 8192:,}
    </div>
    """


def rl_loop_html():
    return """
    <div class="rl-loop">
        <div class="loop-box"><b>STATE</b><br><span>observe robot + world</span><br><code>s_t</code></div>
        <div class="arrow">→</div>
        <div class="loop-box"><b>ACTION</b><br><span>policy chooses motors</span><br><code>a_t</code></div>
        <div class="arrow">→</div>
        <div class="loop-box reward"><b>REWARD</b><br><span>score movement</span><br><code>r_t</code></div>
        <div class="arrow">→</div>
        <div class="loop-box update"><b>POLICY UPDATE</b><br><span>improve network</span><br><code>θ_{t+1}</code></div>
    </div>
    <div class="status-box">
    The robot is not directly programmed with a walking motion. It learns by trying actions, receiving rewards or penalties,
    and updating its policy across many simulated episodes.
    </div>
    """


def pseudocode_text():
    return """
initialize policy_network θ

for episode in range(total_episodes):
    state = environment.reset()
    done = False

    while not done:
        action = policy_network.select_action(state)
        next_state, reward, done, info = environment.step(action)

        store_transition(state, action, reward, next_state, done)
        state = next_state

    update policy_network θ using PPO/SAC/A2C

# Over many episodes:
# falling actions receive poor reward
# balanced walking actions receive higher reward
# the policy slowly learns a stable gait
"""


def update_dashboard(episode, animation_frame, algorithm, learning_rate, gamma):
    episode = int(episode)
    progress = training_progress(episode)
    phase = animation_frame / 100 * 2 * math.pi
    current_reward = current_value_from_curve(REWARD_X, REWARD_Y, episode)
    current_stability = current_value_from_curve(STABILITY_X, STABILITY_Y, episode)

    metrics = metric_html(episode, progress, current_reward, current_stability, algorithm, learning_rate, gamma)
    live = robot_figure(progress, episode, phase)
    reward_plot = plot_training_curve(REWARD_X, REWARD_Y, episode, "Reward per Episode", "Average reward")
    stability_plot = plot_training_curve(STABILITY_X, STABILITY_Y, episode, "Stability / Error Over Time", "Stability")
    reward_df = reward_function_dataframe(progress)

    return metrics, live, reward_plot, stability_plot, reward_df


def jump_to_stage(stage_name):
    stage_map = {
        "Falling": 0,
        "Struggling": 2_000,
        "Unstable": 6_000,
        "Improving Balance": 12_500,
        "Mostly Stable": 18_750,
        "Natural Walk": 24_000,
    }
    return stage_map.get(stage_name, 15_680)


def live_training_step(episode, animation_frame, speed, algorithm, learning_rate, gamma):
    """Advance the training simulation one live timer step."""
    next_episode = int(episode + speed)
    if next_episode > MAX_EPISODE:
        next_episode = 0

    next_animation_frame = int((animation_frame + 7) % 101)

    metrics, live, reward_plot, stability_plot, reward_df = update_dashboard(
        next_episode,
        next_animation_frame,
        algorithm,
        learning_rate,
        gamma,
    )

    return (
        next_episode,
        next_animation_frame,
        metrics,
        live,
        reward_plot,
        stability_plot,
        reward_df,
    )


def reset_training(algorithm, learning_rate, gamma):
    """Reset the live simulation to episode zero."""
    episode = 0
    animation_frame = 0
    metrics, live, reward_plot, stability_plot, reward_df = update_dashboard(
        episode,
        animation_frame,
        algorithm,
        learning_rate,
        gamma,
    )
    return episode, animation_frame, metrics, live, reward_plot, stability_plot, reward_df


# -----------------------------
# Gradio app
# -----------------------------

CUSTOM_CSS = """
body, .gradio-container {
    background: #07111f !important;
    color: #e5e7eb !important;
}
.metric-card {
    background: #0c1b2d;
    border: 1px solid #24425f;
    border-radius: 14px;
    padding: 14px;
    box-shadow: 0 0 12px rgba(40,120,255,.15);
    min-height: 110px;
}
.top-grid {
    display: grid;
    grid-template-columns: repeat(5, minmax(140px, 1fr));
    gap: 12px;
}
.small-label {
    color: #b8c7d9;
    font-size: 0.88rem;
}
.green-text {
    color: #54e26f;
    font-weight: 800;
}
.blue-text {
    color: #4ea1ff;
    font-weight: 800;
}
.orange-text {
    color: #ff9f1c;
    font-weight: 800;
}
.big-number {
    font-size: 1.55rem;
}
.stage-title {
    font-size: 1.15rem;
}
.status-box {
    background: #0c1b2d;
    border: 1px solid #24425f;
    border-radius: 12px;
    padding: 12px;
    margin-top: 12px;
    color: #dbeafe;
}
.rl-loop {
    display: flex;
    align-items: center;
    justify-content: space-between;
    gap: 8px;
    flex-wrap: wrap;
    background: #0c1b2d;
    border: 1px solid #24425f;
    border-radius: 14px;
    padding: 16px;
}
.loop-box {
    border: 1px solid #4ea1ff;
    border-radius: 12px;
    padding: 12px;
    min-width: 150px;
    text-align: center;
    color: #dbeafe;
}
.loop-box b {
    color: #4ea1ff;
}
.loop-box.reward {
    border-color: #54e26f;
}
.loop-box.reward b {
    color: #54e26f;
}
.loop-box.update {
    border-color: #a855f7;
}
.loop-box.update b {
    color: #c084fc;
}
.arrow {
    font-size: 2rem;
    color: #dbeafe;
}
code {
    color: #ffffff;
    background: #111827;
    padding: 2px 5px;
    border-radius: 5px;
}
.gr-button {
    border-radius: 10px !important;
}
"""

with gr.Blocks(css=CUSTOM_CSS, title="RL Robot Walking Simulation") as demo:
    live_timer = gr.Timer(value=0.5, active=False)

    gr.Markdown(
        """
        # 🤖 Robot Learning to Walk with Reinforcement Learning
        Training a humanoid robot in repeated simulation episodes until the policy discovers balance, coordination, and forward walking.
        """
    )

    with gr.Row():
        with gr.Column(scale=1):
            episode_slider = gr.Slider(
                minimum=0,
                maximum=MAX_EPISODE,
                value=15_680,
                step=100,
                label="Training Episode",
            )
            animation_slider = gr.Slider(
                minimum=0,
                maximum=100,
                value=35,
                step=1,
                label="Walking Animation Frame",
            )
            algorithm_dropdown = gr.Dropdown(
                choices=["PPO", "SAC", "A2C", "DQN-style discrete demo"],
                value="PPO",
                label="Algorithm",
            )
            learning_rate_dropdown = gr.Dropdown(
                choices=["3e-4", "1e-4", "1e-3"],
                value="3e-4",
                label="Learning Rate",
            )
            gamma_dropdown = gr.Dropdown(
                choices=["0.99", "0.98", "0.95"],
                value="0.99",
                label="Discount Factor γ",
            )

            stage_dropdown = gr.Dropdown(
                choices=["Falling", "Struggling", "Unstable", "Improving Balance", "Mostly Stable", "Natural Walk"],
                value="Mostly Stable",
                label="Jump to Learning Stage",
            )
            jump_button = gr.Button("Jump to Stage")

            gr.Markdown("### Live Reinforcement Mode")
            speed_slider = gr.Slider(
                minimum=100,
                maximum=1500,
                value=500,
                step=100,
                label="Episodes Added per Tick",
            )
            with gr.Row():
                start_button = gr.Button("▶ Start Live Training", variant="primary")
                stop_button = gr.Button("⏸ Stop")
            reset_button = gr.Button("↺ Reset Training")

        with gr.Column(scale=4):
            metrics_html = gr.HTML()
            progression_plot = gr.Plot(value=progression_figure(), label="Learning Progression")

    with gr.Row():
        with gr.Column(scale=2):
            live_plot = gr.Plot(label="Live Simulation")
            gr.HTML(value=rl_loop_html())
        with gr.Column(scale=1):
            reward_plot = gr.Plot(label="Reward per Episode")
            stability_plot = gr.Plot(label="Stability Over Time")

    with gr.Row():
        with gr.Column(scale=1):
            reward_table = gr.Dataframe(label="Reward Function: Weighted Sum", interactive=False)
        with gr.Column(scale=1):
            gr.Code(value=pseudocode_text(), language="python", label="RL Pseudocode")
            gr.Markdown(
                """
                **Teaching note:** This app visualizes the logic of robot locomotion training.  
                For a real physics-based version, connect the same dashboard idea to **Gymnasium + MuJoCo + Stable-Baselines3 PPO**.
                """
            )

    inputs = [episode_slider, animation_slider, algorithm_dropdown, learning_rate_dropdown, gamma_dropdown]
    outputs = [metrics_html, live_plot, reward_plot, stability_plot, reward_table]
    live_outputs = [episode_slider, animation_slider, metrics_html, live_plot, reward_plot, stability_plot, reward_table]

    for component in inputs:
        component.change(fn=update_dashboard, inputs=inputs, outputs=outputs)

    demo.load(fn=update_dashboard, inputs=inputs, outputs=outputs)

    jump_button.click(fn=jump_to_stage, inputs=stage_dropdown, outputs=episode_slider).then(
        fn=update_dashboard,
        inputs=inputs,
        outputs=outputs,
    )

    start_button.click(lambda: gr.Timer(active=True), inputs=None, outputs=live_timer)
    stop_button.click(lambda: gr.Timer(active=False), inputs=None, outputs=live_timer)

    reset_button.click(
        fn=reset_training,
        inputs=[algorithm_dropdown, learning_rate_dropdown, gamma_dropdown],
        outputs=live_outputs,
    )

    live_timer.tick(
        fn=live_training_step,
        inputs=[episode_slider, animation_slider, speed_slider, algorithm_dropdown, learning_rate_dropdown, gamma_dropdown],
        outputs=live_outputs,
    )


if __name__ == "__main__":
    demo.launch()