import os
import sys
import time

import numpy as np
import torch

# Ensure project root is in path
sys.path.append(os.getcwd())

import torch.nn.functional as F
import torch.optim as optim

from ai.environments.rust_env_lite import RustEnvLite
from ai.models.training_config import INPUT_SIZE, POLICY_SIZE
from ai.training.train import AlphaNet


def benchmark():
    print("========================================================")
    print(" LovecaSim AlphaZero Benchmark (Lite Rust Env)          ")
    print("========================================================")

    # Configuration
    NUM_ENVS = int(os.getenv("BENCH_ENVS", "256"))
    TOTAL_STEPS = int(os.getenv("BENCH_STEPS", "200"))
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    print(f" [Bench] Device:  {DEVICE}")
    print(f" [Bench] Envs:    {NUM_ENVS}")
    print(f" [Bench] Steps:   {TOTAL_STEPS}")
    print(f" [Bench] Obs Dim: {INPUT_SIZE}")

    # 1. Initialize Simplified Environment
    print(" [Bench] Initializing Rust Engine (Lite)...")
    env = RustEnvLite(num_envs=NUM_ENVS)
    obs = env.reset()

    # 2. Initialize Model
    print(" [Bench] Initializing AlphaNet...")
    model = AlphaNet(policy_size=POLICY_SIZE).to(DEVICE)
    optimizer = optim.Adam(model.parameters(), lr=1e-4)

    obs_tensor = torch.zeros((NUM_ENVS, INPUT_SIZE), dtype=torch.float32).to(DEVICE)
    obs_tensor.requires_grad = True  # Enable grad for stress testing

    # 3. Benchmark Loop
    print(" [Bench] Starting Training Loop...")
    start_time = time.time()
    total_samples = 0

    for step in range(1, TOTAL_STEPS + 1):
        # A. Sync Obs to GPU
        with torch.no_grad():
            obs_tensor.copy_(torch.from_numpy(obs))

        # B. Inference
        policy_logits, value = model(obs_tensor)

        # C. Action Selection (Sample from logits)
        # Gradient is detached for sampling
        with torch.no_grad():
            probs = F.softmax(policy_logits, dim=1)
            actions = torch.multinomial(probs, 1).cpu().numpy().flatten().astype(np.int32)

        # D. Environment Step
        obs, rewards, dones, done_indices = env.step(actions)

        # E. Dummy Training Step (Simulate backward pass stress)
        if step % 5 == 0:
            optimizer.zero_grad()
            # Dummy target for benchmarking
            p_loss = policy_logits.mean()
            v_loss = value.mean()
            loss = p_loss + v_loss
            loss.backward()
            optimizer.step()

        total_samples += NUM_ENVS

        if step % 50 == 0 or step == TOTAL_STEPS:
            elapsed = time.time() - start_time
            sps = total_samples / elapsed if elapsed > 0 else 0
            print(f" [Bench] Step {step}/{TOTAL_STEPS} | SPS: {sps:.0f}")

    end_time = time.time()
    duration = end_time - start_time
    final_sps = total_samples / duration

    print("\n========================================================")
    print(" [Result] Benchmark Completed!")
    print(f" [Result] Total Time:    {duration:.2f}s")
    print(f" [Result] Total Samples: {total_samples}")
    print(f" [Result] Final SPS:     {final_sps:.2f}")
    print("========================================================")


if __name__ == "__main__":
    benchmark()