"""
CUDA Kernels for GPU-Accelerated VectorEnv.

This module contains CUDA kernel implementations for:
- Environment reset
- Game step (integrated with opponent, phases, scoring)
- Observation encoding
- Action mask computation

All kernels are designed for the VectorGameStateGPU class.
"""

import numpy as np

try:
    from numba import cuda
    from numba.cuda.random import xoroshiro128p_normal_float32, xoroshiro128p_uniform_float32

    HAS_CUDA = True
except ImportError:
    HAS_CUDA = False

    # Mock for type checking
    class MockCuda:
        def jit(self, *args, **kwargs):
            def decorator(f):
                return f

            return decorator

        def grid(self, x):
            return 0

    cuda = MockCuda()

    def xoroshiro128p_uniform_float32(rng, i):
        return 0.5


# ============================================================================
# CONSTANTS (Must match fast_logic.py)
# ============================================================================
SC = 0
OS = 1
TR = 2
HD = 3
DI = 4
EN = 5
DK = 6
OT = 7
PH = 8
OD = 9

# Opcodes
O_DRAW = 10
O_BLADES = 11
O_HEARTS = 12
O_RECOV_L = 13
O_BOOST = 14
O_RECOV_M = 15
O_BUFF = 16
O_CHARGE = 17
O_TAP_O = 18
O_CHOOSE = 19
O_ADD_H = 20
O_RETURN = 999
O_JUMP = 100
O_JUMP_F = 101

# Conditions
C_TR1 = 200
C_CLR = 202
C_STG = 203
C_HND = 204
C_CTR = 206
C_LLD = 207
C_GRP = 208
C_OPH = 210
C_ENR = 213
C_CMP = 220

# Unique ID (UID) System
BASE_ID_MASK = 0xFFFFF


@cuda.jit(device=True)
def get_base_id_device(uid: int) -> int:
    """Extract the base card definition ID (0-1999) from a UID."""
    return uid & BASE_ID_MASK


# ============================================================================
# DEVICE FUNCTIONS (Callable from kernels)
# ============================================================================


@cuda.jit(device=True)
def check_deck_refresh_device(p_deck, p_trash, p_global_ctx, DK_idx, TR_idx):
    """Shuffle trash back into deck if deck is empty."""
    if p_global_ctx[DK_idx] <= 0:
        # Count trash
        tr_count = 0
        for t in range(60):
            if p_trash[t] > 0:
                tr_count += 1

        if tr_count > 0:
            # Move trash to deck
            d_ptr = 0
            for t in range(60):
                if p_trash[t] > 0:
                    p_deck[d_ptr] = p_trash[t]
                    p_trash[t] = 0
                    d_ptr += 1

            p_global_ctx[DK_idx] = d_ptr
            p_global_ctx[TR_idx] = 0


@cuda.jit(device=True)
def move_to_trash_device(card_id, p_trash, p_global_ctx, TR_idx):
    """Move a card to trash zone."""
    for t in range(60):
        if p_trash[t] == 0:
            p_trash[t] = card_id
            p_global_ctx[TR_idx] += 1
            break


@cuda.jit(device=True)
def draw_cards_device(count, p_hand, p_deck, p_trash, p_global_ctx):
    """Draw cards from deck to hand."""
    for _ in range(count):
        check_deck_refresh_device(p_deck, p_trash, p_global_ctx, DK, TR)

        if p_global_ctx[DK] <= 0:
            break

        # Find top card in deck
        top_card = 0
        d_idx_found = -1
        for d in range(60):
            if p_deck[d] > 0:
                top_card = p_deck[d]
                d_idx_found = d
                break

        if top_card > 0:
            # Find empty hand slot
            for h in range(60):
                if p_hand[h] == 0:
                    p_hand[h] = top_card
                    p_deck[d_idx_found] = 0
                    p_global_ctx[DK] -= 1
                    p_global_ctx[HD] += 1
                    break


@cuda.jit(device=True)
def resolve_bytecode_device(
    bytecode,
    flat_ctx,
    global_ctx,
    player_id,
    p_hand,
    p_deck,
    p_stage,
    p_energy_vec,
    p_energy_count,
    p_cont_vec,
    p_cont_ptr,
    p_tapped,
    p_live,
    opp_tapped,
    p_trash,
    bytecode_map,
    bytecode_index,
):
    """
    GPU Device function for resolving bytecode.
    Returns (new_cont_ptr, status, bonus).
    """
    ip = 0
    cptr = p_cont_ptr
    bonus = 0
    cond = True
    blen = bytecode.shape[0]
    safety_counter = 0

    while ip < blen and safety_counter < 500:
        safety_counter += 1
        op = bytecode[ip, 0]
        v = bytecode[ip, 1]
        a = bytecode[ip, 2]
        s = bytecode[ip, 3]

        if op == 0:
            ip += 1
            continue
        if op == O_RETURN:
            break

        # Jumps
        if op == O_JUMP:
            new_ip = ip + v
            if 0 <= new_ip < blen:
                ip = new_ip
            else:
                break
            continue

        if op == O_JUMP_F:
            if not cond:
                new_ip = ip + v
                if 0 <= new_ip < blen:
                    ip = new_ip
                else:
                    break
                continue
            ip += 1
            continue

        # Conditions (op >= 200)
        if op >= 200:
            if op == C_TR1:
                cond = global_ctx[TR] == 1
            elif op == C_STG:
                ct = 0
                for j in range(3):
                    if p_stage[j] != -1:
                        ct += 1
                cond = ct >= v
            elif op == C_HND:
                cond = global_ctx[HD] >= v
            elif op == C_LLD:
                cond = global_ctx[SC] > global_ctx[OS]
            elif op == C_ENR:
                cond = global_ctx[EN] >= v
            elif op == C_CMP:
                if v > 0:
                    cond = global_ctx[SC] >= v
                else:
                    cond = global_ctx[SC] > global_ctx[OS]
            elif op == C_OPH:
                cond = global_ctx[OT] >= v if v > 0 else global_ctx[OT] > 0
            else:
                cond = True
            ip += 1
        else:
            # Effects
            if cond:
                if op == O_DRAW:
                    draw_cards_device(v, p_hand, p_deck, p_trash, global_ctx)
                elif op == O_CHARGE:
                    # Move cards from deck to energy (simplified)
                    amt = min(v, global_ctx[DK])
                    for _ in range(amt):
                        for d in range(60):
                            if p_deck[d] > 0:
                                p_deck[d] = 0
                                global_ctx[DK] -= 1
                                global_ctx[EN] += 1
                                break
                elif op == O_HEARTS:
                    # Add hearts (points)
                    bonus += v
                    # Register continuous effect
                    if cptr < 32:
                        p_cont_vec[cptr, 0] = 2
                        p_cont_vec[cptr, 1] = v
                        p_cont_vec[cptr, 5] = a
                        p_cont_vec[cptr, 9] = 1
                        cptr += 1
                elif op == O_BLADES:
                    if cptr < 32:
                        p_cont_vec[cptr, 0] = 1
                        p_cont_vec[cptr, 1] = v
                        p_cont_vec[cptr, 2] = 4
                        p_cont_vec[cptr, 3] = s
                        p_cont_vec[cptr, 9] = 1
                        cptr += 1
                elif op == O_RECOV_M:
                    if 0 <= s < 3:
                        p_tapped[s] = 0
                elif op == O_RECOV_L:
                    if 0 <= s < p_live.shape[0]:
                        p_live[s] = 0
                elif op == O_TAP_O:
                    if 0 <= s < 3:
                        opp_tapped[s] = 1
                elif op == O_BUFF:
                    if cptr < 32:
                        p_cont_vec[cptr, 0] = 8
                        p_cont_vec[cptr, 1] = v
                        p_cont_vec[cptr, 2] = s
                        p_cont_vec[cptr, 9] = 1
                        cptr += 1
                elif op == O_BOOST:
                    bonus += v
            ip += 1

    return cptr, 0, bonus


@cuda.jit(device=True)
def step_player_device(
    act_id,
    player_id,
    rng_state,
    i,
    p_hand,
    p_deck,
    p_stage,
    p_energy_vec,
    p_energy_count,
    p_tapped,
    p_live,
    p_scores,
    p_global_ctx,
    p_trash,
    p_continuous_vec,
    p_continuous_ptr,
    opp_tapped,
    card_stats,
    bytecode_map,
    bytecode_index,
):
    """
    Device function for single player step.
    Returns bonus score from this action.
    """
    bonus = 0

    if act_id == 0:
        # Pass -> Next Phase
        ph = p_global_ctx[PH]
        if ph == -1:
            p_global_ctx[PH] = 0
        elif ph == 0:
            p_global_ctx[PH] = 4  # Skip to Main
        elif ph == 4:
            p_global_ctx[PH] = 8  # Performance
        return 0

    # Member Play (1-180)
    if 1 <= act_id <= 180:
        adj = act_id - 1
        hand_idx = adj // 3
        slot = adj % 3

        if hand_idx < 60:
            card_id = p_hand[hand_idx]
            if card_id >= 0:
                bid = get_base_id_device(card_id)
                if bid < card_stats.shape[0]:
                    # Cost calculation
                    cost = card_stats[bid, 0]
                    effective_cost = cost
                    prev_cid = p_stage[slot]
                    if prev_cid >= 0:
                        prev_bid = get_base_id_device(prev_cid)
                        if prev_bid < card_stats.shape[0]:
                            prev_cost = card_stats[prev_bid, 0]
                            effective_cost = max(0, cost - prev_cost)

                # Pay cost by tapping energy
                ec = min(p_global_ctx[EN], 12)
                paid = 0
                if effective_cost > 0:
                    for e_idx in range(ec):
                        if 3 + e_idx < 16:
                            if p_tapped[3 + e_idx] == 0:
                                p_tapped[3 + e_idx] = 1
                                paid += 1
                                if paid >= effective_cost:
                                    break

                # Move to stage
                p_stage[slot] = card_id
                p_hand[hand_idx] = 0
                p_global_ctx[HD] -= 1
                p_global_ctx[51 + slot] = 1  # Mark played

                # Resolve auto-ability
                bid = get_base_id_device(card_id)
                if bid < bytecode_index.shape[0]:
                    map_idx = bytecode_index[bid, 0]
                    if map_idx >= 0:
                        flat_ctx = cuda.local.array(64, dtype=np.int32)
                        for j in range(64):
                            flat_ctx[j] = 0

                        new_ptr, _, ab_bonus = resolve_bytecode_device(
                            bytecode_map[map_idx],
                            flat_ctx,
                            p_global_ctx,
                            player_id,
                            p_hand,
                            p_deck,
                            p_stage,
                            p_energy_vec,
                            p_energy_count,
                            p_continuous_vec,
                            p_continuous_ptr[0],
                            p_tapped,
                            p_live,
                            opp_tapped,
                            p_trash,
                            bytecode_map,
                            bytecode_index,
                        )
                        p_continuous_ptr[0] = new_ptr
                        bonus += ab_bonus

    # Activate Ability (200-202)
    elif 200 <= act_id <= 202:
        slot = act_id - 200
        card_id = p_stage[slot]
        if card_id >= 0 and p_tapped[slot] == 0:
            bid = get_base_id_device(card_id)
            if bid < bytecode_index.shape[0]:
                map_idx = bytecode_index[bid, 0]
                if map_idx >= 0:
                    flat_ctx = cuda.local.array(64, dtype=np.int32)
                    for j in range(64):
                        flat_ctx[j] = 0

                    new_ptr, _, ab_bonus = resolve_bytecode_device(
                        bytecode_map[map_idx],
                        flat_ctx,
                        p_global_ctx,
                        player_id,
                        p_hand,
                        p_deck,
                        p_stage,
                        p_energy_vec,
                        p_energy_count,
                        p_continuous_vec,
                        p_continuous_ptr[0],
                        p_tapped,
                        p_live,
                        opp_tapped,
                        p_trash,
                        bytecode_map,
                        bytecode_index,
                    )
                    p_continuous_ptr[0] = new_ptr
                    bonus += ab_bonus
                    p_tapped[slot] = 1

    # Set Live Card (400-459)
    elif 400 <= act_id <= 459:
        hand_idx = act_id - 400
        if hand_idx < 60:
            card_id = p_hand[hand_idx]
            if card_id > 0:
                # Find empty live zone slot
                for j in range(50):
                    if p_live[j] == 0:
                        p_live[j] = card_id
                        p_hand[hand_idx] = 0
                        p_global_ctx[HD] -= 1
                        break

    return bonus


@cuda.jit(device=True)
def resolve_live_device(
    live_id, p_stage, p_live, p_scores, p_global_ctx, p_deck, p_hand, p_trash, card_stats, p_cont_vec, p_cont_ptr
):
    """
    Device function to resolve a live card.
    Returns the score value if successful, 0 otherwise.
    """
    bid = get_base_id_device(live_id)
    if live_id < 0 or bid >= card_stats.shape[0]:
        return 0

    # Get required hearts from card_stats (indices 12-18)
    required = cuda.local.array(7, dtype=np.int32)
    for c in range(7):
        required[c] = card_stats[bid, 12 + c]

    total_required = 0
    for c in range(7):
        total_required += required[c]

    if total_required <= 0:
        # No requirements - auto-succeed
        return card_stats[bid, 38]  # Score value

    # Calculate provided hearts from stage members
    provided = cuda.local.array(7, dtype=np.int32)
    for c in range(7):
        provided[c] = 0

    for slot in range(3):
        cid = p_stage[slot]
        if cid > 0:
            s_bid = get_base_id_device(cid)
            if s_bid < card_stats.shape[0]:
                for c in range(7):
                    provided[c] += card_stats[s_bid, 12 + c]

    # Check if requirements met
    for c in range(6):  # Colors (not All)
        if required[c] > provided[c]:
            return 0  # Failed

    # All requirements met
    return card_stats[bid, 38]


@cuda.jit(device=True)
def run_opponent_turn_device(
    rng_state,
    i,
    opp_hand,
    opp_deck,
    opp_stage,
    opp_energy_vec,
    opp_energy_count,
    opp_tapped,
    opp_live,
    opp_scores,
    opp_global_ctx,
    opp_trash,
    p_tapped,
    opp_history,
    card_stats,
    bytecode_map,
    bytecode_index,
):
    """
    Simple heuristic opponent turn.
    Plays members if possible, activates abilities, sets lives.
    """
    # Play up to 3 members in empty slots
    for slot in range(3):
        if opp_stage[slot] == -1:
            # Find playable member in hand
            for h in range(60):
                cid = opp_hand[h]
                if cid >= 0:
                    bid = get_base_id_device(cid)
                    if bid < card_stats.shape[0]:
                        ctype = card_stats[bid, 10]
                        if ctype == 1:  # Member
                            cost = card_stats[bid, 0]
                            if cost <= opp_global_ctx[EN]:
                                # Play it
                                opp_stage[slot] = cid
                                opp_hand[h] = 0
                                opp_global_ctx[HD] -= 1
                                # Update History
                                for k in range(5, 0, -1):
                                    opp_history[i, k] = opp_history[i, k - 1]
                                opp_history[i, 0] = cid
                                break

    # Set a live card if possible
    for h in range(60):
        cid = opp_hand[h]
        if cid >= 0:
            bid = get_base_id_device(cid)
            if bid < card_stats.shape[0]:
                ctype = card_stats[bid, 10]
                if ctype == 2:  # Live
                    for lz in range(50):
                        if opp_live[lz] == 0:
                            opp_live[lz] = cid
                            opp_hand[h] = 0
                            opp_global_ctx[HD] -= 1
                            # Update History
                            for k in range(5, 0, -1):
                                opp_history[i, k] = opp_history[i, k - 1]
                            opp_history[i, 0] = cid
                            break
                    break


# ============================================================================
# MAIN KERNELS
# ============================================================================


@cuda.jit
def reset_kernel(
    indices,
    batch_stage,
    batch_energy_vec,
    batch_energy_count,
    batch_continuous_vec,
    batch_continuous_ptr,
    batch_tapped,
    batch_live,
    batch_scores,
    batch_flat_ctx,
    batch_global_ctx,
    batch_hand,
    batch_deck,
    batch_trash,
    batch_opp_history,
    opp_stage,
    opp_energy_vec,
    opp_energy_count,
    opp_tapped,
    opp_live,
    opp_scores,
    opp_global_ctx,
    opp_hand,
    opp_deck,
    opp_trash,
    ability_member_ids,
    ability_live_ids,
    rng_states,
    force_start_order,
    obs_buffer,
    card_stats,
):
    """
    CUDA Kernel to reset environments.
    """
    tid = cuda.grid(1)
    if tid >= indices.shape[0]:
        return

    i = indices[tid]

    # Clear agent state
    for j in range(3):
        batch_stage[i, j] = -1
    for j in range(3):
        for k in range(32):
            batch_energy_vec[i, j, k] = 0
        batch_energy_count[i, j] = 0
    for j in range(32):
        for k in range(10):
            batch_continuous_vec[i, j, k] = 0
    batch_continuous_ptr[i] = 0
    for j in range(16):
        batch_tapped[i, j] = 0
    for j in range(50):
        batch_live[i, j] = 0
    batch_scores[i] = 0
    for j in range(64):
        batch_flat_ctx[i, j] = 0
    for j in range(128):
        batch_global_ctx[i, j] = 0
    for j in range(60):
        batch_trash[i, j] = 0
    for j in range(6):
        batch_opp_history[i, j] = 0

    # Clear opponent state
    for j in range(3):
        opp_stage[i, j] = -1
    for j in range(3):
        for k in range(32):
            opp_energy_vec[i, j, k] = 0
        opp_energy_count[i, j] = 0
    for j in range(16):
        opp_tapped[i, j] = 0
    for j in range(50):
        opp_live[i, j] = 0
    opp_scores[i] = 0
    for j in range(128):
        opp_global_ctx[i, j] = 0
    for j in range(60):
        opp_trash[i, j] = 0

    # Generate deck
    n_members = ability_member_ids.shape[0]
    n_lives = ability_live_ids.shape[0]

    # Members (0-47)
    for k in range(48):
        if n_members == 48:
            batch_deck[i, k] = ability_member_ids[k]
            opp_deck[i, k] = ability_member_ids[k]
        else:
            # Random pick using RNG
            r = xoroshiro128p_uniform_float32(rng_states, i)
            idx = int(r * n_members) % n_members
            batch_deck[i, k] = ability_member_ids[idx]
            r = xoroshiro128p_uniform_float32(rng_states, i)
            idx = int(r * n_members) % n_members
            opp_deck[i, k] = ability_member_ids[idx]

    # Lives (48-59)
    for k in range(12):
        if n_lives == 12:
            batch_deck[i, 48 + k] = ability_live_ids[k]
            opp_deck[i, 48 + k] = ability_live_ids[k]
        else:
            r = xoroshiro128p_uniform_float32(rng_states, i)
            idx = int(r * n_lives) % n_lives
            batch_deck[i, 48 + k] = ability_live_ids[idx]
            r = xoroshiro128p_uniform_float32(rng_states, i)
            idx = int(r * n_lives) % n_lives
            opp_deck[i, 48 + k] = ability_live_ids[idx]

    # Shuffle decks (Fisher-Yates)
    for k in range(59, 0, -1):
        r = xoroshiro128p_uniform_float32(rng_states, i)
        j = int(r * (k + 1)) % (k + 1)
        tmp = batch_deck[i, k]
        batch_deck[i, k] = batch_deck[i, j]
        batch_deck[i, j] = tmp

        r = xoroshiro128p_uniform_float32(rng_states, i)
        j = int(r * (k + 1)) % (k + 1)
        tmp = opp_deck[i, k]
        opp_deck[i, k] = opp_deck[i, j]
        opp_deck[i, j] = tmp

    # Place 2 cards in Live Zone
    batch_live[i, 0] = batch_deck[i, 0]
    batch_live[i, 1] = batch_deck[i, 1]
    batch_deck[i, 0] = 0
    batch_deck[i, 1] = 0

    opp_live[i, 0] = opp_deck[i, 0]
    opp_live[i, 1] = opp_deck[i, 1]
    opp_deck[i, 0] = 0
    opp_deck[i, 1] = 0

    # Draw hand (6 cards)
    for j in range(60):
        batch_hand[i, j] = 0
        opp_hand[i, j] = 0

    drawn = 0
    for k in range(2, 60):
        if batch_deck[i, k] > 0 and drawn < 6:
            batch_hand[i, drawn] = batch_deck[i, k]
            batch_deck[i, k] = 0
            drawn += 1

    drawn_o = 0
    for k in range(2, 60):
        if opp_deck[i, k] > 0 and drawn_o < 6:
            opp_hand[i, drawn_o] = opp_deck[i, k]
            opp_deck[i, k] = 0
            drawn_o += 1

    # Set initial global context
    batch_global_ctx[i, HD] = 6
    batch_global_ctx[i, DK] = 52
    batch_global_ctx[i, EN] = 3
    batch_global_ctx[i, PH] = 4  # Start in Main phase (simplified)
    batch_global_ctx[i, 54] = 1  # Turn 1

    opp_global_ctx[i, HD] = 6
    opp_global_ctx[i, DK] = 52
    opp_global_ctx[i, EN] = 3
    opp_global_ctx[i, PH] = 4
    opp_global_ctx[i, 54] = 1

    # Start order
    if force_start_order == -1:
        r = xoroshiro128p_uniform_float32(rng_states, i)
        is_second = 1 if r > 0.5 else 0
    else:
        is_second = force_start_order
    batch_global_ctx[i, 10] = is_second


@cuda.jit
def step_kernel(
    num_envs,
    actions,
    batch_hand,
    batch_deck,
    batch_stage,
    batch_energy_vec,
    batch_energy_count,
    batch_continuous_vec,
    batch_continuous_ptr,
    batch_tapped,
    batch_live,
    batch_scores,
    batch_flat_ctx,
    batch_global_ctx,
    opp_hand,
    opp_deck,
    opp_stage,
    opp_energy_vec,
    opp_energy_count,
    opp_tapped,
    opp_live,
    opp_scores,
    opp_global_ctx,
    card_stats,
    bytecode_map,
    bytecode_index,
    obs_buffer,
    rewards,
    dones,
    prev_scores,
    prev_opp_scores,
    prev_phases,
    terminal_obs_buffer,
    batch_trash,
    opp_trash,
    batch_opp_history,
    term_scores_agent,
    term_scores_opp,
    ability_member_ids,
    ability_live_ids,
    rng_states,
    game_config,
    opp_mode,
    force_start_order,
):
    """
    Main integrated step kernel.
    Processes one environment per thread.
    """
    i = cuda.grid(1)
    if i >= num_envs:
        return

    # Config
    CFG_TURN_LIMIT = int(game_config[0])
    CFG_STEP_LIMIT = int(game_config[1])
    CFG_REWARD_WIN = game_config[2]
    CFG_REWARD_LOSE = game_config[3]
    CFG_REWARD_SCALE = game_config[4]
    CFG_REWARD_TURN_PENALTY = game_config[5]

    act_id = actions[i]
    ph = int(batch_global_ctx[i, PH])

    # Sync score to context
    batch_global_ctx[i, SC] = batch_scores[i]

    # Increment step counter
    batch_global_ctx[i, 58] += 1

    # Get continuous pointer slice
    cont_ptr_arr = batch_continuous_ptr[i : i + 1]
    score_arr = batch_scores[i : i + 1]

    # Execute action
    bonus = step_player_device(
        act_id,
        0,
        rng_states,
        i,
        batch_hand[i],
        batch_deck[i],
        batch_stage[i],
        batch_energy_vec[i],
        batch_energy_count[i],
        batch_tapped[i],
        batch_live[i],
        score_arr,
        batch_global_ctx[i],
        batch_trash[i],
        batch_continuous_vec[i],
        cont_ptr_arr,
        opp_tapped[i],
        card_stats,
        bytecode_map,
        bytecode_index,
    )
    batch_scores[i] += bonus

    # Handle turn end (Pass in Main Phase)
    if act_id == 0 and ph == 4:
        # Run opponent turn
        run_opponent_turn_device(
            rng_states,
            i,
            opp_hand[i],
            opp_deck[i],
            opp_stage[i],
            opp_energy_vec[i],
            opp_energy_count[i],
            opp_tapped[i],
            opp_live[i],
            opp_scores[i : i + 1],
            opp_global_ctx[i],
            opp_trash[i],
            batch_tapped[i],
            batch_opp_history,
            card_stats,
            bytecode_map,
            bytecode_index,
        )

        # Resolve lives for both players
        agent_live_score = 0
        opp_live_score = 0

        for z in range(10):
            lid = batch_live[i, z]
            if lid > 0:
                s = resolve_live_device(
                    lid,
                    batch_stage[i],
                    batch_live[i],
                    batch_scores[i : i + 1],
                    batch_global_ctx[i],
                    batch_deck[i],
                    batch_hand[i],
                    batch_trash[i],
                    card_stats,
                    batch_continuous_vec[i],
                    cont_ptr_arr,
                )
                agent_live_score += s
                # Clear used live
                if s > 0:
                    move_to_trash_device(lid, batch_trash[i], batch_global_ctx[i], TR)
                    batch_live[i, z] = 0

        for z in range(10):
            lid = opp_live[i, z]
            if lid > 0:
                s = resolve_live_device(
                    lid,
                    opp_stage[i],
                    opp_live[i],
                    opp_scores[i : i + 1],
                    opp_global_ctx[i],
                    opp_deck[i],
                    opp_hand[i],
                    opp_trash[i],
                    card_stats,
                    batch_continuous_vec[i],
                    cont_ptr_arr,
                )
                opp_live_score += s
                if s > 0:
                    move_to_trash_device(lid, opp_trash[i], opp_global_ctx[i], TR)
                    opp_live[i, z] = 0

        # Scoring comparison
        if agent_live_score > 0 and opp_live_score == 0:
            batch_scores[i] += 1
        elif agent_live_score == 0 and opp_live_score > 0:
            opp_scores[i] += 1
        elif agent_live_score > 0 and opp_live_score > 0:
            if agent_live_score > opp_live_score:
                batch_scores[i] += 1
            elif opp_live_score > agent_live_score:
                opp_scores[i] += 1
            else:
                # Tie - both score
                batch_scores[i] += 1
                opp_scores[i] += 1

        # Next turn setup
        batch_global_ctx[i, 54] += 1
        opp_global_ctx[i, 54] += 1

        # Untap and energy
        for j in range(16):
            batch_tapped[i, j] = 0
            if j < opp_tapped.shape[1]:
                opp_tapped[i, j] = 0

        batch_global_ctx[i, EN] = min(batch_global_ctx[i, EN] + 1, 12)
        opp_global_ctx[i, EN] = min(opp_global_ctx[i, EN] + 1, 12)

        # Draw card
        draw_cards_device(1, batch_hand[i], batch_deck[i], batch_trash[i], batch_global_ctx[i])
        draw_cards_device(1, opp_hand[i], opp_deck[i], opp_trash[i], opp_global_ctx[i])

    # Calculate rewards
    current_score = batch_scores[i]
    score_diff = float(current_score) - float(prev_scores[i])
    opp_score_diff = float(opp_scores[i]) - float(prev_opp_scores[i])

    r = (score_diff * CFG_REWARD_SCALE) - (opp_score_diff * CFG_REWARD_SCALE)
    r += CFG_REWARD_TURN_PENALTY

    win = current_score >= 3
    lose = opp_scores[i] >= 3

    if win:
        r += CFG_REWARD_WIN
    if lose:
        r += CFG_REWARD_LOSE

    rewards[i] = r

    # Sync Opp Stats to Agent Context (for Attention features)
    batch_global_ctx[i, 4] = opp_global_ctx[i, 3]  # HD
    batch_global_ctx[i, 9] = opp_global_ctx[i, 6]  # DK
    batch_global_ctx[i, 7] = opp_global_ctx[i, 2]  # TR

    # Check done
    is_done = win or lose or batch_global_ctx[i, 54] >= CFG_TURN_LIMIT or batch_global_ctx[i, 58] >= CFG_STEP_LIMIT
    dones[i] = is_done

    if is_done:
        term_scores_agent[i] = batch_scores[i]
        term_scores_opp[i] = opp_scores[i]
        # Note: Auto-reset should be called separately

    # Update prev scores
    prev_scores[i] = batch_scores[i]
    prev_opp_scores[i] = opp_scores[i]


@cuda.jit
def compute_action_masks_kernel(
    num_envs,
    batch_hand,
    batch_stage,
    batch_tapped,
    batch_global_ctx,
    batch_live,
    card_stats,
    masks,  # Output: (N, 2000)
):
    """
    Compute legal action masks on GPU.
    """
    i = cuda.grid(1)
    if i >= num_envs:
        return

    # Reset all to False
    for a in range(2000):
        masks[i, a] = False

    ph = batch_global_ctx[i, PH]

    # Action 0: Pass is always legal in Main Phase
    if ph == 4:
        masks[i, 0] = True

    # Member Play (1-180): HandIdx * 3 + Slot + 1
    for h_idx in range(60):
        cid = batch_hand[i, h_idx]
        if cid > 0:
            bid = get_base_id_device(cid)
            if bid < card_stats.shape[0]:
                ctype = card_stats[bid, 10]
                cost = card_stats[bid, 0]

                if ctype == 1:  # Member
                    for slot in range(3):
                        # Check if slot empty or can upgrade
                        old_cid = batch_stage[i, slot]
                        effective_cost = cost
                        if old_cid >= 0:
                            old_bid = get_base_id_device(old_cid)
                            if old_bid < card_stats.shape[0]:
                                effective_cost = max(0, cost - card_stats[old_bid, 0])

                        # Check energy
                        available_energy = 0
                        for e in range(12):
                            if batch_tapped[i, 3 + e] == 0:
                                available_energy += 1

                        if available_energy >= effective_cost:
                            action_id = h_idx * 3 + slot + 1
                            if action_id < 181:
                                masks[i, action_id] = True

    # Activate Ability (200-202)
    for slot in range(3):
        cid = batch_stage[i, slot]
        if cid > 0 and batch_tapped[i, slot] == 0:
            masks[i, 200 + slot] = True

    # Set Live (400-459)
    for h_idx in range(60):
        cid = batch_hand[i, h_idx]
        if cid > 0:
            bid = get_base_id_device(cid)
            if bid < card_stats.shape[0]:
                ctype = card_stats[bid, 10]
                if ctype == 2:  # Live
                    # Check if there's an empty live zone slot
                    for lz_idx in range(50):
                        if batch_live[i, lz_idx] == 0:
                            if h_idx < 60:  # This check is redundant due to outer loop
                                masks[i, 400 + h_idx] = True
                                break  # Only need one empty slot to make it legal


@cuda.jit
def encode_observations_kernel(
    num_envs,
    batch_hand,
    batch_stage,
    batch_energy_count,
    batch_tapped,
    batch_scores,
    opp_scores,
    opp_stage,
    opp_tapped,
    card_stats,
    batch_global_ctx,
    batch_live,
    turn_number,
    obs_buffer,
):
    """
    Encode observations on GPU (STANDARD mode).
    """
    i = cuda.grid(1)
    if i >= num_envs:
        return

    obs_dim = obs_buffer.shape[1]

    # Clear observation
    for j in range(obs_dim):
        obs_buffer[i, j] = 0.0

    # Metadata
    obs_buffer[i, 0] = float(batch_scores[i]) / 3.0
    obs_buffer[i, 1] = float(opp_scores[i]) / 3.0
    obs_buffer[i, 2] = float(batch_global_ctx[i, EN]) / 12.0
    obs_buffer[i, 3] = float(batch_global_ctx[i, HD]) / 60.0
    obs_buffer[i, 4] = float(batch_global_ctx[i, DK]) / 60.0
    obs_buffer[i, 5] = float(batch_global_ctx[i, 54]) / 100.0  # Turn

    offset = 10

    # Stage (3 slots x 20 features)
    for slot in range(3):
        cid = batch_stage[i, slot]
        base = offset + slot * 20
        if cid > 0:
            bid = get_base_id_device(cid)
            if bid < card_stats.shape[0]:
                obs_buffer[i, base] = 1.0  # Presence
                obs_buffer[i, base + 1] = float(cid) / 2000.0
                obs_buffer[i, base + 2] = float(card_stats[bid, 0]) / 10.0  # Cost
                obs_buffer[i, base + 3] = float(card_stats[bid, 1]) / 5.0  # Blades
                obs_buffer[i, base + 4] = float(card_stats[bid, 2]) / 10.0  # Hearts
                obs_buffer[i, base + 5] = 1.0 if batch_tapped[i, slot] > 0 else 0.0

    offset += 60

    # Opponent Stage
    for slot in range(3):
        cid = opp_stage[i, slot]
        base = offset + slot * 20
        if cid > 0:
            bid = get_base_id_device(cid)
            if bid < card_stats.shape[0]:
                obs_buffer[i, base] = 1.0
                obs_buffer[i, base + 1] = float(cid) / 2000.0
                obs_buffer[i, base + 2] = float(card_stats[bid, 0]) / 10.0
                obs_buffer[i, base + 3] = float(card_stats[bid, 1]) / 5.0
                obs_buffer[i, base + 4] = float(card_stats[bid, 2]) / 10.0

    offset += 60

    # Hand (up to 20 cards shown)
    h_count = 0
    for h_idx in range(60):
        cid = batch_hand[i, h_idx]
        if cid > 0 and h_count < 20:
            base = offset + h_count * 20
            if base + 10 < obs_dim:
                obs_buffer[i, base] = 1.0
                obs_buffer[i, base + 1] = float(cid) / 2000.0
                bid = get_base_id_device(cid)
                if bid < card_stats.shape[0]:
                    obs_buffer[i, base + 2] = float(card_stats[bid, 0]) / 10.0
                    obs_buffer[i, base + 3] = float(card_stats[bid, 10])  # Type
            h_count += 1

    offset += 400

    # Live zone (up to 10 cards)
    l_count = 0
    for l_idx in range(50):
        cid = batch_live[i, l_idx]
        if cid > 0 and l_count < 10:
            base = offset + l_count * 10
            if base + 5 < obs_dim:
                obs_buffer[i, base] = 1.0
                obs_buffer[i, base + 1] = float(cid) / 2000.0
            l_count += 1


@cuda.jit
def encode_observations_attention_kernel(
    num_envs,
    batch_hand,
    batch_stage,
    batch_energy_count,
    batch_tapped,
    batch_scores,
    opp_scores,
    opp_stage,
    opp_tapped,
    card_stats,
    batch_global_ctx,
    batch_live,
    batch_opp_history,
    opp_global_ctx,  # Added
    turn_number,
    obs_buffer,
):
    """
    Encode observations for Attention Architecture (2240-dim).
    """
    i = cuda.grid(1)
    if i >= num_envs:
        return

    # Constants
    FEAT = 64
    MAX_HAND = 15  # +1 overflow

    # Offsets
    HAND_START = 0
    HAND_OVER_START = HAND_START + (MAX_HAND * FEAT)  # 960
    STAGE_START = HAND_OVER_START + FEAT  # 1024
    LIVE_START = STAGE_START + (3 * FEAT)  # 1216
    LIVE_SUCC_START = LIVE_START + (3 * FEAT)  # 1408
    OPP_STAGE_START = LIVE_SUCC_START + (3 * FEAT)  # 1600
    OPP_HIST_START = OPP_STAGE_START + (3 * FEAT)  # 1792
    GLOBAL_START = OPP_HIST_START + (6 * FEAT)  # 2176

    # Clear buffer
    for k in range(2240):
        obs_buffer[i, k] = 0.0

    # --- A. HAND (16 slots) ---
    hand_count = 0
    for j in range(60):
        cid = batch_hand[i, j]
        if cid > 0:
            bid = get_base_id_device(cid)
            if bid < card_stats.shape[0]:
                if hand_count < 16:
                    base = HAND_START + hand_count * FEAT

                    obs_buffer[i, base + 0] = 1.0  # Presence
                    obs_buffer[i, base + 1] = float(card_stats[bid, 10]) / 2.0  # Type
                    obs_buffer[i, base + 2] = float(card_stats[bid, 0]) / 10.0  # Cost
                    obs_buffer[i, base + 3] = float(card_stats[bid, 1]) / 5.0  # Blades
                    obs_buffer[i, base + 5] = float(cid) / 2000.0  # Card ID (New)
                    obs_buffer[i, base + 6] = 0.2  # Location: Hand

                    # Hearts (8-14)
                    for k in range(7):
                        if 12 + k < card_stats.shape[1]:
                            obs_buffer[i, base + 8 + k] = float(card_stats[bid, 12 + k]) / 5.0

                    # Group (22-28)
                    raw_group = card_stats[bid, 11]
                    obs_buffer[i, base + 22 + (raw_group % 7)] = 1.0

                    # Context
                    obs_buffer[i, base + 58] = float(hand_count) / 10.0
                    obs_buffer[i, base + 59] = 1.0  # Mine

                    hand_count += 1

    # --- B. MY STAGE (3 slots) ---
    for slot in range(3):
        cid = batch_stage[i, slot]
        if cid > 0:
            bid = get_base_id_device(cid)
            if bid < card_stats.shape[0]:
                base = STAGE_START + slot * FEAT

                obs_buffer[i, base + 0] = 1.0
                obs_buffer[i, base + 1] = float(card_stats[bid, 10]) / 2.0
                obs_buffer[i, base + 2] = float(card_stats[bid, 0]) / 10.0
                obs_buffer[i, base + 3] = float(card_stats[bid, 1]) / 5.0
                obs_buffer[i, base + 4] = 1.0 if batch_tapped[i, slot] > 0 else 0.0
                obs_buffer[i, base + 5] = float(cid) / 2000.0  # Card ID (New)
                obs_buffer[i, base + 6] = 0.4  # Location: Stage

                for k in range(7):
                    if 12 + k < card_stats.shape[1]:
                        obs_buffer[i, base + 8 + k] = float(card_stats[bid, 12 + k]) / 5.0

                raw_group = card_stats[bid, 11]
                obs_buffer[i, base + 22 + (raw_group % 7)] = 1.0

                obs_buffer[i, base + 58] = float(slot) / 10.0
                obs_buffer[i, base + 59] = 1.0

    # --- C. LIVE ZONE (6 slots) ---
    live_count = 0
    for j in range(50):
        cid = batch_live[i, j]
        if cid > 0:
            bid = get_base_id_device(cid)
            if bid < card_stats.shape[0] and live_count < 6:
                base = LIVE_START + live_count * FEAT

                obs_buffer[i, base + 0] = 1.0
                obs_buffer[i, base + 1] = float(card_stats[bid, 10]) / 2.0
                obs_buffer[i, base + 2] = float(card_stats[bid, 0]) / 10.0
                obs_buffer[i, base + 5] = float(cid) / 2000.0  # Card ID (New)
                obs_buffer[i, base + 6] = 0.6  # Location: Live

                for k in range(7):
                    if 12 + k < card_stats.shape[1]:
                        obs_buffer[i, base + 8 + k] = float(card_stats[bid, 12 + k]) / 5.0

                obs_buffer[i, base + 58] = float(live_count) / 10.0
                obs_buffer[i, base + 59] = 1.0
                live_count += 1

    # --- D. OPP STAGE (3 slots) ---
    for slot in range(3):
        cid = opp_stage[i, slot]
        if cid > 0:
            bid = get_base_id_device(cid)
            if bid < card_stats.shape[0]:
                base = OPP_STAGE_START + slot * FEAT

                obs_buffer[i, base + 0] = 1.0
                obs_buffer[i, base + 1] = float(card_stats[bid, 10]) / 2.0
                obs_buffer[i, base + 2] = float(card_stats[bid, 0]) / 10.0
                obs_buffer[i, base + 3] = float(card_stats[bid, 1]) / 5.0
                obs_buffer[i, base + 4] = 1.0 if opp_tapped[i, slot] > 0 else 0.0
                obs_buffer[i, base + 5] = float(cid) / 2000.0  # Card ID (New)
                obs_buffer[i, base + 6] = 0.8  # Location: Opp Stage

                for k in range(7):
                    if 12 + k < card_stats.shape[1]:
                        obs_buffer[i, base + 8 + k] = float(card_stats[bid, 12 + k]) / 5.0

                obs_buffer[i, base + 58] = float(slot) / 10.0
                obs_buffer[i, base + 59] = -1.0

    # --- E. OPP HISTORY (6 slots) ---
    for h in range(6):
        cid = batch_opp_history[i, h]
        if cid > 0:
            bid = get_base_id_device(cid)
            if bid < card_stats.shape[0]:
                base = OPP_HIST_START + h * FEAT

                obs_buffer[i, base + 0] = 1.0
                obs_buffer[i, base + 1] = float(card_stats[bid, 10]) / 2.0
                obs_buffer[i, base + 2] = float(card_stats[bid, 0]) / 10.0
                obs_buffer[i, base + 5] = float(cid) / 2000.0  # Card ID (New)
                obs_buffer[i, base + 6] = 1.0  # Location: History

                obs_buffer[i, base + 58] = float(h) / 10.0
                obs_buffer[i, base + 59] = -1.0

    # --- F. GLOBAL SCALARS ---
    obs_buffer[i, GLOBAL_START + 0] = float(batch_scores[i]) / 10.0
    obs_buffer[i, GLOBAL_START + 1] = float(opp_scores[i]) / 10.0
    obs_buffer[i, GLOBAL_START + 2] = float(batch_global_ctx[i, 54]) / 20.0  # Turn from Context
    obs_buffer[i, GLOBAL_START + 3] = float(batch_global_ctx[i, 8]) / 10.0
    obs_buffer[i, GLOBAL_START + 4] = float(batch_global_ctx[i, 5]) / 10.0
    obs_buffer[i, GLOBAL_START + 5] = float(batch_global_ctx[i, 6]) / 40.0
    obs_buffer[i, GLOBAL_START + 6] = float(hand_count) / 15.0

    # Opponent Resources (New)
    obs_buffer[i, GLOBAL_START + 7] = float(opp_global_ctx[i, 5]) / 10.0  # Opp Energy
    obs_buffer[i, GLOBAL_START + 8] = float(batch_global_ctx[i, 4]) / 10.0  # Opp Hand (from ctx[4])
    obs_buffer[i, GLOBAL_START + 9] = float(batch_global_ctx[i, 9]) / 40.0  # Opp Deck (from ctx[9])
    obs_buffer[i, GLOBAL_START + 10] = float(batch_global_ctx[i, 7]) / 10.0  # Opp Trash (from ctx[7])