"""End-to-end environment tests for the BoardSim OpenEnv environment.

Covers:
  * deterministic reset/step contract
  * observation schema and required fields
  * 10-round episode termination
  * reward bounds (per-step dense, terminal spikes)
  * vote resolution on weighted CEO + NPC tally
  * format penalty fires for invalid decisions
  * pitch bootstrap fires for non-empty pitch
  * runway-exhaustion bankruptcy path
  * trust dynamics persist and update bidirectionally
  * event order is shuffled per seed (no trajectory memorisation)
"""
from __future__ import annotations

import statistics

import pytest

from board_sim_env.models import BoardSimAction
from board_sim_env.server.board_sim_env_environment import (
    BoardSimEnvironment,
    EVENTS,
    NPC_AGENDAS,
    ROLE_WEIGHT,
    compute_profitability_score,
)


# ─── Determinism ──────────────────────────────────────────────────────────

def test_reset_is_deterministic_per_seed():
    e1, e2 = BoardSimEnvironment(), BoardSimEnvironment()
    o1, o2 = e1.reset(seed=7), e2.reset(seed=7)
    assert o1.event == o2.event
    assert o1.options == o2.options
    assert [s["statement"] for s in o1.npc_statements] == [s["statement"] for s in o2.npc_statements]


def test_different_seeds_produce_different_event_orders():
    seen_first_events = set()
    for s in range(20):
        env = BoardSimEnvironment()
        seen_first_events.add(env.reset(seed=s).event)
    assert len(seen_first_events) >= 4, "event shuffling should produce variety across seeds"


# ─── Schema ───────────────────────────────────────────────────────────────

def test_observation_schema():
    env = BoardSimEnvironment()
    obs = env.reset(seed=0)
    for key in ("revenue", "burn_rate", "runway_months", "profitability_score", "trust"):
        assert key in obs.state, f"observation.state missing {key}"
    assert obs.round == 1
    assert len(obs.options) == 3
    assert len(obs.npc_statements) == 4
    for s in obs.npc_statements:
        assert {"role", "vote", "confidence", "statement"}.issubset(s.keys())
        assert s["role"] in NPC_AGENDAS
        assert s["vote"] in obs.options


def test_npc_role_set_and_weights():
    assert set(NPC_AGENDAS.keys()) == {"CTO", "CFO", "Investor Rep", "Independent"}
    assert ROLE_WEIGHT["CEO"] == 2.5
    for role in NPC_AGENDAS:
        assert ROLE_WEIGHT[role] > 0


# ─── Episode lifecycle ───────────────────────────────────────────────────

def test_episode_terminates_at_or_before_ten_rounds():
    env = BoardSimEnvironment()
    obs = env.reset(seed=42)
    n = 0
    while not obs.done and n < 15:
        obs = env.step(BoardSimAction(decision=obs.options[0], coalition_pitch=""))
        n += 1
    assert obs.done
    assert n <= 10
    assert obs.state["done_reason"] in {"runway_exhausted", "acquisition", "ipo", "stay_private", "finished_10"}


def test_step_returns_required_fields():
    env = BoardSimEnvironment()
    obs = env.reset(seed=1)
    res = env.step(BoardSimAction(decision=obs.options[0], coalition_pitch=""))
    assert hasattr(res, "reward")
    assert hasattr(res, "done")
    assert hasattr(res, "state")


# ─── Reward bounds ───────────────────────────────────────────────────────

def test_per_step_reward_dense_and_bounded_until_terminal():
    """Non-terminal step rewards live in roughly [-3, +3]; terminal step can spike (+/-30 ish)."""
    env = BoardSimEnvironment()
    obs = env.reset(seed=11)
    rewards = []
    while not obs.done:
        obs = env.step(BoardSimAction(decision=obs.options[0], coalition_pitch="runway and morale matter"))
        rewards.append(float(obs.reward or 0.0))
        if obs.done:
            break
    assert len(rewards) >= 1
    nonterm = rewards[:-1] if len(rewards) > 1 else []
    for r in nonterm:
        assert -3.0 <= r <= 3.0, f"per-step reward {r} outside dense band"


def test_format_penalty_for_invalid_decision():
    """Format penalty (-0.5) should fire when action.decision is not in options.

    Pick a non-terminal first round so terminal bonuses don't dominate the
    measurement; compare same-seed paired (valid vs invalid) reward.
    """
    invalid_drops = 0
    n = 0
    for s in range(40):
        e_valid, e_invalid = BoardSimEnvironment(), BoardSimEnvironment()
        o_valid = e_valid.reset(seed=s)
        e_invalid.reset(seed=s)
        r_valid = e_valid.step(BoardSimAction(decision=o_valid.options[0], coalition_pitch=""))
        r_invalid = e_invalid.step(BoardSimAction(decision="NOT_A_VALID_OPTION", coalition_pitch=""))
        if r_valid.done or r_invalid.done:
            continue
        n += 1
        if (r_invalid.reward or 0.0) < (r_valid.reward or 0.0):
            invalid_drops += 1
    assert n >= 5, "needed enough non-terminal first rounds to compare"
    assert invalid_drops / n >= 0.7, "invalid decisions should reduce reward most of the time (format penalty)"


def test_pitch_bootstrap_increases_reward_vs_empty_pitch():
    """Non-empty pitch on a contested round earns the +0.05 bootstrap and >=0 persuasion bonus."""
    seeds_with_lift = 0
    for s in range(20):
        e1, e2 = BoardSimEnvironment(), BoardSimEnvironment()
        o1, o2 = e1.reset(seed=s), e2.reset(seed=s)
        r1 = e1.step(BoardSimAction(decision=o1.options[0], coalition_pitch=""))
        r2 = e2.step(BoardSimAction(
            decision=o2.options[0],
            coalition_pitch="runway discipline and engineering quality argue for this",
        ))
        if (r2.reward or 0.0) >= (r1.reward or 0.0):
            seeds_with_lift += 1
    assert seeds_with_lift >= 14, "pitch should generally not hurt reward (>=70% of seeds non-decreasing)"


# ─── Profitability score ─────────────────────────────────────────────────

def test_profitability_score_in_range():
    env = BoardSimEnvironment()
    env.reset(seed=0)
    score = env.state.state_dict["profitability_score"]
    assert 0.0 <= score <= 100.0
    score2 = compute_profitability_score(env.state.state_dict)
    assert abs(score - score2) < 1e-6


# ─── Trust dynamics ──────────────────────────────────────────────────────

def test_trust_persists_and_updates():
    env = BoardSimEnvironment()
    obs0 = env.reset(seed=3)
    init_trust = dict(obs0.state["trust"])
    obs1 = env.step(BoardSimAction(decision=obs0.options[0], coalition_pitch="strong product readiness"))
    after = obs1.state["trust"]
    assert set(after.keys()) == set(init_trust.keys())
    assert any(abs(after[r] - init_trust[r]) > 1e-6 for r in init_trust), "at least one NPC's trust should move"
    for v in after.values():
        assert 0.1 <= v <= 1.0


# ─── Vote resolution ────────────────────────────────────────────────────

def test_ceo_weight_dominates_when_no_persuasion():
    """CEO weight 2.5 + at least one aligned NPC should usually carry the vote."""
    wins = 0
    for s in range(50):
        env = BoardSimEnvironment()
        obs = env.reset(seed=s)
        target = obs.options[0]
        env.step(BoardSimAction(decision=target, coalition_pitch=""))
        history = env.state.state_dict["history"]
        if history and history[-1]["winning_decision"] == target:
            wins += 1
    assert wins / 50 >= 0.6, "CEO weight should win >=60% of single-step votes without pitch"


# ─── Sanity smoke ────────────────────────────────────────────────────────

def test_random_policy_survives_majority_of_episodes():
    import random
    rng = random.Random(123)
    survived = 0
    n = 30
    for ep in range(n):
        env = BoardSimEnvironment()
        obs = env.reset(seed=ep)
        while not obs.done:
            obs = env.step(BoardSimAction(decision=rng.choice(obs.options), coalition_pitch=""))
        if env.state.state_dict.get("done_reason") != "runway_exhausted":
            survived += 1
    assert survived / n >= 0.6, "random policy should survive >=60% of episodes (env-health floor)"