#!/usr/bin/env python3
"""Local agent runner for EmergencyEnv.

This script acts as an agent only:
- reset env
- choose action from observation
- step env
- log trajectory
"""

from __future__ import annotations

import argparse
import json
import math
import os
import random
from pathlib import Path
from datetime import datetime, timezone
from typing import Union, TYPE_CHECKING, Optional, cast

from app.environment.core import EmergencyEnv
from app.models.action import Action

if TYPE_CHECKING:
    from openai import OpenAI as OpenAIClient
else:
    OpenAIClient = None

try:
    from openai import OpenAI
except Exception:  # pragma: no cover - fallback for missing optional dependency
    OpenAI = None

TASK_ORDER = ["acde_easy", "acde_medium", "acde_hard"]
LEVEL_TO_TASK = {
    "low": "acde_easy",
    "medium": "acde_medium",
    "high": "acde_hard",
}
RANDOM_LEVELS = ("medium", "high")
RANDOM_LEVEL_WEIGHTS = (0.25, 0.75)
BASE_SPEED_KMH = 60.0
TRAFFIC_FACTOR = {"low": 1.0, "medium": 0.6, "high": 0.3}
LEARNING_ARCHIVE_PATH = Path(__file__).resolve().parent / "data" / "learning_archive.json"
LEARNING_ARCHIVE_VERSION = 2
DEFAULT_API_BASE_URL = "https://api-inference.huggingface.co/v1"
DEFAULT_MODEL_NAME = "Qwen/Qwen2.5-72B-Instruct"
REQUIRED_ENV_VARS = ("HF_TOKEN",)
STRICT_SCORE_MIN = 0.001
STRICT_SCORE_MAX = 0.999


def clamp_strict_score(value: float) -> float:
    """Clamp score-like outputs to the strict open interval (0, 1)."""
    return max(STRICT_SCORE_MIN, min(STRICT_SCORE_MAX, float(value)))


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="EmergencyEnv agent runner")
    parser.add_argument("--mode", choices=["single", "full"], default="full")
    parser.add_argument("--task", choices=TASK_ORDER, default=None)
    parser.add_argument("--level", choices=["low", "medium", "high"], default=None)
    parser.add_argument("--seed", type=int, default=None)
    parser.add_argument("--episodes", type=int, default=1)
    parser.add_argument("--train-episodes", type=int, default=0)
    parser.add_argument("--train-same-seed", action="store_true")
    parser.add_argument(
        "--memory-file",
        default=str(Path(__file__).resolve().parent / "data" / "learning_memory.json"),
    )
    return parser.parse_args()


def emit_structured(tag: str, payload: dict) -> None:
    print(f"[{tag}] " + json.dumps(payload, ensure_ascii=True, separators=(",", ":")))


def runtime_llm_config() -> dict[str, str]:
    return {
        "API_BASE_URL": os.getenv("API_BASE_URL", DEFAULT_API_BASE_URL).strip(),
        "MODEL_NAME": os.getenv("MODEL_NAME", DEFAULT_MODEL_NAME).strip(),
        "HF_TOKEN": os.getenv("HF_TOKEN", "").strip(),
    }


def require_llm_config() -> tuple[OpenAIClient, str]:
    config = runtime_llm_config()
    missing = [name for name, value in config.items() if not value]
    if missing:
        raise SystemExit(
            "Missing required environment variables: "
            + ", ".join(missing)
            + ". Set HF_TOKEN before running inference.py"
        )
    if OpenAI is None:
        raise SystemExit("openai package is required for inference.py LLM rationale generation.")

    client = OpenAI(base_url=config["API_BASE_URL"], api_key=config["HF_TOKEN"], timeout=8.0)
    return client, config["MODEL_NAME"]


def llm_rationale(
    client: Union[OpenAIClient, None],
    model_name: str,
    observation: dict,
    chosen: dict,
    strategy: str,
) -> str:
    fallback = (
        f"Selected {chosen['hospital_id']} by {strategy}; "
        f"score={chosen['policy_score']:.3f}, traffic={chosen['traffic']}, icu={chosen['icu']}"
    )
    if client is None:
        return fallback
    try:
        prompt = (
            "You are an emergency routing agent. Return one short sentence rationale "
            "for the selected hospital. Keep it under 25 words.\n"
            f"task={observation.get('task_id')} difficulty={observation.get('scenario_difficulty')} "
            f"step={observation.get('step')} patient={observation.get('patient_condition')} "
            f"required={observation.get('required_specialization')} "
            f"selected={chosen['hospital_id']} score={chosen['policy_score']:.3f} "
            f"distance={chosen['distance_km']:.1f}km traffic={chosen['traffic']} icu={chosen['icu']} "
            f"strategy={strategy}"
        )
        completion = client.chat.completions.create(
            model=model_name,
            messages=[
                {"role": "system", "content": "Generate concise emergency triage rationale."},
                {"role": "user", "content": prompt},
            ],
            temperature=0.0,
            max_tokens=60,
        )
        text = (completion.choices[0].message.content or "").strip()
        if not text:
            return fallback
        return " ".join(text.split())[:180]
    except Exception:
        return fallback


def normalize_seed(raw_value: int | str) -> int:
    """Normalize arbitrary numeric/text input into a deterministic positive seed."""
    if isinstance(raw_value, int):
        value = raw_value
    else:
        text = str(raw_value).strip()
        try:
            value = int(text)
        except ValueError:
            # Deterministic fallback for non-numeric input.
            value = sum((idx + 1) * ord(ch) for idx, ch in enumerate(text))

    normalized = abs(value) % 1_000_000_000
    return normalized if normalized != 0 else 202601


def ask_seed_if_missing(seed: int | None) -> int:
    if seed is not None:
        return normalize_seed(seed)
    # No CLI seed means a fresh randomized run.
    return normalize_seed(random.SystemRandom().randint(1, 999_999_999))


def ask_level_if_missing(level: str | None) -> str:
    if level in LEVEL_TO_TASK:
        return level
    # No CLI level means pick a random non-easy difficulty.
    return random.choices(
        RANDOM_LEVELS,
        weights=RANDOM_LEVEL_WEIGHTS,
        k=1,
    )[0]


def append_trajectory_log(entry: dict) -> None:
    path = Path(__file__).resolve().parent / "data" / "trajectory_history.jsonl"
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("a", encoding="utf-8") as fp:
        fp.write(json.dumps(entry, ensure_ascii=True) + "\n")


def load_learning_archive() -> dict:
    LEARNING_ARCHIVE_PATH.parent.mkdir(parents=True, exist_ok=True)
    if not LEARNING_ARCHIVE_PATH.exists():
        return {"version": LEARNING_ARCHIVE_VERSION, "profiles": {}, "episodes": []}

    try:
        payload_text = LEARNING_ARCHIVE_PATH.read_text(encoding="utf-8-sig").strip()
        payload = json.loads(payload_text) if payload_text else {}
    except json.JSONDecodeError:
        return {"version": LEARNING_ARCHIVE_VERSION, "profiles": {}, "episodes": []}

    if not isinstance(payload, dict):
        return {"version": LEARNING_ARCHIVE_VERSION, "profiles": {}, "episodes": []}

    if payload.get("version") != LEARNING_ARCHIVE_VERSION:
        return {
            "version": LEARNING_ARCHIVE_VERSION,
            "profiles": {},
            "episodes": payload.get("episodes", [])[-500:] if isinstance(payload.get("episodes", []), list) else [],
        }

    payload.setdefault("version", LEARNING_ARCHIVE_VERSION)
    payload.setdefault("profiles", {})
    payload.setdefault("episodes", [])
    return payload


def save_learning_archive(archive: dict) -> None:
    LEARNING_ARCHIVE_PATH.parent.mkdir(parents=True, exist_ok=True)
    LEARNING_ARCHIVE_PATH.write_text(json.dumps(archive, indent=2, ensure_ascii=True), encoding="utf-8")


def profile_key(seed: int, task_id: str) -> str:
    return f"{seed}|{task_id}"


def _merge_step_stats(primary: dict, secondary: dict) -> dict:
    merged: dict = {}
    for step_key in set(primary.keys()) | set(secondary.keys()):
        merged[step_key] = {}
        step_primary = primary.get(step_key, {})
        step_secondary = secondary.get(step_key, {})
        for hospital_id in set(step_primary.keys()) | set(step_secondary.keys()):
            a = step_primary.get(hospital_id, {})
            b = step_secondary.get(hospital_id, {})
            count = int(a.get("count", 0)) + int(b.get("count", 0))
            accepted = int(a.get("accepted", 0)) + int(b.get("accepted", 0))
            partial = int(a.get("partial", 0)) + int(b.get("partial", 0))
            rejected = int(a.get("rejected", 0)) + int(b.get("rejected", 0))
            total_reward = float(a.get("total_reward", 0.0)) + float(b.get("total_reward", 0.0))
            merged[step_key][hospital_id] = {
                "count": count,
                "success": int(a.get("success", 0)) + int(b.get("success", 0)),
                "accepted": accepted,
                "partial": partial,
                "rejected": rejected,
                "total_reward": total_reward,
                "avg_reward": (total_reward / max(1, count)),
                "success_rate": (accepted / max(1, count)),
                "last_status": a.get("last_status") or b.get("last_status"),
                "last_reason": a.get("last_reason") or b.get("last_reason"),
            }
    return merged


def build_learning_profile(
    archive: dict,
    seed: int,
    task_id: str,
    required_specialization: str | None = None,
) -> dict | None:
    profiles = archive.get("profiles", {})
    key = profile_key(seed, task_id)
    exact = profiles.get(key)
    if not exact:
        return None

    # Strict scope: learn only from same seed + same level/task.
    return {
        "attempts": int(exact.get("attempts", 0)),
        "best_score": float(exact.get("best_score", 0.0)),
        "best_actions": list(exact.get("best_actions", [])),
        "step_stats": exact.get("step_stats", {}),
        "best_scenario_name": exact.get("best_scenario_name"),
        "last_scenario_name": exact.get("last_scenario_name"),
        "source": "exact-only",
    }


def _difficulty_policy_params(difficulty: str) -> tuple[float, float]:
    if difficulty == "easy":
        return 0.07, 0.18
    if difficulty == "medium":
        return 0.16, 0.32
    return 0.26, 0.44


def _sample_softmax(candidates: list[dict], key: str, temperature: float, rng: random.Random) -> dict:
    logits = [item[key] / max(temperature, 1e-6) for item in candidates]
    max_logit = max(logits)
    exps = [math.exp(v - max_logit) for v in logits]
    total = sum(exps)
    probs = [e / total for e in exps]

    roll = rng.random()
    cdf = 0.0
    for item, prob in zip(candidates, probs):
        cdf += prob
        if roll <= cdf:
            return item
    return candidates[-1]


def memory_score_for_hospital(
    hospital_id: str,
    memory_snapshot: dict,
    learning_profile: dict | None = None,
    step_number: int | None = None,
) -> float:
    entry = memory_snapshot.get(hospital_id)
    if not entry:
        return 0.5

    success = int(entry.get("accepted", entry.get("success", 0)))
    fail = int(entry.get("rejected", entry.get("fail", 0)))
    avg = float(entry.get("avg", 0.0))
    total = success + fail
    if total <= 0:
        return 0.5

    success_rate = success / total
    # Fix 3: reliability-first memory scoring.
    value = (0.6 * success_rate) + (0.4 * avg)
    recent_failed = False

    if learning_profile and step_number is not None:
        step_stats = learning_profile.get("step_stats", {}).get(str(step_number), {})
        hospital_stats = step_stats.get(hospital_id)
        if hospital_stats:
            step_avg = float(hospital_stats.get("avg_reward", 0.0))
            step_success = float(hospital_stats.get("success_rate", 0.0))
            step_count = int(hospital_stats.get("count", 0))
            value += min(0.20, (step_avg * 0.10) + (step_success * 0.08) + min(step_count, 5) * 0.01)
            recent_failed = str(hospital_stats.get("last_status", "")).upper() == "REJECTED"

    if recent_failed:
        value -= 0.3

    return max(0.0, min(1.0, value))


def score_hospitals(observation: dict, learning_profile: dict | None = None) -> list[dict]:
    failed = set(observation.get("failed_hospitals", []))
    recent_failed = set(observation.get("recent_failed_hospitals", []))
    visited = set(observation.get("visited_hospitals", []))
    memory_snapshot = observation.get("memory_snapshot", {})
    previous_action = observation.get("previous_action")
    last_arrival = observation.get("last_arrival_outcome") or {}
    last_status = str(last_arrival.get("status", "")).lower()

    scored: list[dict] = []
    initial_limit = float(observation.get("initial_critical_time_limit_minutes", observation["critical_time_limit_minutes"]))
    remaining_time = float(observation.get("remaining_time_minutes", observation["critical_time_limit_minutes"]))
    urgency = 1.0 - min(1.0, max(0.0, remaining_time / max(initial_limit, 1e-6)))

    patient_condition = observation.get("patient_condition", "").lower()
    critical_patient = patient_condition in {"critical", "unstable"}
    required_specialization = str(observation.get("required_specialization", ""))
    scenario_name = str(observation.get("scenario_name", ""))
    step_number = int(observation.get("step", 1))
    difficulty = str(observation.get("scenario_difficulty", "medium"))
    attempts = int(learning_profile.get("attempts", 0)) if learning_profile else 0
    preferred_route = []
    if learning_profile:
        preferred_route = list(learning_profile.get("best_actions", []))

    for hospital in observation.get("hospitals", []):
        traffic_factor = TRAFFIC_FACTOR[hospital["traffic"]]
        speed_kmh = BASE_SPEED_KMH * traffic_factor
        travel_time = (hospital["distance_km"] / max(speed_kmh, 1e-6)) * 60.0

        distance_score = max(0.0, min(1.0, 1.0 - hospital["distance_km"] / 20.0))
        icu_score = 1.0 if hospital["icu"] == "available" else 0.55
        mem_score = memory_score_for_hospital(
            hospital["hospital_id"],
            memory_snapshot,
            learning_profile=learning_profile,
            step_number=step_number,
        )

        memory_scenario = ""
        if learning_profile:
            memory_scenario = str(
                learning_profile.get("best_scenario_name")
                or learning_profile.get("last_scenario_name")
                or ""
            )
        if memory_scenario and scenario_name and memory_scenario != scenario_name:
            mem_score *= 0.5

        spec_match = (
            hospital["specialization"] == observation["required_specialization"]
            or hospital["specialization"] == "general"
            or observation["required_specialization"] == "general"
        )
        exact_spec_match = hospital["specialization"] == observation["required_specialization"]
        general_fallback = (
            hospital["specialization"] == "general"
            and observation["required_specialization"] != "general"
        )

        rejected_penalty = 0.40 if hospital["hospital_id"] in failed else 0.0
        revisit_penalty = 0.14 if hospital["hospital_id"] in visited else 0.0
        partial_repeat_penalty = (
            0.32
            if last_status == "partial" and hospital["hospital_id"] == previous_action
            else 0.0
        )
        critical_unknown_penalty = 0.17 if critical_patient and hospital["icu"] == "unknown" else 0.03
        traffic_penalty = 0.10 if hospital["traffic"] == "high" else 0.04 if hospital["traffic"] == "medium" else 0.0
        if critical_patient and general_fallback:
            spec_penalty = {"easy": 0.08, "medium": 0.16, "hard": 0.26}.get(difficulty, 0.16)
            if attempts >= 5:
                spec_penalty += 0.06
        else:
            spec_penalty = 0.0
        spec_bonus = 0.16 if exact_spec_match else (0.08 if spec_match else 0.0)
        urgency_boost = urgency * (0.18 + max(0.0, 0.25 - travel_time / 100.0))
        step_route_bonus = 0.0
        if step_number - 1 < len(preferred_route) and preferred_route[step_number - 1] == hospital["hospital_id"]:
            step_route_bonus = 0.16

        score = (
            (icu_score * 0.30)
            + (distance_score * 0.18)
            + (traffic_factor * 0.14)
            + (mem_score * 0.24)
            + spec_bonus
            + urgency_boost
            + step_route_bonus
            - rejected_penalty
            - revisit_penalty
            - partial_repeat_penalty
            - spec_penalty
            - critical_unknown_penalty
            - traffic_penalty
        )

        if hospital["hospital_id"] == previous_action and last_status == "rejected":
            score *= 0.01

        if hospital["hospital_id"] in recent_failed:
            score *= 0.2

        if hospital["specialization"] != required_specialization:
            if patient_condition == "critical":
                score *= 0.15
            else:
                score *= 0.4
        elif patient_condition == "critical":
            score *= 1.5

        # Hard realism penalties to align policy scoring with validator outcomes.
        if hospital["specialization"] != required_specialization:
            score -= 0.6
        if critical_patient and hospital["icu"] == "unknown":
            score -= 0.5
        if critical_patient and hospital["traffic"] == "high":
            score -= 0.3

        # Confidence-style risk multiplier keeps risky options from looking deceptively good.
        risk_factor = 1.0
        if hospital["icu"] == "unknown":
            risk_factor *= 0.6
        if not spec_match:
            risk_factor *= 0.5
        if critical_patient and hospital["traffic"] == "high":
            risk_factor *= 0.7
        score *= risk_factor

        # Reduce memory dominance in final decision score.
        memory_weight = 0.15
        current_score_weight = 0.85
        if step_number == 1:
            memory_weight = 0.1
            current_score_weight = 0.9
        base_current_score = score
        confidence_score = max(0.0, min(1.0, base_current_score))
        effective_memory_score = mem_score
        in_best_route = hospital["hospital_id"] in preferred_route
        if in_best_route and confidence_score < 0.6:
            effective_memory_score = 0.0
        if confidence_score < 0.2:
            effective_memory_score = 0.0

        score = (current_score_weight * base_current_score) + (memory_weight * effective_memory_score)

        scored.append(
            {
                "hospital_id": hospital["hospital_id"],
                "icu": hospital["icu"],
                "distance_km": hospital["distance_km"],
                "traffic": hospital["traffic"],
                "specialization": hospital["specialization"],
                "travel_time": travel_time,
                "memory_score": mem_score,
                "policy_score": max(0.0, min(1.0, score)),
                "specialization_match": spec_match,
                "tie_break_score": (
                    (distance_score * 0.35)
                    + (traffic_factor * 0.35)
                    + (icu_score * 0.20)
                    + (0.10 if spec_match else 0.0)
                ),
            }
        )

    scored.sort(key=lambda item: item["policy_score"], reverse=True)
    if scored:
        min_score = min(item["policy_score"] for item in scored)
        max_score = max(item["policy_score"] for item in scored)
        spread = max_score - min_score
        if spread > 1e-9:
            for item in scored:
                normalized = (item["policy_score"] - min_score) / (spread + 1e-6)
                if normalized < 0.2:
                    jitter_seed = (
                        int(observation.get("seed", 0))
                        + (step_number * 131)
                        + sum(ord(ch) for ch in item["hospital_id"])
                    )
                    jitter_rng = random.Random(jitter_seed)
                    normalized *= jitter_rng.uniform(0.3, 0.7)
                item["policy_score"] = max(0.0, min(1.0, normalized))
        elif max_score > 0:
            for item in scored:
                normalized = item["policy_score"] / max_score
                if normalized < 0.2:
                    jitter_seed = (
                        int(observation.get("seed", 0))
                        + (step_number * 131)
                        + sum(ord(ch) for ch in item["hospital_id"])
                    )
                    jitter_rng = random.Random(jitter_seed)
                    normalized *= jitter_rng.uniform(0.3, 0.7)
                item["policy_score"] = max(0.0, min(1.0, normalized))
        else:
            tie_min = min(item.get("tie_break_score", 0.0) for item in scored)
            tie_max = max(item.get("tie_break_score", 0.0) for item in scored)
            tie_spread = tie_max - tie_min
            if tie_spread > 1e-9:
                for item in scored:
                    normalized = (item.get("tie_break_score", 0.0) - tie_min) / (tie_spread + 1e-6)
                    if normalized < 0.2:
                        jitter_seed = (
                            int(observation.get("seed", 0))
                            + (step_number * 131)
                            + sum(ord(ch) for ch in item["hospital_id"])
                        )
                        jitter_rng = random.Random(jitter_seed)
                        normalized *= jitter_rng.uniform(0.3, 0.7)
                    item["policy_score"] = max(0.0, min(1.0, normalized))
            else:
                for item in scored:
                    item["policy_score"] = 0.0

        # Remove hard-zero scores and normalize to probability-like values.
        for item in scored:
            if item["policy_score"] <= 0.0:
                jitter_seed = (
                    int(observation.get("seed", 0))
                    + (step_number * 173)
                    + sum(ord(ch) for ch in item["hospital_id"])
                )
                jitter_rng = random.Random(jitter_seed)
                if critical_patient and required_specialization != "general":
                    if item.get("specialization") == required_specialization:
                        item["policy_score"] = jitter_rng.uniform(0.08, 0.18)
                    else:
                        item["policy_score"] = jitter_rng.uniform(0.001, 0.01)
                else:
                    item["policy_score"] = jitter_rng.uniform(0.05, 0.15)

        total_score = sum(item["policy_score"] for item in scored)
        if total_score > 0:
            for item in scored:
                item["policy_score"] = item["policy_score"] / (total_score + 1e-6)
        else:
            uniform = 1.0 / len(scored)
            for item in scored:
                item["policy_score"] = uniform

        # Final clinical-priority pass: in critical non-general cases,
        # exact specialization should dominate unless unavailable.
        if critical_patient and required_specialization != "general":
            for item in scored:
                if item.get("specialization") == required_specialization:
                    item["policy_score"] *= 1.5
                else:
                    item["policy_score"] *= 0.15

            boosted_total = sum(item["policy_score"] for item in scored)
            if boosted_total > 0:
                for item in scored:
                    item["policy_score"] = item["policy_score"] / boosted_total

        for item in scored:
            raw_score = float(item["policy_score"])
            normalized_score = raw_score / (1.0 + abs(raw_score))
            # Keep a small floor so no action is fully eliminated from exploration.
            if normalized_score < 0.01:
                jitter_seed = (
                    int(observation.get("seed", 0))
                    + (step_number * 211)
                    + sum(ord(ch) for ch in item["hospital_id"])
                )
                jitter_rng = random.Random(jitter_seed)
                normalized_score = jitter_rng.uniform(0.01, 0.03)
            item["policy_score"] = normalized_score

        scored.sort(key=lambda item: item["policy_score"], reverse=True)

    for item in scored:
        item.pop("tie_break_score", None)
    return scored


def choose_hospital(
    scored: list[dict],
    observation: dict,
    rng: random.Random,
    learning_profile: dict | None = None,
) -> tuple[dict, str]:
    difficulty = observation.get("scenario_difficulty", "medium")
    epsilon, temperature = _difficulty_policy_params(difficulty)

    failed = set(observation.get("failed_hospitals", []))
    recent_failed = set(observation.get("recent_failed_hospitals", []))
    visited = set(observation.get("visited_hospitals", []))
    previous_action = observation.get("previous_action")
    selected_hospital_id = observation.get("selected_hospital_id")
    visited_sequence = observation.get("visited_hospitals", []) or []
    recent_hospital = previous_action or selected_hospital_id or (visited_sequence[-1] if visited_sequence else None)
    last_arrival = observation.get("last_arrival_outcome") or {}
    last_status = str(last_arrival.get("status", "")).lower()
    last_reason = str(last_arrival.get("reason", "")).lower()
    is_rerouting_phase = str(observation.get("ambulance_status", "")).lower() == "rerouting"

    # Cooldown logic: avoid recently failed hospitals first, then avoid visited when alternatives exist.
    candidates = [
        item
        for item in scored
        if item["hospital_id"] not in recent_failed and item["hospital_id"] not in visited
    ]
    if not candidates:
        candidates = [item for item in scored if item["hospital_id"] not in recent_failed]
    if not candidates:
        # Last-resort fallback: if every hospital has failed already, avoid immediate retry.
        candidates = list(scored)
        if (last_status == "rejected" or is_rerouting_phase) and recent_hospital:
            redirected = [item for item in candidates if item["hospital_id"] != recent_hospital]
            if redirected:
                candidates = redirected

    step_number = int(observation.get("step", 1))
    attempts = int(learning_profile.get("attempts", 0)) if learning_profile else 0
    required_specialization = str(observation.get("required_specialization", ""))
    critical_patient = observation.get("patient_condition", "").lower() in {"critical", "unstable"}

    # Hard realism rule: never immediately retry the hospital that just rejected the patient.
    if (last_status == "rejected" or is_rerouting_phase) and recent_hospital:
        immediate_retry_block = [item for item in candidates if item["hospital_id"] != recent_hospital]
        if immediate_retry_block:
            candidates = immediate_retry_block
        elif len(candidates) == 1 and candidates[0]["hospital_id"] == recent_hospital:
            fallback_any = [item for item in scored if item["hospital_id"] != recent_hospital]
            if fallback_any:
                candidates = fallback_any

    # In critical non-general cases, prioritize exact specialization when available.
    if critical_patient and required_specialization != "general":
        exact_spec_candidates = [
            item for item in candidates if item["specialization"] == required_specialization
        ]
        if exact_spec_candidates:
            candidates = exact_spec_candidates

    if step_number == 1:
        policy_mode = "safe"
    elif last_status == "rejected":
        policy_mode = "risk-aware"
    else:
        policy_mode = "balanced"

    safe_weight = 1.0
    if policy_mode == "safe":
        safe_weight *= 0.8
        epsilon *= 0.6
        temperature *= 0.8
    elif policy_mode == "risk-aware":
        epsilon *= 1.1
        temperature *= 0.9

    # Within-episode learning from concrete failure reasons.
    if "wrong hospital specialization" in last_reason:
        strict_spec = [
            item
            for item in candidates
            if item["specialization"] == observation.get("required_specialization")
        ]
        if strict_spec:
            candidates = strict_spec
    if "icu unavailable" in last_reason:
        icu_known = [item for item in candidates if item["icu"] == "available"]
        if icu_known:
            candidates = icu_known
    if "specialist" in last_reason:
        strict_spec = [
            item
            for item in candidates
            if item["specialization"] == observation.get("required_specialization")
        ]
        if strict_spec:
            candidates = strict_spec
    if "overloaded" in last_reason:
        non_high_traffic = [item for item in candidates if item["traffic"] != "high"]
        if non_high_traffic:
            candidates = non_high_traffic
    if "delay" in last_reason:
        candidates = sorted(candidates, key=lambda item: item["distance_km"])

    def learned_utility(item: dict) -> float:
        base = float(item.get("policy_score", 0.0))
        if not learning_profile:
            return base
        step_stats = learning_profile.get("step_stats", {}).get(str(step_number), {})
        stats = step_stats.get(item["hospital_id"], {})
        count = int(stats.get("count", 0))
        if count <= 0:
            exploration_bonus = 0.22 * math.sqrt(max(1.0, math.log(attempts + 2.0)))
            return base + exploration_bonus
        avg_reward = float(stats.get("avg_reward", 0.0))
        success_rate = float(stats.get("success_rate", 0.0))
        rejected = int(stats.get("rejected", 0))
        rejection_rate = rejected / max(1, count)
        exploration_bonus = 0.18 * math.sqrt(max(0.0, math.log(attempts + 2.0) / (count + 1.0)))
        # Real-data utility: reward trend + success rate - rejection risk + exploration bonus.
        historical_weight = 0.35
        historical_weight *= 0.6
        historical_bonus = (avg_reward * historical_weight) + (success_rate * 0.30) - (rejection_rate * 0.22)
        if item["hospital_id"] in recent_failed:
            historical_bonus = 0.0
        return base + historical_bonus + exploration_bonus

    def pick_improvement_candidate(route_choice_id: str | None) -> dict | None:
        if not candidates:
            return None
        ranked = sorted(candidates, key=learned_utility, reverse=True)
        if route_choice_id is None:
            return ranked[0]
        for item in ranked:
            if item["hospital_id"] != route_choice_id:
                return item
        return ranked[0]

    def enforce_score_guard(chosen: dict, strategy: str) -> tuple[dict, str]:
        # Absolute next-step guard: never pick the same hospital immediately after a rejection.
        if last_status == "rejected" and previous_action and chosen.get("hospital_id") == previous_action:
            alternatives = [item for item in scored if item["hospital_id"] != previous_action]
            if alternatives:
                rerouted = max(alternatives, key=lambda item: float(item.get("policy_score", 0.0)))
                return rerouted, strategy + " + immediate-retry block"

        # Global guardrail: when a score gap is very large, prefer best option most
        # of the time while preserving some exploration.
        globally_eligible = [
            item
            for item in scored
            if item["hospital_id"] not in recent_failed
            and not (
                (last_status == "rejected" or is_rerouting_phase)
                and recent_hospital
                and item["hospital_id"] == recent_hospital
            )
        ]
        if not globally_eligible:
            globally_eligible = list(scored)

        if globally_eligible:
            best_global = max(globally_eligible, key=lambda item: float(item.get("policy_score", 0.0)))
            chosen_score = float(chosen.get("policy_score", 0.0))
            best_global_score = float(best_global.get("policy_score", 0.0))
            # Cooldown hard guard: never immediately retry the just-failed hospital.
            if (last_status == "rejected" or is_rerouting_phase) and recent_hospital:
                if chosen.get("hospital_id") == recent_hospital:
                    alternatives = [
                        item
                        for item in scored
                        if item["hospital_id"] != recent_hospital and item["hospital_id"] not in recent_failed
                    ]
                    if not alternatives:
                        alternatives = [item for item in scored if item["hospital_id"] != recent_hospital]
                    if alternatives:
                        rerouted = max(alternatives, key=lambda item: float(item.get("policy_score", 0.0)))
                        return rerouted, strategy + " + cooldown reroute"

            if chosen_score < (best_global_score * 0.6):
                return best_global, strategy + " + anti-stupidity guard"
            if (best_global_score - chosen_score) > 0.25 and rng.random() < 0.75:
                return best_global, strategy + " + score-gap guard"

        return chosen, strategy

    # Learning-driven fail guard: avoid hospitals that repeatedly fail at this exact step.
    if learning_profile:
        step_stats = learning_profile.get("step_stats", {}).get(str(step_number), {})
        guard_blocked: set[str] = set()
        for hospital_id, stats in step_stats.items():
            count = int(stats.get("count", 0))
            success_rate = float(stats.get("success_rate", 0.0))
            rejected = int(stats.get("rejected", 0))
            if count >= 2 and success_rate <= 0.0 and rejected >= 2:
                guard_blocked.add(hospital_id)

        guarded_candidates = [item for item in candidates if item["hospital_id"] not in guard_blocked]
        if guarded_candidates:
            candidates = guarded_candidates

    # As attempts increase, reduce randomness and rely on learned utility.
    if attempts >= 3:
        epsilon *= 0.35
        temperature *= 0.70

    # Same seed + same task policy:
    # evaluate route combinations across all steps, not just one-step mutations.
    if learning_profile and policy_mode != "risk-aware":
        best_route = list(learning_profile.get("best_actions", []))
        if step_number - 1 < len(best_route):
            baseline_id = best_route[step_number - 1]
            ranked = sorted(candidates, key=learned_utility, reverse=True)
            baseline_candidate = next((item for item in ranked if item["hospital_id"] == baseline_id), None)
            alternatives = [item for item in ranked if item["hospital_id"] != baseline_id]
            top_candidate = ranked[0] if ranked else None

            if (
                step_number == 1
                and baseline_candidate is not None
                and top_candidate is not None
                and float(baseline_candidate.get("policy_score", 0.0)) < float(top_candidate.get("policy_score", 0.0))
            ):
                baseline_candidate = None

            alternatives = alternatives[: min(3, len(alternatives))]

            if attempts >= 1:
                # Mixed-radix route search: each run selects a step-wise digit.
                # digit 0 => keep baseline for this step, 1/2 => try alternative ranks.
                combo_index = max(0, attempts - 1)
                digit = (combo_index // (3 ** max(0, step_number - 1))) % 3

                if digit == 0 and baseline_candidate is not None:
                    return enforce_score_guard(baseline_candidate, "best-route retain")

                alt_rank = digit - 1
                if alt_rank >= 0 and alt_rank < len(alternatives):
                    return enforce_score_guard(alternatives[alt_rank], f"combination search step-{step_number} alt-{alt_rank + 1}")

                if baseline_candidate is not None:
                    return enforce_score_guard(baseline_candidate, "best-route retain")

    if attempts >= 6:
        ranked = sorted(candidates, key=learned_utility, reverse=True)
        top_pool = ranked[: min(3, len(ranked))]
        return enforce_score_guard(_sample_softmax(top_pool, "policy_score", max(0.08, temperature * 0.85), rng), "learned utility exploit")

    if learning_profile and policy_mode == "safe":
        preferred_route = list(learning_profile.get("best_actions", []))
        if step_number - 1 < len(preferred_route):
            preferred_hospital = preferred_route[step_number - 1]
            preferred_candidate = next((item for item in candidates if item["hospital_id"] == preferred_hospital), None)
            if preferred_candidate is not None:
                profile_score = float(learning_profile.get("best_score", 0.0))
                if (profile_score * safe_weight) >= 0.85 or len(candidates) == 1:
                    return enforce_score_guard(preferred_candidate, "learned best path")

    # If last outcome was partial, force trying a different hospital when possible.
    if last_status == "partial" and previous_action:
        redirected = [item for item in candidates if item["hospital_id"] != previous_action]
        if redirected:
            candidates = redirected
        # After partial treatment, reduce random exploration and favor safer follow-up routing.
        epsilon = min(epsilon, 0.04)
        temperature = min(temperature, 0.24)

    critical = observation.get("patient_condition", "").lower() in {"critical", "unstable"}
    strategy = f"{policy_mode} policy"

    if critical and policy_mode in {"safe", "balanced"}:
        confirmed = [item for item in candidates if item["icu"] == "available"]
        if confirmed:
            candidates = confirmed
            strategy = f"{policy_mode} policy + critical triage"

    if len(candidates) > 1 and rng.random() < 0.15:
        ranked = sorted(candidates, key=learned_utility, reverse=True)
        top_k = ranked[: min(3, len(ranked))]
        return enforce_score_guard(rng.choice(top_k), strategy + " + guided-exploration")

    if len(candidates) > 1:
        # Utility-aware candidate ordering for softmax sampling.
        ranked = sorted(candidates, key=learned_utility, reverse=True)
        chosen = _sample_softmax(ranked, "policy_score", temperature, rng)
        return enforce_score_guard(chosen, strategy)

    return enforce_score_guard(candidates[0], strategy)


def print_options(scored: list[dict]) -> None:
    print(f"Hospital options ({len(scored)} total):")
    for idx, item in enumerate(scored, start=1):
        print(
            f"  [{idx}] {item['hospital_id']} | {item['distance_km']:.1f} km | ICU {item['icu']} | "
            f"traffic {item['traffic']} | specialty {item['specialization']} | score {item['policy_score']:.3f}"
        )


def run_episode(
    env: EmergencyEnv,
    task_id: str,
    seed: int,
    archive: dict | None = None,
    llm_client: object | None = None,
    model_name: str | None = None,
) -> dict:
    observation_model = env.reset(seed=seed, task_id=task_id)
    observation = observation_model.model_dump()
    learning_profile = None
    if archive is not None:
        learning_profile = build_learning_profile(
            archive,
            seed,
            task_id,
            required_specialization=str(observation.get("required_specialization", "")) or None,
        )

    print("\n" + "=" * 72)
    print(f"Scenario: {observation['scenario_name']}")
    print(f"Task: {task_id} | Difficulty: {observation['scenario_difficulty']} | Seed: {seed}")
    print(f"Patient condition: {observation['patient_condition']}")
    print(f"Required specialization: {observation['required_specialization']}")
    print("Objective: admit patient successfully (no fixed deadline window)")
    print("=" * 72)
    print(f"[START] task={task_id} env=acde-openenv model={model_name or 'none'}", flush=True)

    if learning_profile:
        print(
            f"Learning memory: best historical score {float(learning_profile.get('best_score', 0.0)):.3f} "
            f"across {int(learning_profile.get('attempts', 0))} attempts"
        )
        if learning_profile.get("best_actions"):
            print(f"Best known route: {' -> '.join(learning_profile['best_actions'])}")

    total_reward = 0.0
    all_rewards = []
    steps = 0
    done = False
    previous_policy_hospital_id: str | None = None
    previous_policy_outcome: str | None = None
    attempt_index = int(learning_profile.get("attempts", 0)) if learning_profile else 0
    # Keep scenario deterministic by seed, but vary policy exploration across retries.
    rng = random.Random(seed + (attempt_index * 7919))
    step_records: list[dict] = []

    while not done:
        steps += 1
        print(f"\nStep {observation['step']} | phase={observation['ambulance_status']}")

        scored = score_hospitals(observation, learning_profile=learning_profile)
        chosen, strategy = choose_hospital(scored, observation, rng, learning_profile=learning_profile)

        # Final policy-level guard: no immediate retry of the same hospital after rejection.
        if previous_policy_outcome == "REJECTED" and previous_policy_hospital_id and chosen["hospital_id"] == previous_policy_hospital_id:
            alternatives = [item for item in scored if item["hospital_id"] != previous_policy_hospital_id]
            if alternatives:
                chosen = max(alternatives, key=lambda item: float(item.get("policy_score", 0.0)))
                strategy = strategy + " + immediate-retry override"

        print_options(scored)
        rationale = llm_rationale(cast(Optional[OpenAIClient], llm_client), model_name or "", observation, chosen, strategy)
        print(f"Decision: {chosen['hospital_id']} ({strategy})")

        step_result = env.step(
            Action(
                step=observation["step"],
                hospital_id=chosen["hospital_id"],
                rationale=rationale,
            )
        )
        next_obs_model = step_result["observation"]
        reward = float(step_result["reward"])
        all_rewards.append(reward)
        done = bool(step_result["done"])
        info = step_result.get("info", {}) or {}
        next_observation = next_obs_model.model_dump()
        total_reward += reward

        outcome = info.get("outcome", {})
        status = str(outcome.get("status", "partial")).upper()
        reason = str(outcome.get("reason", "No reason provided"))
        previous_policy_hospital_id = chosen["hospital_id"]
        previous_policy_outcome = status

        print(f"Outcome: {status}")
        print(f"Reason: {reason}")
        print(f"Reward: {reward:.3f}")
        error_val = str(info.get("last_action_error")) if info.get("last_action_error") else "null"
        print(f"[STEP] step={observation.get('step')} action={chosen['hospital_id']} reward={reward:.2f} done={str(done).lower()} error={error_val}", flush=True)

        append_trajectory_log(
            {
                "seed": seed,
                "task": task_id,
                "difficulty": observation.get("scenario_difficulty"),
                "step": observation.get("step"),
                "state": {
                    "patient_condition": observation.get("patient_condition"),
                    "remaining_time_minutes": observation.get("remaining_time_minutes"),
                    "failed_hospitals": observation.get("failed_hospitals", []),
                    "visited_hospitals": observation.get("visited_hospitals", []),
                    "ambulance_status": observation.get("ambulance_status"),
                },
                "action": {
                    "hospital_id": chosen["hospital_id"],
                    "policy_score": chosen["policy_score"],
                    "strategy": strategy,
                },
                "outcome": {
                    "status": status,
                    "reason": reason,
                },
                "reward": reward,
            }
        )

        step_records.append(
            {
                "step": observation.get("step"),
                "hospital_id": chosen["hospital_id"],
                "status": status,
                "reason": reason,
                "reward": reward,
                "policy_score": chosen["policy_score"],
            }
        )

        observation = next_observation

    final_state = env.state()
    final_result = final_state.final_outcome or "FAILURE"
    final_score = clamp_strict_score(final_state.final_score)

    print("\nFinal result:")
    print(f"  Result: {final_result}")
    print(f"  Total steps: {steps}")
    print(f"  Final score: {final_score:.3f}")
    print(f"  Average reward: {total_reward / max(1, steps):.3f}")
    rewards_str = ",".join(f"{r:.2f}" for r in all_rewards)
    print(f"[END] success={str(final_result == 'SUCCESS').lower()} steps={steps} score={final_score:.2f} rewards={rewards_str}", flush=True)

    return {
        "success": final_result == "SUCCESS",
        "score": final_score,
        "steps": steps,
        "seed": seed,
        "task_id": task_id,
        "scenario_name": observation.get("scenario_name"),
        "scenario_type": observation.get("scenario_type"),
        "difficulty": observation.get("scenario_difficulty"),
        "required_specialization": observation.get("required_specialization"),
        "actions": [record["hospital_id"] for record in step_records],
        "step_records": step_records,
        "timestamp": datetime.now(timezone.utc).isoformat(),
    }


def update_learning_archive(archive: dict, episode_result: dict) -> None:
    key = profile_key(int(episode_result["seed"]), str(episode_result["task_id"]))
    profiles = archive.setdefault("profiles", {})
    profile = profiles.get(
        key,
        {
            "attempts": 0,
            "best_score": 0.0,
            "best_actions": [],
            "best_steps": 0,
            "step_stats": {},
        },
    )

    profile["attempts"] = int(profile.get("attempts", 0)) + 1
    profile["last_score"] = float(episode_result["score"])
    profile["last_success"] = bool(episode_result["success"])
    profile["last_run_at"] = episode_result["timestamp"]
    profile["last_actions"] = list(episode_result.get("actions", []))
    profile["last_required_specialization"] = episode_result.get("required_specialization")
    profile["last_scenario_type"] = episode_result.get("scenario_type")
    profile["last_scenario_name"] = episode_result.get("scenario_name")

    if float(episode_result["score"]) >= float(profile.get("best_score", 0.0)):
        profile["best_score"] = float(episode_result["score"])
        profile["best_actions"] = list(episode_result.get("actions", []))
        profile["best_steps"] = int(episode_result.get("steps", 0))
        profile["best_success"] = bool(episode_result["success"])
        profile["best_scenario_name"] = episode_result.get("scenario_name")
        profile["best_difficulty"] = episode_result.get("difficulty")
        profile["best_required_specialization"] = episode_result.get("required_specialization")

    step_stats = profile.setdefault("step_stats", {})
    for record in episode_result.get("step_records", []):
        step_key = str(record.get("step"))
        hospital_id = str(record.get("hospital_id"))
        step_bucket = step_stats.setdefault(step_key, {})
        hospital_bucket = step_bucket.setdefault(
            hospital_id,
            {
                "count": 0,
                "success": 0,
                "accepted": 0,
                "partial": 0,
                "rejected": 0,
                "total_reward": 0.0,
                "avg_reward": 0.0,
                "last_status": None,
                "last_reason": None,
            },
        )
        hospital_bucket["count"] += 1
        if record["status"] == "ACCEPTED":
            hospital_bucket["success"] += 1
            hospital_bucket["accepted"] += 1
        elif record["status"] == "PARTIAL":
            hospital_bucket["partial"] += 1
        else:
            hospital_bucket["rejected"] += 1
        hospital_bucket["total_reward"] = float(hospital_bucket["total_reward"]) + float(record["reward"])
        hospital_bucket["avg_reward"] = hospital_bucket["total_reward"] / max(1, hospital_bucket["count"])
        hospital_bucket["last_status"] = record["status"]
        hospital_bucket["last_reason"] = record["reason"]
        hospital_bucket["success_rate"] = hospital_bucket["accepted"] / max(1, hospital_bucket["count"])

    profiles[key] = profile
    episodes = archive.setdefault("episodes", [])
    episodes.append(
        {
            "seed": episode_result["seed"],
            "task_id": episode_result["task_id"],
            "difficulty": episode_result["difficulty"],
            "required_specialization": episode_result.get("required_specialization"),
            "scenario_name": episode_result["scenario_name"],
            "score": episode_result["score"],
            "success": episode_result["success"],
            "actions": episode_result.get("actions", []),
            "timestamp": episode_result["timestamp"],
        }
    )
    archive["episodes"] = episodes[-500:]


def print_training_summary(results: list[dict]) -> None:
    if not results:
        return
    scores = [float(item["score"]) for item in results]
    successes = sum(1 for item in results if item["success"])
    split = max(1, len(scores) // 2)
    early_scores = scores[:split]
    late_scores = scores[split:]
    if not late_scores:
        late_scores = scores[-split:]
    early_avg = sum(early_scores) / len(early_scores)
    late_avg = sum(late_scores) / len(late_scores)
    delta = late_avg - early_avg

    print("\nTraining summary:")
    print(f"  Episodes: {len(results)}")
    print(f"  Success rate: {successes / len(results):.1%}")
    print(f"  Average score: {sum(scores) / len(scores):.3f}")
    print(f"  Early avg score ({len(early_scores)} eps): {early_avg:.3f}")
    print(f"  Late avg score ({len(late_scores)} eps): {late_avg:.3f}")
    print(f"  Trend delta (late-early): {delta:+.3f}")


def main() -> None:
    args = parse_args()
    llm_client, model_name = require_llm_config()
    seed = ask_seed_if_missing(args.seed)
    print(f"Using seed: {seed}")
    if args.mode == "full":
        tasks = TASK_ORDER
    else:
        chosen_task = args.task
        if chosen_task is None:
            chosen_level = ask_level_if_missing(args.level)
            chosen_task = LEVEL_TO_TASK[chosen_level]
        tasks = [chosen_task]

    env = EmergencyEnv(memory_file=args.memory_file)
    archive = load_learning_archive()

    results = []
    run_count = args.train_episodes if args.train_episodes > 0 else args.episodes
    training_mode = args.train_episodes > 0

    for episode in range(run_count):
        for idx, task_id in enumerate(tasks):
            if training_mode:
                if args.train_same_seed:
                    task_seed = seed
                else:
                    task_seed = seed + (episode * 100) + idx
            else:
                task_seed = seed + (episode * 100) + idx

            label = f"Training Episode {episode + 1}" if training_mode else f"Episode {episode + 1}"
            print(f"\n=== {label} | {task_id} | seed={task_seed} ===")
            episode_result = run_episode(
                env,
                task_id,
                task_seed,
                archive=archive,
                llm_client=llm_client,
                model_name=model_name,
            )
            results.append(episode_result)
            update_learning_archive(archive, episode_result)

    save_learning_archive(archive)

    if training_mode:
        print_training_summary(results)
        return

    if results:
        print("\nBatch summary:")
        if len(results) == 1:
            episode_result = "SUCCESS" if results[0]["success"] else "FAILURE"
            print(f"  Episode outcome: {episode_result}")
            print(f"  Episode score: {results[0]['score']:.3f}")
            print(f"  Episode steps: {results[0]['steps']}")
            print("  Note: run 30-50 episodes to estimate difficulty success rate.")
        else:
            print(f"  Success rate: {sum(1 for item in results if item['success']) / len(results):.1%}")
            print(f"  Average score: {sum(item['score'] for item in results) / len(results):.3f}")
            print(f"  Average steps: {sum(item['steps'] for item in results) / len(results):.1f}")


if __name__ == "__main__":
    main()