"""Run an LLM baseline against the ESC environment and write Markdown results.

This script is intentionally separate from `inference.py`:

- `inference.py` keeps the hackathon-required stdout contract.
- `benchmark_llm.py` is for your own benchmarking workflow and writes
  reusable Markdown + JSON artifacts.

Required environment variables:
    API_BASE_URL
    MODEL_NAME
    ESC_ENV_URL

Authentication variables:
    HF_TOKEN or OPENAI_API_KEY or API_KEY

Example:
    export API_BASE_URL=https://router.huggingface.co/v1
    export MODEL_NAME=gpt-4.1-mini
    export HF_TOKEN=<your-token>
    export ESC_ENV_URL=http://127.0.0.1:7860
    python3 benchmark_llm.py
"""
from __future__ import annotations

import argparse
import asyncio
import json
import os
import textwrap
from dataclasses import asdict, dataclass
from datetime import datetime, timezone
from pathlib import Path
from statistics import mean
from typing import Any, Dict, List

from openai import OpenAI

from src.client import ESCHttpClient
from src.models import Action

TASK_IDS = ["work_stress_venting", "guarded_relationship", "crisis_fragile_trust"]
TEMPERATURE = 0.6
MAX_TOKENS = 220

SYSTEM_PROMPT = textwrap.dedent(
    """
    You are an emotionally attuned peer supporter chatting with someone who is
    going through a hard time. Your job is NOT to fix their problem. Your job
    is to make them feel heard, safe, and understood first — and only move
    toward gentle exploration or light action once trust is established.

    Principles:
    - Lead with empathy and validation. Reflect what you hear.
    - Do NOT give advice until the person has clearly shared what's really
      going on and feels heard.
    - Ask at most one open-ended question per reply. Never interrogate.
    - Never be dismissive, minimising, or instructive in a judgmental tone.
    - Keep replies warm, brief (1-3 sentences), and human.
    - In high-distress / crisis scenarios, gently reference professional
      support (a therapist, crisis line) only after rapport is built.

    Reply with ONLY your next message to the seeker.
    """
).strip()


@dataclass
class LLMEpisodeSummary:
    task_id: str
    model: str
    steps: int
    score: float
    success: bool
    completion: float
    avg_step_reward: float
    avg_immediate: float
    avg_future_oriented: float
    avg_penalties: float
    final_resolution: float
    transcript: List[str]


def build_user_prompt(
    scenario_brief: str,
    stage_hint: str,
    turn: int,
    remaining: int,
    seeker_utterance: str,
    history: List[str],
) -> str:
    history_block = "\n".join(history[-8:]) if history else "(first turn)"
    return textwrap.dedent(
        f"""
        Scenario: {scenario_brief}
        Conversation stage (public hint): {stage_hint}
        Turn: {turn}
        Remaining turns: {remaining}

        Recent exchange:
        {history_block}

        Seeker just said:
        "{seeker_utterance}"

        Write your next reply (1-3 sentences, warm, no advice unless rapport is clearly established):
        """
    ).strip()


def call_llm(client: OpenAI, model_name: str, user_prompt: str) -> str:
    completion = client.chat.completions.create(
        model=model_name,
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": user_prompt},
        ],
        temperature=TEMPERATURE,
        max_tokens=MAX_TOKENS,
        stream=False,
    )
    text = (completion.choices[0].message.content or "").strip()
    if not text:
        return "That sounds really hard. I'm here with you. Can you tell me more about what's feeling heaviest right now?"
    return text


async def run_task(
    openai_client: OpenAI,
    env_client: ESCHttpClient,
    model_name: str,
    task_id: str,
) -> LLMEpisodeSummary:
    reset = await env_client.reset(task_id=task_id)
    obs = reset.observation
    history: List[str] = [f"Seeker: {obs.seeker_utterance}"]
    rewards: List[float] = []
    immediate_scores: List[float] = []
    future_scores: List[float] = []
    penalties: List[float] = []
    transcript: List[str] = [f"Seeker: {obs.seeker_utterance}"]
    final: Dict[str, Any] = {}

    while True:
        prompt = build_user_prompt(
            scenario_brief=obs.scenario_brief,
            stage_hint=obs.stage_hint,
            turn=obs.turn,
            remaining=obs.remaining_turns,
            seeker_utterance=obs.seeker_utterance,
            history=history,
        )
        message = call_llm(openai_client, model_name, prompt)
        result = await env_client.step(Action(message=message))

        rewards.append(float(result.reward))
        reward_detail = result.reward_detail or {}
        immediate_scores.append(float(reward_detail.get("immediate", 0.0)))
        future_scores.append(float(reward_detail.get("future_oriented", 0.0)))
        penalties.append(float(reward_detail.get("penalties", 0.0)))

        transcript.append(f"Agent: {message}")
        transcript.append(f"Seeker: {result.observation.seeker_utterance}")
        history.extend(transcript[-2:])

        obs = result.observation
        if result.done:
            final = result.info.get("final", {})
            break

    return LLMEpisodeSummary(
        task_id=task_id,
        model=model_name,
        steps=obs.turn,
        score=float(final.get("score", 0.0)),
        success=bool(final.get("success", 0.0) >= 1.0),
        completion=float(final.get("completion", 0.0)),
        avg_step_reward=mean(rewards) if rewards else 0.0,
        avg_immediate=mean(immediate_scores) if immediate_scores else 0.0,
        avg_future_oriented=mean(future_scores) if future_scores else 0.0,
        avg_penalties=mean(penalties) if penalties else 0.0,
        final_resolution=float(final.get("final_resolution", 0.0)),
        transcript=transcript,
    )


def render_markdown(episodes: List[LLMEpisodeSummary], generated_at: str, env_url: str) -> str:
    avg_score = mean(ep.score for ep in episodes) if episodes else 0.0
    avg_success = mean(1.0 if ep.success else 0.0 for ep in episodes) if episodes else 0.0
    model_name = episodes[0].model if episodes else "unknown"

    lines: List[str] = []
    lines.append("# LLM Benchmark Results")
    lines.append("")
    lines.append(f"_Generated: {generated_at}_")
    lines.append("")
    lines.append(f"- Model: `{model_name}`")
    lines.append(f"- Environment URL: `{env_url}`")
    lines.append(f"- Average score: `{avg_score:.3f}`")
    lines.append(f"- Success rate: `{avg_success:.2f}`")
    lines.append("")
    lines.append("| Task | Score | Success | Completion | Steps | Avg step reward | Final resolution |")
    lines.append("| --- | ---: | ---: | ---: | ---: | ---: | ---: |")
    for ep in episodes:
        lines.append(
            "| "
            f"{ep.task_id} | "
            f"{ep.score:.3f} | "
            f"{int(ep.success)} | "
            f"{ep.completion:.1f} | "
            f"{ep.steps} | "
            f"{ep.avg_step_reward:.3f} | "
            f"{ep.final_resolution:.3f} |"
        )
    lines.append("")
    lines.append("## Transcript Excerpts")
    lines.append("")
    for ep in episodes:
        lines.append(f"### {ep.task_id}")
        lines.append("")
        for line in ep.transcript[:10]:
            lines.append(f"- {line}")
        lines.append("")

    return "\n".join(lines).strip() + "\n"


def require_env(name: str) -> str:
    value = os.getenv(name)
    if not value:
        raise SystemExit(
            f"Missing required environment variable: {name}\n"
            f"Set it, then rerun `py -3 benchmark_llm.py`."
        )
    return value


def resolve_api_key() -> str:
    api_key = os.getenv("HF_TOKEN") or os.getenv("OPENAI_API_KEY") or os.getenv("API_KEY")
    if not api_key:
        raise SystemExit("Missing HF_TOKEN, OPENAI_API_KEY, or API_KEY.")
    return api_key


async def async_main(output: str, json_output: str) -> None:
    api_base_url = require_env("API_BASE_URL")
    model_name = require_env("MODEL_NAME")
    api_key = resolve_api_key()
    env_url = require_env("ESC_ENV_URL")

    openai_client = OpenAI(base_url=api_base_url, api_key=api_key)
    env_client = ESCHttpClient.from_url(env_url)

    try:
        episodes = [
            await run_task(openai_client, env_client, model_name=model_name, task_id=task_id)
            for task_id in TASK_IDS
        ]
    finally:
        await env_client.close()

    generated_at = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%SZ")
    markdown = render_markdown(episodes, generated_at=generated_at, env_url=env_url)

    md_path = Path(output)
    json_path = Path(json_output)
    md_path.parent.mkdir(parents=True, exist_ok=True)
    json_path.parent.mkdir(parents=True, exist_ok=True)
    md_path.write_text(markdown, encoding="utf-8")
    json_path.write_text(json.dumps([asdict(ep) for ep in episodes], indent=2), encoding="utf-8")

    print(f"Wrote Markdown report to {md_path}")
    print(f"Wrote JSON report to {json_path}")


def main() -> None:
    parser = argparse.ArgumentParser(description="Run the LLM baseline and write results.")
    parser.add_argument("--output", default="results/llm_benchmark.md", help="Markdown output path.")
    parser.add_argument("--json-output", default="results/llm_benchmark.json", help="JSON output path.")
    args = parser.parse_args()
    asyncio.run(async_main(args.output, args.json_output))


if __name__ == "__main__":
    main()