"""
Baseline Inference Script
=========================
Runs a real LLM agent against all 3 tasks (easy, medium, hard) and reports
average scores across NUM_RUNS episodes per task.

Reproducibility design:
  - TASK_SEEDS pins the starting circuit per run, so the environment
    presents the same problem difficulty across different model comparisons.
  - TEMPERATURE = 0.0 (greedy decoding) makes the LLM deterministic:
    same model + same prompt = same action. This means a single run
    is fully reproducible. The model's route through the circuit will be
    identical every time.
  - NUM_RUNS = 3 averages over multiple episodes to give stable scores
    for the README baseline table.

Why scores still vary slightly across runs even at temperature=0:
  Some APIs (Groq, HF router) do not guarantee bit-exact reproducibility
  at temperature=0 due to batching and hardware differences. The variance
  should be small (<5%). For reporting, use the average.

Required environment variables:
    API_BASE_URL   The API endpoint for the LLM.
    MODEL_NAME     The model identifier.
    HF_TOKEN       Your Hugging Face / API key (or GROQ_API_KEY for Groq).
    IMAGE_NAME     Docker image name (default: quantum_env).
"""

import asyncio
import json
import os
import textwrap
from typing import List, Optional, Tuple

from dotenv import load_dotenv

load_dotenv()

from openai import OpenAI

from quantum_openenv_env.server.quantum_openenv_env_environment import GRADERS
from quantum_openenv_env.client import QuantumOpenenvEnv
from quantum_openenv_env.models import QuantumAction

API_KEY      = os.getenv("HF_TOKEN") or os.getenv("GROQ_API_KEY") or os.getenv("API_KEY")
IMAGE_NAME   = os.getenv("IMAGE_NAME", "quantum_env")
API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
MODEL_NAME   = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
BENCHMARK    = os.getenv("QUANTUM_BENCHMARK", "quantum_optimization")

MAX_STEPS  = 15
MAX_TOKENS = 150
SUCCESS_SCORE_THRESHOLD = 0.10

# ── Reproducibility ────────────────────────────────────────────────────────────
# TEMPERATURE = 0.0: greedy decoding makes the LLM deterministic.
# For a truly non-deterministic model (temperature > 0), increase NUM_RUNS
# and report the average — that is statistically stable even if single runs vary.
TEMPERATURE = 0.7
NUM_RUNS    = 3   # episodes per task; average is reported in summary

ALL_TASKS  = ["easy", "medium", "hard"]
TASK_SEEDS = {
    "easy":   42,
    "medium":  7,
    "hard":   13,
}
# ──────────────────────────────────────────────────────────────────────────────


SYSTEM_PROMPT = textwrap.dedent(
    """
    You are an AI agent tasked with optimizing a multi-qubit quantum circuit.
    You will be given the current circuit as a list of gates with their index,
    name, and target_qubits.

    You have 4 possible actions:
    Action 1: Cancel identical self-inverse gates (H, X, Y, Z, CNOT, SWAP) on
              the same qubits, not blocked by intermediate gates sharing those qubits.
    Action 2: Swap adjacent commuting gates (gates on entirely non-overlapping qubits).
    Action 3: Replace an H-X-H sequence on the same qubit with a Z gate.
    Action 4: Replace a CNOT(a,b)→CNOT(b,a)→CNOT(a,b) sequence with a single SWAP gate (3 alternating CNOTs collapse to 1 SWAP).

    You MUST output ONLY a valid JSON object with exactly two keys:
      "target_index" (integer) and "action_type" (integer 1-4).
    Example: {"target_index": 2, "action_type": 1}
    Do not output markdown, backticks, or any other text.
    """
).strip()


# ============================================================================
# Logging  (format required by hackathon platform output parser)
# ============================================================================

def log_start(task: str, env: str, model: str) -> None:
    print(f"[START] task={task} env={env} model={model}", flush=True)


def log_step(
    step: int, action: str, reward: float, done: bool, error: Optional[str]
) -> None:
    error_val = error if error else "null"
    print(
        f"[STEP] step={step} action={action} reward={reward:.2f} "
        f"done={str(done).lower()} error={error_val}",
        flush=True,
    )


def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
    rewards_str = ",".join(f"{r:.2f}" for r in rewards)
    print(
        f"[END] success={str(success).lower()} steps={steps} "
        f"score={score:.3f} rewards={rewards_str}",
        flush=True,
    )


# ============================================================================
# Prompt helpers
# ============================================================================

def build_user_prompt(
    step: int, circuit: list, last_reward: float, history: List[str]
) -> str:
    circuit_block = (
        "\n".join(
            f"Index {i}: {gate.name} on qubits {gate.target_qubits}"
            for i, gate in enumerate(circuit)
        )
        if circuit else "Empty circuit"
    )
    history_block = "\n".join(history[-4:]) if history else "None"
    return textwrap.dedent(
        f"""
        Step: {step}
        Current circuit:
        {circuit_block}
        Last reward: {last_reward:.2f}
        Previous steps:
        {history_block}
        Send your next action as a JSON object with "target_index" and "action_type".
        """
    ).strip()


def get_model_action(
    client: OpenAI,
    step: int,
    circuit: list,
    last_reward: float,
    history: List[str],
) -> str:
    user_prompt = build_user_prompt(step, circuit, last_reward, history)
    try:
        completion = client.chat.completions.create(
            model=MODEL_NAME,
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user",   "content": user_prompt},
            ],
            temperature=TEMPERATURE,
            max_tokens=MAX_TOKENS,
            stream=False,
        )
        text = (completion.choices[0].message.content or "").strip()
        return text if text else "{}"
    except Exception as exc:
        print(f"[DEBUG] Model request failed: {exc}", flush=True)
        return "{}"


# ============================================================================
# Single episode
# ============================================================================

async def run_episode(
    task_name: str,
    run_number: int,
    seed: int,
    env: QuantumOpenenvEnv,
    client: OpenAI,
) -> float:
    """
    Run one episode and return the score.
    Emits [START] / [END] log lines as required by the platform.
    """
    history: List[str] = []
    rewards: List[float] = []
    steps_taken = 0
    score = 0.01
    success = False

    try:
        result = await env.reset(seed=seed)
        circuit = result.observation.circuit
        last_reward = 0.0
        initial_gate_count = len(circuit)

        actual_task = (result.observation.metadata or {}).get("task", task_name)
        if actual_task not in ALL_TASKS:
            actual_task = task_name

        log_start(task=actual_task, env=BENCHMARK, model=MODEL_NAME)

        for step in range(1, MAX_STEPS + 1):
            if result.done:
                break

            message = get_model_action(client, step, circuit, last_reward, history)

            try:
                clean = message.replace("```json", "").replace("```", "").strip()
                parsed = json.loads(clean)
                target_index = int(parsed["target_index"])
                action_type  = int(parsed.get("action_type", 1))
                error = None
            except Exception as exc:
                error = str(exc)
                target_index = 0
                action_type  = 1

            result = await env.step(
                QuantumAction(target_index=target_index, action_type=action_type)
            )
            reward = result.reward or 0.0
            done   = result.done

            rewards.append(reward)
            steps_taken = step
            circuit     = result.observation.circuit
            last_reward = reward

            log_step(step=step, action=message, reward=reward, done=done, error=error)
            history.append(f"Step {step}: {message!r} -> reward {reward:+.2f}")

            if done:
                break

        if not result.observation.metadata:
            result.observation.metadata = {}
        result.observation.metadata["initial_count"] = initial_gate_count

        grader  = GRADERS.get(actual_task, GRADERS["hard"])
        score   = grader(result.observation)
        success = score >= SUCCESS_SCORE_THRESHOLD

    except Exception as exc:
        print(f"[DEBUG] Task {task_name} run {run_number} error: {exc}", flush=True)

    finally:
        log_end(success=success, steps=steps_taken, score=score, rewards=rewards)

    return score


# ============================================================================
# Main: all 3 tasks × NUM_RUNS episodes each
# ============================================================================

async def main() -> None:
    """
    Run all 3 tasks, NUM_RUNS episodes each, and report average scores.

    TEMPERATURE=0.0 makes the LLM greedy/deterministic so scores are stable.
    Average across NUM_RUNS gives a robust baseline for the README table.

    The platform requires [START] task=X ... [END] for each of easy/medium/hard.
    """
    client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)

    # task → list of scores across runs
    all_scores: dict[str, List[float]] = {t: [] for t in ALL_TASKS}

    for task_name in ALL_TASKS:
        print(f"\n{'='*60}", flush=True)
        print(f"  Task : {task_name.upper()}  (seed={TASK_SEEDS[task_name]}, "
              f"runs={NUM_RUNS}, temp={TEMPERATURE})", flush=True)
        print(f"  Model: {MODEL_NAME}", flush=True)
        print(f"{'='*60}", flush=True)

        for run in range(1, NUM_RUNS + 1):
            print(f"\n  --- Run {run}/{NUM_RUNS} ---", flush=True)

            env = await QuantumOpenenvEnv.from_docker_image(
                IMAGE_NAME,
                env_vars={"QUANTUM_TASK": task_name},
            )
            try:
                score = await run_episode(
                    task_name=task_name,
                    run_number=run,
                    seed=TASK_SEEDS[task_name],
                    env=env,
                    client=client,
                )
                all_scores[task_name].append(score)
            finally:
                try:
                    await env.close()
                except Exception as e:
                    print(f"[DEBUG] env.close() error: {e}", flush=True)

    # ── Summary table ──────────────────────────────────────────────────────
    print(f"\n{'='*60}", flush=True)
    print("  BASELINE RESULTS SUMMARY", flush=True)
    print(f"  Model       : {MODEL_NAME}", flush=True)
    print(f"  Temperature : {TEMPERATURE}", flush=True)
    print(f"  Runs/task   : {NUM_RUNS}", flush=True)
    print(f"  Seeds       : easy={TASK_SEEDS['easy']}  "
          f"medium={TASK_SEEDS['medium']}  hard={TASK_SEEDS['hard']}", flush=True)
    print(f"{'='*60}", flush=True)
    print(f"  {'Task':<10} {'Avg Score':>10}  {'Min':>6}  {'Max':>6}  Result", flush=True)
    print(f"  {'-'*50}", flush=True)
    for task_name in ALL_TASKS:
        scores  = all_scores[task_name]
        avg     = sum(scores) / len(scores) if scores else 0.0
        mn      = min(scores) if scores else 0.0
        mx      = max(scores) if scores else 0.0
        success = avg >= SUCCESS_SCORE_THRESHOLD
        status  = "PASS ✓" if success else "FAIL ✗"
        print(
            f"  {task_name:<10} {avg:>10.3f}  {mn:>6.3f}  {mx:>6.3f}  {status}",
            flush=True,
        )
    print(f"{'='*60}\n", flush=True)


if __name__ == "__main__":
    asyncio.run(main())