"""
Voice agent for Layer 2 conversations.

Uses Llama 3.1 8B Instruct to act as the customer support agent during
evaluation. Supports both local model and HF Inference API backends.
"""

from __future__ import annotations

import logging
import os
import time
from typing import Any

try:
    from huggingface_hub import InferenceClient
except ImportError:
    InferenceClient = None  # type: ignore

logger = logging.getLogger(__name__)


class HFAgent:
    """
    Voice agent powered by Llama 3.1 8B.

    Takes a system prompt from Layer 1 and generates responses
    in the customer support conversation.

    Supports two backends:
    - local: loads model in-process via transformers (pass local_model=...)
    - api: uses HF Inference API (pass hf_token=...)
    """

    DEFAULT_MODEL = "unsloth/Meta-Llama-3.1-8B-Instruct"

    def __init__(
        self,
        model_id: str | None = None,
        hf_token: str | None = None,
        max_tokens: int = 300,
        temperature: float = 0.3,
        local_model: Any = None,
    ):
        self.model_id = model_id or self.DEFAULT_MODEL
        self.max_tokens = max_tokens
        self.temperature = temperature
        self._local_model = local_model
        self._client: Any = None

        if local_model is None:
            self.hf_token = hf_token or os.environ.get("HF_TOKEN")
            if self.hf_token and InferenceClient is not None:
                self._client = InferenceClient(token=self.hf_token)

    @property
    def is_llm_available(self) -> bool:
        return self._local_model is not None or self._client is not None

    def __call__(
        self,
        system_prompt: str,
        conversation_history: list[dict[str, str]],
        observation: dict[str, Any],
        max_retries: int = 4,
    ) -> str:
        """
        Generate an agent response.

        Compatible with ConversationEnvironment.run_episode(agent_fn=...).
        """
        messages = [{"role": "system", "content": system_prompt}]

        for msg in conversation_history:
            if msg["role"] == "customer":
                messages.append({"role": "user", "content": msg["content"]})
            elif msg["role"] == "agent":
                messages.append({"role": "assistant", "content": msg["content"]})

        # Add the latest customer message from observation
        customer_msg = observation.get("customer_message", "")
        if customer_msg:
            messages.append({"role": "user", "content": customer_msg})

        if self._local_model is not None:
            return self._local_model.generate(
                messages, max_tokens=self.max_tokens, temperature=self.temperature,
            )

        if self._client is None:
            raise RuntimeError(
                "No inference backend available. "
                "Pass local_model=... or set HF_TOKEN for API access."
            )

        last_err = None
        for attempt in range(max_retries + 1):
            try:
                response = self._client.chat_completion(
                    model=self.model_id,
                    messages=messages,
                    max_tokens=self.max_tokens,
                    temperature=self.temperature,
                )
                return response.choices[0].message.content.strip()
            except Exception as e:
                err_str = str(e)
                if "402" in err_str or "Payment Required" in err_str:
                    raise RuntimeError(
                        "HF API credits depleted. "
                        "Get more credits at https://huggingface.co/settings/billing"
                    ) from e
                if any(code in err_str for code in ("500", "502", "503", "504", "429", "timeout", "Timeout", "Time-out")):
                    last_err = e
                    wait = 2 ** (attempt + 1)  # 2, 4, 8, 16s
                    logger.warning(
                        "HF API error (attempt %d/%d), retrying in %ds: %s",
                        attempt + 1, max_retries + 1, wait, e,
                    )
                    time.sleep(wait)
                    continue
                raise
        raise last_err  # type: ignore[misc]