"""LLM provider abstraction with OpenAI, Mock, and Anthropic (stub) implementations."""

from __future__ import annotations

import asyncio
import json
import time
from abc import ABC, abstractmethod
from collections.abc import AsyncIterator

import structlog

from agent_bench.core.config import AppConfig, load_config
from agent_bench.core.types import (
    CompletionResponse,
    Message,
    Role,
    TokenUsage,
    ToolCall,
    ToolDefinition,
)

log = structlog.get_logger()


class ProviderTimeoutError(Exception):
    """Raised when the LLM provider times out."""


class ProviderRateLimitError(Exception):
    """Raised when the LLM provider returns a rate limit / quota error."""


# --- Pure formatting functions (used by providers and tests directly) ---


def format_tools_openai(tools: list[ToolDefinition]) -> list[dict]:
    """Format tool definitions into OpenAI function-calling schema."""
    return [
        {
            "type": "function",
            "function": {
                "name": t.name,
                "description": t.description,
                "parameters": t.parameters,
            },
        }
        for t in tools
    ]


def format_messages_openai(messages: list[Message]) -> list[dict]:
    """Format internal Message objects into OpenAI chat message dicts."""
    formatted = []
    for m in messages:
        msg: dict = {"role": m.role.value, "content": m.content}
        if m.tool_call_id:
            msg["tool_call_id"] = m.tool_call_id
        if m.tool_calls:
            msg["tool_calls"] = [
                {
                    "id": tc.id,
                    "type": "function",
                    "function": {
                        "name": tc.name,
                        "arguments": json.dumps(tc.arguments),
                    },
                }
                for tc in m.tool_calls
            ]
        formatted.append(msg)
    return formatted


# --- Provider interface ---


class LLMProvider(ABC):
    """Async LLM provider interface."""

    @abstractmethod
    async def complete(
        self,
        messages: list[Message],
        tools: list[ToolDefinition] | None = None,
        temperature: float = 0.0,
        max_tokens: int = 1024,
    ) -> CompletionResponse: ...

    @abstractmethod
    async def stream_complete(
        self,
        messages: list[Message],
        tools: list[ToolDefinition] | None = None,
        temperature: float = 0.0,
        max_tokens: int = 1024,
    ) -> AsyncIterator[str]:
        """Yield content chunks as they arrive from the provider."""
        ...
        yield ""  # pragma: no cover

    @abstractmethod
    def format_tools(self, tools: list[ToolDefinition]) -> list[dict]: ...

    async def health_check(self) -> bool:
        """Check if the upstream provider is reachable.

        Returns True if the provider can serve requests, False otherwise.
        Default implementation returns True (assume healthy). Providers
        should override this to perform a real connectivity check.
        """
        return True


# --- Implementations ---


class MockProvider(LLMProvider):
    """Deterministic provider for testing.

    Behavior:
    - If tools are provided AND no Role.TOOL messages exist -> returns tool_calls
    - If Role.TOOL messages exist OR no tools -> returns final text answer
    """

    def __init__(self) -> None:
        self.call_count = 0

    async def complete(
        self,
        messages: list[Message],
        tools: list[ToolDefinition] | None = None,
        temperature: float = 0.0,
        max_tokens: int = 1024,
    ) -> CompletionResponse:
        self.call_count += 1
        has_tool_results = any(m.role == Role.TOOL for m in messages)

        if tools and not has_tool_results:
            return CompletionResponse(
                content="",
                tool_calls=[
                    ToolCall(
                        id=f"call_mock_{self.call_count}",
                        name=tools[0].name,
                        arguments={"query": "mock search query"},
                    )
                ],
                usage=TokenUsage(
                    input_tokens=150,
                    output_tokens=25,
                    estimated_cost_usd=0.0001,
                ),
                provider="mock",
                model="mock-1",
                latency_ms=1.0,
            )

        return CompletionResponse(
            content="Based on the documentation, path parameters in FastAPI are defined "
            "using curly braces in the path string. [source: fastapi_path_params.md]",
            tool_calls=[],
            usage=TokenUsage(
                input_tokens=200,
                output_tokens=50,
                estimated_cost_usd=0.0002,
            ),
            provider="mock",
            model="mock-1",
            latency_ms=2.0,
        )

    async def stream_complete(
        self,
        messages: list[Message],
        tools: list[ToolDefinition] | None = None,
        temperature: float = 0.0,
        max_tokens: int = 1024,
    ) -> AsyncIterator[str]:
        """Yield mock response in exactly 3 deterministic chunks."""
        chunks = [
            "Based on the documentation, path parameters in FastAPI ",
            "are defined using curly braces in the path string. ",
            "[source: fastapi_path_params.md]",
        ]
        for chunk in chunks:
            yield chunk
            await asyncio.sleep(0.01)

    def format_tools(self, tools: list[ToolDefinition]) -> list[dict]:
        return format_tools_openai(tools)


class OpenAIProvider(LLMProvider):
    """OpenAI API provider pinned to a dated gpt-4o-mini snapshot.

    The ``model`` parameter overrides the default pin (used by the
    calibration runner so a row config's ``model_id`` is what actually
    gets called — without an override, ``judge_id`` would be a label
    that disagrees with the API request, breaking provenance).
    """

    def __init__(
        self, config: AppConfig | None = None, *, model: str | None = None
    ) -> None:
        try:
            from openai import AsyncOpenAI
        except ImportError as e:
            raise ImportError("openai package required: pip install openai") from e

        import os

        self.config = config or load_config()
        api_key = os.environ.get("OPENAI_API_KEY", "")
        self.client = AsyncOpenAI(api_key=api_key)
        self.model = model or "gpt-4o-mini-2024-07-18"
        model_pricing = self.config.provider.models.get(self.model)
        self._input_cost = model_pricing.input_cost_per_mtok if model_pricing else 0.15
        self._output_cost = model_pricing.output_cost_per_mtok if model_pricing else 0.60

    async def complete(
        self,
        messages: list[Message],
        tools: list[ToolDefinition] | None = None,
        temperature: float = 0.0,
        max_tokens: int = 1024,
    ) -> CompletionResponse:
        from openai import APITimeoutError, RateLimitError

        formatted_messages = format_messages_openai(messages)
        kwargs: dict = {
            "model": self.model,
            "messages": formatted_messages,
            "temperature": temperature,
            "max_tokens": max_tokens,
        }
        if tools:
            kwargs["tools"] = self.format_tools(tools)
            kwargs["tool_choice"] = "auto"

        retry_cfg = self.config.retry
        start = time.perf_counter()

        for attempt in range(retry_cfg.max_retries + 1):
            try:
                response = await self.client.chat.completions.create(**kwargs)
                break  # success
            except RateLimitError as e:
                if attempt == retry_cfg.max_retries:
                    log.error("provider_rate_limited",
                              attempts=attempt + 1, error=str(e))
                    raise ProviderRateLimitError(
                        f"Rate limited after {retry_cfg.max_retries} retries: {e}"
                    ) from e
                wait = min(
                    retry_cfg.base_delay * (2 ** attempt),
                    retry_cfg.max_delay,
                )
                log.warning("provider_retry",
                            attempt=attempt + 1, wait_seconds=wait, error=str(e))
                await asyncio.sleep(wait)
            except APITimeoutError as e:
                raise ProviderTimeoutError(f"OpenAI timed out: {e}") from e

        latency_ms = (time.perf_counter() - start) * 1000

        choice = response.choices[0]
        content = choice.message.content or ""
        tool_calls: list[ToolCall] = []

        if choice.message.tool_calls:
            for tc in choice.message.tool_calls:
                try:
                    args = json.loads(tc.function.arguments)
                except json.JSONDecodeError:
                    args = {}
                tool_calls.append(ToolCall(id=tc.id, name=tc.function.name, arguments=args))

        usage = response.usage
        input_tokens = usage.prompt_tokens if usage else 0
        output_tokens = usage.completion_tokens if usage else 0
        cost = (input_tokens * self._input_cost + output_tokens * self._output_cost) / 1_000_000

        return CompletionResponse(
            content=content,
            tool_calls=tool_calls,
            usage=TokenUsage(
                input_tokens=input_tokens,
                output_tokens=output_tokens,
                estimated_cost_usd=cost,
            ),
            provider="openai",
            model=self.model,
            latency_ms=latency_ms,
        )

    async def stream_complete(
        self,
        messages: list[Message],
        tools: list[ToolDefinition] | None = None,
        temperature: float = 0.0,
        max_tokens: int = 1024,
    ) -> AsyncIterator[str]:
        """Yield content chunks from OpenAI streaming API.

        Same retry/timeout handling as complete() — wraps the raw openai
        call so 429s and timeouts are translated consistently.
        """
        from openai import APITimeoutError, RateLimitError

        formatted_messages = format_messages_openai(messages)
        kwargs: dict = {
            "model": self.model,
            "messages": formatted_messages,
            "temperature": temperature,
            "max_tokens": max_tokens,
            "stream": True,
        }
        if tools:
            kwargs["tools"] = self.format_tools(tools)
            kwargs["tool_choice"] = "auto"

        retry_cfg = self.config.retry
        for attempt in range(retry_cfg.max_retries + 1):
            try:
                response = await self.client.chat.completions.create(**kwargs)
                break
            except RateLimitError as e:
                if attempt == retry_cfg.max_retries:
                    raise ProviderRateLimitError(
                        f"Rate limited after {retry_cfg.max_retries} retries: {e}"
                    ) from e
                wait = min(
                    retry_cfg.base_delay * (2 ** attempt),
                    retry_cfg.max_delay,
                )
                log.warning("provider_stream_retry",
                            attempt=attempt + 1, wait_seconds=wait)
                await asyncio.sleep(wait)
            except APITimeoutError as e:
                raise ProviderTimeoutError(f"OpenAI timed out: {e}") from e

        async for chunk in response:
            if chunk.choices and chunk.choices[0].delta.content:
                yield chunk.choices[0].delta.content

    async def health_check(self) -> bool:
        try:
            await self.client.models.retrieve(self.model)
            return True
        except Exception:
            return False

    def format_tools(self, tools: list[ToolDefinition]) -> list[dict]:
        return format_tools_openai(tools)


def format_tools_anthropic(tools: list[ToolDefinition]) -> list[dict]:
    """Format tool definitions into Anthropic tool schema."""
    return [
        {
            "name": t.name,
            "description": t.description,
            "input_schema": t.parameters,
        }
        for t in tools
    ]


def format_messages_anthropic(
    messages: list[Message],
) -> tuple[str, list[dict]]:
    """Extract system prompt and format messages for Anthropic API.

    Returns (system_prompt, messages) where system is separated out
    because Anthropic uses a system= parameter, not a message role.
    """
    system_prompt = ""
    formatted = []

    for m in messages:
        if m.role == Role.SYSTEM:
            system_prompt = m.content
            continue

        if m.role == Role.TOOL:
            # Anthropic: tool results are user messages with tool_result blocks
            formatted.append({
                "role": "user",
                "content": [
                    {
                        "type": "tool_result",
                        "tool_use_id": m.tool_call_id,
                        "content": m.content,
                    }
                ],
            })
        elif m.role == Role.ASSISTANT and m.tool_calls:
            # Assistant message with tool_use content blocks
            content: list[dict] = []
            if m.content:
                content.append({"type": "text", "text": m.content})
            for tc in m.tool_calls:
                content.append({
                    "type": "tool_use",
                    "id": tc.id,
                    "name": tc.name,
                    "input": tc.arguments,
                })
            formatted.append({"role": "assistant", "content": content})
        else:
            formatted.append({
                "role": m.role.value,
                "content": m.content,
            })

    return system_prompt, formatted


class AnthropicProvider(LLMProvider):
    """Anthropic Claude provider.

    The ``model`` parameter overrides the default pin (used by the
    calibration runner so a row config's ``model_id`` is what actually
    gets called — without an override, ``judge_id`` would be a label
    that disagrees with the API request, breaking provenance).
    """

    def __init__(
        self, config: AppConfig | None = None, *, model: str | None = None
    ) -> None:
        try:
            from anthropic import AsyncAnthropic
        except ImportError as e:
            raise ImportError(
                "anthropic package required: pip install anthropic"
            ) from e

        import os

        self.config = config or load_config()
        api_key = os.environ.get("ANTHROPIC_API_KEY", "")
        self.client = AsyncAnthropic(api_key=api_key)
        self.model = model or "claude-haiku-4-5-20251001"
        model_pricing = self.config.provider.models.get(self.model)
        self._input_cost = (
            model_pricing.input_cost_per_mtok if model_pricing else 0.80
        )
        self._output_cost = (
            model_pricing.output_cost_per_mtok if model_pricing else 4.0
        )

    async def complete(
        self,
        messages: list[Message],
        tools: list[ToolDefinition] | None = None,
        temperature: float = 0.0,
        max_tokens: int = 1024,
    ) -> CompletionResponse:
        from anthropic import APITimeoutError, RateLimitError

        system_prompt, formatted_messages = format_messages_anthropic(messages)
        kwargs: dict = {
            "model": self.model,
            "messages": formatted_messages,
            "temperature": temperature,
            "max_tokens": max_tokens,
        }
        if system_prompt:
            kwargs["system"] = system_prompt
        if tools:
            kwargs["tools"] = self.format_tools(tools)

        retry_cfg = self.config.retry
        start = time.perf_counter()

        for attempt in range(retry_cfg.max_retries + 1):
            try:
                response = await self.client.messages.create(**kwargs)
                break
            except RateLimitError as e:
                if attempt == retry_cfg.max_retries:
                    log.error(
                        "provider_rate_limited",
                        attempts=attempt + 1,
                        error=str(e),
                    )
                    raise ProviderRateLimitError(
                        f"Rate limited after {retry_cfg.max_retries} retries: {e}"
                    ) from e
                wait = min(
                    retry_cfg.base_delay * (2 ** attempt),
                    retry_cfg.max_delay,
                )
                log.warning(
                    "provider_retry",
                    attempt=attempt + 1,
                    wait_seconds=wait,
                )
                await asyncio.sleep(wait)
            except APITimeoutError as e:
                raise ProviderTimeoutError(
                    f"Anthropic timed out: {e}"
                ) from e

        latency_ms = (time.perf_counter() - start) * 1000

        # Parse response content blocks
        content = ""
        tool_calls_out: list[ToolCall] = []
        for block in response.content:
            if block.type == "text":
                content += block.text
            elif block.type == "tool_use":
                tool_calls_out.append(
                    ToolCall(
                        id=block.id,
                        name=block.name,
                        arguments=block.input,
                    )
                )

        input_tokens = response.usage.input_tokens
        output_tokens = response.usage.output_tokens
        cost = (
            input_tokens * self._input_cost
            + output_tokens * self._output_cost
        ) / 1_000_000

        return CompletionResponse(
            content=content,
            tool_calls=tool_calls_out,
            usage=TokenUsage(
                input_tokens=input_tokens,
                output_tokens=output_tokens,
                estimated_cost_usd=cost,
            ),
            provider="anthropic",
            model=self.model,
            latency_ms=latency_ms,
        )

    async def stream_complete(
        self,
        messages: list[Message],
        tools: list[ToolDefinition] | None = None,
        temperature: float = 0.0,
        max_tokens: int = 1024,
    ) -> AsyncIterator[str]:
        """Yield content chunks from Anthropic streaming API."""
        from anthropic import APITimeoutError, RateLimitError

        system_prompt, formatted_messages = format_messages_anthropic(messages)
        kwargs: dict = {
            "model": self.model,
            "messages": formatted_messages,
            "temperature": temperature,
            "max_tokens": max_tokens,
        }
        if system_prompt:
            kwargs["system"] = system_prompt
        if tools:
            kwargs["tools"] = self.format_tools(tools)

        # Retry wraps the full stream lifecycle: .stream() returns a manager,
        # but the HTTP request fires in __aenter__. Both must be inside the
        # try/except to catch 429s and timeouts.
        retry_cfg = self.config.retry
        for attempt in range(retry_cfg.max_retries + 1):
            try:
                async with self.client.messages.stream(**kwargs) as s:
                    async for text in s.text_stream:
                        yield text
                return  # success — exit retry loop
            except RateLimitError as e:
                if attempt == retry_cfg.max_retries:
                    raise ProviderRateLimitError(
                        f"Rate limited after {retry_cfg.max_retries} retries: {e}"
                    ) from e
                wait = min(
                    retry_cfg.base_delay * (2 ** attempt),
                    retry_cfg.max_delay,
                )
                log.warning(
                    "provider_stream_retry",
                    attempt=attempt + 1,
                    wait_seconds=wait,
                )
                await asyncio.sleep(wait)
            except APITimeoutError as e:
                raise ProviderTimeoutError(
                    f"Anthropic timed out: {e}"
                ) from e

    async def health_check(self) -> bool:
        try:
            await self.client.models.retrieve(model_id=self.model)
            return True
        except Exception:
            return False

    def format_tools(self, tools: list[ToolDefinition]) -> list[dict]:
        return format_tools_anthropic(tools)


class SelfHostedProvider(LLMProvider):
    """Provider targeting any OpenAI-compatible endpoint (vLLM, TGI, Ollama).

    Reads settings from config (provider.selfhosted.*) with env var fallback:
        MODAL_VLLM_URL   -> base_url
        SELFHOSTED_MODEL -> model_name
        MODAL_AUTH_TOKEN  -> api_key

    Tool-calling support is detected lazily on the first complete() call
    with tools. If the endpoint returns a 400 or the model ignores tools,
    subsequent calls fall back to prompt-based tool selection.
    """

    def __init__(self, config: AppConfig | None = None) -> None:
        import os

        import httpx as _httpx

        self.config = config or load_config()
        sh = self.config.provider.selfhosted
        self.base_url = (
            sh.base_url
            or os.environ.get("MODAL_VLLM_URL", "http://localhost:8001/v1")
        )
        self.model = (
            sh.model_name
            if sh.model_name != "mistralai/Mistral-7B-Instruct-v0.3"
            else os.environ.get("SELFHOSTED_MODEL", sh.model_name)
        )
        api_key = sh.api_key or os.environ.get("MODAL_AUTH_TOKEN", "")
        self._supports_tool_calling: bool | None = None  # detected lazily

        model_pricing = self.config.provider.models.get(self.model)
        self._input_cost = model_pricing.input_cost_per_mtok if model_pricing else 0.0
        self._output_cost = model_pricing.output_cost_per_mtok if model_pricing else 0.0

        self.client = _httpx.AsyncClient(
            base_url=self.base_url,
            timeout=sh.timeout_seconds,
            follow_redirects=True,
            headers={"Authorization": f"Bearer {api_key}"} if api_key else {},
        )

    async def _detect_tool_calling(self) -> bool | None:
        """Probe the endpoint for OpenAI-format tool-calling support.

        Returns:
            True  — model responded with tool_calls (definitive: cache it)
            False — endpoint returned 400 (definitive: cache it)
            None  — transient failure (timeout, 5xx, connection error); do NOT cache
        """
        test_tool = {
            "type": "function",
            "function": {
                "name": "test_probe",
                "description": "Probe for tool support",
                "parameters": {
                    "type": "object",
                    "properties": {"x": {"type": "string"}},
                },
            },
        }
        try:
            resp = await self.client.post(
                "/chat/completions",
                json={
                    "model": self.model,
                    "messages": [
                        {"role": "user", "content": "Call the test_probe tool with x='hello'"}
                    ],
                    "tools": [test_tool],
                    "tool_choice": "auto",
                    "max_tokens": 50,
                },
            )
            if resp.status_code == 400:
                log.info("selfhosted_tool_detect", result="unsupported (400)")
                return False
            if resp.status_code >= 500:
                log.warning("selfhosted_tool_detect", result="transient (5xx)")
                return None
            resp.raise_for_status()
            data = resp.json()
            has_tools = bool(
                data["choices"][0]["message"].get("tool_calls")
            )
            log.info("selfhosted_tool_detect", result="supported" if has_tools else "unsupported")
            return has_tools
        except Exception:
            log.warning("selfhosted_tool_detect", result="transient (error)")
            return None

    @staticmethod
    def _sanitize_messages(messages: list[dict]) -> list[dict]:
        """Convert tool-role messages and merge consecutive same-role messages.

        Many models (e.g. Mistral) require strictly alternating user/assistant
        messages. Tool results are converted to user messages and consecutive
        same-role messages are merged.
        """
        sanitized: list[dict] = []
        for m in messages:
            if m["role"] == "tool":
                role = "user"
                content = f"[Tool result]: {m['content']}"
            elif m["role"] == "assistant" and "tool_calls" in m:
                role = "assistant"
                content = m.get("content") or ""
            else:
                role = m["role"]
                content = m.get("content") or ""

            # Merge consecutive same-role messages
            if sanitized and sanitized[-1]["role"] == role and role != "system":
                sanitized[-1]["content"] += "\n\n" + content
            else:
                sanitized.append({"role": role, "content": content})

        # Merge consecutive same-role messages that resulted from dropping empty ones
        merged: list[dict] = []
        for m in sanitized:
            if not m["content"].strip() and m["role"] != "system":
                continue  # drop empty messages
            if merged and merged[-1]["role"] == m["role"] and m["role"] != "system":
                merged[-1]["content"] += "\n\n" + m["content"]
            else:
                merged.append(m)
        return merged

    @staticmethod
    def _tools_as_prompt(tools: list[ToolDefinition]) -> str:
        """Format tools as system prompt text for prompt-based fallback."""
        lines = ["You have access to the following tools:", ""]
        for t in tools:
            lines.append(f"- {t.name}: {t.description}")
            lines.append(f"  Parameters: {json.dumps(t.parameters)}")
        lines.extend([
            "",
            "To use a tool, respond with ONLY this JSON (no other text):",
            '{"tool_calls": [{"name": "tool_name", "arguments": {"param": "value"}}]}',
            "",
            "If you don't need a tool, respond normally with text.",
        ])
        return "\n".join(lines)

    @staticmethod
    def _parse_tool_calls_from_text(text: str) -> list[ToolCall]:
        """Parse tool calls from model text output (prompt-based fallback)."""
        import uuid

        try:
            data = json.loads(text.strip())
            if isinstance(data, dict) and "tool_calls" in data:
                calls = []
                for tc in data["tool_calls"]:
                    raw_args = tc.get("arguments", {})
                    if not isinstance(raw_args, dict):
                        raw_args = {}
                    calls.append(
                        ToolCall(
                            id=f"call_{uuid.uuid4().hex[:8]}",
                            name=tc["name"],
                            arguments=raw_args,
                        )
                    )
                return calls
        except (json.JSONDecodeError, KeyError, TypeError):
            pass
        return []

    async def complete(
        self,
        messages: list[Message],
        tools: list[ToolDefinition] | None = None,
        temperature: float = 0.0,
        max_tokens: int = 1024,
    ) -> CompletionResponse:
        import httpx as _httpx

        # Lazy tool-calling detection on first call with tools
        if tools and self._supports_tool_calling is None:
            result = await self._detect_tool_calling()
            if result is not None:
                self._supports_tool_calling = result
            # If None (transient), leave as None so next call retries

        formatted_messages = format_messages_openai(messages)

        # Use native tools only when detection confirmed support.
        # When detection is None (transient failure), fall back to prompt-based
        # rather than risk a 400 with native tools on an unsupported endpoint.
        use_native_tools = tools and self._supports_tool_calling is True
        if tools and not use_native_tools:
            tool_prompt = self._tools_as_prompt(tools)
            # Merge tool instructions into existing system message (some models
            # like Mistral reject multiple system messages in their chat template)
            if formatted_messages and formatted_messages[0]["role"] == "system":
                formatted_messages[0]["content"] = (
                    tool_prompt + "\n\n" + formatted_messages[0]["content"]
                )
            else:
                formatted_messages = [
                    {"role": "system", "content": tool_prompt},
                    *formatted_messages,
                ]
        # Always sanitize for self-hosted: messages may contain tool/tool_calls
        # from earlier iterations even when current call has tools=None
        formatted_messages = self._sanitize_messages(formatted_messages)

        payload: dict = {
            "model": self.model,
            "messages": formatted_messages,
            "temperature": temperature,
            "max_tokens": max_tokens,
        }
        if use_native_tools and tools:
            payload["tools"] = self.format_tools(tools)
            payload["tool_choice"] = "auto"

        retry_cfg = self.config.retry
        start = time.perf_counter()

        for attempt in range(retry_cfg.max_retries + 1):
            try:
                resp = await self.client.post("/chat/completions", json=payload)
                if resp.status_code == 429:
                    if attempt == retry_cfg.max_retries:
                        raise ProviderRateLimitError(
                            f"Rate limited after {retry_cfg.max_retries} retries"
                        )
                    wait = min(
                        retry_cfg.base_delay * (2 ** attempt), retry_cfg.max_delay
                    )
                    log.warning(
                        "selfhosted_retry",
                        attempt=attempt + 1,
                        wait_seconds=wait,
                    )
                    await asyncio.sleep(wait)
                    continue
                if resp.status_code >= 400:
                    log.error("selfhosted_error", status=resp.status_code, body=resp.text[:500])
                resp.raise_for_status()
                break
            except _httpx.TimeoutException as e:
                raise ProviderTimeoutError(f"Self-hosted timed out: {e}") from e

        latency_ms = (time.perf_counter() - start) * 1000
        data = resp.json()

        choice = data["choices"][0]
        content = choice["message"].get("content") or ""
        tool_calls: list[ToolCall] = []

        if choice["message"].get("tool_calls"):
            # Native tool calling response
            for tc in choice["message"]["tool_calls"]:
                try:
                    args = json.loads(tc["function"]["arguments"])
                except (json.JSONDecodeError, KeyError):
                    args = {}
                tool_calls.append(
                    ToolCall(
                        id=tc["id"],
                        name=tc["function"]["name"],
                        arguments=args,
                    )
                )
        elif tools and not self._supports_tool_calling and content:
            # Prompt-based fallback: parse tool calls from text
            tool_calls = self._parse_tool_calls_from_text(content)
            if tool_calls:
                content = ""  # tool call replaces text content

        usage_data = data.get("usage", {})
        input_tokens = usage_data.get("prompt_tokens", 0)
        output_tokens = usage_data.get("completion_tokens", 0)
        cost = (
            input_tokens * self._input_cost + output_tokens * self._output_cost
        ) / 1_000_000

        return CompletionResponse(
            content=content,
            tool_calls=tool_calls,
            usage=TokenUsage(
                input_tokens=input_tokens,
                output_tokens=output_tokens,
                estimated_cost_usd=cost,
            ),
            provider="selfhosted",
            model=self.model,
            latency_ms=latency_ms,
        )

    async def stream_complete(
        self,
        messages: list[Message],
        tools: list[ToolDefinition] | None = None,
        temperature: float = 0.0,
        max_tokens: int = 1024,
    ) -> AsyncIterator[str]:
        import httpx as _httpx

        # Same tool-calling detection/fallback as complete()
        if tools and self._supports_tool_calling is None:
            result = await self._detect_tool_calling()
            if result is not None:
                self._supports_tool_calling = result

        formatted_messages = format_messages_openai(messages)
        use_native_tools = tools and self._supports_tool_calling is True
        if tools and not use_native_tools:
            tool_prompt = self._tools_as_prompt(tools)
            if formatted_messages and formatted_messages[0]["role"] == "system":
                formatted_messages[0]["content"] = (
                    tool_prompt + "\n\n" + formatted_messages[0]["content"]
                )
            else:
                formatted_messages = [
                    {"role": "system", "content": tool_prompt},
                    *formatted_messages,
                ]
        formatted_messages = self._sanitize_messages(formatted_messages)

        payload: dict = {
            "model": self.model,
            "messages": formatted_messages,
            "temperature": temperature,
            "max_tokens": max_tokens,
            "stream": True,
        }
        if use_native_tools and tools:
            payload["tools"] = self.format_tools(tools)
            payload["tool_choice"] = "auto"

        retry_cfg = self.config.retry
        for attempt in range(retry_cfg.max_retries + 1):
            try:
                async with self.client.stream(
                    "POST", "/chat/completions", json=payload
                ) as resp:
                    if resp.status_code == 429:
                        if attempt == retry_cfg.max_retries:
                            raise ProviderRateLimitError(
                                f"Rate limited after {retry_cfg.max_retries} retries"
                            )
                        wait = min(
                            retry_cfg.base_delay * (2 ** attempt),
                            retry_cfg.max_delay,
                        )
                        log.warning(
                            "selfhosted_stream_retry",
                            attempt=attempt + 1,
                            wait_seconds=wait,
                        )
                        await asyncio.sleep(wait)
                        continue
                    resp.raise_for_status()

                    async for line in resp.aiter_lines():
                        line = line.strip()
                        if not line or not line.startswith("data: "):
                            continue
                        data_str = line[len("data: "):]
                        if data_str == "[DONE]":
                            return
                        try:
                            chunk_data = json.loads(data_str)
                            delta = chunk_data["choices"][0].get("delta", {})
                            if delta.get("content"):
                                yield delta["content"]
                        except (json.JSONDecodeError, KeyError, IndexError):
                            continue
                return  # success — exit retry loop
            except _httpx.TimeoutException as e:
                raise ProviderTimeoutError(f"Self-hosted timed out: {e}") from e

    async def health_check(self) -> bool:
        try:
            resp = await self.client.get("/models", timeout=5.0)
            return resp.status_code == 200
        except Exception:
            return False

    def format_tools(self, tools: list[ToolDefinition]) -> list[dict]:
        return format_tools_openai(tools)


def create_provider(config: AppConfig | None = None) -> LLMProvider:
    """Factory: create provider based on config."""
    if config is None:
        config = load_config()
    name = config.provider.default
    if name == "openai":
        return OpenAIProvider(config)
    elif name == "anthropic":
        return AnthropicProvider(config)
    elif name == "selfhosted":
        return SelfHostedProvider(config)
    elif name == "mock":
        return MockProvider()
    else:
        raise ValueError(f"Unknown provider: {name}")