"""
llama.cpp HTTP client wrapper for FormScout.

Wraps the llama.cpp server's /completion and /embedding endpoints.
Falls back gracefully when the server is unavailable.

Model: Qwen3-VL-8B-Instruct (Q4_K_M GGUF) for VLM inference.
Model: Qwen3-VL-Embedding-8B (Q4_K_M GGUF) for embeddings.
Params: 8B each (shared backbone).
License: Apache-2.0.
"""
from __future__ import annotations

import base64
import json
import logging
from pathlib import Path
from typing import Any

import requests

from formscout import config

logger = logging.getLogger(__name__)

_TIMEOUT = 120  # seconds — VLM can be slow


class LlamaCppClient:
    """HTTP client for a llama.cpp server instance."""

    def __init__(self, host: str | None = None, port: int | None = None):
        self.host = host or config.LLAMA_CPP_HOST
        self.port = port or config.LLAMA_CPP_PORT_VLM
        self.base_url = f"http://{self.host}:{self.port}"

    @property
    def available(self) -> bool:
        """Check if the server is reachable."""
        try:
            r = requests.get(f"{self.base_url}/health", timeout=5)
            return r.status_code == 200
        except (requests.ConnectionError, requests.Timeout):
            return False

    def complete(
        self,
        prompt: str,
        images: list[str] | None = None,
        max_tokens: int = 512,
        temperature: float = 0.1,
        stop: list[str] | None = None,
    ) -> dict[str, Any]:
        """
        Send a chat-completion request (OpenAI-compatible /v1/chat/completions —
        required for multimodal: llama-server routes images through the mmproj
        only on this endpoint). Returns parsed JSON if the response is JSON,
        otherwise returns {"text": raw_text}.

        Args:
            prompt: The text prompt (system + user combined).
            images: Optional list of base64-encoded JPEGs or file paths.
            max_tokens: Max generation tokens.
            temperature: Sampling temperature.
            stop: Stop sequences (default: none — JSON output must not be truncated).
        """
        content: list[dict[str, Any]] = [{"type": "text", "text": prompt}]
        for img in images or []:
            if len(img) < 4096 and Path(img).exists():
                with open(img, "rb") as f:
                    b64 = base64.b64encode(f.read()).decode()
            else:
                b64 = img  # already base64
            content.append({
                "type": "image_url",
                "image_url": {"url": f"data:image/jpeg;base64,{b64}"},
            })

        payload: dict[str, Any] = {
            "messages": [{"role": "user", "content": content}],
            "max_tokens": max_tokens,
            "temperature": temperature,
        }
        if stop:
            payload["stop"] = stop

        try:
            r = requests.post(
                f"{self.base_url}/v1/chat/completions",
                json=payload,
                timeout=_TIMEOUT,
            )
            r.raise_for_status()
            result = r.json()
            text = result["choices"][0]["message"]["content"] or ""
            return self._parse_json_reply(text)
        except requests.ConnectionError:
            return {"error": "llama.cpp server not available", "text": ""}
        except requests.Timeout:
            return {"error": "llama.cpp server timeout", "text": ""}
        except Exception as e:
            return {"error": str(e), "text": ""}

    @staticmethod
    def _parse_json_reply(text: str) -> dict[str, Any]:
        """Parse model output as JSON, tolerating markdown fences."""
        stripped = text.strip()
        if stripped.startswith("```"):
            stripped = stripped.split("\n", 1)[-1]
            stripped = stripped.rsplit("```", 1)[0].strip()
        try:
            parsed = json.loads(stripped)
            if isinstance(parsed, dict):
                return parsed
        except (json.JSONDecodeError, TypeError):
            pass
        return {"text": text}


class EmbeddingClient:
    """HTTP client for the llama.cpp embedding server."""

    def __init__(self, host: str | None = None, port: int | None = None):
        self.host = host or config.LLAMA_CPP_HOST
        self.port = port or config.LLAMA_CPP_PORT_EMBED
        self.base_url = f"http://{self.host}:{self.port}"

    @property
    def available(self) -> bool:
        try:
            r = requests.get(f"{self.base_url}/health", timeout=5)
            return r.status_code == 200
        except (requests.ConnectionError, requests.Timeout):
            return False

    def embed(self, text: str) -> list[float] | None:
        """Get embedding vector for text. Returns None on failure."""
        try:
            r = requests.post(
                f"{self.base_url}/embedding",
                json={"content": text},
                timeout=30,
            )
            r.raise_for_status()
            data = r.json()
            return data.get("embedding")
        except Exception:
            return None