"""HTTP client for Modal audit endpoint with mock fallback."""

from __future__ import annotations

import json
import os
import urllib.error
import urllib.request
from typing import Any

from merge import parse_llm_json
from prompts import FEW_SHOT_ASSISTANT, build_messages


def get_modal_url() -> str | None:
    return os.environ.get("MODAL_AUDIT_URL") or os.environ.get("MODAL_AUDIT_ENDPOINT")


def get_modal_timeout() -> float:
    # A cold Modal container loads the quantized model (and on a fresh deploy
    # pulls the GGUF), which can push the first request past a minute. Generous default.
    return float(os.environ.get("MODAL_AUDIT_TIMEOUT", "300"))


def get_ollama_model() -> str | None:
    """Local Ollama model tag (e.g. 'gemma4:e4b'); enables the local-LLM path."""
    return os.environ.get("OLLAMA_MODEL")


def get_ollama_url() -> str:
    return os.environ.get("OLLAMA_URL", "http://localhost:11434")


def get_ollama_timeout() -> float:
    # Local CPU/Metal inference is slow: a full audit is ~2.5 min on an M1 with
    # gemma3:4b, and the first (cold) call adds model-load time. Generous default.
    return float(os.environ.get("OLLAMA_TIMEOUT", "300"))


def backend_label() -> str:
    """Human-readable description of the inference backend run() will use."""
    if get_modal_url():
        return "live Gemma 4 E4B on Modal"
    model = get_ollama_model()
    if model:
        return f"local Ollama ({model})"
    return "mock LLM (set MODAL_AUDIT_URL or OLLAMA_MODEL)"


def call_modal_audit(
    platform: str,
    goal: str,
    audience: str,
    post: str,
    *,
    timeout: float | None = None,
) -> dict[str, Any]:
    """Dispatch to a backend: Modal endpoint, local Ollama, or deterministic mock."""
    if timeout is None:
        timeout = get_modal_timeout()
    url = get_modal_url()
    if not url:
        if get_ollama_model():
            return _call_ollama(platform, goal, audience, post)
        return _mock_llm_response(platform, goal, audience, post)

    payload = json.dumps(
        {
            "platform": platform,
            "goal": goal,
            "audience": audience,
            "post": post,
        }
    ).encode("utf-8")
    # Modal serves a single fastapi_endpoint at the root of its URL — no path suffix.
    headers = {"Content-Type": "application/json"}
    token = os.environ.get("MODAL_AUDIT_TOKEN")
    if token:
        headers["X-Audit-Token"] = token
    req = urllib.request.Request(
        url.rstrip("/"),
        data=payload,
        headers=headers,
        method="POST",
    )
    try:
        with urllib.request.urlopen(req, timeout=timeout) as resp:
            data = json.loads(resp.read().decode("utf-8"))
    except urllib.error.HTTPError as exc:
        body = exc.read().decode("utf-8", errors="replace")
        raise RuntimeError(f"Modal HTTP {exc.code}: {body}") from exc

    if "raw" in data:
        return parse_llm_json(data["raw"])
    return data


def _ollama_chat(model: str, messages: list[dict[str, str]], timeout: float) -> str:
    """POST messages to Ollama's /api/chat with JSON-constrained output; return content."""
    body = json.dumps(
        {
            "model": model,
            "messages": messages,
            "stream": False,
            "format": "json",  # constrain output to valid JSON
            "options": {"temperature": 0, "num_predict": 2048},
        }
    ).encode("utf-8")
    req = urllib.request.Request(
        get_ollama_url().rstrip("/") + "/api/chat",
        data=body,
        headers={"Content-Type": "application/json"},
        method="POST",
    )
    try:
        with urllib.request.urlopen(req, timeout=timeout) as resp:
            data = json.loads(resp.read().decode("utf-8"))
    except urllib.error.HTTPError as exc:
        detail = exc.read().decode("utf-8", errors="replace")
        raise RuntimeError(f"Ollama HTTP {exc.code}: {detail}") from exc
    except urllib.error.URLError as exc:
        raise RuntimeError(
            f"Cannot reach Ollama at {get_ollama_url()} — is it running? ({exc.reason})"
        ) from exc
    return data.get("message", {}).get("content", "")


def _call_ollama(
    platform: str,
    goal: str,
    audience: str,
    post: str,
    *,
    timeout: float | None = None,
) -> dict[str, Any]:
    """Run the audit against a local Ollama model. First call may be slow (model load)."""
    if timeout is None:
        timeout = get_ollama_timeout()
    model = get_ollama_model()
    messages = build_messages(platform, goal, audience, post)
    raw = _ollama_chat(model, messages, timeout)
    try:
        return parse_llm_json(raw)
    except (json.JSONDecodeError, ValueError):
        # One retry with an explicit instruction, mirroring the Modal path.
        retry = messages + [
            {"role": "user", "content": "Return ONLY valid JSON matching the schema. No other text."}
        ]
        return parse_llm_json(_ollama_chat(model, retry, timeout))


def _mock_llm_response(
    platform: str,
    goal: str,
    audience: str,
    post: str,
) -> dict[str, Any]:
    """Deterministic mock when Modal URL is unset — uses few-shot shape for webinar-like posts."""
    del platform, goal, audience
    lower = post.lower()
    if "link in bio" in lower or "webinar" in lower:
        return json.loads(FEW_SHOT_ASSISTANT)
    return {
        "briefCheck": {
            "status": "ok",
            "inferred": {
                "goal": "Unclear from post — needs editor review",
                "audience": "In-context colleagues",
            },
            "gaps": [],
        },
        "auditReport": {
            "goalAlignment": {
                "overall": 40,
                "cappedBy": [],
                "dimensions": [
                    {"key": "hook", "score": 2, "rationale": "Opening lacks a clear stake or benefit."},
                    {"key": "clarity", "score": 2, "rationale": "Multiple topics mixed in one dump."},
                    {"key": "audienceFit", "score": 3, "rationale": "Jargon may fit insiders but structure is rough."},
                    {"key": "goalService", "score": 2, "rationale": "Does not clearly drive the stated goal actions."},
                    {"key": "cta", "score": 2, "rationale": "Call to action is logistics-only or missing deadline."},
                ],
                "summary": "Mock audit (set MODAL_AUDIT_URL for live Gemma 4 E4B). Post needs structure and a clearer CTA.",
            },
            "warnings": [
                {
                    "code": "MIXED_MESSAGES",
                    "severity": "warning",
                    "source": "llm",
                    "message": "Artifact, task, and logistics appear mixed.",
                },
                {
                    "code": "NO_CLEAR_CTA",
                    "severity": "warning",
                    "source": "llm",
                    "message": "No explicit personal action with a deadline.",
                },
            ],
            "rewriteHints": [
                "Lead with why the reader should act now.",
                "Separate artifact, task, and logistics into sections.",
                "Add one explicit CTA with a deadline.",
            ],
        },
    }