"""Modal worker for Tiny Narrator's llama.cpp reader brain.

Deploy:
    modal deploy modal_workers/reader_brain.py

The deployed web server exposes llama.cpp's OpenAI-compatible API. Set:
    LLAMA_CPP_BASE_URL=https://<modal-app-url>/v1
    LLAMA_CPP_MODEL=narrator-brain
    LLAMA_CPP_TOKEN=<same value as Modal secret>

The server intentionally binds to 0.0.0.0 because Modal's web_server proxy
routes traffic to the container port.
"""

import os
from pathlib import Path
import subprocess
import shutil

import modal


APP_NAME = "tiny-narrator-reader-brain"
MODEL_REF = "nvidia/NVIDIA-Nemotron-3-Nano-4B-GGUF:Q4_K_M"
MODEL_ALIAS = "narrator-brain"
SERVER_PORT = 8080
CACHE_DIR = "/cache"

app = modal.App(APP_NAME)
model_cache = modal.Volume.from_name("tiny-narrator-reader-brain-cache", create_if_missing=True)


def _secret_names() -> list[modal.Secret]:
    return [modal.Secret.from_name("tiny-narrator-reader-brain-token")]


def _find_llama_server() -> str:
    candidates = [
        shutil.which("llama-server"),
        "/app/llama-server",
        "/usr/local/bin/llama-server",
        "/usr/bin/llama-server",
        "/bin/llama-server",
        "/llama-server",
    ]
    for candidate in candidates:
        if candidate and Path(candidate).exists():
            return candidate

    inspected = []
    for directory in ("/app", "/usr/local/bin", "/usr/bin", "/bin"):
        path = Path(directory)
        if path.exists():
            inspected.append(f"{directory}: {[item.name for item in path.glob('*llama*')]}")
    raise FileNotFoundError(f"llama-server binary was not found. Inspected: {'; '.join(inspected)}")


reader_brain_image = (
    modal.Image.from_registry(
        "ghcr.io/ggml-org/llama.cpp:server-cuda12",
        add_python="3.12",
    )
    .dockerfile_commands("ENTRYPOINT []")
    .env(
        {
            "HF_HOME": f"{CACHE_DIR}/huggingface",
        }
    )
)


@app.function(
    image=reader_brain_image,
    gpu="T4",
    volumes={CACHE_DIR: model_cache},
    secrets=_secret_names(),
    timeout=900,
    scaledown_window=300,
    max_containers=1,
)
@modal.concurrent(max_inputs=20)
@modal.web_server(SERVER_PORT, startup_timeout=600)
def reader_brain_server():
    api_key = os.getenv("LLAMA_CPP_TOKEN", "")

    command = [
        _find_llama_server(),
        "--host",
        "0.0.0.0",
        "--port",
        str(SERVER_PORT),
        "-hf",
        MODEL_REF,
        "--alias",
        MODEL_ALIAS,
        "--ctx-size",
        "4096",
        "--parallel",
        "1",
        "--reasoning",
        "off",
        "--n-gpu-layers",
        "999",
    ]
    if api_key:
        command.extend(["--api-key", api_key])

    display_command = command.copy()
    if "--api-key" in display_command:
        key_index = display_command.index("--api-key") + 1
        if key_index < len(display_command):
            display_command[key_index] = "***"
    print(f"[tiny-narrator-reader-brain] starting: {' '.join(display_command)}", flush=True)
    subprocess.Popen(command)