"""Score the fine-tune on Modal: serve the GGUF with llama-server, run eval.py.

Mirrors the deployment exactly — same official llama.cpp server image and the same
`-hf REPO:QUANT` load the Space uses — then runs training/eval.py against its
OpenAI-compatible endpoint. Text-only by default; --vision loads the mmproj
projector and feeds each thread as a rendered screenshot instead of text (run
training/render_screenshots.py first). The GGUF download is cached in a Volume.

  modal run training/modal_eval.py                                  # the fine-tune
  modal run training/modal_eval.py --model-hf-repo unsloth/gemma-4-31B-it-GGUF  # baseline
  modal run training/modal_eval.py --model-file gemma-cal-e4b-Q4_K_M.gguf --vision
  modal run training/modal_eval.py --eval-path training/data/eval_unstructured.jsonl
"""
from __future__ import annotations

import os
import shutil
import subprocess
import time
import urllib.request
from pathlib import Path

import modal

REPO_ROOT = Path(__file__).resolve().parent.parent

# Same llama-server the Space runs; add Python + the (light) eval deps.
image = (
    modal.Image.from_registry("ghcr.io/ggml-org/llama.cpp:server-cuda", add_python="3.11")
    .entrypoint([])  # clear the base image's llama-server ENTRYPOINT so Modal can run python
    .pip_install("requests", "pydantic>=2", "huggingface_hub", "python-dateutil>=2.9")
    .add_local_dir(
        str(REPO_ROOT), "/root/repo",
        ignore=[".git", "**/__pycache__", "**/*.gguf", "training/outputs",
                "training/data/.smcalflow_cache"],
    )
)
app = modal.App("imessage-cal-eval", image=image)
hf_cache = modal.Volume.from_name("imessage-cal-hf-cache", create_if_missing=True)
outputs_vol = modal.Volume.from_name("imessage-cal-outputs", create_if_missing=True)


@app.function(
    gpu="A100-80GB",  # Modal GPU type (not the HF Spaces "a100-large" flavor)
    timeout=60 * 60,
    secrets=[modal.Secret.from_name("huggingface")],
    volumes={"/cache/hf": hf_cache, "/outputs": outputs_vol},
)
def evaluate(model_hf_repo: str = "ParetoOptimal/gemma-4-cal-gguf",
             model_quant: str = "Q4_K_M",
             model_file: str = "",
             minimal_prompt: bool = False,
             vision: bool = False,
             mmproj_repo: str = "unsloth/gemma-4-E4B-it-GGUF",
             mmproj_file: str = "mmproj-F16.gguf",
             eval_path: str = "",
             title_polish: bool = False) -> str:
    workspace = "/root/repo"
    env = {**os.environ, "HF_HOME": "/cache/hf"}
    if minimal_prompt:
        env["MINIMAL_PROMPT"] = "1"  # eval.py drops the system prompt for both models
    if eval_path:
        env["EVAL_PATH"] = eval_path  # e.g. training/data/eval_unstructured.jsonl
    ls = shutil.which("llama-server") or "/app/llama-server"
    env["LD_LIBRARY_PATH"] = f"{os.path.dirname(ls)}:/app:" + env.get("LD_LIBRARY_PATH", "")

    # Serve a specific file via -m: an absolute path reads straight off the outputs
    # volume (e.g. a staging GGUF the gate already deleted from HF), otherwise the
    # file is fetched from `model_hf_repo`. No file -> -hf REPO:QUANT.
    if model_file:
        if model_file.startswith("/"):
            path = model_file
            label = f"volume:{os.path.basename(model_file)}"
        else:
            from huggingface_hub import hf_hub_download
            path = hf_hub_download(model_hf_repo, model_file, cache_dir="/cache/hf")
            label = f"{model_hf_repo}/{model_file}"
        load_args = ["-m", path]
    else:
        label = f"{model_hf_repo}:{model_quant}"
        load_args = ["-hf", f"{model_hf_repo}:{model_quant}"]
    if vision:
        # The vision arm: load the projector so llama-server accepts image_url
        # content; eval.py (VISION=1) then sends each thread as a screenshot only.
        # Default projector mirrors the Space's Dockerfile: the E4B fine-tune pairs
        # with the BASE E4B's mmproj (unsloth repo), not the 31B one in ours.
        from huggingface_hub import hf_hub_download
        mmproj_path = hf_hub_download(mmproj_repo, mmproj_file, cache_dir="/cache/hf")
        load_args += ["--mmproj", mmproj_path]
        label += "+vision"
    print(f"[eval] launching {ls} ({label}, {'vision' if vision else 'text-only'})", flush=True)
    proc = subprocess.Popen(
        [ls, *load_args,
         "--host", "127.0.0.1", "--port", "8080", "-ngl", "999", "-c", "8192", "--jinja"],
        env=env,
    )
    ready = False
    for i in range(900):  # model download (~18.7GB first run) + load can take minutes
        if proc.poll() is not None:
            print("[eval] ERROR: llama-server exited early", flush=True)
            break
        try:
            with urllib.request.urlopen("http://127.0.0.1:8080/health", timeout=5) as r:
                if r.status == 200:
                    ready = True
                    print(f"[eval] llama-server ready after ~{i * 2}s", flush=True)
                    break
        except Exception:  # noqa: BLE001  (503 while loading -> retry)
            time.sleep(2)
    hf_cache.commit()  # persist the downloaded GGUF for next run
    if not ready:
        raise RuntimeError("llama-server never became healthy")

    if title_polish:
        label += "+titles"
    env2 = {**env, "INFERENCE_BASE_URL": "http://127.0.0.1:8080/v1", "MODEL_LABEL": label}
    if vision:
        env2["VISION"] = "1"
    if title_polish:
        env2["TITLE_POLISH"] = "1"
    r = subprocess.run(["python3", "training/eval.py"], cwd=workspace, env=env2,
                       capture_output=True, text=True)
    print(r.stdout, flush=True)
    if r.stderr:
        print("STDERR:", r.stderr[-3000:], flush=True)
    proc.terminate()
    return r.stdout


@app.local_entrypoint()
def main(model_hf_repo: str = "ParetoOptimal/gemma-4-cal-gguf", model_quant: str = "Q4_K_M",
         model_file: str = "", minimal_prompt: bool = False, vision: bool = False,
         mmproj_repo: str = "unsloth/gemma-4-E4B-it-GGUF",
         mmproj_file: str = "mmproj-F16.gguf", eval_path: str = "",
         title_polish: bool = False):
    print(evaluate.remote(model_hf_repo=model_hf_repo, model_quant=model_quant,
                          model_file=model_file, minimal_prompt=minimal_prompt,
                          vision=vision, mmproj_repo=mmproj_repo,
                          mmproj_file=mmproj_file, eval_path=eval_path,
                          title_polish=title_polish))