"""Score the fine-tune on Modal: serve the GGUF with llama-server, run eval.py. Mirrors the deployment exactly — same official llama.cpp server image and the same `-hf REPO:QUANT` load the Space uses — then runs training/eval.py against its OpenAI-compatible endpoint. Text-only by default; --vision loads the mmproj projector and feeds each thread as a rendered screenshot instead of text (run training/render_screenshots.py first). The GGUF download is cached in a Volume. modal run training/modal_eval.py # the fine-tune modal run training/modal_eval.py --model-hf-repo unsloth/gemma-4-31B-it-GGUF # baseline modal run training/modal_eval.py --model-file gemma-cal-e4b-Q4_K_M.gguf --vision modal run training/modal_eval.py --eval-path training/data/eval_unstructured.jsonl """ from __future__ import annotations import os import shutil import subprocess import time import urllib.request from pathlib import Path import modal REPO_ROOT = Path(__file__).resolve().parent.parent # Same llama-server the Space runs; add Python + the (light) eval deps. image = ( modal.Image.from_registry("ghcr.io/ggml-org/llama.cpp:server-cuda", add_python="3.11") .entrypoint([]) # clear the base image's llama-server ENTRYPOINT so Modal can run python .pip_install("requests", "pydantic>=2", "huggingface_hub", "python-dateutil>=2.9") .add_local_dir( str(REPO_ROOT), "/root/repo", ignore=[".git", "**/__pycache__", "**/*.gguf", "training/outputs", "training/data/.smcalflow_cache"], ) ) app = modal.App("imessage-cal-eval", image=image) hf_cache = modal.Volume.from_name("imessage-cal-hf-cache", create_if_missing=True) outputs_vol = modal.Volume.from_name("imessage-cal-outputs", create_if_missing=True) @app.function( gpu="A100-80GB", # Modal GPU type (not the HF Spaces "a100-large" flavor) timeout=60 * 60, secrets=[modal.Secret.from_name("huggingface")], volumes={"/cache/hf": hf_cache, "/outputs": outputs_vol}, ) def evaluate(model_hf_repo: str = "ParetoOptimal/gemma-4-cal-gguf", model_quant: str = "Q4_K_M", model_file: str = "", minimal_prompt: bool = False, vision: bool = False, mmproj_repo: str = "unsloth/gemma-4-E4B-it-GGUF", mmproj_file: str = "mmproj-F16.gguf", eval_path: str = "", title_polish: bool = False) -> str: workspace = "/root/repo" env = {**os.environ, "HF_HOME": "/cache/hf"} if minimal_prompt: env["MINIMAL_PROMPT"] = "1" # eval.py drops the system prompt for both models if eval_path: env["EVAL_PATH"] = eval_path # e.g. training/data/eval_unstructured.jsonl ls = shutil.which("llama-server") or "/app/llama-server" env["LD_LIBRARY_PATH"] = f"{os.path.dirname(ls)}:/app:" + env.get("LD_LIBRARY_PATH", "") # Serve a specific file via -m: an absolute path reads straight off the outputs # volume (e.g. a staging GGUF the gate already deleted from HF), otherwise the # file is fetched from `model_hf_repo`. No file -> -hf REPO:QUANT. if model_file: if model_file.startswith("/"): path = model_file label = f"volume:{os.path.basename(model_file)}" else: from huggingface_hub import hf_hub_download path = hf_hub_download(model_hf_repo, model_file, cache_dir="/cache/hf") label = f"{model_hf_repo}/{model_file}" load_args = ["-m", path] else: label = f"{model_hf_repo}:{model_quant}" load_args = ["-hf", f"{model_hf_repo}:{model_quant}"] if vision: # The vision arm: load the projector so llama-server accepts image_url # content; eval.py (VISION=1) then sends each thread as a screenshot only. # Default projector mirrors the Space's Dockerfile: the E4B fine-tune pairs # with the BASE E4B's mmproj (unsloth repo), not the 31B one in ours. from huggingface_hub import hf_hub_download mmproj_path = hf_hub_download(mmproj_repo, mmproj_file, cache_dir="/cache/hf") load_args += ["--mmproj", mmproj_path] label += "+vision" print(f"[eval] launching {ls} ({label}, {'vision' if vision else 'text-only'})", flush=True) proc = subprocess.Popen( [ls, *load_args, "--host", "127.0.0.1", "--port", "8080", "-ngl", "999", "-c", "8192", "--jinja"], env=env, ) ready = False for i in range(900): # model download (~18.7GB first run) + load can take minutes if proc.poll() is not None: print("[eval] ERROR: llama-server exited early", flush=True) break try: with urllib.request.urlopen("http://127.0.0.1:8080/health", timeout=5) as r: if r.status == 200: ready = True print(f"[eval] llama-server ready after ~{i * 2}s", flush=True) break except Exception: # noqa: BLE001 (503 while loading -> retry) time.sleep(2) hf_cache.commit() # persist the downloaded GGUF for next run if not ready: raise RuntimeError("llama-server never became healthy") if title_polish: label += "+titles" env2 = {**env, "INFERENCE_BASE_URL": "http://127.0.0.1:8080/v1", "MODEL_LABEL": label} if vision: env2["VISION"] = "1" if title_polish: env2["TITLE_POLISH"] = "1" r = subprocess.run(["python3", "training/eval.py"], cwd=workspace, env=env2, capture_output=True, text=True) print(r.stdout, flush=True) if r.stderr: print("STDERR:", r.stderr[-3000:], flush=True) proc.terminate() return r.stdout @app.local_entrypoint() def main(model_hf_repo: str = "ParetoOptimal/gemma-4-cal-gguf", model_quant: str = "Q4_K_M", model_file: str = "", minimal_prompt: bool = False, vision: bool = False, mmproj_repo: str = "unsloth/gemma-4-E4B-it-GGUF", mmproj_file: str = "mmproj-F16.gguf", eval_path: str = "", title_polish: bool = False): print(evaluate.remote(model_hf_repo=model_hf_repo, model_quant=model_quant, model_file=model_file, minimal_prompt=minimal_prompt, vision=vision, mmproj_repo=mmproj_repo, mmproj_file=mmproj_file, eval_path=eval_path, title_polish=title_polish))