| """Score the fine-tune on Modal: serve the GGUF with llama-server, run eval.py. |
| |
| Mirrors the deployment exactly — same official llama.cpp server image and the same |
| `-hf REPO:QUANT` load the Space uses — then runs training/eval.py against its |
| OpenAI-compatible endpoint. Text-only by default; --vision loads the mmproj |
| projector and feeds each thread as a rendered screenshot instead of text (run |
| training/render_screenshots.py first). The GGUF download is cached in a Volume. |
| |
| modal run training/modal_eval.py # the fine-tune |
| modal run training/modal_eval.py --model-hf-repo unsloth/gemma-4-31B-it-GGUF # baseline |
| modal run training/modal_eval.py --model-file gemma-cal-e4b-Q4_K_M.gguf --vision |
| modal run training/modal_eval.py --eval-path training/data/eval_unstructured.jsonl |
| """ |
| from __future__ import annotations |
|
|
| import os |
| import shutil |
| import subprocess |
| import time |
| import urllib.request |
| from pathlib import Path |
|
|
| import modal |
|
|
| REPO_ROOT = Path(__file__).resolve().parent.parent |
|
|
| |
| image = ( |
| modal.Image.from_registry("ghcr.io/ggml-org/llama.cpp:server-cuda", add_python="3.11") |
| .entrypoint([]) |
| .pip_install("requests", "pydantic>=2", "huggingface_hub", "python-dateutil>=2.9") |
| .add_local_dir( |
| str(REPO_ROOT), "/root/repo", |
| ignore=[".git", "**/__pycache__", "**/*.gguf", "training/outputs", |
| "training/data/.smcalflow_cache"], |
| ) |
| ) |
| app = modal.App("imessage-cal-eval", image=image) |
| hf_cache = modal.Volume.from_name("imessage-cal-hf-cache", create_if_missing=True) |
| outputs_vol = modal.Volume.from_name("imessage-cal-outputs", create_if_missing=True) |
|
|
|
|
| @app.function( |
| gpu="A100-80GB", |
| timeout=60 * 60, |
| secrets=[modal.Secret.from_name("huggingface")], |
| volumes={"/cache/hf": hf_cache, "/outputs": outputs_vol}, |
| ) |
| def evaluate(model_hf_repo: str = "ParetoOptimal/gemma-4-cal-gguf", |
| model_quant: str = "Q4_K_M", |
| model_file: str = "", |
| minimal_prompt: bool = False, |
| vision: bool = False, |
| mmproj_repo: str = "unsloth/gemma-4-E4B-it-GGUF", |
| mmproj_file: str = "mmproj-F16.gguf", |
| eval_path: str = "", |
| title_polish: bool = False) -> str: |
| workspace = "/root/repo" |
| env = {**os.environ, "HF_HOME": "/cache/hf"} |
| if minimal_prompt: |
| env["MINIMAL_PROMPT"] = "1" |
| if eval_path: |
| env["EVAL_PATH"] = eval_path |
| ls = shutil.which("llama-server") or "/app/llama-server" |
| env["LD_LIBRARY_PATH"] = f"{os.path.dirname(ls)}:/app:" + env.get("LD_LIBRARY_PATH", "") |
|
|
| |
| |
| |
| if model_file: |
| if model_file.startswith("/"): |
| path = model_file |
| label = f"volume:{os.path.basename(model_file)}" |
| else: |
| from huggingface_hub import hf_hub_download |
| path = hf_hub_download(model_hf_repo, model_file, cache_dir="/cache/hf") |
| label = f"{model_hf_repo}/{model_file}" |
| load_args = ["-m", path] |
| else: |
| label = f"{model_hf_repo}:{model_quant}" |
| load_args = ["-hf", f"{model_hf_repo}:{model_quant}"] |
| if vision: |
| |
| |
| |
| |
| from huggingface_hub import hf_hub_download |
| mmproj_path = hf_hub_download(mmproj_repo, mmproj_file, cache_dir="/cache/hf") |
| load_args += ["--mmproj", mmproj_path] |
| label += "+vision" |
| print(f"[eval] launching {ls} ({label}, {'vision' if vision else 'text-only'})", flush=True) |
| proc = subprocess.Popen( |
| [ls, *load_args, |
| "--host", "127.0.0.1", "--port", "8080", "-ngl", "999", "-c", "8192", "--jinja"], |
| env=env, |
| ) |
| ready = False |
| for i in range(900): |
| if proc.poll() is not None: |
| print("[eval] ERROR: llama-server exited early", flush=True) |
| break |
| try: |
| with urllib.request.urlopen("http://127.0.0.1:8080/health", timeout=5) as r: |
| if r.status == 200: |
| ready = True |
| print(f"[eval] llama-server ready after ~{i * 2}s", flush=True) |
| break |
| except Exception: |
| time.sleep(2) |
| hf_cache.commit() |
| if not ready: |
| raise RuntimeError("llama-server never became healthy") |
|
|
| if title_polish: |
| label += "+titles" |
| env2 = {**env, "INFERENCE_BASE_URL": "http://127.0.0.1:8080/v1", "MODEL_LABEL": label} |
| if vision: |
| env2["VISION"] = "1" |
| if title_polish: |
| env2["TITLE_POLISH"] = "1" |
| r = subprocess.run(["python3", "training/eval.py"], cwd=workspace, env=env2, |
| capture_output=True, text=True) |
| print(r.stdout, flush=True) |
| if r.stderr: |
| print("STDERR:", r.stderr[-3000:], flush=True) |
| proc.terminate() |
| return r.stdout |
|
|
|
|
| @app.local_entrypoint() |
| def main(model_hf_repo: str = "ParetoOptimal/gemma-4-cal-gguf", model_quant: str = "Q4_K_M", |
| model_file: str = "", minimal_prompt: bool = False, vision: bool = False, |
| mmproj_repo: str = "unsloth/gemma-4-E4B-it-GGUF", |
| mmproj_file: str = "mmproj-F16.gguf", eval_path: str = "", |
| title_polish: bool = False): |
| print(evaluate.remote(model_hf_repo=model_hf_repo, model_quant=model_quant, |
| model_file=model_file, minimal_prompt=minimal_prompt, |
| vision=vision, mmproj_repo=mmproj_repo, |
| mmproj_file=mmproj_file, eval_path=eval_path, |
| title_polish=title_polish)) |
|
|