OffGridSchedula / training /modal_eval.py
ParetoOptimal's picture
Initial Commit
0366d65
Raw
History Blame Contribute Delete
6.46 kB
"""Score the fine-tune on Modal: serve the GGUF with llama-server, run eval.py.
Mirrors the deployment exactly — same official llama.cpp server image and the same
`-hf REPO:QUANT` load the Space uses — then runs training/eval.py against its
OpenAI-compatible endpoint. Text-only by default; --vision loads the mmproj
projector and feeds each thread as a rendered screenshot instead of text (run
training/render_screenshots.py first). The GGUF download is cached in a Volume.
modal run training/modal_eval.py # the fine-tune
modal run training/modal_eval.py --model-hf-repo unsloth/gemma-4-31B-it-GGUF # baseline
modal run training/modal_eval.py --model-file gemma-cal-e4b-Q4_K_M.gguf --vision
modal run training/modal_eval.py --eval-path training/data/eval_unstructured.jsonl
"""
from __future__ import annotations
import os
import shutil
import subprocess
import time
import urllib.request
from pathlib import Path
import modal
REPO_ROOT = Path(__file__).resolve().parent.parent
# Same llama-server the Space runs; add Python + the (light) eval deps.
image = (
modal.Image.from_registry("ghcr.io/ggml-org/llama.cpp:server-cuda", add_python="3.11")
.entrypoint([]) # clear the base image's llama-server ENTRYPOINT so Modal can run python
.pip_install("requests", "pydantic>=2", "huggingface_hub", "python-dateutil>=2.9")
.add_local_dir(
str(REPO_ROOT), "/root/repo",
ignore=[".git", "**/__pycache__", "**/*.gguf", "training/outputs",
"training/data/.smcalflow_cache"],
)
)
app = modal.App("imessage-cal-eval", image=image)
hf_cache = modal.Volume.from_name("imessage-cal-hf-cache", create_if_missing=True)
outputs_vol = modal.Volume.from_name("imessage-cal-outputs", create_if_missing=True)
@app.function(
gpu="A100-80GB", # Modal GPU type (not the HF Spaces "a100-large" flavor)
timeout=60 * 60,
secrets=[modal.Secret.from_name("huggingface")],
volumes={"/cache/hf": hf_cache, "/outputs": outputs_vol},
)
def evaluate(model_hf_repo: str = "ParetoOptimal/gemma-4-cal-gguf",
model_quant: str = "Q4_K_M",
model_file: str = "",
minimal_prompt: bool = False,
vision: bool = False,
mmproj_repo: str = "unsloth/gemma-4-E4B-it-GGUF",
mmproj_file: str = "mmproj-F16.gguf",
eval_path: str = "",
title_polish: bool = False) -> str:
workspace = "/root/repo"
env = {**os.environ, "HF_HOME": "/cache/hf"}
if minimal_prompt:
env["MINIMAL_PROMPT"] = "1" # eval.py drops the system prompt for both models
if eval_path:
env["EVAL_PATH"] = eval_path # e.g. training/data/eval_unstructured.jsonl
ls = shutil.which("llama-server") or "/app/llama-server"
env["LD_LIBRARY_PATH"] = f"{os.path.dirname(ls)}:/app:" + env.get("LD_LIBRARY_PATH", "")
# Serve a specific file via -m: an absolute path reads straight off the outputs
# volume (e.g. a staging GGUF the gate already deleted from HF), otherwise the
# file is fetched from `model_hf_repo`. No file -> -hf REPO:QUANT.
if model_file:
if model_file.startswith("/"):
path = model_file
label = f"volume:{os.path.basename(model_file)}"
else:
from huggingface_hub import hf_hub_download
path = hf_hub_download(model_hf_repo, model_file, cache_dir="/cache/hf")
label = f"{model_hf_repo}/{model_file}"
load_args = ["-m", path]
else:
label = f"{model_hf_repo}:{model_quant}"
load_args = ["-hf", f"{model_hf_repo}:{model_quant}"]
if vision:
# The vision arm: load the projector so llama-server accepts image_url
# content; eval.py (VISION=1) then sends each thread as a screenshot only.
# Default projector mirrors the Space's Dockerfile: the E4B fine-tune pairs
# with the BASE E4B's mmproj (unsloth repo), not the 31B one in ours.
from huggingface_hub import hf_hub_download
mmproj_path = hf_hub_download(mmproj_repo, mmproj_file, cache_dir="/cache/hf")
load_args += ["--mmproj", mmproj_path]
label += "+vision"
print(f"[eval] launching {ls} ({label}, {'vision' if vision else 'text-only'})", flush=True)
proc = subprocess.Popen(
[ls, *load_args,
"--host", "127.0.0.1", "--port", "8080", "-ngl", "999", "-c", "8192", "--jinja"],
env=env,
)
ready = False
for i in range(900): # model download (~18.7GB first run) + load can take minutes
if proc.poll() is not None:
print("[eval] ERROR: llama-server exited early", flush=True)
break
try:
with urllib.request.urlopen("http://127.0.0.1:8080/health", timeout=5) as r:
if r.status == 200:
ready = True
print(f"[eval] llama-server ready after ~{i * 2}s", flush=True)
break
except Exception: # noqa: BLE001 (503 while loading -> retry)
time.sleep(2)
hf_cache.commit() # persist the downloaded GGUF for next run
if not ready:
raise RuntimeError("llama-server never became healthy")
if title_polish:
label += "+titles"
env2 = {**env, "INFERENCE_BASE_URL": "http://127.0.0.1:8080/v1", "MODEL_LABEL": label}
if vision:
env2["VISION"] = "1"
if title_polish:
env2["TITLE_POLISH"] = "1"
r = subprocess.run(["python3", "training/eval.py"], cwd=workspace, env=env2,
capture_output=True, text=True)
print(r.stdout, flush=True)
if r.stderr:
print("STDERR:", r.stderr[-3000:], flush=True)
proc.terminate()
return r.stdout
@app.local_entrypoint()
def main(model_hf_repo: str = "ParetoOptimal/gemma-4-cal-gguf", model_quant: str = "Q4_K_M",
model_file: str = "", minimal_prompt: bool = False, vision: bool = False,
mmproj_repo: str = "unsloth/gemma-4-E4B-it-GGUF",
mmproj_file: str = "mmproj-F16.gguf", eval_path: str = "",
title_polish: bool = False):
print(evaluate.remote(model_hf_repo=model_hf_repo, model_quant=model_quant,
model_file=model_file, minimal_prompt=minimal_prompt,
vision=vision, mmproj_repo=mmproj_repo,
mmproj_file=mmproj_file, eval_path=eval_path,
title_polish=title_polish))