OffGridSchedula

Running

App Files Files Community

OffGridSchedula / training /modal_eval.py

ParetoOptimal

Initial Commit

0366d65 20 days ago

Raw

History Blame Contribute Delete

6.46 kB

	"""Score the fine-tune on Modal: serve the GGUF with llama-server, run eval.py.

	Mirrors the deployment exactly — same official llama.cpp server image and the same
	`-hf REPO:QUANT` load the Space uses — then runs training/eval.py against its
	OpenAI-compatible endpoint. Text-only by default; --vision loads the mmproj
	projector and feeds each thread as a rendered screenshot instead of text (run
	training/render_screenshots.py first). The GGUF download is cached in a Volume.

	modal run training/modal_eval.py # the fine-tune
	modal run training/modal_eval.py --model-hf-repo unsloth/gemma-4-31B-it-GGUF # baseline
	modal run training/modal_eval.py --model-file gemma-cal-e4b-Q4_K_M.gguf --vision
	modal run training/modal_eval.py --eval-path training/data/eval_unstructured.jsonl
	"""
	from __future__ import annotations

	import os
	import shutil
	import subprocess
	import time
	import urllib.request
	from pathlib import Path

	import modal

	REPO_ROOT = Path(__file__).resolve().parent.parent

	# Same llama-server the Space runs; add Python + the (light) eval deps.
	image = (
	modal.Image.from_registry("ghcr.io/ggml-org/llama.cpp:server-cuda", add_python="3.11")
	.entrypoint([]) # clear the base image's llama-server ENTRYPOINT so Modal can run python
	.pip_install("requests", "pydantic>=2", "huggingface_hub", "python-dateutil>=2.9")
	.add_local_dir(
	str(REPO_ROOT), "/root/repo",
	ignore=[".git", "/__pycache__", "/*.gguf", "training/outputs",
	"training/data/.smcalflow_cache"],
	)
	)
	app = modal.App("imessage-cal-eval", image=image)
	hf_cache = modal.Volume.from_name("imessage-cal-hf-cache", create_if_missing=True)
	outputs_vol = modal.Volume.from_name("imessage-cal-outputs", create_if_missing=True)


	@app.function(
	gpu="A100-80GB", # Modal GPU type (not the HF Spaces "a100-large" flavor)
	timeout=60 * 60,
	secrets=[modal.Secret.from_name("huggingface")],
	volumes={"/cache/hf": hf_cache, "/outputs": outputs_vol},
	)
	def evaluate(model_hf_repo: str = "ParetoOptimal/gemma-4-cal-gguf",
	model_quant: str = "Q4_K_M",
	model_file: str = "",
	minimal_prompt: bool = False,
	vision: bool = False,
	mmproj_repo: str = "unsloth/gemma-4-E4B-it-GGUF",
	mmproj_file: str = "mmproj-F16.gguf",
	eval_path: str = "",
	title_polish: bool = False) -> str:
	workspace = "/root/repo"
	env = {**os.environ, "HF_HOME": "/cache/hf"}
	if minimal_prompt:
	env["MINIMAL_PROMPT"] = "1" # eval.py drops the system prompt for both models
	if eval_path:
	env["EVAL_PATH"] = eval_path # e.g. training/data/eval_unstructured.jsonl
	ls = shutil.which("llama-server") or "/app/llama-server"
	env["LD_LIBRARY_PATH"] = f"{os.path.dirname(ls)}:/app:" + env.get("LD_LIBRARY_PATH", "")

	# Serve a specific file via -m: an absolute path reads straight off the outputs
	# volume (e.g. a staging GGUF the gate already deleted from HF), otherwise the
	# file is fetched from `model_hf_repo`. No file -> -hf REPO:QUANT.
	if model_file:
	if model_file.startswith("/"):
	path = model_file
	label = f"volume:{os.path.basename(model_file)}"
	else:
	from huggingface_hub import hf_hub_download
	path = hf_hub_download(model_hf_repo, model_file, cache_dir="/cache/hf")
	label = f"{model_hf_repo}/{model_file}"
	load_args = ["-m", path]
	else:
	label = f"{model_hf_repo}:{model_quant}"
	load_args = ["-hf", f"{model_hf_repo}:{model_quant}"]
	if vision:
	# The vision arm: load the projector so llama-server accepts image_url
	# content; eval.py (VISION=1) then sends each thread as a screenshot only.
	# Default projector mirrors the Space's Dockerfile: the E4B fine-tune pairs
	# with the BASE E4B's mmproj (unsloth repo), not the 31B one in ours.
	from huggingface_hub import hf_hub_download
	mmproj_path = hf_hub_download(mmproj_repo, mmproj_file, cache_dir="/cache/hf")
	load_args += ["--mmproj", mmproj_path]
	label += "+vision"
	print(f"[eval] launching {ls} ({label}, {'vision' if vision else 'text-only'})", flush=True)
	proc = subprocess.Popen(
	[ls, *load_args,
	"--host", "127.0.0.1", "--port", "8080", "-ngl", "999", "-c", "8192", "--jinja"],
	env=env,
	)
	ready = False
	for i in range(900): # model download (~18.7GB first run) + load can take minutes
	if proc.poll() is not None:
	print("[eval] ERROR: llama-server exited early", flush=True)
	break
	try:
	with urllib.request.urlopen("http://127.0.0.1:8080/health", timeout=5) as r:
	if r.status == 200:
	ready = True
	print(f"[eval] llama-server ready after ~{i * 2}s", flush=True)
	break
	except Exception: # noqa: BLE001 (503 while loading -> retry)
	time.sleep(2)
	hf_cache.commit() # persist the downloaded GGUF for next run
	if not ready:
	raise RuntimeError("llama-server never became healthy")

	if title_polish:
	label += "+titles"
	env2 = {**env, "INFERENCE_BASE_URL": "http://127.0.0.1:8080/v1", "MODEL_LABEL": label}
	if vision:
	env2["VISION"] = "1"
	if title_polish:
	env2["TITLE_POLISH"] = "1"
	r = subprocess.run(["python3", "training/eval.py"], cwd=workspace, env=env2,
	capture_output=True, text=True)
	print(r.stdout, flush=True)
	if r.stderr:
	print("STDERR:", r.stderr[-3000:], flush=True)
	proc.terminate()
	return r.stdout


	@app.local_entrypoint()
	def main(model_hf_repo: str = "ParetoOptimal/gemma-4-cal-gguf", model_quant: str = "Q4_K_M",
	model_file: str = "", minimal_prompt: bool = False, vision: bool = False,
	mmproj_repo: str = "unsloth/gemma-4-E4B-it-GGUF",
	mmproj_file: str = "mmproj-F16.gguf", eval_path: str = "",
	title_polish: bool = False):
	print(evaluate.remote(model_hf_repo=model_hf_repo, model_quant=model_quant,
	model_file=model_file, minimal_prompt=minimal_prompt,
	vision=vision, mmproj_repo=mmproj_repo,
	mmproj_file=mmproj_file, eval_path=eval_path,
	title_polish=title_polish))