Spaces:

build-small-hackathon
/

scrubdata

Running

OpenAI Codex

deploy: add sponsor:openai tag (Best Use of Codex) + Codex-hardened build

16dc556 11 days ago

4.95 kB

	"""Scale-to-zero Modal GPU endpoint serving the ScrubData v6 fine-tune via Ollama.

	Mirrors Ollama's HTTP API so the repo's existing planner
	(`scrubdata.model_planner.make_local_ollama_planner`) works UNCHANGED against the
	public Modal URL: it POSTs {URL}/api/chat and reads response["message"]["content"].

	Design:
	- GGUF (ricalanis/scrubdata-qwen3-4b-v6-q8, Q8_0) is downloaded INTO the image at
	BUILD time and the non-thinking Modelfile is written, so cold start does not
	re-download (only `ollama serve` boot + model load to GPU on first request).
	- `ollama create scrubdata-ft -f /Modelfile` runs at container start (fast: GGUF
	is already on local disk).
	- scale-to-zero via scaledown_window=300 -> $0 when idle; GPU cost only during use.

	Deploy:
	uv run modal deploy scripts/modal_serve.py
	# -> public URL of the web_server (Ollama port 11434)
	"""

	import os

	import modal

	HF_REPO = "ricalanis/scrubdata-qwen3-4b-v6-q8"
	GGUF_FILE = "scrubdata-qwen3-4b-v6.Q8_0.gguf"
	GGUF_PATH = f"/models/{GGUF_FILE}"
	MODELFILE_PATH = "/models/Modelfile"
	OLLAMA_PORT = 11434

	# Non-thinking template — identical to notebooks/Modelfile. The Q8 GGUF must use the
	# bare im_start/im_end chat template (no Qwen3 thinking/tools wrapper) or it burns the
	# budget "thinking"; format=json in the API call grammar-constrains away the
	# <tool_call> degeneration loop on long prompts.
	MODELFILE = """FROM /models/scrubdata-qwen3-4b-v6.Q8_0.gguf
	TEMPLATE \"\"\"{{- if .System }}<\|im_start\|>system
	{{ .System }}<\|im_end\|>
	{{ end }}{{- range .Messages }}<\|im_start\|>{{ .Role }}
	{{ .Content }}<\|im_end\|>
	{{ end }}<\|im_start\|>assistant
	\"\"\"
	PARAMETER stop "<\|im_end\|>"
	PARAMETER temperature 0
	PARAMETER repeat_penalty 1
	PARAMETER top_k 20
	PARAMETER top_p 0.95
	"""


	def _bake_model():
	"""Build-time: pull GGUF from HF onto the image disk and write the Modelfile."""
	import os
	from huggingface_hub import hf_hub_download

	os.makedirs("/models", exist_ok=True)
	path = hf_hub_download(repo_id=HF_REPO, filename=GGUF_FILE, local_dir="/models")
	# hf_hub_download may symlink into a cache; ensure the literal path exists.
	if path != GGUF_PATH and not os.path.exists(GGUF_PATH):
	os.symlink(path, GGUF_PATH)
	with open(MODELFILE_PATH, "w") as f:
	f.write(MODELFILE)
	print(f"baked {GGUF_PATH} ({os.path.getsize(path) / 1e9:.2f} GB)", flush=True)


	image = (
	modal.Image.debian_slim(python_version="3.11")
	.apt_install("curl", "zstd")
	# Pin Ollama 0.21.2: the REPAIRED Q8 GGUF was verified on 0.21.2 with format=json;
	# 0.30.7+ silently IGNORES format=json for this model and the planner degenerates
	# into <tool_call> loops (see eval/sc_rerank.py).
	.run_commands("curl -fsSL https://ollama.com/install.sh \| OLLAMA_VERSION=0.21.2 sh")
	.pip_install("huggingface_hub")
	.run_function(_bake_model)
	)

	app = modal.App("scrubdata-serve", image=image)


	@app.function(
	gpu="A100", # 40GB A100: ~2x prefill of A10G on our heavy 9k-token prompt
	# (~95s -> ~50s/clean); model is ~4.7GB Q8 so 40GB is ample.
	# scale-to-zero keeps idle cost $0; ~$0.05/clean active.
	# warm-container floor: default 0 (scale-to-zero). Set SCRUBDATA_MIN_CONTAINERS=1
	# before `modal deploy` to bake a warm floor, OR flip at runtime without redeploying
	# via scripts/modal_warm.py on\|off (update_autoscaler). 1 warm A100 ~= $2.10/hr.
	min_containers=int(os.environ.get("SCRUBDATA_MIN_CONTAINERS", "0")),
	scaledown_window=300, # scale-to-zero ~5 min after last request -> $0 idle
	timeout=600,
	)
	@modal.concurrent(max_inputs=10)
	@modal.web_server(port=OLLAMA_PORT, startup_timeout=300)
	def serve():
	import subprocess
	import time
	import urllib.request

	env = {
	"OLLAMA_HOST": f"0.0.0.0:{OLLAMA_PORT}",
	"OLLAMA_MODELS": "/root/.ollama/models",
	# Disable flash attention: the CUDA FA kernel path produced different decode
	# numerics than the CPU/desktop-GPU reference and let the <tool_call> token
	# leak past the format=json grammar constraint. Off => matches the verified
	# 0.21.2 reference behavior.
	"OLLAMA_FLASH_ATTENTION": "0",
	"OLLAMA_KV_CACHE_TYPE": "f16",
	}
	import os
	full_env = {os.environ, env}

	subprocess.Popen(["ollama", "serve"], env=full_env)

	# wait for the daemon
	for _ in range(60):
	try:
	urllib.request.urlopen(f"http://localhost:{OLLAMA_PORT}/api/tags", timeout=2)
	break
	except Exception:
	time.sleep(0.5)

	# create the named model from the baked GGUF (fast: local file)
	subprocess.run(["ollama", "create", "scrubdata-ft", "-f", MODELFILE_PATH],
	env=full_env, check=True)
	print("scrubdata-ft created; serving", flush=True)
	# web_server keeps the process alive; ollama serve is already in the background.