"""Scale-to-zero Modal GPU endpoint serving the ScrubData v6 fine-tune via Ollama. Mirrors Ollama's HTTP API so the repo's existing planner (`scrubdata.model_planner.make_local_ollama_planner`) works UNCHANGED against the public Modal URL: it POSTs {URL}/api/chat and reads response["message"]["content"]. Design: - GGUF (ricalanis/scrubdata-qwen3-4b-v6-q8, Q8_0) is downloaded INTO the image at BUILD time and the non-thinking Modelfile is written, so cold start does not re-download (only `ollama serve` boot + model load to GPU on first request). - `ollama create scrubdata-ft -f /Modelfile` runs at container start (fast: GGUF is already on local disk). - scale-to-zero via scaledown_window=300 -> $0 when idle; GPU cost only during use. Deploy: uv run modal deploy scripts/modal_serve.py # -> public URL of the web_server (Ollama port 11434) """ import os import modal HF_REPO = "ricalanis/scrubdata-qwen3-4b-v6-q8" GGUF_FILE = "scrubdata-qwen3-4b-v6.Q8_0.gguf" GGUF_PATH = f"/models/{GGUF_FILE}" MODELFILE_PATH = "/models/Modelfile" OLLAMA_PORT = 11434 # Non-thinking template — identical to notebooks/Modelfile. The Q8 GGUF must use the # bare im_start/im_end chat template (no Qwen3 thinking/tools wrapper) or it burns the # budget "thinking"; format=json in the API call grammar-constrains away the # degeneration loop on long prompts. MODELFILE = """FROM /models/scrubdata-qwen3-4b-v6.Q8_0.gguf TEMPLATE \"\"\"{{- if .System }}<|im_start|>system {{ .System }}<|im_end|> {{ end }}{{- range .Messages }}<|im_start|>{{ .Role }} {{ .Content }}<|im_end|> {{ end }}<|im_start|>assistant \"\"\" PARAMETER stop "<|im_end|>" PARAMETER temperature 0 PARAMETER repeat_penalty 1 PARAMETER top_k 20 PARAMETER top_p 0.95 """ def _bake_model(): """Build-time: pull GGUF from HF onto the image disk and write the Modelfile.""" import os from huggingface_hub import hf_hub_download os.makedirs("/models", exist_ok=True) path = hf_hub_download(repo_id=HF_REPO, filename=GGUF_FILE, local_dir="/models") # hf_hub_download may symlink into a cache; ensure the literal path exists. if path != GGUF_PATH and not os.path.exists(GGUF_PATH): os.symlink(path, GGUF_PATH) with open(MODELFILE_PATH, "w") as f: f.write(MODELFILE) print(f"baked {GGUF_PATH} ({os.path.getsize(path) / 1e9:.2f} GB)", flush=True) image = ( modal.Image.debian_slim(python_version="3.11") .apt_install("curl", "zstd") # Pin Ollama 0.21.2: the REPAIRED Q8 GGUF was verified on 0.21.2 with format=json; # 0.30.7+ silently IGNORES format=json for this model and the planner degenerates # into loops (see eval/sc_rerank.py). .run_commands("curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION=0.21.2 sh") .pip_install("huggingface_hub") .run_function(_bake_model) ) app = modal.App("scrubdata-serve", image=image) @app.function( gpu="A100", # 40GB A100: ~2x prefill of A10G on our heavy 9k-token prompt # (~95s -> ~50s/clean); model is ~4.7GB Q8 so 40GB is ample. # scale-to-zero keeps idle cost $0; ~$0.05/clean active. # warm-container floor: default 0 (scale-to-zero). Set SCRUBDATA_MIN_CONTAINERS=1 # before `modal deploy` to bake a warm floor, OR flip at runtime without redeploying # via scripts/modal_warm.py on|off (update_autoscaler). 1 warm A100 ~= $2.10/hr. min_containers=int(os.environ.get("SCRUBDATA_MIN_CONTAINERS", "0")), scaledown_window=300, # scale-to-zero ~5 min after last request -> $0 idle timeout=600, ) @modal.concurrent(max_inputs=10) @modal.web_server(port=OLLAMA_PORT, startup_timeout=300) def serve(): import subprocess import time import urllib.request env = { "OLLAMA_HOST": f"0.0.0.0:{OLLAMA_PORT}", "OLLAMA_MODELS": "/root/.ollama/models", # Disable flash attention: the CUDA FA kernel path produced different decode # numerics than the CPU/desktop-GPU reference and let the token # leak past the format=json grammar constraint. Off => matches the verified # 0.21.2 reference behavior. "OLLAMA_FLASH_ATTENTION": "0", "OLLAMA_KV_CACHE_TYPE": "f16", } import os full_env = {**os.environ, **env} subprocess.Popen(["ollama", "serve"], env=full_env) # wait for the daemon for _ in range(60): try: urllib.request.urlopen(f"http://localhost:{OLLAMA_PORT}/api/tags", timeout=2) break except Exception: time.sleep(0.5) # create the named model from the baked GGUF (fast: local file) subprocess.run(["ollama", "create", "scrubdata-ft", "-f", MODELFILE_PATH], env=full_env, check=True) print("scrubdata-ft created; serving", flush=True) # web_server keeps the process alive; ollama serve is already in the background.