blessingmwiti's picture
Reduce Modal verifier credit usage
cd451e7
import json
import modal
app = modal.App("split-brain-verifier")
model_volume = modal.Volume.from_name("qwen-14b-volume", create_if_missing=True)
MODEL_DIR = "/models"
MODEL_FILENAME = "Qwen2.5-Coder-14B-Instruct-Q4_K_M.gguf"
MODEL_REPO = "bartowski/Qwen2.5-Coder-14B-Instruct-GGUF"
download_image = modal.Image.debian_slim(python_version="3.11").pip_install(
"huggingface-hub"
)
@app.function(
image=download_image,
volumes={MODEL_DIR: model_volume},
timeout=3600,
secrets=[modal.Secret.from_name("huggingface-secret")],
)
def download_model():
from huggingface_hub import hf_hub_download
hf_hub_download(
repo_id=MODEL_REPO,
filename=MODEL_FILENAME,
local_dir=MODEL_DIR,
)
model_volume.commit()
print(f"Downloaded to {MODEL_DIR}/{MODEL_FILENAME}")
llama_image = (
modal.Image.debian_slim(python_version="3.11")
.apt_install("build-essential", "cmake", "git", "libgomp1")
.run_commands(
"git clone https://github.com/ggerganov/llama.cpp /llama.cpp",
"cd /llama.cpp && cmake -B build -DLLAMA_CURL=OFF && cmake --build build --config Release -j$(nproc)",
"cd /llama.cpp && pip install -e .",
)
.pip_install("llama-cpp-python==0.3.4", "fastapi", "uvicorn", "pydantic")
)
@app.cls(
image=llama_image,
gpu="A10G",
volumes={MODEL_DIR: model_volume},
scaledown_window=60,
)
@modal.concurrent(max_inputs=2)
class Verifier:
@modal.enter()
def load_model(self):
from llama_cpp import Llama
self.llm = Llama(
model_path=f"{MODEL_DIR}/{MODEL_FILENAME}",
n_gpu_layers=-1,
n_ctx=8192,
n_batch=512,
verbose=False,
)
@modal.method()
def verify(self, prompt: str, draft_code: str, language: str = "python") -> dict:
system = f"""You are a code verifier. A smaller model drafted the following {language} code.
Your job:
1. Check for bugs, logic errors, type errors, off-by-one errors, and security issues.
2. If the code is correct, respond with exactly: {{"verdict": "PASS"}}
3. If fixable, respond with: {{"verdict": "FIX", "corrected_code": "<fixed code here>", "reason": "<one line>"}}
4. If fundamentally wrong, respond with: {{"verdict": "REWRITE", "corrected_code": "<rewritten code>", "reason": "<one line>"}}
Respond ONLY with valid JSON. No markdown, no explanation outside the JSON."""
user = f"Original prompt:\n{prompt}\n\nDrafted code:\n```{language}\n{draft_code}\n```"
response = self.llm.create_chat_completion(
messages=[
{"role": "system", "content": system},
{"role": "user", "content": user},
],
max_tokens=1024,
temperature=0.1,
)
raw = response["choices"][0]["message"]["content"].strip()
try:
parsed = json.loads(raw)
except json.JSONDecodeError:
return {"verdict": "PASS", "reason": "Verifier response could not be parsed."}
if parsed.get("verdict") not in {"PASS", "FIX", "REWRITE"}:
return {"verdict": "PASS", "reason": "Verifier returned an unknown verdict."}
return parsed
api_image = modal.Image.debian_slim(python_version="3.11").pip_install(
"fastapi", "uvicorn", "pydantic"
)
@app.function(
image=api_image,
scaledown_window=30,
)
@modal.asgi_app()
def verifier_endpoint():
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
web_app = FastAPI()
web_app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_methods=["*"],
allow_headers=["*"],
)
class VerifyRequest(BaseModel):
prompt: str
draft_code: str
language: str = "python"
@web_app.post("/verify")
async def verify(req: VerifyRequest):
return await Verifier().verify.remote.aio(req.prompt, req.draft_code, req.language)
@web_app.get("/health")
async def health():
return {"ok": True}
return web_app
@app.function()
def warm_once():
"""Manually warm the verifier before a live demo if you want lower first-hit latency."""
Verifier().verify.remote("test", "print('hello')", "python")