File size: 4,300 Bytes
053ee0d cd451e7 053ee0d cd451e7 053ee0d cd451e7 053ee0d ea20131 053ee0d ea20131 cd451e7 053ee0d ea20131 053ee0d cd451e7 053ee0d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 | import json
import modal
app = modal.App("split-brain-verifier")
model_volume = modal.Volume.from_name("qwen-14b-volume", create_if_missing=True)
MODEL_DIR = "/models"
MODEL_FILENAME = "Qwen2.5-Coder-14B-Instruct-Q4_K_M.gguf"
MODEL_REPO = "bartowski/Qwen2.5-Coder-14B-Instruct-GGUF"
download_image = modal.Image.debian_slim(python_version="3.11").pip_install(
"huggingface-hub"
)
@app.function(
image=download_image,
volumes={MODEL_DIR: model_volume},
timeout=3600,
secrets=[modal.Secret.from_name("huggingface-secret")],
)
def download_model():
from huggingface_hub import hf_hub_download
hf_hub_download(
repo_id=MODEL_REPO,
filename=MODEL_FILENAME,
local_dir=MODEL_DIR,
)
model_volume.commit()
print(f"Downloaded to {MODEL_DIR}/{MODEL_FILENAME}")
llama_image = (
modal.Image.debian_slim(python_version="3.11")
.apt_install("build-essential", "cmake", "git", "libgomp1")
.run_commands(
"git clone https://github.com/ggerganov/llama.cpp /llama.cpp",
"cd /llama.cpp && cmake -B build -DLLAMA_CURL=OFF && cmake --build build --config Release -j$(nproc)",
"cd /llama.cpp && pip install -e .",
)
.pip_install("llama-cpp-python==0.3.4", "fastapi", "uvicorn", "pydantic")
)
@app.cls(
image=llama_image,
gpu="A10G",
volumes={MODEL_DIR: model_volume},
scaledown_window=60,
)
@modal.concurrent(max_inputs=2)
class Verifier:
@modal.enter()
def load_model(self):
from llama_cpp import Llama
self.llm = Llama(
model_path=f"{MODEL_DIR}/{MODEL_FILENAME}",
n_gpu_layers=-1,
n_ctx=8192,
n_batch=512,
verbose=False,
)
@modal.method()
def verify(self, prompt: str, draft_code: str, language: str = "python") -> dict:
system = f"""You are a code verifier. A smaller model drafted the following {language} code.
Your job:
1. Check for bugs, logic errors, type errors, off-by-one errors, and security issues.
2. If the code is correct, respond with exactly: {{"verdict": "PASS"}}
3. If fixable, respond with: {{"verdict": "FIX", "corrected_code": "<fixed code here>", "reason": "<one line>"}}
4. If fundamentally wrong, respond with: {{"verdict": "REWRITE", "corrected_code": "<rewritten code>", "reason": "<one line>"}}
Respond ONLY with valid JSON. No markdown, no explanation outside the JSON."""
user = f"Original prompt:\n{prompt}\n\nDrafted code:\n```{language}\n{draft_code}\n```"
response = self.llm.create_chat_completion(
messages=[
{"role": "system", "content": system},
{"role": "user", "content": user},
],
max_tokens=1024,
temperature=0.1,
)
raw = response["choices"][0]["message"]["content"].strip()
try:
parsed = json.loads(raw)
except json.JSONDecodeError:
return {"verdict": "PASS", "reason": "Verifier response could not be parsed."}
if parsed.get("verdict") not in {"PASS", "FIX", "REWRITE"}:
return {"verdict": "PASS", "reason": "Verifier returned an unknown verdict."}
return parsed
api_image = modal.Image.debian_slim(python_version="3.11").pip_install(
"fastapi", "uvicorn", "pydantic"
)
@app.function(
image=api_image,
scaledown_window=30,
)
@modal.asgi_app()
def verifier_endpoint():
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
web_app = FastAPI()
web_app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_methods=["*"],
allow_headers=["*"],
)
class VerifyRequest(BaseModel):
prompt: str
draft_code: str
language: str = "python"
@web_app.post("/verify")
async def verify(req: VerifyRequest):
return await Verifier().verify.remote.aio(req.prompt, req.draft_code, req.language)
@web_app.get("/health")
async def health():
return {"ok": True}
return web_app
@app.function()
def warm_once():
"""Manually warm the verifier before a live demo if you want lower first-hit latency."""
Verifier().verify.remote("test", "print('hello')", "python")
|