| import json |
|
|
| import modal |
|
|
|
|
| app = modal.App("split-brain-verifier") |
|
|
| model_volume = modal.Volume.from_name("qwen-14b-volume", create_if_missing=True) |
|
|
| MODEL_DIR = "/models" |
| MODEL_FILENAME = "Qwen2.5-Coder-14B-Instruct-Q4_K_M.gguf" |
| MODEL_REPO = "bartowski/Qwen2.5-Coder-14B-Instruct-GGUF" |
|
|
| download_image = modal.Image.debian_slim(python_version="3.11").pip_install( |
| "huggingface-hub" |
| ) |
|
|
|
|
| @app.function( |
| image=download_image, |
| volumes={MODEL_DIR: model_volume}, |
| timeout=3600, |
| secrets=[modal.Secret.from_name("huggingface-secret")], |
| ) |
| def download_model(): |
| from huggingface_hub import hf_hub_download |
|
|
| hf_hub_download( |
| repo_id=MODEL_REPO, |
| filename=MODEL_FILENAME, |
| local_dir=MODEL_DIR, |
| ) |
| model_volume.commit() |
| print(f"Downloaded to {MODEL_DIR}/{MODEL_FILENAME}") |
|
|
|
|
| llama_image = ( |
| modal.Image.debian_slim(python_version="3.11") |
| .apt_install("build-essential", "cmake", "git", "libgomp1") |
| .run_commands( |
| "git clone https://github.com/ggerganov/llama.cpp /llama.cpp", |
| "cd /llama.cpp && cmake -B build -DLLAMA_CURL=OFF && cmake --build build --config Release -j$(nproc)", |
| "cd /llama.cpp && pip install -e .", |
| ) |
| .pip_install("llama-cpp-python==0.3.4", "fastapi", "uvicorn", "pydantic") |
| ) |
|
|
|
|
| @app.cls( |
| image=llama_image, |
| gpu="A10G", |
| volumes={MODEL_DIR: model_volume}, |
| scaledown_window=60, |
| ) |
| @modal.concurrent(max_inputs=2) |
| class Verifier: |
| @modal.enter() |
| def load_model(self): |
| from llama_cpp import Llama |
|
|
| self.llm = Llama( |
| model_path=f"{MODEL_DIR}/{MODEL_FILENAME}", |
| n_gpu_layers=-1, |
| n_ctx=8192, |
| n_batch=512, |
| verbose=False, |
| ) |
|
|
| @modal.method() |
| def verify(self, prompt: str, draft_code: str, language: str = "python") -> dict: |
| system = f"""You are a code verifier. A smaller model drafted the following {language} code. |
| Your job: |
| 1. Check for bugs, logic errors, type errors, off-by-one errors, and security issues. |
| 2. If the code is correct, respond with exactly: {{"verdict": "PASS"}} |
| 3. If fixable, respond with: {{"verdict": "FIX", "corrected_code": "<fixed code here>", "reason": "<one line>"}} |
| 4. If fundamentally wrong, respond with: {{"verdict": "REWRITE", "corrected_code": "<rewritten code>", "reason": "<one line>"}} |
| Respond ONLY with valid JSON. No markdown, no explanation outside the JSON.""" |
|
|
| user = f"Original prompt:\n{prompt}\n\nDrafted code:\n```{language}\n{draft_code}\n```" |
|
|
| response = self.llm.create_chat_completion( |
| messages=[ |
| {"role": "system", "content": system}, |
| {"role": "user", "content": user}, |
| ], |
| max_tokens=1024, |
| temperature=0.1, |
| ) |
| raw = response["choices"][0]["message"]["content"].strip() |
| try: |
| parsed = json.loads(raw) |
| except json.JSONDecodeError: |
| return {"verdict": "PASS", "reason": "Verifier response could not be parsed."} |
|
|
| if parsed.get("verdict") not in {"PASS", "FIX", "REWRITE"}: |
| return {"verdict": "PASS", "reason": "Verifier returned an unknown verdict."} |
| return parsed |
|
|
|
|
| api_image = modal.Image.debian_slim(python_version="3.11").pip_install( |
| "fastapi", "uvicorn", "pydantic" |
| ) |
|
|
|
|
| @app.function( |
| image=api_image, |
| scaledown_window=30, |
| ) |
| @modal.asgi_app() |
| def verifier_endpoint(): |
| from fastapi import FastAPI |
| from fastapi.middleware.cors import CORSMiddleware |
| from pydantic import BaseModel |
|
|
| web_app = FastAPI() |
| web_app.add_middleware( |
| CORSMiddleware, |
| allow_origins=["*"], |
| allow_methods=["*"], |
| allow_headers=["*"], |
| ) |
|
|
| class VerifyRequest(BaseModel): |
| prompt: str |
| draft_code: str |
| language: str = "python" |
|
|
| @web_app.post("/verify") |
| async def verify(req: VerifyRequest): |
| return await Verifier().verify.remote.aio(req.prompt, req.draft_code, req.language) |
|
|
| @web_app.get("/health") |
| async def health(): |
| return {"ok": True} |
|
|
| return web_app |
|
|
|
|
| @app.function() |
| def warm_once(): |
| """Manually warm the verifier before a live demo if you want lower first-hit latency.""" |
| Verifier().verify.remote("test", "print('hello')", "python") |
|
|