import json import modal app = modal.App("split-brain-verifier") model_volume = modal.Volume.from_name("qwen-14b-volume", create_if_missing=True) MODEL_DIR = "/models" MODEL_FILENAME = "Qwen2.5-Coder-14B-Instruct-Q4_K_M.gguf" MODEL_REPO = "bartowski/Qwen2.5-Coder-14B-Instruct-GGUF" download_image = modal.Image.debian_slim(python_version="3.11").pip_install( "huggingface-hub" ) @app.function( image=download_image, volumes={MODEL_DIR: model_volume}, timeout=3600, secrets=[modal.Secret.from_name("huggingface-secret")], ) def download_model(): from huggingface_hub import hf_hub_download hf_hub_download( repo_id=MODEL_REPO, filename=MODEL_FILENAME, local_dir=MODEL_DIR, ) model_volume.commit() print(f"Downloaded to {MODEL_DIR}/{MODEL_FILENAME}") llama_image = ( modal.Image.debian_slim(python_version="3.11") .apt_install("build-essential", "cmake", "git", "libgomp1") .run_commands( "git clone https://github.com/ggerganov/llama.cpp /llama.cpp", "cd /llama.cpp && cmake -B build -DLLAMA_CURL=OFF && cmake --build build --config Release -j$(nproc)", "cd /llama.cpp && pip install -e .", ) .pip_install("llama-cpp-python==0.3.4", "fastapi", "uvicorn", "pydantic") ) @app.cls( image=llama_image, gpu="A10G", volumes={MODEL_DIR: model_volume}, scaledown_window=60, ) @modal.concurrent(max_inputs=2) class Verifier: @modal.enter() def load_model(self): from llama_cpp import Llama self.llm = Llama( model_path=f"{MODEL_DIR}/{MODEL_FILENAME}", n_gpu_layers=-1, n_ctx=8192, n_batch=512, verbose=False, ) @modal.method() def verify(self, prompt: str, draft_code: str, language: str = "python") -> dict: system = f"""You are a code verifier. A smaller model drafted the following {language} code. Your job: 1. Check for bugs, logic errors, type errors, off-by-one errors, and security issues. 2. If the code is correct, respond with exactly: {{"verdict": "PASS"}} 3. If fixable, respond with: {{"verdict": "FIX", "corrected_code": "", "reason": ""}} 4. If fundamentally wrong, respond with: {{"verdict": "REWRITE", "corrected_code": "", "reason": ""}} Respond ONLY with valid JSON. No markdown, no explanation outside the JSON.""" user = f"Original prompt:\n{prompt}\n\nDrafted code:\n```{language}\n{draft_code}\n```" response = self.llm.create_chat_completion( messages=[ {"role": "system", "content": system}, {"role": "user", "content": user}, ], max_tokens=1024, temperature=0.1, ) raw = response["choices"][0]["message"]["content"].strip() try: parsed = json.loads(raw) except json.JSONDecodeError: return {"verdict": "PASS", "reason": "Verifier response could not be parsed."} if parsed.get("verdict") not in {"PASS", "FIX", "REWRITE"}: return {"verdict": "PASS", "reason": "Verifier returned an unknown verdict."} return parsed api_image = modal.Image.debian_slim(python_version="3.11").pip_install( "fastapi", "uvicorn", "pydantic" ) @app.function( image=api_image, scaledown_window=30, ) @modal.asgi_app() def verifier_endpoint(): from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel web_app = FastAPI() web_app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"], ) class VerifyRequest(BaseModel): prompt: str draft_code: str language: str = "python" @web_app.post("/verify") async def verify(req: VerifyRequest): return await Verifier().verify.remote.aio(req.prompt, req.draft_code, req.language) @web_app.get("/health") async def health(): return {"ok": True} return web_app @app.function() def warm_once(): """Manually warm the verifier before a live demo if you want lower first-hit latency.""" Verifier().verify.remote("test", "print('hello')", "python")