Spaces:

build-small-hackathon
/

split-brain-copilot

Running

File size: 4,300 Bytes

import json

import modal


app = modal.App("split-brain-verifier")

model_volume = modal.Volume.from_name("qwen-14b-volume", create_if_missing=True)

MODEL_DIR = "/models"
MODEL_FILENAME = "Qwen2.5-Coder-14B-Instruct-Q4_K_M.gguf"
MODEL_REPO = "bartowski/Qwen2.5-Coder-14B-Instruct-GGUF"

download_image = modal.Image.debian_slim(python_version="3.11").pip_install(
    "huggingface-hub"
)


@app.function(
    image=download_image,
    volumes={MODEL_DIR: model_volume},
    timeout=3600,
    secrets=[modal.Secret.from_name("huggingface-secret")],
)
def download_model():
    from huggingface_hub import hf_hub_download

    hf_hub_download(
        repo_id=MODEL_REPO,
        filename=MODEL_FILENAME,
        local_dir=MODEL_DIR,
    )
    model_volume.commit()
    print(f"Downloaded to {MODEL_DIR}/{MODEL_FILENAME}")


llama_image = (
    modal.Image.debian_slim(python_version="3.11")
    .apt_install("build-essential", "cmake", "git", "libgomp1")
    .run_commands(
        "git clone https://github.com/ggerganov/llama.cpp /llama.cpp",
        "cd /llama.cpp && cmake -B build -DLLAMA_CURL=OFF && cmake --build build --config Release -j$(nproc)",
        "cd /llama.cpp && pip install -e .",
    )
    .pip_install("llama-cpp-python==0.3.4", "fastapi", "uvicorn", "pydantic")
)


@app.cls(
    image=llama_image,
    gpu="A10G",
    volumes={MODEL_DIR: model_volume},
    scaledown_window=60,
)
@modal.concurrent(max_inputs=2)
class Verifier:
    @modal.enter()
    def load_model(self):
        from llama_cpp import Llama

        self.llm = Llama(
            model_path=f"{MODEL_DIR}/{MODEL_FILENAME}",
            n_gpu_layers=-1,
            n_ctx=8192,
            n_batch=512,
            verbose=False,
        )

    @modal.method()
    def verify(self, prompt: str, draft_code: str, language: str = "python") -> dict:
        system = f"""You are a code verifier. A smaller model drafted the following {language} code.
Your job:
1. Check for bugs, logic errors, type errors, off-by-one errors, and security issues.
2. If the code is correct, respond with exactly: {{"verdict": "PASS"}}
3. If fixable, respond with: {{"verdict": "FIX", "corrected_code": "<fixed code here>", "reason": "<one line>"}}
4. If fundamentally wrong, respond with: {{"verdict": "REWRITE", "corrected_code": "<rewritten code>", "reason": "<one line>"}}
Respond ONLY with valid JSON. No markdown, no explanation outside the JSON."""

        user = f"Original prompt:\n{prompt}\n\nDrafted code:\n```{language}\n{draft_code}\n```"

        response = self.llm.create_chat_completion(
            messages=[
                {"role": "system", "content": system},
                {"role": "user", "content": user},
            ],
            max_tokens=1024,
            temperature=0.1,
        )
        raw = response["choices"][0]["message"]["content"].strip()
        try:
            parsed = json.loads(raw)
        except json.JSONDecodeError:
            return {"verdict": "PASS", "reason": "Verifier response could not be parsed."}

        if parsed.get("verdict") not in {"PASS", "FIX", "REWRITE"}:
            return {"verdict": "PASS", "reason": "Verifier returned an unknown verdict."}
        return parsed


api_image = modal.Image.debian_slim(python_version="3.11").pip_install(
    "fastapi", "uvicorn", "pydantic"
)


@app.function(
    image=api_image,
    scaledown_window=30,
)
@modal.asgi_app()
def verifier_endpoint():
    from fastapi import FastAPI
    from fastapi.middleware.cors import CORSMiddleware
    from pydantic import BaseModel

    web_app = FastAPI()
    web_app.add_middleware(
        CORSMiddleware,
        allow_origins=["*"],
        allow_methods=["*"],
        allow_headers=["*"],
    )

    class VerifyRequest(BaseModel):
        prompt: str
        draft_code: str
        language: str = "python"

    @web_app.post("/verify")
    async def verify(req: VerifyRequest):
        return await Verifier().verify.remote.aio(req.prompt, req.draft_code, req.language)

    @web_app.get("/health")
    async def health():
        return {"ok": True}

    return web_app


@app.function()
def warm_once():
    """Manually warm the verifier before a live demo if you want lower first-hit latency."""
    Verifier().verify.remote("test", "print('hello')", "python")