File size: 4,300 Bytes
053ee0d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cd451e7
053ee0d
cd451e7
053ee0d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cd451e7
053ee0d
 
 
 
 
 
 
 
 
 
 
 
 
ea20131
 
 
053ee0d
 
 
ea20131
cd451e7
053ee0d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ea20131
053ee0d
 
 
 
 
 
 
 
cd451e7
 
 
053ee0d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import json

import modal


app = modal.App("split-brain-verifier")

model_volume = modal.Volume.from_name("qwen-14b-volume", create_if_missing=True)

MODEL_DIR = "/models"
MODEL_FILENAME = "Qwen2.5-Coder-14B-Instruct-Q4_K_M.gguf"
MODEL_REPO = "bartowski/Qwen2.5-Coder-14B-Instruct-GGUF"

download_image = modal.Image.debian_slim(python_version="3.11").pip_install(
    "huggingface-hub"
)


@app.function(
    image=download_image,
    volumes={MODEL_DIR: model_volume},
    timeout=3600,
    secrets=[modal.Secret.from_name("huggingface-secret")],
)
def download_model():
    from huggingface_hub import hf_hub_download

    hf_hub_download(
        repo_id=MODEL_REPO,
        filename=MODEL_FILENAME,
        local_dir=MODEL_DIR,
    )
    model_volume.commit()
    print(f"Downloaded to {MODEL_DIR}/{MODEL_FILENAME}")


llama_image = (
    modal.Image.debian_slim(python_version="3.11")
    .apt_install("build-essential", "cmake", "git", "libgomp1")
    .run_commands(
        "git clone https://github.com/ggerganov/llama.cpp /llama.cpp",
        "cd /llama.cpp && cmake -B build -DLLAMA_CURL=OFF && cmake --build build --config Release -j$(nproc)",
        "cd /llama.cpp && pip install -e .",
    )
    .pip_install("llama-cpp-python==0.3.4", "fastapi", "uvicorn", "pydantic")
)


@app.cls(
    image=llama_image,
    gpu="A10G",
    volumes={MODEL_DIR: model_volume},
    scaledown_window=60,
)
@modal.concurrent(max_inputs=2)
class Verifier:
    @modal.enter()
    def load_model(self):
        from llama_cpp import Llama

        self.llm = Llama(
            model_path=f"{MODEL_DIR}/{MODEL_FILENAME}",
            n_gpu_layers=-1,
            n_ctx=8192,
            n_batch=512,
            verbose=False,
        )

    @modal.method()
    def verify(self, prompt: str, draft_code: str, language: str = "python") -> dict:
        system = f"""You are a code verifier. A smaller model drafted the following {language} code.
Your job:
1. Check for bugs, logic errors, type errors, off-by-one errors, and security issues.
2. If the code is correct, respond with exactly: {{"verdict": "PASS"}}
3. If fixable, respond with: {{"verdict": "FIX", "corrected_code": "<fixed code here>", "reason": "<one line>"}}
4. If fundamentally wrong, respond with: {{"verdict": "REWRITE", "corrected_code": "<rewritten code>", "reason": "<one line>"}}
Respond ONLY with valid JSON. No markdown, no explanation outside the JSON."""

        user = f"Original prompt:\n{prompt}\n\nDrafted code:\n```{language}\n{draft_code}\n```"

        response = self.llm.create_chat_completion(
            messages=[
                {"role": "system", "content": system},
                {"role": "user", "content": user},
            ],
            max_tokens=1024,
            temperature=0.1,
        )
        raw = response["choices"][0]["message"]["content"].strip()
        try:
            parsed = json.loads(raw)
        except json.JSONDecodeError:
            return {"verdict": "PASS", "reason": "Verifier response could not be parsed."}

        if parsed.get("verdict") not in {"PASS", "FIX", "REWRITE"}:
            return {"verdict": "PASS", "reason": "Verifier returned an unknown verdict."}
        return parsed


api_image = modal.Image.debian_slim(python_version="3.11").pip_install(
    "fastapi", "uvicorn", "pydantic"
)


@app.function(
    image=api_image,
    scaledown_window=30,
)
@modal.asgi_app()
def verifier_endpoint():
    from fastapi import FastAPI
    from fastapi.middleware.cors import CORSMiddleware
    from pydantic import BaseModel

    web_app = FastAPI()
    web_app.add_middleware(
        CORSMiddleware,
        allow_origins=["*"],
        allow_methods=["*"],
        allow_headers=["*"],
    )

    class VerifyRequest(BaseModel):
        prompt: str
        draft_code: str
        language: str = "python"

    @web_app.post("/verify")
    async def verify(req: VerifyRequest):
        return await Verifier().verify.remote.aio(req.prompt, req.draft_code, req.language)

    @web_app.get("/health")
    async def health():
        return {"ok": True}

    return web_app


@app.function()
def warm_once():
    """Manually warm the verifier before a live demo if you want lower first-hit latency."""
    Verifier().verify.remote("test", "print('hello')", "python")