File size: 1,529 Bytes
08fac8b
 
 
b3b1b91
08fac8b
 
 
b3b1b91
08fac8b
b3b1b91
 
 
 
 
 
 
 
 
08fac8b
 
 
 
 
 
00db226
 
b3b1b91
00db226
08fac8b
 
b3b1b91
 
08fac8b
b3b1b91
 
08fac8b
 
b3b1b91
08fac8b
 
b3b1b91
a56c92a
b3b1b91
 
a56c92a
08fac8b
0911813
b3b1b91
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from fastapi import FastAPI
from pydantic import BaseModel
from llama_cpp import Llama
import llama_cpp

app = FastAPI()

# --- HIGHEST OPTIMIZATION FOR 1.5B ---
llm = Llama.from_pretrained(
    repo_id="Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF",
    filename="*q4_k_m.gguf",
    n_ctx=1024,
    n_threads=2,             # Match your physical cores
    n_batch=512,
    flash_attn=True,
    n_mlock=True,            # Keep model in RAM for consistent speed
    type_k=llama_cpp.GGML_TYPE_Q4_0, # 4-bit KV Cache for faster processing
    verbose=False
)

class EvalRequest(BaseModel):
    task_description: str
    python_code: str

@app.get("/")
async def health_check():
    return {"status": "Online", "message": "Optimized 1.5B Evaluator Ready"}

@app.post("/evaluate")
async def evaluate_code(request: EvalRequest):
    # Minimalist prompt for faster processing
    prompt = f"TASK: {request.task_description}\n\nCODE:\n{request.python_code}\n\nEVALUATE:"
    
    system_prompt = 'You are a Python tutor. Output ONLY JSON: {"score": int, "feedback": str, "improvements": list}'

    response = llm.create_chat_completion(
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt}
        ],
        max_tokens=400,
        temperature=0.1,
        repeat_penalty=1.1,
        stop=["}"], # STOP IMMEDIATELY when JSON closes
        response_format={"type": "json_object"} 
    )
        
    return {"evaluation": response['choices'][0]['message']['content']}