Spaces:
Sleeping
Sleeping
File size: 1,529 Bytes
08fac8b b3b1b91 08fac8b b3b1b91 08fac8b b3b1b91 08fac8b 00db226 b3b1b91 00db226 08fac8b b3b1b91 08fac8b b3b1b91 08fac8b b3b1b91 08fac8b b3b1b91 a56c92a b3b1b91 a56c92a 08fac8b 0911813 b3b1b91 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 | from fastapi import FastAPI
from pydantic import BaseModel
from llama_cpp import Llama
import llama_cpp
app = FastAPI()
# --- HIGHEST OPTIMIZATION FOR 1.5B ---
llm = Llama.from_pretrained(
repo_id="Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF",
filename="*q4_k_m.gguf",
n_ctx=1024,
n_threads=2, # Match your physical cores
n_batch=512,
flash_attn=True,
n_mlock=True, # Keep model in RAM for consistent speed
type_k=llama_cpp.GGML_TYPE_Q4_0, # 4-bit KV Cache for faster processing
verbose=False
)
class EvalRequest(BaseModel):
task_description: str
python_code: str
@app.get("/")
async def health_check():
return {"status": "Online", "message": "Optimized 1.5B Evaluator Ready"}
@app.post("/evaluate")
async def evaluate_code(request: EvalRequest):
# Minimalist prompt for faster processing
prompt = f"TASK: {request.task_description}\n\nCODE:\n{request.python_code}\n\nEVALUATE:"
system_prompt = 'You are a Python tutor. Output ONLY JSON: {"score": int, "feedback": str, "improvements": list}'
response = llm.create_chat_completion(
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt}
],
max_tokens=400,
temperature=0.1,
repeat_penalty=1.1,
stop=["}"], # STOP IMMEDIATELY when JSON closes
response_format={"type": "json_object"}
)
return {"evaluation": response['choices'][0]['message']['content']}
|