from fastapi import FastAPI from pydantic import BaseModel from llama_cpp import Llama import llama_cpp app = FastAPI() # --- HIGHEST OPTIMIZATION FOR 1.5B --- llm = Llama.from_pretrained( repo_id="Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF", filename="*q4_k_m.gguf", n_ctx=1024, n_threads=2, # Match your physical cores n_batch=512, flash_attn=True, n_mlock=True, # Keep model in RAM for consistent speed type_k=llama_cpp.GGML_TYPE_Q4_0, # 4-bit KV Cache for faster processing verbose=False ) class EvalRequest(BaseModel): task_description: str python_code: str @app.get("/") async def health_check(): return {"status": "Online", "message": "Optimized 1.5B Evaluator Ready"} @app.post("/evaluate") async def evaluate_code(request: EvalRequest): # Minimalist prompt for faster processing prompt = f"TASK: {request.task_description}\n\nCODE:\n{request.python_code}\n\nEVALUATE:" system_prompt = 'You are a Python tutor. Output ONLY JSON: {"score": int, "feedback": str, "improvements": list}' response = llm.create_chat_completion( messages=[ {"role": "system", "content": system_prompt}, {"role": "user", "content": prompt} ], max_tokens=400, temperature=0.1, repeat_penalty=1.1, stop=["}"], # STOP IMMEDIATELY when JSON closes response_format={"type": "json_object"} ) return {"evaluation": response['choices'][0]['message']['content']}