Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI | |
| from pydantic import BaseModel | |
| from llama_cpp import Llama | |
| import llama_cpp | |
| app = FastAPI() | |
| # --- HIGHEST OPTIMIZATION FOR 1.5B --- | |
| llm = Llama.from_pretrained( | |
| repo_id="Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF", | |
| filename="*q4_k_m.gguf", | |
| n_ctx=1024, | |
| n_threads=2, # Match your physical cores | |
| n_batch=512, | |
| flash_attn=True, | |
| n_mlock=True, # Keep model in RAM for consistent speed | |
| type_k=llama_cpp.GGML_TYPE_Q4_0, # 4-bit KV Cache for faster processing | |
| verbose=False | |
| ) | |
| class EvalRequest(BaseModel): | |
| task_description: str | |
| python_code: str | |
| async def health_check(): | |
| return {"status": "Online", "message": "Optimized 1.5B Evaluator Ready"} | |
| async def evaluate_code(request: EvalRequest): | |
| # Minimalist prompt for faster processing | |
| prompt = f"TASK: {request.task_description}\n\nCODE:\n{request.python_code}\n\nEVALUATE:" | |
| system_prompt = 'You are a Python tutor. Output ONLY JSON: {"score": int, "feedback": str, "improvements": list}' | |
| response = llm.create_chat_completion( | |
| messages=[ | |
| {"role": "system", "content": system_prompt}, | |
| {"role": "user", "content": prompt} | |
| ], | |
| max_tokens=400, | |
| temperature=0.1, | |
| repeat_penalty=1.1, | |
| stop=["}"], # STOP IMMEDIATELY when JSON closes | |
| response_format={"type": "json_object"} | |
| ) | |
| return {"evaluation": response['choices'][0]['message']['content']} | |