from fastapi import FastAPI from pydantic import BaseModel from llama_cpp import Llama app = FastAPI() # --- OPTIMIZED 1.5B INITIALIZATION --- llm = Llama.from_pretrained( repo_id="Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF", # Upgraded to 1.5B filename="*q4_k_m.gguf", # Q4 quantization is the absolute sweet spot for speed vs. logic n_ctx=1024, # REDUCED: Halving context from 2048 massively speeds up prompt processing n_threads=2, # CRITICAL: Set to physical CPU cores. Over-allocating threads actively slows it down. n_batch=512, flash_attn=True # SPEED BOOST: Enabled Flash Attention for faster memory access ) class EvalRequest(BaseModel): task_description: str python_code: str @app.get("/") async def health_check(): return {"status": "Online", "message": "1.5B AI Code Evaluator is running!"} @app.post("/evaluate") async def evaluate_code(request: EvalRequest): prompt = f"{request.task_description}\n\nStudent Code:\n{request.python_code}" # 1.5B is smart enough that we can make the prompt shorter (saving time) system_prompt = """You are an encouraging Python tutor. Evaluate the code. RULES: Score 0-100 integer. Praise effort first. Give 1-2 friendly tips. Output ONLY a valid JSON object matching this schema: {"score": 0 to 100 based on correctness of code with respect to task description, "feedback": "Great job...", "improvements": ["Tip 1"]}""" response = llm.create_chat_completion( messages=[ {"role": "system", "content": system_prompt}, {"role": "user", "content": prompt} ], max_tokens=512, # REDUCED: The JSON is short. Capping this saves unnecessary generation time. temperature=0.1, response_format={"type": "json_object"} ) return {"evaluation": response['choices'][0]['message']['content']}