from fastapi import FastAPI
from pydantic import BaseModel
from llama_cpp import Llama

app = FastAPI()

# --- OPTIMIZED 1.5B INITIALIZATION ---
llm = Llama.from_pretrained(
    repo_id="Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF", # Upgraded to 1.5B
    filename="*q4_k_m.gguf", # Q4 quantization is the absolute sweet spot for speed vs. logic
    n_ctx=1024,              # REDUCED: Halving context from 2048 massively speeds up prompt processing
    n_threads=2,             # CRITICAL: Set to physical CPU cores. Over-allocating threads actively slows it down.
    n_batch=512,
    flash_attn=True          # SPEED BOOST: Enabled Flash Attention for faster memory access
)

class EvalRequest(BaseModel):
    task_description: str
    python_code: str

@app.get("/")
async def health_check():
    return {"status": "Online", "message": "1.5B AI Code Evaluator is running!"}

@app.post("/evaluate")
async def evaluate_code(request: EvalRequest):
    prompt = f"{request.task_description}\n\nStudent Code:\n{request.python_code}"
    
    # 1.5B is smart enough that we can make the prompt shorter (saving time)
    system_prompt = """You are an encouraging Python tutor. Evaluate the code.
RULES: Score 0-100 integer. Praise effort first. Give 1-2 friendly tips.
Output ONLY a valid JSON object matching this schema:
{"score": 0 to 100 based on correctness of code with respect to task description, "feedback": "Great job...", "improvements": ["Tip 1"]}"""

    response = llm.create_chat_completion(
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt}
        ],
        max_tokens=512,      # REDUCED: The JSON is short. Capping this saves unnecessary generation time.
        temperature=0.1,
        response_format={"type": "json_object"} 
    )
        
    return {"evaluation": response['choices'][0]['message']['content']}