nryadav18's picture
Update app.py
a0b3172 verified
from fastapi import FastAPI
from pydantic import BaseModel
from llama_cpp import Llama
app = FastAPI()
# --- OPTIMIZED 1.5B INITIALIZATION ---
llm = Llama.from_pretrained(
repo_id="Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF", # Upgraded to 1.5B
filename="*q4_k_m.gguf", # Q4 quantization is the absolute sweet spot for speed vs. logic
n_ctx=1024, # REDUCED: Halving context from 2048 massively speeds up prompt processing
n_threads=2, # CRITICAL: Set to physical CPU cores. Over-allocating threads actively slows it down.
n_batch=512,
flash_attn=True # SPEED BOOST: Enabled Flash Attention for faster memory access
)
class EvalRequest(BaseModel):
task_description: str
python_code: str
@app.get("/")
async def health_check():
return {"status": "Online", "message": "1.5B AI Code Evaluator is running!"}
@app.post("/evaluate")
async def evaluate_code(request: EvalRequest):
prompt = f"{request.task_description}\n\nStudent Code:\n{request.python_code}"
# 1.5B is smart enough that we can make the prompt shorter (saving time)
system_prompt = """You are an encouraging Python tutor. Evaluate the code.
RULES: Score 0-100 integer. Praise effort first. Give 1-2 friendly tips.
Output ONLY a valid JSON object matching this schema:
{"score": 0 to 100 based on correctness of code with respect to task description, "feedback": "Great job...", "improvements": ["Tip 1"]}"""
response = llm.create_chat_completion(
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt}
],
max_tokens=512, # REDUCED: The JSON is short. Capping this saves unnecessary generation time.
temperature=0.1,
response_format={"type": "json_object"}
)
return {"evaluation": response['choices'][0]['message']['content']}