Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI | |
| from pydantic import BaseModel | |
| from llama_cpp import Llama | |
| app = FastAPI() | |
| # --- OPTIMIZED 1.5B INITIALIZATION --- | |
| llm = Llama.from_pretrained( | |
| repo_id="Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF", # Upgraded to 1.5B | |
| filename="*q4_k_m.gguf", # Q4 quantization is the absolute sweet spot for speed vs. logic | |
| n_ctx=1024, # REDUCED: Halving context from 2048 massively speeds up prompt processing | |
| n_threads=2, # CRITICAL: Set to physical CPU cores. Over-allocating threads actively slows it down. | |
| n_batch=512, | |
| flash_attn=True # SPEED BOOST: Enabled Flash Attention for faster memory access | |
| ) | |
| class EvalRequest(BaseModel): | |
| task_description: str | |
| python_code: str | |
| async def health_check(): | |
| return {"status": "Online", "message": "1.5B AI Code Evaluator is running!"} | |
| async def evaluate_code(request: EvalRequest): | |
| prompt = f"{request.task_description}\n\nStudent Code:\n{request.python_code}" | |
| # 1.5B is smart enough that we can make the prompt shorter (saving time) | |
| system_prompt = """You are an encouraging Python tutor. Evaluate the code. | |
| RULES: Score 0-100 integer. Praise effort first. Give 1-2 friendly tips. | |
| Output ONLY a valid JSON object matching this schema: | |
| {"score": 0 to 100 based on correctness of code with respect to task description, "feedback": "Great job...", "improvements": ["Tip 1"]}""" | |
| response = llm.create_chat_completion( | |
| messages=[ | |
| {"role": "system", "content": system_prompt}, | |
| {"role": "user", "content": prompt} | |
| ], | |
| max_tokens=512, # REDUCED: The JSON is short. Capping this saves unnecessary generation time. | |
| temperature=0.1, | |
| response_format={"type": "json_object"} | |
| ) | |
| return {"evaluation": response['choices'][0]['message']['content']} |