Spaces:

nryadav18
/

python-code-evaluator

Sleeping

App Files Files Community

nryadav18 commited on Feb 23

Commit

07fd26a

verified ·

1 Parent(s): b3b1b91

Update app.py

Browse files

Files changed (1) hide show

app.py +15 -18

app.py CHANGED Viewed

@@ -1,21 +1,17 @@
 from fastapi import FastAPI
 from pydantic import BaseModel
 from llama_cpp import Llama
-import llama_cpp
 app = FastAPI()
-# --- HIGHEST OPTIMIZATION FOR 1.5B ---
 llm = Llama.from_pretrained(
-    repo_id="Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF",
-    filename="*q4_k_m.gguf",
-    n_ctx=1024,
-    n_threads=2,             # Match your physical cores
     n_batch=512,
-    flash_attn=True,
-    n_mlock=True,            # Keep model in RAM for consistent speed
-    type_k=llama_cpp.GGML_TYPE_Q4_0, # 4-bit KV Cache for faster processing
-    verbose=False
 )
 class EvalRequest(BaseModel):
@@ -24,25 +20,26 @@ class EvalRequest(BaseModel):
 @app.get("/")
 async def health_check():
-    return {"status": "Online", "message": "Optimized 1.5B Evaluator Ready"}
 @app.post("/evaluate")
 async def evaluate_code(request: EvalRequest):
-    # Minimalist prompt for faster processing
-    prompt = f"TASK: {request.task_description}\n\nCODE:\n{request.python_code}\n\nEVALUATE:"
-    system_prompt = 'You are a Python tutor. Output ONLY JSON: {"score": int, "feedback": str, "improvements": list}'
     response = llm.create_chat_completion(
         messages=[
             {"role": "system", "content": system_prompt},
             {"role": "user", "content": prompt}
         ],
-        max_tokens=400,
         temperature=0.1,
-        repeat_penalty=1.1,
-        stop=["}"], # STOP IMMEDIATELY when JSON closes
         response_format={"type": "json_object"}
     )
-    return {"evaluation": response['choices'][0]['message']['content']}

 from fastapi import FastAPI
 from pydantic import BaseModel
 from llama_cpp import Llama
 app = FastAPI()
+# --- OPTIMIZED 1.5B INITIALIZATION ---
 llm = Llama.from_pretrained(
+    repo_id="Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF", # Upgraded to 1.5B
+    filename="*q4_k_m.gguf", # Q4 quantization is the absolute sweet spot for speed vs. logic
+    n_ctx=1024,              # REDUCED: Halving context from 2048 massively speeds up prompt processing
+    n_threads=2,             # CRITICAL: Set to physical CPU cores. Over-allocating threads actively slows it down.
     n_batch=512,
+    flash_attn=True          # SPEED BOOST: Enabled Flash Attention for faster memory access
 )
 class EvalRequest(BaseModel):
 @app.get("/")
 async def health_check():
+    return {"status": "Online", "message": "1.5B AI Code Evaluator is running!"}
 @app.post("/evaluate")
 async def evaluate_code(request: EvalRequest):
+    prompt = f"{request.task_description}\n\nStudent Code:\n{request.python_code}"
+    # 1.5B is smart enough that we can make the prompt shorter (saving time)
+    system_prompt = """You are an encouraging Python tutor. Evaluate the code.
+RULES: Score 0-100 integer. Praise effort first. Give 1-2 friendly tips.
+Output ONLY a valid JSON object matching this schema:
+{"score": 85, "feedback": "Great job...", "improvements": ["Tip 1"]}"""
     response = llm.create_chat_completion(
         messages=[
             {"role": "system", "content": system_prompt},
             {"role": "user", "content": prompt}
         ],
+        max_tokens=200,      # REDUCED: The JSON is short. Capping this saves unnecessary generation time.
         temperature=0.1,
         response_format={"type": "json_object"}
     )
+    return {"evaluation": response['choices'][0]['message']['content']}