nryadav18 commited on
Commit
07fd26a
·
verified ·
1 Parent(s): b3b1b91

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -18
app.py CHANGED
@@ -1,21 +1,17 @@
1
  from fastapi import FastAPI
2
  from pydantic import BaseModel
3
  from llama_cpp import Llama
4
- import llama_cpp
5
 
6
  app = FastAPI()
7
 
8
- # --- HIGHEST OPTIMIZATION FOR 1.5B ---
9
  llm = Llama.from_pretrained(
10
- repo_id="Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF",
11
- filename="*q4_k_m.gguf",
12
- n_ctx=1024,
13
- n_threads=2, # Match your physical cores
14
  n_batch=512,
15
- flash_attn=True,
16
- n_mlock=True, # Keep model in RAM for consistent speed
17
- type_k=llama_cpp.GGML_TYPE_Q4_0, # 4-bit KV Cache for faster processing
18
- verbose=False
19
  )
20
 
21
  class EvalRequest(BaseModel):
@@ -24,25 +20,26 @@ class EvalRequest(BaseModel):
24
 
25
  @app.get("/")
26
  async def health_check():
27
- return {"status": "Online", "message": "Optimized 1.5B Evaluator Ready"}
28
 
29
  @app.post("/evaluate")
30
  async def evaluate_code(request: EvalRequest):
31
- # Minimalist prompt for faster processing
32
- prompt = f"TASK: {request.task_description}\n\nCODE:\n{request.python_code}\n\nEVALUATE:"
33
 
34
- system_prompt = 'You are a Python tutor. Output ONLY JSON: {"score": int, "feedback": str, "improvements": list}'
 
 
 
 
35
 
36
  response = llm.create_chat_completion(
37
  messages=[
38
  {"role": "system", "content": system_prompt},
39
  {"role": "user", "content": prompt}
40
  ],
41
- max_tokens=400,
42
  temperature=0.1,
43
- repeat_penalty=1.1,
44
- stop=["}"], # STOP IMMEDIATELY when JSON closes
45
  response_format={"type": "json_object"}
46
  )
47
 
48
- return {"evaluation": response['choices'][0]['message']['content']}
 
1
  from fastapi import FastAPI
2
  from pydantic import BaseModel
3
  from llama_cpp import Llama
 
4
 
5
  app = FastAPI()
6
 
7
+ # --- OPTIMIZED 1.5B INITIALIZATION ---
8
  llm = Llama.from_pretrained(
9
+ repo_id="Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF", # Upgraded to 1.5B
10
+ filename="*q4_k_m.gguf", # Q4 quantization is the absolute sweet spot for speed vs. logic
11
+ n_ctx=1024, # REDUCED: Halving context from 2048 massively speeds up prompt processing
12
+ n_threads=2, # CRITICAL: Set to physical CPU cores. Over-allocating threads actively slows it down.
13
  n_batch=512,
14
+ flash_attn=True # SPEED BOOST: Enabled Flash Attention for faster memory access
 
 
 
15
  )
16
 
17
  class EvalRequest(BaseModel):
 
20
 
21
  @app.get("/")
22
  async def health_check():
23
+ return {"status": "Online", "message": "1.5B AI Code Evaluator is running!"}
24
 
25
  @app.post("/evaluate")
26
  async def evaluate_code(request: EvalRequest):
27
+ prompt = f"{request.task_description}\n\nStudent Code:\n{request.python_code}"
 
28
 
29
+ # 1.5B is smart enough that we can make the prompt shorter (saving time)
30
+ system_prompt = """You are an encouraging Python tutor. Evaluate the code.
31
+ RULES: Score 0-100 integer. Praise effort first. Give 1-2 friendly tips.
32
+ Output ONLY a valid JSON object matching this schema:
33
+ {"score": 85, "feedback": "Great job...", "improvements": ["Tip 1"]}"""
34
 
35
  response = llm.create_chat_completion(
36
  messages=[
37
  {"role": "system", "content": system_prompt},
38
  {"role": "user", "content": prompt}
39
  ],
40
+ max_tokens=200, # REDUCED: The JSON is short. Capping this saves unnecessary generation time.
41
  temperature=0.1,
 
 
42
  response_format={"type": "json_object"}
43
  )
44
 
45
+ return {"evaluation": response['choices'][0]['message']['content']}