Spaces:

scriptsledge
/

clarity-backend

Running

scriptsledge commited on Dec 20, 2025

Commit

1102a35

verified ·

1 Parent(s): 916f543

perf: switch to Qwen 2.5 Coder 1.5B for ultra-fast inference on 2 vCPU hardware

Files changed (1) hide show

model_service.py CHANGED Viewed

@@ -3,10 +3,10 @@ from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
 # --- Configuration ---
-# Using the 4-bit quantized version of Qwen 2.5 Coder 3B
-# This fits comfortably in RAM and is much faster on CPU than the 7B version
-REPO_ID = "Qwen/Qwen2.5-Coder-3B-Instruct-GGUF"
-FILENAME = "qwen2.5-coder-3b-instruct-q4_k_m.gguf"
 print(f"Initializing Clarity AI Engine (llama.cpp)...")
 print(f"Target Model: {REPO_ID} [{FILENAME}]")
@@ -167,7 +167,7 @@ def correct_code_with_ai(code: str) -> dict:
         # llama-cpp-python chat completion
         response = llm.create_chat_completion(
             messages=messages,
-            max_tokens=1024, # Reduced to prevent CPU timeouts on free tier
             temperature=0.1, # Lower temperature for stricter adherence
         )

 from huggingface_hub import hf_hub_download
 # --- Configuration ---
+# Using the 4-bit quantized version of Qwen 2.5 Coder 1.5B
+# This is the fastest option for 2 vCPU hardware while maintaining good coding intelligence.
+REPO_ID = "Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF"
+FILENAME = "qwen2.5-coder-1.5b-instruct-q4_k_m.gguf"
 print(f"Initializing Clarity AI Engine (llama.cpp)...")
 print(f"Target Model: {REPO_ID} [{FILENAME}]")
         # llama-cpp-python chat completion
         response = llm.create_chat_completion(
             messages=messages,
+            max_tokens=1024, # Optimized for 1.5B speed
             temperature=0.1, # Lower temperature for stricter adherence
         )