Spaces:

scriptsledge
/

clarity-backend

Sleeping

scriptsledge commited on Dec 20, 2025

Commit

f7181e6

verified ·

1 Parent(s): 1102a35

perf: switch to 1.5B Q2_K quantization for lowest possible latency on CPU

Files changed (1) hide show

model_service.py CHANGED Viewed

@@ -3,10 +3,10 @@ from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
 # --- Configuration ---
-# Using the 4-bit quantized version of Qwen 2.5 Coder 1.5B
-# This is the fastest option for 2 vCPU hardware while maintaining good coding intelligence.
 REPO_ID = "Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF"
-FILENAME = "qwen2.5-coder-1.5b-instruct-q4_k_m.gguf"
 print(f"Initializing Clarity AI Engine (llama.cpp)...")
 print(f"Target Model: {REPO_ID} [{FILENAME}]")

 from huggingface_hub import hf_hub_download
 # --- Configuration ---
+# Using the ultra-compressed 2-bit version of Qwen 2.5 Coder 1.5B
+# This is extremely fast and has very low memory usage, though intelligence may vary.
 REPO_ID = "Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF"
+FILENAME = "qwen2.5-coder-1.5b-instruct-q2_k.gguf"
 print(f"Initializing Clarity AI Engine (llama.cpp)...")
 print(f"Target Model: {REPO_ID} [{FILENAME}]")