scriptsledge commited on
Commit
f7181e6
·
verified ·
1 Parent(s): 1102a35

perf: switch to 1.5B Q2_K quantization for lowest possible latency on CPU

Browse files
Files changed (1) hide show
  1. model_service.py +3 -3
model_service.py CHANGED
@@ -3,10 +3,10 @@ from llama_cpp import Llama
3
  from huggingface_hub import hf_hub_download
4
 
5
  # --- Configuration ---
6
- # Using the 4-bit quantized version of Qwen 2.5 Coder 1.5B
7
- # This is the fastest option for 2 vCPU hardware while maintaining good coding intelligence.
8
  REPO_ID = "Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF"
9
- FILENAME = "qwen2.5-coder-1.5b-instruct-q4_k_m.gguf"
10
 
11
  print(f"Initializing Clarity AI Engine (llama.cpp)...")
12
  print(f"Target Model: {REPO_ID} [{FILENAME}]")
 
3
  from huggingface_hub import hf_hub_download
4
 
5
  # --- Configuration ---
6
+ # Using the ultra-compressed 2-bit version of Qwen 2.5 Coder 1.5B
7
+ # This is extremely fast and has very low memory usage, though intelligence may vary.
8
  REPO_ID = "Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF"
9
+ FILENAME = "qwen2.5-coder-1.5b-instruct-q2_k.gguf"
10
 
11
  print(f"Initializing Clarity AI Engine (llama.cpp)...")
12
  print(f"Target Model: {REPO_ID} [{FILENAME}]")