Spaces:
Running
Running
feat: Add ERNIE 21B IQ2_XS variant - more stable quantization
Browse files- Added ernie_21b_thinking_q2 model with IQ2_XS (2-bit) quantization
- Keeps original IQ1_0 variant for comparison/testing
- IQ2_XS should fix generation failures caused by experimental TQ1_0
- Same repo (unsloth/ERNIE-4.5-21B-A3B-Thinking-GGUF), different quant file
app.py
CHANGED
|
@@ -218,6 +218,20 @@ AVAILABLE_MODELS = {
|
|
| 218 |
"repeat_penalty": 1.15,
|
| 219 |
},
|
| 220 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 221 |
}
|
| 222 |
|
| 223 |
DEFAULT_MODEL_KEY = "qwen3_600m_q4"
|
|
@@ -272,6 +286,8 @@ def load_model(model_key: str = None) -> Tuple[Llama, str]:
|
|
| 272 |
n_gpu_layers=0, # CPU only
|
| 273 |
verbose=False,
|
| 274 |
seed=1337,
|
|
|
|
|
|
|
| 275 |
)
|
| 276 |
|
| 277 |
current_model_key = model_key
|
|
|
|
| 218 |
"repeat_penalty": 1.15,
|
| 219 |
},
|
| 220 |
},
|
| 221 |
+
"ernie_21b_thinking_q2": {
|
| 222 |
+
"name": "ERNIE-4.5 21B Thinking Q2 (128K Context)",
|
| 223 |
+
"repo_id": "unsloth/ERNIE-4.5-21B-A3B-Thinking-GGUF",
|
| 224 |
+
"filename": "*IQ2_XS.gguf",
|
| 225 |
+
"max_context": 131072,
|
| 226 |
+
"default_temperature": 0.6,
|
| 227 |
+
"supports_toggle": False,
|
| 228 |
+
"inference_settings": {
|
| 229 |
+
"temperature": 0.3,
|
| 230 |
+
"top_p": 0.9,
|
| 231 |
+
"top_k": 30,
|
| 232 |
+
"repeat_penalty": 1.15,
|
| 233 |
+
},
|
| 234 |
+
},
|
| 235 |
}
|
| 236 |
|
| 237 |
DEFAULT_MODEL_KEY = "qwen3_600m_q4"
|
|
|
|
| 286 |
n_gpu_layers=0, # CPU only
|
| 287 |
verbose=False,
|
| 288 |
seed=1337,
|
| 289 |
+
v_type=2,
|
| 290 |
+
k_type=2,
|
| 291 |
)
|
| 292 |
|
| 293 |
current_model_key = model_key
|