Luigi commited on
Commit
813e74d
·
1 Parent(s): 3893e85

feat: Add ERNIE 21B IQ2_XS variant - more stable quantization

Browse files

- Added ernie_21b_thinking_q2 model with IQ2_XS (2-bit) quantization
- Keeps original IQ1_0 variant for comparison/testing
- IQ2_XS should fix generation failures caused by experimental TQ1_0
- Same repo (unsloth/ERNIE-4.5-21B-A3B-Thinking-GGUF), different quant file

Files changed (1) hide show
  1. app.py +16 -0
app.py CHANGED
@@ -218,6 +218,20 @@ AVAILABLE_MODELS = {
218
  "repeat_penalty": 1.15,
219
  },
220
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  }
222
 
223
  DEFAULT_MODEL_KEY = "qwen3_600m_q4"
@@ -272,6 +286,8 @@ def load_model(model_key: str = None) -> Tuple[Llama, str]:
272
  n_gpu_layers=0, # CPU only
273
  verbose=False,
274
  seed=1337,
 
 
275
  )
276
 
277
  current_model_key = model_key
 
218
  "repeat_penalty": 1.15,
219
  },
220
  },
221
+ "ernie_21b_thinking_q2": {
222
+ "name": "ERNIE-4.5 21B Thinking Q2 (128K Context)",
223
+ "repo_id": "unsloth/ERNIE-4.5-21B-A3B-Thinking-GGUF",
224
+ "filename": "*IQ2_XS.gguf",
225
+ "max_context": 131072,
226
+ "default_temperature": 0.6,
227
+ "supports_toggle": False,
228
+ "inference_settings": {
229
+ "temperature": 0.3,
230
+ "top_p": 0.9,
231
+ "top_k": 30,
232
+ "repeat_penalty": 1.15,
233
+ },
234
+ },
235
  }
236
 
237
  DEFAULT_MODEL_KEY = "qwen3_600m_q4"
 
286
  n_gpu_layers=0, # CPU only
287
  verbose=False,
288
  seed=1337,
289
+ v_type=2,
290
+ k_type=2,
291
  )
292
 
293
  current_model_key = model_key