Spaces:
Starting
Starting
Update app.py
Browse files
app.py
CHANGED
|
@@ -1286,7 +1286,7 @@ class ZeroEngine:
|
|
| 1286 |
if is_cached:
|
| 1287 |
model_cache.preload_cache(path)
|
| 1288 |
|
| 1289 |
-
#
|
| 1290 |
init_params = {
|
| 1291 |
"model_path": path,
|
| 1292 |
"n_ctx": optimal_ctx,
|
|
@@ -1298,6 +1298,14 @@ class ZeroEngine:
|
|
| 1298 |
"n_gpu_layers": 0,
|
| 1299 |
"verbose": False,
|
| 1300 |
"seed": -1,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1301 |
}
|
| 1302 |
|
| 1303 |
# Remove None values to avoid llama.cpp errors
|
|
|
|
| 1286 |
if is_cached:
|
| 1287 |
model_cache.preload_cache(path)
|
| 1288 |
|
| 1289 |
+
# ENHANCED CPU-ONLY INITIALIZATION WITH SPEED OPTIMIZATIONS
|
| 1290 |
init_params = {
|
| 1291 |
"model_path": path,
|
| 1292 |
"n_ctx": optimal_ctx,
|
|
|
|
| 1298 |
"n_gpu_layers": 0,
|
| 1299 |
"verbose": False,
|
| 1300 |
"seed": -1,
|
| 1301 |
+
# SPEED OPTIMIZATIONS
|
| 1302 |
+
"f16_kv": True, # Faster KV cache
|
| 1303 |
+
"type_k": 2 if model_format != "gemma" else None, # KV quantization
|
| 1304 |
+
"type_v": 2 if model_format != "gemma" else None, # KV quantization
|
| 1305 |
+
"use_scratch": True, # Scratch buffer
|
| 1306 |
+
"cache_prompt": True, # Prompt caching
|
| 1307 |
+
"cache_prompt_tokens": 512, # Larger prompt cache
|
| 1308 |
+
"numa": True, # NUMA optimization
|
| 1309 |
}
|
| 1310 |
|
| 1311 |
# Remove None values to avoid llama.cpp errors
|