Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -1261,16 +1261,16 @@ class ZeroEngine:
|
|
| 1261 |
# IGNORE user batch size - use conservative settings for reliability
|
| 1262 |
logger.info(f"[TOKEN] User batch size: {user_batch_size} (OVERRIDDEN for reliability)")
|
| 1263 |
|
| 1264 |
-
#
|
| 1265 |
-
optimal_batch = 512 #
|
| 1266 |
-
optimal_ctx = 1024 #
|
| 1267 |
optimal_threads = 2 # FIXED: 2 threads for 2 vCPU
|
| 1268 |
|
| 1269 |
-
logger.info(f"[RAM]
|
| 1270 |
|
| 1271 |
# Reduce context for Gemma models (they have 131K n_ctx_train)
|
| 1272 |
if model_format == "gemma":
|
| 1273 |
-
optimal_ctx = 512 #
|
| 1274 |
logger.info(f"[FORMAT] Gemma detected: reducing context to {optimal_ctx}")
|
| 1275 |
|
| 1276 |
# Apply CPU optimizations before model loading
|
|
@@ -1286,62 +1286,18 @@ class ZeroEngine:
|
|
| 1286 |
if is_cached:
|
| 1287 |
model_cache.preload_cache(path)
|
| 1288 |
|
| 1289 |
-
#
|
| 1290 |
init_params = {
|
| 1291 |
"model_path": path,
|
| 1292 |
"n_ctx": optimal_ctx,
|
| 1293 |
-
"n_threads": optimal_threads,
|
| 1294 |
-
"n_threads_batch": optimal_threads,
|
| 1295 |
-
"use_mmap":
|
| 1296 |
-
"use_mlock":
|
| 1297 |
-
"n_batch": optimal_batch,
|
| 1298 |
-
"n_gpu_layers": 0,
|
| 1299 |
-
"rope_scaling_type": 0,
|
| 1300 |
-
"rope_freq_scale": ROPE_SCALING,
|
| 1301 |
"verbose": False,
|
| 1302 |
-
"
|
| 1303 |
-
"embedding": False, # No embeddings
|
| 1304 |
-
"f16_kv": False, # Quantized KV cache
|
| 1305 |
-
# ULTRA AGGRESSIVE SPEED OPTIMIZATIONS
|
| 1306 |
-
"type_k": 2 if KV_CACHE_QUANTIZATION and model_format != "gemma" else None,
|
| 1307 |
-
"type_v": 2 if KV_CACHE_QUANTIZATION and model_format != "gemma" else None,
|
| 1308 |
-
"offload_kqv": OFFLOAD_KQV,
|
| 1309 |
-
"flash_attn": FLASH_ATTENTION,
|
| 1310 |
-
"use_scratch": True, # Use scratch buffer
|
| 1311 |
-
"no_kv_offload": True, # Keep KV in RAM
|
| 1312 |
-
"num_experts_used": 0, # No MoE for CPU
|
| 1313 |
-
"seed": -1, # Random seed
|
| 1314 |
-
"tensor_split": None, # No tensor splitting
|
| 1315 |
-
"main_gpu": 0, # CPU-only
|
| 1316 |
-
"device": "cpu", # Explicit CPU
|
| 1317 |
-
"lora_base": None, # No LoRA base
|
| 1318 |
-
"lora_scale": 1.0, # LoRA scale
|
| 1319 |
-
"clpp_k": 0, # No CLPP
|
| 1320 |
-
"numa": NUMA_OPTIMIZE, # NUMA if available
|
| 1321 |
-
"cfg_scale": 1.0, # No CFG
|
| 1322 |
-
"grammar": None, # No grammar constraints
|
| 1323 |
-
"chat_format": None, # Auto-detect
|
| 1324 |
-
"chat_handler": None, # Default handler
|
| 1325 |
-
"cache_prompt": True, # Cache prompts
|
| 1326 |
-
"cache_prompt_tokens": 256, # Prompt cache size
|
| 1327 |
-
"cache_all": False, # Don't cache all
|
| 1328 |
-
"draft_model": None, # No draft model
|
| 1329 |
-
"draft_model_n_ctx": 512, # Draft context
|
| 1330 |
-
"draft_model_n_gpu_layers": -1, # Auto-detect
|
| 1331 |
-
"speculative_max_draft_len": 5, # Speculative decoding
|
| 1332 |
-
"speculative_max_top_k": 4, # Speculative top-k
|
| 1333 |
-
"speculative_decoding": SPECULATIVE_DECODE, # Enable if available
|
| 1334 |
-
"speculative_min_draft_len": 1, # Min draft length
|
| 1335 |
-
"speculative_max_top_k": 4, # Max top-k for draft
|
| 1336 |
-
"speculative_min_top_k": 1, # Min top-k for draft
|
| 1337 |
-
"speculative_max_top_p": 0.95, # Max top-p for draft
|
| 1338 |
-
"speculative_min_top_p": 0.1, # Min top-p for draft
|
| 1339 |
-
"speculative_max_temp": 1.0, # Max temp for draft
|
| 1340 |
-
"speculative_min_temp": 0.1, # Min temp for draft
|
| 1341 |
-
"speculative_eta": 0.1, # Eta for draft
|
| 1342 |
-
"speculative_tau": 5.0, # Tau for draft
|
| 1343 |
-
"speculative_gamma": 1.0, # Gamma for draft
|
| 1344 |
-
"speculative_delta": 0.1, # Delta for draft
|
| 1345 |
}
|
| 1346 |
|
| 1347 |
# Remove None values to avoid llama.cpp errors
|
|
|
|
| 1261 |
# IGNORE user batch size - use conservative settings for reliability
|
| 1262 |
logger.info(f"[TOKEN] User batch size: {user_batch_size} (OVERRIDDEN for reliability)")
|
| 1263 |
|
| 1264 |
+
# ORIGINAL WORKING SETTINGS WITH HARD-CODED RAM
|
| 1265 |
+
optimal_batch = 512 # ORIGINAL: Working batch size
|
| 1266 |
+
optimal_ctx = 1024 # ORIGINAL: Working context size
|
| 1267 |
optimal_threads = 2 # FIXED: 2 threads for 2 vCPU
|
| 1268 |
|
| 1269 |
+
logger.info(f"[RAM] ORIGINAL: batch={optimal_batch}, ctx={optimal_ctx}")
|
| 1270 |
|
| 1271 |
# Reduce context for Gemma models (they have 131K n_ctx_train)
|
| 1272 |
if model_format == "gemma":
|
| 1273 |
+
optimal_ctx = 512 # ORIGINAL for Gemma
|
| 1274 |
logger.info(f"[FORMAT] Gemma detected: reducing context to {optimal_ctx}")
|
| 1275 |
|
| 1276 |
# Apply CPU optimizations before model loading
|
|
|
|
| 1286 |
if is_cached:
|
| 1287 |
model_cache.preload_cache(path)
|
| 1288 |
|
| 1289 |
+
# SIMPLIFIED CPU-ONLY INITIALIZATION FOR RELIABILITY
|
| 1290 |
init_params = {
|
| 1291 |
"model_path": path,
|
| 1292 |
"n_ctx": optimal_ctx,
|
| 1293 |
+
"n_threads": optimal_threads,
|
| 1294 |
+
"n_threads_batch": optimal_threads,
|
| 1295 |
+
"use_mmap": True,
|
| 1296 |
+
"use_mlock": False,
|
| 1297 |
+
"n_batch": optimal_batch,
|
| 1298 |
+
"n_gpu_layers": 0,
|
|
|
|
|
|
|
| 1299 |
"verbose": False,
|
| 1300 |
+
"seed": -1,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1301 |
}
|
| 1302 |
|
| 1303 |
# Remove None values to avoid llama.cpp errors
|