turtle170 commited on
Commit
b91cca5
·
verified ·
1 Parent(s): 9f35548

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -57
app.py CHANGED
@@ -1261,16 +1261,16 @@ class ZeroEngine:
1261
  # IGNORE user batch size - use conservative settings for reliability
1262
  logger.info(f"[TOKEN] User batch size: {user_batch_size} (OVERRIDDEN for reliability)")
1263
 
1264
- # OPTIMIZED SETTINGS FOR 16GB USABLE RAM
1265
- optimal_batch = 512 # OPTIMIZED: Good batch size
1266
- optimal_ctx = 1024 # OPTIMIZED: Good context size
1267
  optimal_threads = 2 # FIXED: 2 threads for 2 vCPU
1268
 
1269
- logger.info(f"[RAM] OPTIMIZED: batch={optimal_batch}, ctx={optimal_ctx}")
1270
 
1271
  # Reduce context for Gemma models (they have 131K n_ctx_train)
1272
  if model_format == "gemma":
1273
- optimal_ctx = 512 # OPTIMIZED for Gemma
1274
  logger.info(f"[FORMAT] Gemma detected: reducing context to {optimal_ctx}")
1275
 
1276
  # Apply CPU optimizations before model loading
@@ -1286,62 +1286,18 @@ class ZeroEngine:
1286
  if is_cached:
1287
  model_cache.preload_cache(path)
1288
 
1289
- # ULTRA-OPTIMIZED CPU-ONLY INITIALIZATION
1290
  init_params = {
1291
  "model_path": path,
1292
  "n_ctx": optimal_ctx,
1293
- "n_threads": optimal_threads, # Exactly 2 threads
1294
- "n_threads_batch": optimal_threads, # Batch threads = total threads
1295
- "use_mmap": USE_MMAP, # Memory-mapped I/O
1296
- "use_mlock": MLOCK_MODEL, # Let OS manage memory
1297
- "n_batch": optimal_batch, # Optimized batch size
1298
- "n_gpu_layers": 0, # CPU-only
1299
- "rope_scaling_type": 0,
1300
- "rope_freq_scale": ROPE_SCALING,
1301
  "verbose": False,
1302
- "logits_all": False, # Only final logits
1303
- "embedding": False, # No embeddings
1304
- "f16_kv": False, # Quantized KV cache
1305
- # ULTRA AGGRESSIVE SPEED OPTIMIZATIONS
1306
- "type_k": 2 if KV_CACHE_QUANTIZATION and model_format != "gemma" else None,
1307
- "type_v": 2 if KV_CACHE_QUANTIZATION and model_format != "gemma" else None,
1308
- "offload_kqv": OFFLOAD_KQV,
1309
- "flash_attn": FLASH_ATTENTION,
1310
- "use_scratch": True, # Use scratch buffer
1311
- "no_kv_offload": True, # Keep KV in RAM
1312
- "num_experts_used": 0, # No MoE for CPU
1313
- "seed": -1, # Random seed
1314
- "tensor_split": None, # No tensor splitting
1315
- "main_gpu": 0, # CPU-only
1316
- "device": "cpu", # Explicit CPU
1317
- "lora_base": None, # No LoRA base
1318
- "lora_scale": 1.0, # LoRA scale
1319
- "clpp_k": 0, # No CLPP
1320
- "numa": NUMA_OPTIMIZE, # NUMA if available
1321
- "cfg_scale": 1.0, # No CFG
1322
- "grammar": None, # No grammar constraints
1323
- "chat_format": None, # Auto-detect
1324
- "chat_handler": None, # Default handler
1325
- "cache_prompt": True, # Cache prompts
1326
- "cache_prompt_tokens": 256, # Prompt cache size
1327
- "cache_all": False, # Don't cache all
1328
- "draft_model": None, # No draft model
1329
- "draft_model_n_ctx": 512, # Draft context
1330
- "draft_model_n_gpu_layers": -1, # Auto-detect
1331
- "speculative_max_draft_len": 5, # Speculative decoding
1332
- "speculative_max_top_k": 4, # Speculative top-k
1333
- "speculative_decoding": SPECULATIVE_DECODE, # Enable if available
1334
- "speculative_min_draft_len": 1, # Min draft length
1335
- "speculative_max_top_k": 4, # Max top-k for draft
1336
- "speculative_min_top_k": 1, # Min top-k for draft
1337
- "speculative_max_top_p": 0.95, # Max top-p for draft
1338
- "speculative_min_top_p": 0.1, # Min top-p for draft
1339
- "speculative_max_temp": 1.0, # Max temp for draft
1340
- "speculative_min_temp": 0.1, # Min temp for draft
1341
- "speculative_eta": 0.1, # Eta for draft
1342
- "speculative_tau": 5.0, # Tau for draft
1343
- "speculative_gamma": 1.0, # Gamma for draft
1344
- "speculative_delta": 0.1, # Delta for draft
1345
  }
1346
 
1347
  # Remove None values to avoid llama.cpp errors
 
1261
  # IGNORE user batch size - use conservative settings for reliability
1262
  logger.info(f"[TOKEN] User batch size: {user_batch_size} (OVERRIDDEN for reliability)")
1263
 
1264
+ # ORIGINAL WORKING SETTINGS WITH HARD-CODED RAM
1265
+ optimal_batch = 512 # ORIGINAL: Working batch size
1266
+ optimal_ctx = 1024 # ORIGINAL: Working context size
1267
  optimal_threads = 2 # FIXED: 2 threads for 2 vCPU
1268
 
1269
+ logger.info(f"[RAM] ORIGINAL: batch={optimal_batch}, ctx={optimal_ctx}")
1270
 
1271
  # Reduce context for Gemma models (they have 131K n_ctx_train)
1272
  if model_format == "gemma":
1273
+ optimal_ctx = 512 # ORIGINAL for Gemma
1274
  logger.info(f"[FORMAT] Gemma detected: reducing context to {optimal_ctx}")
1275
 
1276
  # Apply CPU optimizations before model loading
 
1286
  if is_cached:
1287
  model_cache.preload_cache(path)
1288
 
1289
+ # SIMPLIFIED CPU-ONLY INITIALIZATION FOR RELIABILITY
1290
  init_params = {
1291
  "model_path": path,
1292
  "n_ctx": optimal_ctx,
1293
+ "n_threads": optimal_threads,
1294
+ "n_threads_batch": optimal_threads,
1295
+ "use_mmap": True,
1296
+ "use_mlock": False,
1297
+ "n_batch": optimal_batch,
1298
+ "n_gpu_layers": 0,
 
 
1299
  "verbose": False,
1300
+ "seed": -1,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1301
  }
1302
 
1303
  # Remove None values to avoid llama.cpp errors