Spaces:

turtle170
/

ZeroEngine

Running

App Files Files Community

turtle170 commited on Feb 1

Commit

b91cca5

verified ·

1 Parent(s): 9f35548

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -57

app.py CHANGED Viewed

@@ -1261,16 +1261,16 @@ class ZeroEngine:
                     # IGNORE user batch size - use conservative settings for reliability
                     logger.info(f"[TOKEN] User batch size: {user_batch_size} (OVERRIDDEN for reliability)")
-                # OPTIMIZED SETTINGS FOR 16GB USABLE RAM
-                optimal_batch = 512  # OPTIMIZED: Good batch size
-                optimal_ctx = 1024   # OPTIMIZED: Good context size
                 optimal_threads = 2   # FIXED: 2 threads for 2 vCPU
-                logger.info(f"[RAM] OPTIMIZED: batch={optimal_batch}, ctx={optimal_ctx}")
                 # Reduce context for Gemma models (they have 131K n_ctx_train)
                 if model_format == "gemma":
-                    optimal_ctx = 512  # OPTIMIZED for Gemma
                     logger.info(f"[FORMAT] Gemma detected: reducing context to {optimal_ctx}")
                 # Apply CPU optimizations before model loading
@@ -1286,62 +1286,18 @@ class ZeroEngine:
                     if is_cached:
                         model_cache.preload_cache(path)
-                    # ULTRA-OPTIMIZED CPU-ONLY INITIALIZATION
                     init_params = {
                         "model_path": path,
                         "n_ctx": optimal_ctx,
-                        "n_threads": optimal_threads,                    # Exactly 2 threads
-                        "n_threads_batch": optimal_threads,              # Batch threads = total threads
-                        "use_mmap": USE_MMAP,                           # Memory-mapped I/O
-                        "use_mlock": MLOCK_MODEL,                       # Let OS manage memory
-                        "n_batch": optimal_batch,                       # Optimized batch size
-                        "n_gpu_layers": 0,                            # CPU-only
-                        "rope_scaling_type": 0,
-                        "rope_freq_scale": ROPE_SCALING,
                         "verbose": False,
-                        "logits_all": False,                           # Only final logits
-                        "embedding": False,                            # No embeddings
-                        "f16_kv": False,                              # Quantized KV cache
-                        # ULTRA AGGRESSIVE SPEED OPTIMIZATIONS
-                        "type_k": 2 if KV_CACHE_QUANTIZATION and model_format != "gemma" else None,
-                        "type_v": 2 if KV_CACHE_QUANTIZATION and model_format != "gemma" else None,
-                        "offload_kqv": OFFLOAD_KQV,
-                        "flash_attn": FLASH_ATTENTION,
-                        "use_scratch": True,                           # Use scratch buffer
-                        "no_kv_offload": True,                         # Keep KV in RAM
-                        "num_experts_used": 0,                         # No MoE for CPU
-                        "seed": -1,                                   # Random seed
-                        "tensor_split": None,                          # No tensor splitting
-                        "main_gpu": 0,                                # CPU-only
-                        "device": "cpu",                              # Explicit CPU
-                        "lora_base": None,                            # No LoRA base
-                        "lora_scale": 1.0,                            # LoRA scale
-                        "clpp_k": 0,                                 # No CLPP
-                        "numa": NUMA_OPTIMIZE,                        # NUMA if available
-                        "cfg_scale": 1.0,                            # No CFG
-                        "grammar": None,                              # No grammar constraints
-                        "chat_format": None,                          # Auto-detect
-                        "chat_handler": None,                         # Default handler
-                        "cache_prompt": True,                         # Cache prompts
-                        "cache_prompt_tokens": 256,                   # Prompt cache size
-                        "cache_all": False,                           # Don't cache all
-                        "draft_model": None,                          # No draft model
-                        "draft_model_n_ctx": 512,                     # Draft context
-                        "draft_model_n_gpu_layers": -1,                # Auto-detect
-                        "speculative_max_draft_len": 5,                # Speculative decoding
-                        "speculative_max_top_k": 4,                   # Speculative top-k
-                        "speculative_decoding": SPECULATIVE_DECODE,   # Enable if available
-                        "speculative_min_draft_len": 1,                # Min draft length
-                        "speculative_max_top_k": 4,                   # Max top-k for draft
-                        "speculative_min_top_k": 1,                   # Min top-k for draft
-                        "speculative_max_top_p": 0.95,                 # Max top-p for draft
-                        "speculative_min_top_p": 0.1,                  # Min top-p for draft
-                        "speculative_max_temp": 1.0,                  # Max temp for draft
-                        "speculative_min_temp": 0.1,                  # Min temp for draft
-                        "speculative_eta": 0.1,                       # Eta for draft
-                        "speculative_tau": 5.0,                       # Tau for draft
-                        "speculative_gamma": 1.0,                     # Gamma for draft
-                        "speculative_delta": 0.1,                     # Delta for draft
                     }
                     # Remove None values to avoid llama.cpp errors

                     # IGNORE user batch size - use conservative settings for reliability
                     logger.info(f"[TOKEN] User batch size: {user_batch_size} (OVERRIDDEN for reliability)")
+                # ORIGINAL WORKING SETTINGS WITH HARD-CODED RAM
+                optimal_batch = 512  # ORIGINAL: Working batch size
+                optimal_ctx = 1024   # ORIGINAL: Working context size
                 optimal_threads = 2   # FIXED: 2 threads for 2 vCPU
+                logger.info(f"[RAM] ORIGINAL: batch={optimal_batch}, ctx={optimal_ctx}")
                 # Reduce context for Gemma models (they have 131K n_ctx_train)
                 if model_format == "gemma":
+                    optimal_ctx = 512  # ORIGINAL for Gemma
                     logger.info(f"[FORMAT] Gemma detected: reducing context to {optimal_ctx}")
                 # Apply CPU optimizations before model loading
                     if is_cached:
                         model_cache.preload_cache(path)
+                    # SIMPLIFIED CPU-ONLY INITIALIZATION FOR RELIABILITY
                     init_params = {
                         "model_path": path,
                         "n_ctx": optimal_ctx,
+                        "n_threads": optimal_threads,
+                        "n_threads_batch": optimal_threads,
+                        "use_mmap": True,
+                        "use_mlock": False,
+                        "n_batch": optimal_batch,
+                        "n_gpu_layers": 0,
                         "verbose": False,
+                        "seed": -1,
                     }
                     # Remove None values to avoid llama.cpp errors