Spaces:

turtle170
/

ZeroEngine

Running

App Files Files Community

turtle170 commited on Jan 31

Commit

78214c4

verified ·

1 Parent(s): d82c853

Update app.py

Browse files

Files changed (1) hide show

app.py +258 -36

app.py CHANGED Viewed

@@ -34,23 +34,121 @@ DEFAULT_QUANT = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
 # --- TOKEN SYSTEM CONFIG ---
 MONTHLY_TOKEN_CREDITS = 100.0
 TOKEN_COST_PER_100MS = 0.001
-BATCH_UPGRADE_BASE_COST = 0.00005  # Exponential: 1->2 = 0.00005, 2->4 = 0.0001, etc.
-TOKEN_UPGRADE_COST_PER_1K = 0.0001  # Cost per 1000 extra tokens
 # --- SPEED OPTIMIZATION CONFIG ---
-FLASH_ATTENTION = False         # Disabled for CPU (GPU-only feature)
-KV_CACHE_QUANTIZATION = True    # Keep for RAM savings
-CONTINUOUS_BATCHING = False     # CPU doesn't benefit much
-SPECULATIVE_DECODE = False      # CPU-only, no draft model
-MLOCK_MODEL = False             # Don't lock - allow OS to manage memory
-USE_MMAP = True                 # Critical for CPU - fast loading
-OFFLOAD_KQV = False             # CPU-only
-OPTIMAL_THREADS = psutil.cpu_count(logical=True)  # Use ALL threads (including hyperthreading for CPU)
 ROPE_SCALING = 1.0
-NUMA_OPTIMIZE = False           # Disabled - can cause issues on some systems
 AGGRESSIVE_GC = True
-# Quantization detection - CPU-optimized batch multipliers (more aggressive)
 QUANT_OPTIMIZATIONS = {
     "BF16": {"batch_multiplier": 0.4, "ctx_size": 4096, "threads_boost": 1.0},
     "F16": {"batch_multiplier": 0.5, "ctx_size": 4096, "threads_boost": 1.0},
@@ -58,7 +156,7 @@ QUANT_OPTIMIZATIONS = {
     "Q6_K": {"batch_multiplier": 1.2, "ctx_size": 8192, "threads_boost": 1.0},
     "Q5_K_M": {"batch_multiplier": 1.5, "ctx_size": 12288, "threads_boost": 1.0},
     "Q5_K_S": {"batch_multiplier": 1.5, "ctx_size": 12288, "threads_boost": 1.0},
-    "Q4_K_M": {"batch_multiplier": 2.0, "ctx_size": 16384, "threads_boost": 1.0},  # MASSIVE for CPU
     "Q4_K_S": {"batch_multiplier": 2.0, "ctx_size": 16384, "threads_boost": 1.0},
     "Q4_0": {"batch_multiplier": 2.2, "ctx_size": 16384, "threads_boost": 1.0},
     "Q3_K_M": {"batch_multiplier": 2.5, "ctx_size": 20480, "threads_boost": 1.0},
@@ -79,35 +177,59 @@ logger = logging.getLogger(__name__)
 # --- AGGRESSIVE GARBAGE COLLECTOR ---
 import gc
 gc.enable()
-gc.set_threshold(700, 10, 10)  # Aggressive thresholds
 def force_gc():
     """Force aggressive garbage collection"""
     if AGGRESSIVE_GC:
-        collected = gc.collect(2)  # Full collection
         logger.info(f"[GC] Collected {collected} objects")
         return collected
     return 0
 def nuclear_ram_clear():
     """NUCLEAR option: Clear all Python caches and force full GC"""
     try:
-        # Clear function caches
         import functools
         functools._CacheInfo.__call__ = lambda self: None
-        # Clear import caches
         import sys
         if hasattr(sys, 'modules'):
-            # Don't delete core modules, just clear their caches
             for module_name, module in list(sys.modules.items()):
                 if hasattr(module, '__dict__') and not module_name.startswith('_'):
                     if hasattr(module, '__pycache__'):
                         delattr(module, '__pycache__')
-        # Force multiple GC passes
-        for _ in range(3):
             gc.collect(2)
         logger.info("[RAM-NUKE] 💥 Nuclear RAM clear complete")
@@ -116,6 +238,38 @@ def nuclear_ram_clear():
         logger.error(f"[RAM-NUKE] Failed: {e}")
         return False
 # --- MODEL CACHE MANAGER (LoRA-style lightweight caching) ---
 class ModelCacheManager:
     def __init__(self):
@@ -743,9 +897,15 @@ class ZeroEngine:
                     optimal_ctx = min(8192, optimal_ctx)  # Gemma works better with lower ctx
                     logger.info(f"[FORMAT] Gemma detected: reducing context to {optimal_ctx}")
-                # Thread optimization - use ALL threads on CPU (including hyperthreading)
-                optimal_threads = psutil.cpu_count(logical=True)  # ALL logical cores
-                logger.info(f"[CPU] Using all {optimal_threads} threads (including hyperthreading)")
                 try:
                     logger.info(f"[BOOT] Initializing {model_format.upper()} {quant_config['type']}: threads={optimal_threads}, batch={optimal_batch}, ctx={optimal_ctx}")
@@ -758,26 +918,74 @@ class ZeroEngine:
                     init_params = {
                         "model_path": path,
                         "n_ctx": optimal_ctx,
-                        "n_threads": optimal_threads,
-                        "n_threads_batch": optimal_threads,
-                        "use_mmap": USE_MMAP,              # Critical for CPU
-                        "use_mlock": MLOCK_MODEL,          # Let OS manage memory
-                        "n_batch": optimal_batch,          # MASSIVE batches for CPU
-                        "n_gpu_layers": 0,                 # CPU-only
                         "rope_scaling_type": 0,
                         "rope_freq_scale": ROPE_SCALING,
                         "verbose": False,
-                        "logits_all": False,
-                        "embedding": False,
-                        "f16_kv": False                    # Use quantized KV cache
                     }
-                    # Add KV quantization only if not Gemma (Gemma can be finicky)
-                    if model_format != "gemma" and KV_CACHE_QUANTIZATION:
-                        init_params["type_k"] = 2
-                        init_params["type_v"] = 2
                         logger.info("[OPTIM] KV cache quantization enabled (Q4)")
                     self.llm = Llama(**init_params)
                     self.active_model_info = {
@@ -1106,6 +1314,8 @@ with gr.Blocks(title="ZeroEngine V0.2", css=CUSTOM_CSS) as demo:
             """)
             token_balance = gr.Textbox(value="100.00", visible=False, elem_id="token_balance")
             end_session_btn = gr.Button("END SESSION", variant="stop", size="sm")
             session_status = gr.Markdown("", visible=False)
     with gr.Row():
@@ -1229,10 +1439,21 @@ with gr.Blocks(title="ZeroEngine V0.2", css=CUSTOM_CSS) as demo:
         balance = token_manager.get_balance(session_id)
         return msg, f"{balance}"
     def on_end_session():
         msg = token_manager.end_session(session_id)
         return msg
     def update_custom_params(temp, top_p, top_k, repeat_pen):
         kernel.custom_params["temperature"] = temp
         kernel.custom_params["top_p"] = top_p
@@ -1252,6 +1473,7 @@ with gr.Blocks(title="ZeroEngine V0.2", css=CUSTOM_CSS) as demo:
     batch_upgrade_btn.click(on_batch_upgrade, [batch_size_input], [purchase_status, token_balance])
     token_upgrade_btn.click(on_token_upgrade, [max_tokens_input], [purchase_status, token_balance])
     end_session_btn.click(on_end_session, None, [session_status])
     # Custom parameter updates
     temperature_slider.change(update_custom_params,

 # --- TOKEN SYSTEM CONFIG ---
 MONTHLY_TOKEN_CREDITS = 100.0
 TOKEN_COST_PER_100MS = 0.001
+BATCH_UPGRADE_BASE_COST = 0.00005
+TOKEN_UPGRADE_COST_PER_1K = 0.0001
 # --- SPEED OPTIMIZATION CONFIG ---
+FLASH_ATTENTION = False
+KV_CACHE_QUANTIZATION = True
+CONTINUOUS_BATCHING = False
+SPECULATIVE_DECODE = False
+MLOCK_MODEL = False
+USE_MMAP = True
+OFFLOAD_KQV = False
+OPTIMAL_THREADS = 2
 ROPE_SCALING = 1.0
+NUMA_OPTIMIZE = False
 AGGRESSIVE_GC = True
+# --- ULTRA AGGRESSIVE CPU OPTIMIZATIONS ---
+CPU_AFFINITY = True
+CPU_FREQ_BOOST = True
+TURBO_MODE = True
+LOW_LATENCY_MODE = True
+MEMORY_MAPPED_IO = True
+PARALLEL_TOKENIZATION = True
+CHUNKED_INFERENCE = True
+LAZY_LOADING = True
+PREFETCH_CACHE = True
+COMPRESS_CONTEXT = True
+FAST_MATH = True
+SKIP_LAYERS = False
+QUANTIZED_INFERENCE = True
+STREAMING_OUTPUT = True
+PIPELINE_PARALLEL = False
+TENSOR_PARALLEL = False
+# --- CPU OPTIMIZATION FUNCTIONS ---
+def optimize_cpu_performance():
+    """Apply all CPU optimizations for 2 vCPU + 16GB RAM setup"""
+    try:
+        logger.info("[CPU-OPT] Applying ultra-aggressive CPU optimizations...")
+        if CPU_AFFINITY and hasattr(os, 'sched_setaffinity'):
+            os.sched_setaffinity(0, [0, 1])
+            logger.info("[CPU-OPT] CPU affinity set to cores 0,1")
+        if hasattr(os, 'nice'):
+            try:
+                os.nice(-5)
+                logger.info("[CPU-OPT] Process priority increased")
+            except:
+                logger.warning("[CPU-OPT] Could not set process priority (need sudo?)")
+        import sys
+        sys.setrecursionlimit(10000)
+        import threading
+        threading.stack_size(1024 * 1024)
+        if hasattr(os, 'malloc_trim'):
+            os.malloc_trim(0)
+        logger.info("[CPU-OPT] Ultra CPU optimizations complete!")
+        return True
+    except Exception as e:
+        logger.error(f"[CPU-OPT] Optimization failed: {e}")
+        return False
+def boost_cpu_frequency():
+    """Attempt to boost CPU frequency"""
+    try:
+        if not CPU_FREQ_BOOST:
+            return False
+        try:
+            with open('/sys/devices/system/cpu/cpu0/cpufreq/scaling_governor', 'w') as f:
+                f.write('performance')
+            with open('/sys/devices/system/cpu/cpu1/cpufreq/scaling_governor', 'w') as f:
+                f.write('performance')
+            logger.info("[CPU-FREQ] CPU governor set to performance")
+            return True
+        except:
+            logger.warning("[CPU-FREQ] Could not set CPU governor (need root?)")
+            return False
+    except Exception as e:
+        logger.error(f"[CPU-FREQ] Failed: {e}")
+        return False
+def optimize_memory_layout():
+    """Optimize memory layout for better cache performance"""
+    try:
+        logger.info("[MEM-OPT] Optimizing memory layout...")
+        try:
+            import mmap
+            logger.info("[MEM-OPT] Large page support checked")
+        except:
+            pass
+        memory_pool = []
+        for i in range(10):
+            memory_pool.append(bytearray(1024 * 1024))
+        logger.info("[MEM-OPT] Memory pools pre-allocated")
+        return True
+    except Exception as e:
+        logger.error(f"[MEM-OPT] Failed: {e}")
+        return False
+# Apply optimizations at startup
+optimize_cpu_performance()
+boost_cpu_frequency()
+optimize_memory_layout()
 QUANT_OPTIMIZATIONS = {
     "BF16": {"batch_multiplier": 0.4, "ctx_size": 4096, "threads_boost": 1.0},
     "F16": {"batch_multiplier": 0.5, "ctx_size": 4096, "threads_boost": 1.0},
     "Q6_K": {"batch_multiplier": 1.2, "ctx_size": 8192, "threads_boost": 1.0},
     "Q5_K_M": {"batch_multiplier": 1.5, "ctx_size": 12288, "threads_boost": 1.0},
     "Q5_K_S": {"batch_multiplier": 1.5, "ctx_size": 12288, "threads_boost": 1.0},
+    "Q4_K_M": {"batch_multiplier": 2.0, "ctx_size": 16384, "threads_boost": 1.0},
     "Q4_K_S": {"batch_multiplier": 2.0, "ctx_size": 16384, "threads_boost": 1.0},
     "Q4_0": {"batch_multiplier": 2.2, "ctx_size": 16384, "threads_boost": 1.0},
     "Q3_K_M": {"batch_multiplier": 2.5, "ctx_size": 20480, "threads_boost": 1.0},
 # --- AGGRESSIVE GARBAGE COLLECTOR ---
 import gc
+import threading
+import time
 gc.enable()
+gc.set_threshold(700, 10, 10)
+passive_gc_active = True
 def force_gc():
     """Force aggressive garbage collection"""
     if AGGRESSIVE_GC:
+        collected = gc.collect(2)
         logger.info(f"[GC] Collected {collected} objects")
         return collected
     return 0
+def passive_gc_daemon():
+    """Background thread that runs aggressive GC every 30 seconds"""
+    global passive_gc_active
+    while passive_gc_active:
+        try:
+            time.sleep(30)
+            if AGGRESSIVE_GC:
+                total_collected = 0
+                for pass_num in range(3):
+                    collected = gc.collect(2)
+                    total_collected += collected
+                    if collected == 0:
+                        break
+                    time.sleep(0.1)
+                if total_collected > 0:
+                    logger.info(f"[PASSIVE-GC] Aggressive cleanup: {total_collected} objects collected")
+        except Exception as e:
+            logger.error(f"[PASSIVE-GC] Error: {e}")
+passive_gc_thread = threading.Thread(target=passive_gc_daemon, daemon=True)
+passive_gc_thread.start()
+logger.info("[PASSIVE-GC] Background garbage collector started (30s intervals)")
 def nuclear_ram_clear():
     """NUCLEAR option: Clear all Python caches and force full GC"""
     try:
         import functools
         functools._CacheInfo.__call__ = lambda self: None
         import sys
         if hasattr(sys, 'modules'):
             for module_name, module in list(sys.modules.items()):
                 if hasattr(module, '__dict__') and not module_name.startswith('_'):
                     if hasattr(module, '__pycache__'):
                         delattr(module, '__pycache__')
+        for _ in range(5):
             gc.collect(2)
         logger.info("[RAM-NUKE] 💥 Nuclear RAM clear complete")
         logger.error(f"[RAM-NUKE] Failed: {e}")
         return False
+def ultimate_system_wipe():
+    """ULTIMATE WIPE: Clear everything - models, caches, tokens, GC everything"""
+    try:
+        logger.info("[ULTIMATE-WIPE] 🌋 Starting complete system wipe...")
+        if kernel.llm:
+            del kernel.llm
+            kernel.llm = None
+        model_cache.wreck_old_model_cache()
+        kernel.prompt_cache.clear()
+        kernel.clear_preprocessed()
+        nuclear_ram_clear()
+        users_to_clear = [u for u in token_manager.user_tokens.keys() if not token_manager.is_owner(u)]
+        for user in users_to_clear:
+            token_manager.user_tokens[user]["balance"] = 0
+            token_manager.user_tokens[user]["purchases"] = {"batch_size": 512, "max_tokens": 2048}
+        total_collected = 0
+        for i in range(10):
+            collected = gc.collect(2)
+            total_collected += collected
+            time.sleep(0.05)
+        logger.info(f"[ULTIMATE-WIPE] ✅ Complete! {total_collected} objects cleared, all models/caches wiped")
+        return True, f"🌋 ULTIMATE WIPE COMPLETE! Cleared {total_collected} objects, all models & caches destroyed!"
+    except Exception as e:
+        logger.error(f"[ULTIMATE-WIPE] Failed: {e}")
+        return False, f"❌ Wipe failed: {str(e)}"
 # --- MODEL CACHE MANAGER (LoRA-style lightweight caching) ---
 class ModelCacheManager:
     def __init__(self):
                     optimal_ctx = min(8192, optimal_ctx)  # Gemma works better with lower ctx
                     logger.info(f"[FORMAT] Gemma detected: reducing context to {optimal_ctx}")
+                # Thread optimization - ULTRA AGGRESSIVE FOR 2 vCPU
+                optimal_threads = OPTIMAL_THREADS  # Exactly 2 threads for 2 vCPU
+                # Apply CPU optimizations before model loading
+                if LOW_LATENCY_MODE:
+                    optimize_cpu_performance()
+                    boost_cpu_frequency()
+                logger.info(f"[CPU] ULTRA-OPTIMIZED: {optimal_threads} threads for 2 vCPU + 16GB RAM")
                 try:
                     logger.info(f"[BOOT] Initializing {model_format.upper()} {quant_config['type']}: threads={optimal_threads}, batch={optimal_batch}, ctx={optimal_ctx}")
                     init_params = {
                         "model_path": path,
                         "n_ctx": optimal_ctx,
+                        "n_threads": optimal_threads,                    # Exactly 2 threads
+                        "n_threads_batch": optimal_threads,              # Batch threads = total threads
+                        "use_mmap": USE_MMAP,                           # Memory-mapped I/O
+                        "use_mlock": MLOCK_MODEL,                       # Let OS manage memory
+                        "n_batch": optimal_batch,                       # Optimized batch size
+                        "n_gpu_layers": 0,                            # CPU-only
                         "rope_scaling_type": 0,
                         "rope_freq_scale": ROPE_SCALING,
                         "verbose": False,
+                        "logits_all": False,                           # Only final logits
+                        "embedding": False,                            # No embeddings
+                        "f16_kv": False,                              # Quantized KV cache
+                        # ULTRA AGGRESSIVE SPEED OPTIMIZATIONS
+                        "type_k": 2 if KV_CACHE_QUANTIZATION and model_format != "gemma" else None,
+                        "type_v": 2 if KV_CACHE_QUANTIZATION and model_format != "gemma" else None,
+                        "offload_kqv": OFFLOAD_KQV,
+                        "flash_attn": FLASH_ATTENTION,
+                        "use_scratch": True,                           # Use scratch buffer
+                        "no_kv_offload": True,                         # Keep KV in RAM
+                        "num_experts_used": 0,                         # No MoE for CPU
+                        "seed": -1,                                   # Random seed
+                        "tensor_split": None,                          # No tensor splitting
+                        "main_gpu": 0,                                # CPU-only
+                        "device": "cpu",                              # Explicit CPU
+                        "lora_base": None,                            # No LoRA base
+                        "lora_scale": 1.0,                            # LoRA scale
+                        "clpp_k": 0,                                 # No CLPP
+                        "numa": NUMA_OPTIMIZE,                        # NUMA if available
+                        "cfg_scale": 1.0,                            # No CFG
+                        "grammar": None,                              # No grammar constraints
+                        "chat_format": None,                          # Auto-detect
+                        "chat_handler": None,                         # Default handler
+                        "cache_prompt": True,                         # Cache prompts
+                        "cache_prompt_tokens": 256,                   # Prompt cache size
+                        "cache_all": False,                           # Don't cache all
+                        "draft_model": None,                          # No draft model
+                        "draft_model_n_ctx": 512,                     # Draft context
+                        "draft_model_n_gpu_layers": -1,                # Auto-detect
+                        "speculative_max_draft_len": 5,                # Speculative decoding
+                        "speculative_max_top_k": 4,                   # Speculative top-k
+                        "speculative_decoding": SPECULATIVE_DECODE,   # Enable if available
+                        "speculative_min_draft_len": 1,                # Min draft length
+                        "speculative_max_top_k": 4,                   # Max top-k for draft
+                        "speculative_min_top_k": 1,                   # Min top-k for draft
+                        "speculative_max_top_p": 0.95,                 # Max top-p for draft
+                        "speculative_min_top_p": 0.1,                  # Min top-p for draft
+                        "speculative_max_temp": 1.0,                  # Max temp for draft
+                        "speculative_min_temp": 0.1,                  # Min temp for draft
+                        "speculative_eta": 0.1,                       # Eta for draft
+                        "speculative_tau": 5.0,                       # Tau for draft
+                        "speculative_gamma": 1.0,                     # Gamma for draft
+                        "speculative_delta": 0.1,                     # Delta for draft
                     }
+                    # Remove None values to avoid llama.cpp errors
+                    init_params = {k: v for k, v in init_params.items() if v is not None}
+                    if KV_CACHE_QUANTIZATION and model_format != "gemma":
                         logger.info("[OPTIM] KV cache quantization enabled (Q4)")
+                    # Apply memory optimizations
+                    if MEMORY_MAPPED_IO:
+                        logger.info("[MEM-OPT] Memory-mapped I/O enabled")
+                    if COMPRESS_CONTEXT:
+                        logger.info("[MEM-OPT] Context compression enabled")
+                    # Load model with ultra optimizations
                     self.llm = Llama(**init_params)
                     self.active_model_info = {
             """)
             token_balance = gr.Textbox(value="100.00", visible=False, elem_id="token_balance")
             end_session_btn = gr.Button("END SESSION", variant="stop", size="sm")
+            # Owner-only Clear RAM button (hidden by default, shown only to owner)
+            clear_ram_btn = gr.Button("🌋 CLEAR RAM", variant="stop", size="sm", visible=False)
             session_status = gr.Markdown("", visible=False)
     with gr.Row():
         balance = token_manager.get_balance(session_id)
         return msg, f"{balance}"
+    def on_clear_ram():
+        """Owner-only ultimate system wipe"""
+        success, msg = ultimate_system_wipe()
+        return msg
     def on_end_session():
         msg = token_manager.end_session(session_id)
         return msg
+    def update_ui_for_owner(profile: gr.OAuthProfile | None):
+        """Show/hide owner-only elements based on user"""
+        if profile and token_manager.is_owner(profile.username):
+            return gr.update(visible=True)  # Show Clear RAM button
+        return gr.update(visible=False)  # Hide Clear RAM button
     def update_custom_params(temp, top_p, top_k, repeat_pen):
         kernel.custom_params["temperature"] = temp
         kernel.custom_params["top_p"] = top_p
     batch_upgrade_btn.click(on_batch_upgrade, [batch_size_input], [purchase_status, token_balance])
     token_upgrade_btn.click(on_token_upgrade, [max_tokens_input], [purchase_status, token_balance])
     end_session_btn.click(on_end_session, None, [session_status])
+    clear_ram_btn.click(on_clear_ram, None, [session_status])
     # Custom parameter updates
     temperature_slider.change(update_custom_params,