Spaces:

turtle170
/

ZeroEngine

Running

App Files Files Community

turtle170 commited on Jan 31

Commit

75bc536

verified ·

1 Parent(s): dccc10e

Update app.py

Browse files

Files changed (1) hide show

app.py +68 -565

app.py CHANGED Viewed

@@ -31,47 +31,32 @@ SYSTEM_RESERVE_MB = 500
 DEFAULT_MODEL = "unsloth/Llama-3.2-1B-Instruct-GGUF"
 DEFAULT_QUANT = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
-# --- TOKEN SYSTEM CONFIG ---
-MONTHLY_TOKEN_CREDITS = 100.0
-TOKEN_COST_PER_100MS = 0.001
-BATCH_UPGRADE_BASE_COST = 0.00005  # Exponential: 1->2 = 0.00005, 2->4 = 0.0001, etc.
-TOKEN_UPGRADE_COST_PER_1K = 0.0001  # Cost per 1000 extra tokens
 # --- SPEED OPTIMIZATION CONFIG ---
-FLASH_ATTENTION = False         # Disabled for CPU (GPU-only feature)
-KV_CACHE_QUANTIZATION = True    # Keep for RAM savings
-CONTINUOUS_BATCHING = False     # CPU doesn't benefit much
-SPECULATIVE_DECODE = False      # CPU-only, no draft model
-MLOCK_MODEL = False             # Don't lock - allow OS to manage memory
-USE_MMAP = True                 # Critical for CPU - fast loading
-OFFLOAD_KQV = False             # CPU-only
-OPTIMAL_THREADS = psutil.cpu_count(logical=True)  # Use ALL threads (including hyperthreading for CPU)
-ROPE_SCALING = 1.0
-NUMA_OPTIMIZE = False           # Disabled - can cause issues on some systems
-AGGRESSIVE_GC = True
-# Quantization detection - CPU-optimized batch multipliers (more aggressive)
 QUANT_OPTIMIZATIONS = {
-    "BF16": {"batch_multiplier": 0.4, "ctx_size": 4096, "threads_boost": 1.0},
-    "F16": {"batch_multiplier": 0.5, "ctx_size": 4096, "threads_boost": 1.0},
-    "Q8_0": {"batch_multiplier": 1.0, "ctx_size": 8192, "threads_boost": 1.0},
-    "Q6_K": {"batch_multiplier": 1.2, "ctx_size": 8192, "threads_boost": 1.0},
-    "Q5_K_M": {"batch_multiplier": 1.5, "ctx_size": 12288, "threads_boost": 1.0},
-    "Q5_K_S": {"batch_multiplier": 1.5, "ctx_size": 12288, "threads_boost": 1.0},
-    "Q4_K_M": {"batch_multiplier": 2.0, "ctx_size": 16384, "threads_boost": 1.0},  # MASSIVE for CPU
-    "Q4_K_S": {"batch_multiplier": 2.0, "ctx_size": 16384, "threads_boost": 1.0},
-    "Q4_0": {"batch_multiplier": 2.2, "ctx_size": 16384, "threads_boost": 1.0},
-    "Q3_K_M": {"batch_multiplier": 2.5, "ctx_size": 20480, "threads_boost": 1.0},
-    "Q2_K": {"batch_multiplier": 3.0, "ctx_size": 24576, "threads_boost": 1.0},
-}
-# Model format/architecture detection patterns
-MODEL_FORMATS = {
-    "llama": {"pattern": ["llama", "mistral", "mixtral"], "template": "llama"},
-    "gemma": {"pattern": ["gemma"], "template": "gemma"},
-    "phi": {"pattern": ["phi"], "template": "phi"},
-    "qwen": {"pattern": ["qwen"], "template": "chatml"},
-    "deepseek": {"pattern": ["deepseek"], "template": "deepseek"},
 }
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - ZEROENGINE - %(message)s')
@@ -222,212 +207,6 @@ class ModelCacheManager:
             logger.error(f"[WRECKER] Failed: {e}")
             return False
-# --- TOKEN MANAGER ---
-class TokenManager:
-    def __init__(self):
-        self.user_tokens = {}  # {username: {"balance": float, "start_time": float, "purchases": {}}}
-        self.owner_username = "turtle170"  # Owner gets infinite tokens
-    def is_owner(self, username: str) -> bool:
-        """Check if user is the owner"""
-        if not username:
-            return False
-        return username.lower() == self.owner_username.lower()
-    def initialize_user(self, username: str):
-        """Initialize new user with monthly credits (or infinite for owner)"""
-        if not username:
-            username = "anonymous"
-        if username not in self.user_tokens:
-            # Owner gets infinite tokens
-            if self.is_owner(username):
-                self.user_tokens[username] = {
-                    "balance": float('inf'),
-                    "start_time": time.time(),
-                    "purchases": {"batch_multiplier": 1, "token_limit": 2048},
-                    "total_spent": 0.0,
-                    "is_owner": True,
-                    "username": username
-                }
-                logger.info(f"[TOKEN] 👑 OWNER {username} initialized with INFINITE tokens!")
-            else:
-                self.user_tokens[username] = {
-                    "balance": MONTHLY_TOKEN_CREDITS,
-                    "start_time": time.time(),
-                    "purchases": {"batch_multiplier": 1, "token_limit": 2048},
-                    "total_spent": 0.0,
-                    "is_owner": False,
-                    "username": username,
-                    "last_reset": time.time()
-                }
-                logger.info(f"[TOKEN] New user {username}: {MONTHLY_TOKEN_CREDITS} tokens")
-    def check_monthly_reset(self, username: str):
-        """Reset tokens if a month has passed"""
-        if not username or username not in self.user_tokens:
-            return
-        if self.user_tokens[username].get("is_owner", False):
-            return  # Owner never needs reset
-        last_reset = self.user_tokens[username].get("last_reset", time.time())
-        month_in_seconds = 30 * 24 * 60 * 60  # 30 days
-        if time.time() - last_reset > month_in_seconds:
-            self.user_tokens[username]["balance"] = MONTHLY_TOKEN_CREDITS
-            self.user_tokens[username]["last_reset"] = time.time()
-            self.user_tokens[username]["total_spent"] = 0.0
-            logger.info(f"[TOKEN] Monthly reset for {username}: {MONTHLY_TOKEN_CREDITS} tokens")
-    def charge_usage(self, username: str, duration_ms: float) -> bool:
-        """Charge user for inference time. Returns True if successful. Owner never charged."""
-        if not username:
-            username = "anonymous"
-        self.initialize_user(username)
-        self.check_monthly_reset(username)
-        # Owner never gets charged
-        if self.user_tokens[username].get("is_owner", False):
-            return True
-        cost = (duration_ms / 100.0) * TOKEN_COST_PER_100MS
-        # Check if user has enough balance
-        if self.user_tokens[username]["balance"] <= 0:
-            logger.warning(f"[TOKEN] ❌ {username} has 0 tokens! Access denied.")
-            return False
-        if self.user_tokens[username]["balance"] >= cost:
-            self.user_tokens[username]["balance"] -= cost
-            self.user_tokens[username]["balance"] = max(0, self.user_tokens[username]["balance"])  # Never go below 0
-            self.user_tokens[username]["total_spent"] += cost
-            logger.info(f"[TOKEN] Charged {cost:.4f} tokens ({duration_ms:.0f}ms) | Remaining: {self.user_tokens[username]['balance']:.2f}")
-            return True
-        else:
-            # Insufficient balance - set to 0 and deny
-            self.user_tokens[username]["balance"] = 0
-            logger.warning(f"[TOKEN] ❌ Insufficient balance! {username} now at 0 tokens.")
-            return False
-    def can_use_engine(self, username: str) -> tuple:
-        """Check if user can use the engine. Returns (bool, message)"""
-        if not username:
-            username = "anonymous"
-        self.initialize_user(username)
-        self.check_monthly_reset(username)
-        if self.user_tokens[username].get("is_owner", False):
-            return True, "👑 Owner access granted"
-        balance = self.user_tokens[username]["balance"]
-        if balance <= 0:
-            last_reset = self.user_tokens[username].get("last_reset", time.time())
-            time_until_reset = 30 * 24 * 60 * 60 - (time.time() - last_reset)
-            days_left = int(time_until_reset / (24 * 60 * 60))
-            return False, f"❌ Out of tokens! Resets in {days_left} days. Current balance: 0.00"
-        return True, f"✅ Access granted. Balance: {balance:.2f} tokens"
-    def purchase_batch_upgrade(self, username: str) -> tuple:
-        """Purchase batch size upgrade (exponential cost). Free for owner."""
-        if not username:
-            return False, "❌ Please login first"
-        self.initialize_user(username)
-        # Owner gets free upgrades
-        if self.user_tokens[username].get("is_owner", False):
-            current_mult = self.user_tokens[username]["purchases"]["batch_multiplier"]
-            self.user_tokens[username]["purchases"]["batch_multiplier"] = current_mult * 2
-            new_mult = current_mult * 2
-            logger.info(f"[TOKEN] 👑 OWNER free batch upgrade: {current_mult}x → {new_mult}x")
-            return True, f"👑 FREE UPGRADE! Batch now {new_mult}x!"
-        current_mult = self.user_tokens[username]["purchases"]["batch_multiplier"]
-        upgrade_level = int(math.log2(current_mult)) if current_mult > 1 else 0
-        cost = BATCH_UPGRADE_BASE_COST * (2 ** upgrade_level)
-        if self.user_tokens[username]["balance"] >= cost:
-            self.user_tokens[username]["balance"] -= cost
-            self.user_tokens[username]["purchases"]["batch_multiplier"] = current_mult * 2
-            new_mult = current_mult * 2
-            logger.info(f"[TOKEN] Batch upgrade: {current_mult}x → {new_mult}x | Cost: {cost:.5f}")
-            return True, f"✅ Batch upgraded to {new_mult}x! (-{cost:.5f} tokens)"
-        else:
-            return False, f"❌ Insufficient tokens! Need {cost:.5f}, have {self.user_tokens[username]['balance']:.2f}"
-    def purchase_token_upgrade(self, username: str, extra_tokens: int = 1000) -> tuple:
-        """Purchase extra response token length. Free for owner."""
-        if not username:
-            return False, "❌ Please login first"
-        self.initialize_user(username)
-        # Owner gets free upgrades
-        if self.user_tokens[username].get("is_owner", False):
-            self.user_tokens[username]["purchases"]["token_limit"] += extra_tokens
-            new_limit = self.user_tokens[username]["purchases"]["token_limit"]
-            logger.info(f"[TOKEN] 👑 OWNER free token upgrade: +{extra_tokens} tokens")
-            return True, f"👑 FREE UPGRADE! Token limit now {new_limit}!"
-        cost = (extra_tokens / 1000) * TOKEN_UPGRADE_COST_PER_1K
-        if self.user_tokens[username]["balance"] >= cost:
-            self.user_tokens[username]["balance"] -= cost
-            self.user_tokens[username]["purchases"]["token_limit"] += extra_tokens
-            new_limit = self.user_tokens[username]["purchases"]["token_limit"]
-            logger.info(f"[TOKEN] Token limit upgrade: +{extra_tokens} tokens | Cost: {cost:.5f}")
-            return True, f"✅ Token limit now {new_limit}! (-{cost:.5f} tokens)"
-        else:
-            return False, f"❌ Insufficient tokens! Need {cost:.5f}, have {self.user_tokens[username]['balance']:.2f}"
-    def get_balance(self, username: str) -> float:
-        """Get user's current token balance"""
-        if not username:
-            username = "anonymous"
-        self.initialize_user(username)
-        self.check_monthly_reset(username)
-        balance = self.user_tokens[username]["balance"]
-        # Show ∞ for owner
-        if balance == float('inf'):
-            return balance
-        return round(max(0, balance), 2)  # Never show negative
-    def get_purchases(self, username: str) -> dict:
-        """Get user's current purchases"""
-        if not username:
-            username = "anonymous"
-        self.initialize_user(username)
-        return self.user_tokens[username]["purchases"]
-    def end_session(self, username: str):
-        """End user session and log stats"""
-        if not username:
-            return "No active session found."
-        if username in self.user_tokens:
-            stats = self.user_tokens[username]
-            if stats.get("is_owner", False):
-                return f"👑 Owner session ended. Welcome back anytime, {stats['username']}!"
-            logger.info(f"[TOKEN] Session ended: Spent {stats['total_spent']:.2f}, Remaining {stats['balance']:.2f}")
-            return f"Session ended. You spent {stats['total_spent']:.2f} tokens this session. Balance: {stats['balance']:.2f}"
-        return "No active session found."
-# Global token manager
-import math
-token_manager = TokenManager()
 # Global cache manager
 model_cache = ModelCacheManager()
@@ -493,7 +272,7 @@ class ZeroEngine:
         self.api = HfApi(token=HF_TOKEN)
         self.telemetry = TelemetryManager(self.api)
         self.llm: Optional[Llama] = None
-        self.active_model_info = {"repo": "", "file": "", "format": ""}
         self.kernel_lock = threading.Lock()
         self.is_prefilling = False
         self.perf_stats = {
@@ -503,9 +282,9 @@ class ZeroEngine:
             "peak_tps": 0.0,
             "cache_hits": 0
         }
-        self.prompt_cache = {}
         self.last_activity = time.time()
-        self.idle_timeout = 20
         self.auto_cleanup_thread = None
         self.start_idle_monitor()
@@ -514,29 +293,6 @@ class ZeroEngine:
         self.typing_timer = None
         self.preprocessed_tokens = None
-        # Custom parameters (user-configurable)
-        self.custom_params = {
-            "temperature": 0.7,
-            "top_p": 0.95,
-            "top_k": 40,
-            "repeat_penalty": 1.1,
-            "batch_size_override": None,  # None = auto
-            "max_tokens_override": None   # None = auto
-        }
-    def detect_model_format(self, filename: str, repo: str) -> str:
-        """Auto-detect model format/architecture from filename and repo"""
-        combined = f"{repo.lower()} {filename.lower()}"
-        for format_name, format_info in MODEL_FORMATS.items():
-            for pattern in format_info["pattern"]:
-                if pattern in combined:
-                    logger.info(f"[FORMAT-DETECT] Detected {format_name.upper()} architecture")
-                    return format_name
-        logger.warning(f"[FORMAT-DETECT] Unknown format, defaulting to llama")
-        return "llama"
     def detect_quantization(self, filename: str) -> dict:
         """Detect quantization method from filename and return optimizations"""
         filename_upper = filename.upper()
@@ -633,158 +389,7 @@ class ZeroEngine:
             logger.error(f"Scan error: {e}")
             return []
-    def boot_kernel(self, repo: str, filename: str, session_id: str = None) -> str:
-        """HYPER-OPTIMIZED Boot kernel with format auto-detection and Gemma fixes"""
-        try:
-            if not repo or not filename:
-                return "🔴 ERROR: Repository or filename missing"
-            logger.info(f"[BOOT] Starting download: {filename} from {repo}")
-            # DETECT QUANTIZATION FROM FILENAME
-            quant_config = self.detect_quantization(filename)
-            # DETECT MODEL FORMAT/ARCHITECTURE
-            model_format = self.detect_model_format(filename, repo)
-            # Download with timeout protection
-            try:
-                path = hf_hub_download(
-                    repo_id=repo,
-                    filename=filename,
-                    token=HF_TOKEN,
-                    local_files_only=False
-                )
-                logger.info(f"[BOOT] Download complete: {path}")
-            except Exception as e:
-                logger.error(f"[BOOT] Download failed: {e}")
-                return f"🔴 DOWNLOAD FAILED: {str(e)}"
-            # Check if model is cached
-            is_cached = model_cache.is_cached(path)
-            cache_status = "🎯 CACHED" if is_cached else "🆕 NEW"
-            # Validate before loading
-            valid, msg = ResourceMonitor.validate_deployment(path)
-            if not valid:
-                logger.warning(f"[BOOT] Validation failed: {msg}")
-                return f"🔴 VALIDATION FAILED: {msg}"
-            logger.info(f"[BOOT] Validation passed ({cache_status}), applying {quant_config['type']} optimizations for {model_format.upper()}...")
-            # Load model with MAXIMUM PERFORMANCE SETTINGS
-            with self.kernel_lock:
-                # WRECK OLD MODEL
-                if self.llm:
-                    logger.info("[BOOT] 💣 WRECKING old model...")
-                    try:
-                        model_cache.wreck_old_model_cache()
-                        del self.llm
-                        self.llm = None
-                        nuclear_ram_clear()
-                        logger.info("[BOOT] ✅ Old model DESTROYED")
-                    except Exception as e:
-                        logger.warning(f"[BOOT] Cleanup warning: {e}")
-                # Calculate optimal parameters with token purchases
-                vm = psutil.virtual_memory()
-                available_ram_gb = vm.available / (1024**3)
-                # CPU-OPTIMIZED BATCH CALCULATION - Very aggressive for 16GB RAM
-                # Base calculation: use more RAM for batching on CPU
-                base_batch = int(512 * available_ram_gb / 8)  # More aggressive base
-                optimal_batch = int(base_batch * quant_config["batch_multiplier"])
-                # Apply user's batch multiplier from token purchases
-                if session_id:
-                    user_batch_mult = token_manager.get_purchases(session_id)["batch_multiplier"]
-                    optimal_batch = int(optimal_batch * user_batch_mult)
-                    logger.info(f"[TOKEN] User batch multiplier: {user_batch_mult}x")
-                # CPU can handle larger batches with quantized models
-                optimal_batch = max(1024, min(8192, optimal_batch))  # 1024-8192 range for CPU
-                # Context size
-                optimal_ctx = quant_config["ctx_size"]
-                # Reduce context for Gemma models (they have 131K n_ctx_train)
-                if model_format == "gemma":
-                    optimal_ctx = min(8192, optimal_ctx)  # Gemma works better with lower ctx
-                    logger.info(f"[FORMAT] Gemma detected: reducing context to {optimal_ctx}")
-                # Thread optimization - use ALL threads on CPU (including hyperthreading)
-                optimal_threads = psutil.cpu_count(logical=True)  # ALL logical cores
-                logger.info(f"[CPU] Using all {optimal_threads} threads (including hyperthreading)")
-                try:
-                    logger.info(f"[BOOT] Initializing {model_format.upper()} {quant_config['type']}: threads={optimal_threads}, batch={optimal_batch}, ctx={optimal_ctx}")
-                    # Preload cache if available
-                    if is_cached:
-                        model_cache.preload_cache(path)
-                    # ULTRA-OPTIMIZED CPU-ONLY INITIALIZATION
-                    init_params = {
-                        "model_path": path,
-                        "n_ctx": optimal_ctx,
-                        "n_threads": optimal_threads,
-                        "n_threads_batch": optimal_threads,
-                        "use_mmap": USE_MMAP,              # Critical for CPU
-                        "use_mlock": MLOCK_MODEL,          # Let OS manage memory
-                        "n_batch": optimal_batch,          # MASSIVE batches for CPU
-                        "n_gpu_layers": 0,                 # CPU-only
-                        "rope_scaling_type": 0,
-                        "rope_freq_scale": ROPE_SCALING,
-                        "verbose": False,
-                        "logits_all": False,
-                        "embedding": False,
-                        "f16_kv": False                    # Use quantized KV cache
-                    }
-                    # Add KV quantization only if not Gemma (Gemma can be finicky)
-                    if model_format != "gemma" and KV_CACHE_QUANTIZATION:
-                        init_params["type_k"] = 2
-                        init_params["type_v"] = 2
-                        logger.info("[OPTIM] KV cache quantization enabled (Q4)")
-                    self.llm = Llama(**init_params)
-                    self.active_model_info = {
-                        "repo": repo,
-                        "file": filename,
-                        "quant": quant_config['type'],
-                        "format": model_format
-                    }
-                    self.telemetry.track_load(repo, filename)
-                    # Extract and cache signature
-                    if not is_cached:
-                        logger.info("[BOOT] Extracting cache signature...")
-                        signature = model_cache.extract_cache_signature(path)
-                        if signature:
-                            model_cache.save_to_cache(path, signature)
-                    # Warm-up
-                    logger.info("[BOOT] Warming up model caches...")
-                    try:
-                        self.llm("Warmup", max_tokens=1, stream=False)
-                        force_gc()
-                    except:
-                        pass
-                    logger.info("[BOOT] 🚀 CPU-OPTIMIZED MODEL READY!")
-                    return f"🟢 {model_format.upper()} {quant_config['type']} {cache_status} | CPU:{optimal_threads}T | B:{optimal_batch} | Ctx:{optimal_ctx}"
-                except Exception as e:
-                    logger.error(f"[BOOT] Model loading failed: {e}")
-                    self.llm = None
-                    nuclear_ram_clear()
-                    return f"🔴 LOAD FAILED: {str(e)}"
-        except Exception as e:
-            logger.error(f"[BOOT] Unexpected error: {e}")
-            nuclear_ram_clear()
-            return f"🔴 BOOT FAILURE: {str(e)}"
         """HYPER-OPTIMIZED Boot kernel with cache manager and old model wrecker"""
         try:
             if not repo or not filename:
@@ -942,7 +547,7 @@ class ZeroEngine:
         threading.Thread(target=_bg_eval, daemon=True).start()
         return "⚡ Primed"
-    def inference_generator(self, prompt: str, history: List[Dict], ghost_context: str, repo: str, quant: str, username: str) -> Generator:
         # Update activity timestamp
         self.update_activity()
@@ -995,28 +600,23 @@ class ZeroEngine:
         first_token_time = None
         try:
-            # Get max tokens from user purchases
-            max_tokens = 2048
-            if username:
-                max_tokens = token_manager.get_purchases(username)["token_limit"]
-            # HYPER-OPTIMIZED CPU INFERENCE SETTINGS
             stream = self.llm(
                 formatted_prompt,
-                max_tokens=max_tokens,
                 stop=["User:", "<|eot_id|>", "\n\n"],
                 stream=True,
-                temperature=self.custom_params["temperature"],
-                top_p=self.custom_params["top_p"],
-                top_k=self.custom_params["top_k"],
-                repeat_penalty=self.custom_params["repeat_penalty"],
-                frequency_penalty=0.0,
-                presence_penalty=0.0,
-                tfs_z=1.0,
-                typical_p=1.0,
-                mirostat_mode=2,                    # CPU benefits from mirostat
-                mirostat_tau=5.0,
-                mirostat_eta=0.1,
             )
             for chunk in stream:
@@ -1036,19 +636,10 @@ class ZeroEngine:
                 if tps > self.perf_stats["peak_tps"]:
                     self.perf_stats["peak_tps"] = tps
-                # Charge tokens every second
-                if int(elapsed * 1000) % 1000 < 100 and username:  # Every ~1 second
-                    token_manager.charge_usage(username, elapsed * 1000)
                 # Update history with streaming content + performance metrics
-                balance = token_manager.get_balance(username) if username else 0
-                history[-1]["content"] = f"{response_text}\n\n`⚡ {tps} t/s | 🎯 Peak: {self.perf_stats['peak_tps']:.1f} t/s | 💰 {balance:.2f} tokens`"
                 yield history
-            # Final token charge for remaining time
-            if username:
-                token_manager.charge_usage(username, elapsed * 1000)
             # Update global performance stats
             self.perf_stats["total_tokens"] += tokens_count
             self.perf_stats["total_time"] += elapsed
@@ -1172,49 +763,27 @@ h1, h2, h3, h4, h5, h6 {
 # --- UI INTERFACE ---
 kernel = ZeroEngine()
-# Session ID for token tracking
-username = token_manager.get_username()
-with gr.Blocks(title="ZeroEngine V0.2", css=CUSTOM_CSS) as demo:
-    # Header with Token Display
-    with gr.Row():
-        with gr.Column(scale=8):
-            gr.HTML("""
-                <div style='text-align: center; padding: 30px; border-radius: 24px;
-                            background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
-                            margin-bottom: 20px; box-shadow: 0 10px 30px rgba(0,0,0,0.3);'>
-                    <h1 style='margin: 0; font-size: 3em; background: linear-gradient(90deg, #00d4ff, #7b2ff7);
-                               -webkit-background-clip: text; -webkit-text-fill-color: transparent;
-                               font-family: Consolas, monospace;'>
-                        🛰️ ZEROENGINE V0.2
-                    </h1>
-                    <p style='margin: 10px 0 0 0; color: #888; font-family: Consolas, monospace;'>
-                        CPU-Optimized | Token System | Custom Parameters | Auto-Format
-                    </p>
-                </div>
-            """)
-        with gr.Column(scale=2):
-            # Token Display
-            gr.HTML("""
-                <div style='text-align: center; padding: 20px; border-radius: 20px;
-                            background: linear-gradient(135deg, #7b2ff7 0%, #9b59b6 100%);
-                            margin-bottom: 20px; box-shadow: 0 8px 20px rgba(123,47,247,0.3);'>
-                    <div style='font-size: 2em; margin-bottom: 5px;'>💰</div>
-                    <div id='token-display' style='font-size: 1.8em; font-weight: bold; color: white; font-family: Consolas;'>
-                        100.00
-                    </div>
-                    <div style='font-size: 0.9em; color: #ddd; font-family: Consolas;'>TOKENS</div>
-                </div>
-            """)
-            token_balance = gr.Textbox(value="100.00", visible=False, elem_id="token_balance")
-            end_session_btn = gr.Button("END SESSION", variant="stop", size="sm")
-            session_status = gr.Markdown("", visible=False)
     with gr.Row():
         with gr.Column(scale=8):
             chat_box = gr.Chatbot(
                 label="Main Engine Feedback",
-                height=600,
                 show_label=False,
                 autoscroll=True,
                 container=True
@@ -1229,15 +798,12 @@ with gr.Blocks(title="ZeroEngine V0.2", css=CUSTOM_CSS) as demo:
                 )
                 send_btn = gr.Button("SUBMIT", variant="primary", scale=1)
-        with gr.Column(scale=4):
-            # Hardware Status
             gr.Markdown("### 🛠️ Hardware Status")
             ram_metric = gr.Label(label="RAM Usage", value="0/0 GB")
             cpu_metric = gr.Label(label="CPU Load", value="0%")
             gr.Markdown("---")
-            # Model Control
             gr.Markdown("### 📡 Model Control")
             repo_input = gr.Textbox(label="HuggingFace Repo", value=DEFAULT_MODEL)
             quant_dropdown = gr.Dropdown(label="Available Quants", choices=[], interactive=True)
@@ -1249,26 +815,6 @@ with gr.Blocks(title="ZeroEngine V0.2", css=CUSTOM_CSS) as demo:
             boot_status = gr.Markdown("Status: `STANDBY`")
             gr.Markdown("---")
-            # Custom Parameters
-            gr.Markdown("### ⚙️ Custom Parameters")
-            temperature_slider = gr.Slider(0.1, 2.0, value=0.7, step=0.1, label="Temperature")
-            top_p_slider = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-P")
-            top_k_slider = gr.Slider(1, 100, value=40, step=1, label="Top-K")
-            repeat_penalty_slider = gr.Slider(1.0, 2.0, value=1.1, step=0.1, label="Repeat Penalty")
-            gr.Markdown("---")
-            # Token Purchases
-            gr.Markdown("### 💎 Token Upgrades")
-            with gr.Row():
-                batch_upgrade_btn = gr.Button("🚀 Batch x2", size="sm", variant="secondary")
-                token_upgrade_btn = gr.Button("📈 +1K Tokens", size="sm", variant="secondary")
-            purchase_status = gr.Markdown("Ready to upgrade!")
-            gr.Markdown("---")
-            # Ghost Cache
             gr.Markdown("### 👻 Ghost Cache (Pre-Context)")
             ghost_buffer = gr.Textbox(
                 label="Background Context",
@@ -1282,7 +828,7 @@ with gr.Blocks(title="ZeroEngine V0.2", css=CUSTOM_CSS) as demo:
             log_output = gr.Code(
                 label="Kernel Logs",
                 language="shell",
-                value="[INIT] V0.2 System Ready.",
                 lines=5
             )
@@ -1290,11 +836,9 @@ with gr.Blocks(title="ZeroEngine V0.2", css=CUSTOM_CSS) as demo:
     def update_stats():
         try:
             m = ResourceMonitor.get_metrics()
-            balance = token_manager.get_balance(session_id)
-            return f"{m['ram_used_gb']}/{m['ram_total_gb']} GB", f"{m['cpu_usage_pct']}%", f"{balance}"
         except Exception as e:
             logger.error(f"Stats update error: {e}")
-            return "Error", "Error", "0.00"
             return "Error", "Error"
     def on_scan(repo):
@@ -1320,78 +864,37 @@ with gr.Blocks(title="ZeroEngine V0.2", css=CUSTOM_CSS) as demo:
                 return
             yield "⚙️ System: Initiating boot sequence...", gr.update()
-            time.sleep(0.5)
-            result = kernel.boot_kernel(repo, file, session_id)
             yield result, gr.update()
         except Exception as e:
             logger.error(f"Boot UI error: {e}")
             yield f"🔴 BOOT ERROR: {str(e)}", gr.update()
-    def on_batch_upgrade():
-        success, msg = token_manager.purchase_batch_upgrade(session_id)
-        balance = token_manager.get_balance(session_id)
-        return msg, f"{balance}"
-    def on_token_upgrade():
-        success, msg = token_manager.purchase_token_upgrade(session_id, 1000)
-        balance = token_manager.get_balance(session_id)
-        return msg, f"{balance}"
-    def on_end_session():
-        msg = token_manager.end_session(session_id)
-        return msg
-    def update_custom_params(temp, top_p, top_k, repeat_pen):
-        kernel.custom_params["temperature"] = temp
-        kernel.custom_params["top_p"] = top_p
-        kernel.custom_params["top_k"] = int(top_k)
-        kernel.custom_params["repeat_penalty"] = repeat_pen
-        return "✅ Parameters updated!"
-    # Timer for periodic stats updates (includes token balance)
     timer = gr.Timer(value=2)
-    timer.tick(update_stats, None, [ram_metric, cpu_metric, token_balance])
     # Event handlers
     scan_btn.click(on_scan, [repo_input], [quant_dropdown, log_output])
     boot_btn.click(on_boot, [repo_input, quant_dropdown], [boot_status, log_output])
-    # Token purchases
-    batch_upgrade_btn.click(on_batch_upgrade, None, [purchase_status, token_balance])
-    token_upgrade_btn.click(on_token_upgrade, None, [purchase_status, token_balance])
-    end_session_btn.click(on_end_session, None, [session_status])
-    # Custom parameter updates
-    temperature_slider.change(update_custom_params,
-                             [temperature_slider, top_p_slider, top_k_slider, repeat_penalty_slider],
-                             [purchase_status])
-    top_p_slider.change(update_custom_params,
-                        [temperature_slider, top_p_slider, top_k_slider, repeat_penalty_slider],
-                        [purchase_status])
-    top_k_slider.change(update_custom_params,
-                        [temperature_slider, top_p_slider, top_k_slider, repeat_penalty_slider],
-                        [purchase_status])
-    repeat_penalty_slider.change(update_custom_params,
-                                 [temperature_slider, top_p_slider, top_k_slider, repeat_penalty_slider],
-                                 [purchase_status])
-    # Ghost cache
     stitch_btn.click(
         lambda x: f"Cache: `{kernel.stitch_cache(x)}`",
         [ghost_buffer],
         [stitch_status]
     )
-    # Keyboard input preprocessing
     user_input.change(
         lambda x: kernel.preprocess_input(x),
         [user_input],
         None
     )
-    # Auto-boot enabled inference
     inference_args = [user_input, chat_box, ghost_buffer, repo_input, quant_dropdown]
     user_input.submit(kernel.inference_generator, inference_args, [chat_box])
     send_btn.click(kernel.inference_generator, inference_args, [chat_box])

 DEFAULT_MODEL = "unsloth/Llama-3.2-1B-Instruct-GGUF"
 DEFAULT_QUANT = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
 # --- SPEED OPTIMIZATION CONFIG ---
+FLASH_ATTENTION = True          # Enable Flash Attention 2
+KV_CACHE_QUANTIZATION = True    # Quantize KV cache (4-bit)
+CONTINUOUS_BATCHING = True      # Enable continuous batching
+SPECULATIVE_DECODE = False      # Disabled for CPU (requires draft model)
+MLOCK_MODEL = False             # Disabled: prevents swapping but uses more RAM
+USE_MMAP = True                 # Memory-mapped file loading
+OFFLOAD_KQV = False             # CPU-only, no offload needed
+OPTIMAL_THREADS = max(1, psutil.cpu_count(logical=False) - 1)  # Physical cores - 1
+ROPE_SCALING = 1.0              # RoPE frequency scaling
+NUMA_OPTIMIZE = True            # NUMA-aware memory allocation
+AGGRESSIVE_GC = True            # Aggressive garbage collection
+# Quantization detection and optimization mapping
 QUANT_OPTIMIZATIONS = {
+    "BF16": {"batch_multiplier": 0.3, "ctx_size": 8192, "threads_boost": 1.2},
+    "F16": {"batch_multiplier": 0.4, "ctx_size": 8192, "threads_boost": 1.2},
+    "Q8_0": {"batch_multiplier": 0.7, "ctx_size": 8192, "threads_boost": 1.0},
+    "Q6_K": {"batch_multiplier": 0.8, "ctx_size": 8192, "threads_boost": 1.0},
+    "Q5_K_M": {"batch_multiplier": 1.0, "ctx_size": 12288, "threads_boost": 0.9},
+    "Q5_K_S": {"batch_multiplier": 1.0, "ctx_size": 12288, "threads_boost": 0.9},
+    "Q4_K_M": {"batch_multiplier": 1.3, "ctx_size": 16384, "threads_boost": 0.8},
+    "Q4_K_S": {"batch_multiplier": 1.3, "ctx_size": 16384, "threads_boost": 0.8},
+    "Q4_0": {"batch_multiplier": 1.4, "ctx_size": 16384, "threads_boost": 0.8},
+    "Q3_K_M": {"batch_multiplier": 1.6, "ctx_size": 20480, "threads_boost": 0.7},
+    "Q2_K": {"batch_multiplier": 2.0, "ctx_size": 24576, "threads_boost": 0.7},
 }
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - ZEROENGINE - %(message)s')
             logger.error(f"[WRECKER] Failed: {e}")
             return False
 # Global cache manager
 model_cache = ModelCacheManager()
         self.api = HfApi(token=HF_TOKEN)
         self.telemetry = TelemetryManager(self.api)
         self.llm: Optional[Llama] = None
+        self.active_model_info = {"repo": "", "file": ""}
         self.kernel_lock = threading.Lock()
         self.is_prefilling = False
         self.perf_stats = {
             "peak_tps": 0.0,
             "cache_hits": 0
         }
+        self.prompt_cache = {}  # Cache for repeated prompts
         self.last_activity = time.time()
+        self.idle_timeout = 20  # 20 seconds idle timeout
         self.auto_cleanup_thread = None
         self.start_idle_monitor()
         self.typing_timer = None
         self.preprocessed_tokens = None
     def detect_quantization(self, filename: str) -> dict:
         """Detect quantization method from filename and return optimizations"""
         filename_upper = filename.upper()
             logger.error(f"Scan error: {e}")
             return []
+    def boot_kernel(self, repo: str, filename: str) -> str:
         """HYPER-OPTIMIZED Boot kernel with cache manager and old model wrecker"""
         try:
             if not repo or not filename:
         threading.Thread(target=_bg_eval, daemon=True).start()
         return "⚡ Primed"
+    def inference_generator(self, prompt: str, history: List[Dict], ghost_context: str, repo: str, quant: str) -> Generator:
         # Update activity timestamp
         self.update_activity()
         first_token_time = None
         try:
+            # HYPER-OPTIMIZED INFERENCE SETTINGS
             stream = self.llm(
                 formatted_prompt,
+                max_tokens=2048,                    # Increased output length
                 stop=["User:", "<|eot_id|>", "\n\n"],
                 stream=True,
+                temperature=0.7,                    # Balanced creativity
+                top_p=0.95,                         # Nucleus sampling
+                top_k=40,                           # Top-K sampling
+                repeat_penalty=1.1,                 # Prevent repetition
+                frequency_penalty=0.0,              # No frequency penalty
+                presence_penalty=0.0,               # No presence penalty
+                tfs_z=1.0,                          # Tail-free sampling
+                typical_p=1.0,                      # Typical sampling
+                mirostat_mode=2,                    # Mirostat v2 (perplexity control)
+                mirostat_tau=5.0,                   # Target perplexity
+                mirostat_eta=0.1,                   # Learning rate
             )
             for chunk in stream:
                 if tps > self.perf_stats["peak_tps"]:
                     self.perf_stats["peak_tps"] = tps
                 # Update history with streaming content + performance metrics
+                history[-1]["content"] = f"{response_text}\n\n`⚡ {tps} t/s | 🎯 Peak: {self.perf_stats['peak_tps']:.1f} t/s | 💾 Cache: {self.perf_stats['cache_hits']}`"
                 yield history
             # Update global performance stats
             self.perf_stats["total_tokens"] += tokens_count
             self.perf_stats["total_time"] += elapsed
 # --- UI INTERFACE ---
 kernel = ZeroEngine()
+with gr.Blocks(title="ZeroEngine Kernel 6.5", css=CUSTOM_CSS) as demo:
+    gr.HTML("""
+        <div style='text-align: center; padding: 30px; border-radius: 24px;
+                    background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
+                    margin-bottom: 30px; box-shadow: 0 10px 30px rgba(0,0,0,0.3);'>
+            <h1 style='margin: 0; font-size: 3em; background: linear-gradient(90deg, #00d4ff, #7b2ff7);
+                       -webkit-background-clip: text; -webkit-text-fill-color: transparent;
+                       font-family: Consolas, monospace;'>
+                🛰️ ZEROENGINE V0.1
+            </h1>
+            <p style='margin: 10px 0 0 0; color: #888; font-family: Consolas, monospace;'>
+                Gradio 6.5.0 | Hyper-Optimized | Auto-Boot | 20s Idle Timeout
+            </p>
+        </div>
+    """)
     with gr.Row():
         with gr.Column(scale=8):
             chat_box = gr.Chatbot(
                 label="Main Engine Feedback",
+                height=650,
                 show_label=False,
                 autoscroll=True,
                 container=True
                 )
                 send_btn = gr.Button("SUBMIT", variant="primary", scale=1)
+        with gr.Column(scale=3):
             gr.Markdown("### 🛠️ Hardware Status")
             ram_metric = gr.Label(label="RAM Usage", value="0/0 GB")
             cpu_metric = gr.Label(label="CPU Load", value="0%")
             gr.Markdown("---")
             gr.Markdown("### 📡 Model Control")
             repo_input = gr.Textbox(label="HuggingFace Repo", value=DEFAULT_MODEL)
             quant_dropdown = gr.Dropdown(label="Available Quants", choices=[], interactive=True)
             boot_status = gr.Markdown("Status: `STANDBY`")
             gr.Markdown("---")
             gr.Markdown("### 👻 Ghost Cache (Pre-Context)")
             ghost_buffer = gr.Textbox(
                 label="Background Context",
             log_output = gr.Code(
                 label="Kernel Logs",
                 language="shell",
+                value="[INIT] System Ready.",
                 lines=5
             )
     def update_stats():
         try:
             m = ResourceMonitor.get_metrics()
+            return f"{m['ram_used_gb']}/{m['ram_total_gb']} GB", f"{m['cpu_usage_pct']}%"
         except Exception as e:
             logger.error(f"Stats update error: {e}")
             return "Error", "Error"
     def on_scan(repo):
                 return
             yield "⚙️ System: Initiating boot sequence...", gr.update()
+            time.sleep(0.5)  # Small delay for UI feedback
+            result = kernel.boot_kernel(repo, file)
             yield result, gr.update()
         except Exception as e:
             logger.error(f"Boot UI error: {e}")
             yield f"🔴 BOOT ERROR: {str(e)}", gr.update()
+    # Timer for periodic stats updates
     timer = gr.Timer(value=2)
+    timer.tick(update_stats, None, [ram_metric, cpu_metric])
     # Event handlers
     scan_btn.click(on_scan, [repo_input], [quant_dropdown, log_output])
     boot_btn.click(on_boot, [repo_input, quant_dropdown], [boot_status, log_output])
     stitch_btn.click(
         lambda x: f"Cache: `{kernel.stitch_cache(x)}`",
         [ghost_buffer],
         [stitch_status]
     )
+    # Keyboard input preprocessing (tokenize while typing)
     user_input.change(
         lambda x: kernel.preprocess_input(x),
         [user_input],
         None
     )
+    # Auto-boot enabled inference - passes repo and quant for auto-boot
     inference_args = [user_input, chat_box, ghost_buffer, repo_input, quant_dropdown]
     user_input.submit(kernel.inference_generator, inference_args, [chat_box])
     send_btn.click(kernel.inference_generator, inference_args, [chat_box])