turtle170 commited on
Commit
75bc536
·
verified ·
1 Parent(s): dccc10e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -565
app.py CHANGED
@@ -31,47 +31,32 @@ SYSTEM_RESERVE_MB = 500
31
  DEFAULT_MODEL = "unsloth/Llama-3.2-1B-Instruct-GGUF"
32
  DEFAULT_QUANT = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
33
 
34
- # --- TOKEN SYSTEM CONFIG ---
35
- MONTHLY_TOKEN_CREDITS = 100.0
36
- TOKEN_COST_PER_100MS = 0.001
37
- BATCH_UPGRADE_BASE_COST = 0.00005 # Exponential: 1->2 = 0.00005, 2->4 = 0.0001, etc.
38
- TOKEN_UPGRADE_COST_PER_1K = 0.0001 # Cost per 1000 extra tokens
39
-
40
  # --- SPEED OPTIMIZATION CONFIG ---
41
- FLASH_ATTENTION = False # Disabled for CPU (GPU-only feature)
42
- KV_CACHE_QUANTIZATION = True # Keep for RAM savings
43
- CONTINUOUS_BATCHING = False # CPU doesn't benefit much
44
- SPECULATIVE_DECODE = False # CPU-only, no draft model
45
- MLOCK_MODEL = False # Don't lock - allow OS to manage memory
46
- USE_MMAP = True # Critical for CPU - fast loading
47
- OFFLOAD_KQV = False # CPU-only
48
- OPTIMAL_THREADS = psutil.cpu_count(logical=True) # Use ALL threads (including hyperthreading for CPU)
49
- ROPE_SCALING = 1.0
50
- NUMA_OPTIMIZE = False # Disabled - can cause issues on some systems
51
- AGGRESSIVE_GC = True
52
-
53
- # Quantization detection - CPU-optimized batch multipliers (more aggressive)
54
  QUANT_OPTIMIZATIONS = {
55
- "BF16": {"batch_multiplier": 0.4, "ctx_size": 4096, "threads_boost": 1.0},
56
- "F16": {"batch_multiplier": 0.5, "ctx_size": 4096, "threads_boost": 1.0},
57
- "Q8_0": {"batch_multiplier": 1.0, "ctx_size": 8192, "threads_boost": 1.0},
58
- "Q6_K": {"batch_multiplier": 1.2, "ctx_size": 8192, "threads_boost": 1.0},
59
- "Q5_K_M": {"batch_multiplier": 1.5, "ctx_size": 12288, "threads_boost": 1.0},
60
- "Q5_K_S": {"batch_multiplier": 1.5, "ctx_size": 12288, "threads_boost": 1.0},
61
- "Q4_K_M": {"batch_multiplier": 2.0, "ctx_size": 16384, "threads_boost": 1.0}, # MASSIVE for CPU
62
- "Q4_K_S": {"batch_multiplier": 2.0, "ctx_size": 16384, "threads_boost": 1.0},
63
- "Q4_0": {"batch_multiplier": 2.2, "ctx_size": 16384, "threads_boost": 1.0},
64
- "Q3_K_M": {"batch_multiplier": 2.5, "ctx_size": 20480, "threads_boost": 1.0},
65
- "Q2_K": {"batch_multiplier": 3.0, "ctx_size": 24576, "threads_boost": 1.0},
66
- }
67
-
68
- # Model format/architecture detection patterns
69
- MODEL_FORMATS = {
70
- "llama": {"pattern": ["llama", "mistral", "mixtral"], "template": "llama"},
71
- "gemma": {"pattern": ["gemma"], "template": "gemma"},
72
- "phi": {"pattern": ["phi"], "template": "phi"},
73
- "qwen": {"pattern": ["qwen"], "template": "chatml"},
74
- "deepseek": {"pattern": ["deepseek"], "template": "deepseek"},
75
  }
76
 
77
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - ZEROENGINE - %(message)s')
@@ -222,212 +207,6 @@ class ModelCacheManager:
222
  logger.error(f"[WRECKER] Failed: {e}")
223
  return False
224
 
225
- # --- TOKEN MANAGER ---
226
- class TokenManager:
227
- def __init__(self):
228
- self.user_tokens = {} # {username: {"balance": float, "start_time": float, "purchases": {}}}
229
- self.owner_username = "turtle170" # Owner gets infinite tokens
230
-
231
- def is_owner(self, username: str) -> bool:
232
- """Check if user is the owner"""
233
- if not username:
234
- return False
235
- return username.lower() == self.owner_username.lower()
236
-
237
- def initialize_user(self, username: str):
238
- """Initialize new user with monthly credits (or infinite for owner)"""
239
- if not username:
240
- username = "anonymous"
241
-
242
- if username not in self.user_tokens:
243
- # Owner gets infinite tokens
244
- if self.is_owner(username):
245
- self.user_tokens[username] = {
246
- "balance": float('inf'),
247
- "start_time": time.time(),
248
- "purchases": {"batch_multiplier": 1, "token_limit": 2048},
249
- "total_spent": 0.0,
250
- "is_owner": True,
251
- "username": username
252
- }
253
- logger.info(f"[TOKEN] 👑 OWNER {username} initialized with INFINITE tokens!")
254
- else:
255
- self.user_tokens[username] = {
256
- "balance": MONTHLY_TOKEN_CREDITS,
257
- "start_time": time.time(),
258
- "purchases": {"batch_multiplier": 1, "token_limit": 2048},
259
- "total_spent": 0.0,
260
- "is_owner": False,
261
- "username": username,
262
- "last_reset": time.time()
263
- }
264
- logger.info(f"[TOKEN] New user {username}: {MONTHLY_TOKEN_CREDITS} tokens")
265
-
266
- def check_monthly_reset(self, username: str):
267
- """Reset tokens if a month has passed"""
268
- if not username or username not in self.user_tokens:
269
- return
270
-
271
- if self.user_tokens[username].get("is_owner", False):
272
- return # Owner never needs reset
273
-
274
- last_reset = self.user_tokens[username].get("last_reset", time.time())
275
- month_in_seconds = 30 * 24 * 60 * 60 # 30 days
276
-
277
- if time.time() - last_reset > month_in_seconds:
278
- self.user_tokens[username]["balance"] = MONTHLY_TOKEN_CREDITS
279
- self.user_tokens[username]["last_reset"] = time.time()
280
- self.user_tokens[username]["total_spent"] = 0.0
281
- logger.info(f"[TOKEN] Monthly reset for {username}: {MONTHLY_TOKEN_CREDITS} tokens")
282
-
283
- def charge_usage(self, username: str, duration_ms: float) -> bool:
284
- """Charge user for inference time. Returns True if successful. Owner never charged."""
285
- if not username:
286
- username = "anonymous"
287
-
288
- self.initialize_user(username)
289
- self.check_monthly_reset(username)
290
-
291
- # Owner never gets charged
292
- if self.user_tokens[username].get("is_owner", False):
293
- return True
294
-
295
- cost = (duration_ms / 100.0) * TOKEN_COST_PER_100MS
296
-
297
- # Check if user has enough balance
298
- if self.user_tokens[username]["balance"] <= 0:
299
- logger.warning(f"[TOKEN] ❌ {username} has 0 tokens! Access denied.")
300
- return False
301
-
302
- if self.user_tokens[username]["balance"] >= cost:
303
- self.user_tokens[username]["balance"] -= cost
304
- self.user_tokens[username]["balance"] = max(0, self.user_tokens[username]["balance"]) # Never go below 0
305
- self.user_tokens[username]["total_spent"] += cost
306
- logger.info(f"[TOKEN] Charged {cost:.4f} tokens ({duration_ms:.0f}ms) | Remaining: {self.user_tokens[username]['balance']:.2f}")
307
- return True
308
- else:
309
- # Insufficient balance - set to 0 and deny
310
- self.user_tokens[username]["balance"] = 0
311
- logger.warning(f"[TOKEN] ❌ Insufficient balance! {username} now at 0 tokens.")
312
- return False
313
-
314
- def can_use_engine(self, username: str) -> tuple:
315
- """Check if user can use the engine. Returns (bool, message)"""
316
- if not username:
317
- username = "anonymous"
318
-
319
- self.initialize_user(username)
320
- self.check_monthly_reset(username)
321
-
322
- if self.user_tokens[username].get("is_owner", False):
323
- return True, "👑 Owner access granted"
324
-
325
- balance = self.user_tokens[username]["balance"]
326
-
327
- if balance <= 0:
328
- last_reset = self.user_tokens[username].get("last_reset", time.time())
329
- time_until_reset = 30 * 24 * 60 * 60 - (time.time() - last_reset)
330
- days_left = int(time_until_reset / (24 * 60 * 60))
331
- return False, f"❌ Out of tokens! Resets in {days_left} days. Current balance: 0.00"
332
-
333
- return True, f"✅ Access granted. Balance: {balance:.2f} tokens"
334
-
335
- def purchase_batch_upgrade(self, username: str) -> tuple:
336
- """Purchase batch size upgrade (exponential cost). Free for owner."""
337
- if not username:
338
- return False, "❌ Please login first"
339
-
340
- self.initialize_user(username)
341
-
342
- # Owner gets free upgrades
343
- if self.user_tokens[username].get("is_owner", False):
344
- current_mult = self.user_tokens[username]["purchases"]["batch_multiplier"]
345
- self.user_tokens[username]["purchases"]["batch_multiplier"] = current_mult * 2
346
- new_mult = current_mult * 2
347
- logger.info(f"[TOKEN] 👑 OWNER free batch upgrade: {current_mult}x → {new_mult}x")
348
- return True, f"👑 FREE UPGRADE! Batch now {new_mult}x!"
349
-
350
- current_mult = self.user_tokens[username]["purchases"]["batch_multiplier"]
351
- upgrade_level = int(math.log2(current_mult)) if current_mult > 1 else 0
352
- cost = BATCH_UPGRADE_BASE_COST * (2 ** upgrade_level)
353
-
354
- if self.user_tokens[username]["balance"] >= cost:
355
- self.user_tokens[username]["balance"] -= cost
356
- self.user_tokens[username]["purchases"]["batch_multiplier"] = current_mult * 2
357
- new_mult = current_mult * 2
358
- logger.info(f"[TOKEN] Batch upgrade: {current_mult}x → {new_mult}x | Cost: {cost:.5f}")
359
- return True, f"✅ Batch upgraded to {new_mult}x! (-{cost:.5f} tokens)"
360
- else:
361
- return False, f"❌ Insufficient tokens! Need {cost:.5f}, have {self.user_tokens[username]['balance']:.2f}"
362
-
363
- def purchase_token_upgrade(self, username: str, extra_tokens: int = 1000) -> tuple:
364
- """Purchase extra response token length. Free for owner."""
365
- if not username:
366
- return False, "❌ Please login first"
367
-
368
- self.initialize_user(username)
369
-
370
- # Owner gets free upgrades
371
- if self.user_tokens[username].get("is_owner", False):
372
- self.user_tokens[username]["purchases"]["token_limit"] += extra_tokens
373
- new_limit = self.user_tokens[username]["purchases"]["token_limit"]
374
- logger.info(f"[TOKEN] 👑 OWNER free token upgrade: +{extra_tokens} tokens")
375
- return True, f"👑 FREE UPGRADE! Token limit now {new_limit}!"
376
-
377
- cost = (extra_tokens / 1000) * TOKEN_UPGRADE_COST_PER_1K
378
-
379
- if self.user_tokens[username]["balance"] >= cost:
380
- self.user_tokens[username]["balance"] -= cost
381
- self.user_tokens[username]["purchases"]["token_limit"] += extra_tokens
382
- new_limit = self.user_tokens[username]["purchases"]["token_limit"]
383
- logger.info(f"[TOKEN] Token limit upgrade: +{extra_tokens} tokens | Cost: {cost:.5f}")
384
- return True, f"✅ Token limit now {new_limit}! (-{cost:.5f} tokens)"
385
- else:
386
- return False, f"❌ Insufficient tokens! Need {cost:.5f}, have {self.user_tokens[username]['balance']:.2f}"
387
-
388
- def get_balance(self, username: str) -> float:
389
- """Get user's current token balance"""
390
- if not username:
391
- username = "anonymous"
392
-
393
- self.initialize_user(username)
394
- self.check_monthly_reset(username)
395
-
396
- balance = self.user_tokens[username]["balance"]
397
-
398
- # Show ∞ for owner
399
- if balance == float('inf'):
400
- return balance
401
-
402
- return round(max(0, balance), 2) # Never show negative
403
-
404
- def get_purchases(self, username: str) -> dict:
405
- """Get user's current purchases"""
406
- if not username:
407
- username = "anonymous"
408
-
409
- self.initialize_user(username)
410
- return self.user_tokens[username]["purchases"]
411
-
412
- def end_session(self, username: str):
413
- """End user session and log stats"""
414
- if not username:
415
- return "No active session found."
416
-
417
- if username in self.user_tokens:
418
- stats = self.user_tokens[username]
419
-
420
- if stats.get("is_owner", False):
421
- return f"👑 Owner session ended. Welcome back anytime, {stats['username']}!"
422
-
423
- logger.info(f"[TOKEN] Session ended: Spent {stats['total_spent']:.2f}, Remaining {stats['balance']:.2f}")
424
- return f"Session ended. You spent {stats['total_spent']:.2f} tokens this session. Balance: {stats['balance']:.2f}"
425
- return "No active session found."
426
-
427
- # Global token manager
428
- import math
429
- token_manager = TokenManager()
430
-
431
  # Global cache manager
432
  model_cache = ModelCacheManager()
433
 
@@ -493,7 +272,7 @@ class ZeroEngine:
493
  self.api = HfApi(token=HF_TOKEN)
494
  self.telemetry = TelemetryManager(self.api)
495
  self.llm: Optional[Llama] = None
496
- self.active_model_info = {"repo": "", "file": "", "format": ""}
497
  self.kernel_lock = threading.Lock()
498
  self.is_prefilling = False
499
  self.perf_stats = {
@@ -503,9 +282,9 @@ class ZeroEngine:
503
  "peak_tps": 0.0,
504
  "cache_hits": 0
505
  }
506
- self.prompt_cache = {}
507
  self.last_activity = time.time()
508
- self.idle_timeout = 20
509
  self.auto_cleanup_thread = None
510
  self.start_idle_monitor()
511
 
@@ -514,29 +293,6 @@ class ZeroEngine:
514
  self.typing_timer = None
515
  self.preprocessed_tokens = None
516
 
517
- # Custom parameters (user-configurable)
518
- self.custom_params = {
519
- "temperature": 0.7,
520
- "top_p": 0.95,
521
- "top_k": 40,
522
- "repeat_penalty": 1.1,
523
- "batch_size_override": None, # None = auto
524
- "max_tokens_override": None # None = auto
525
- }
526
-
527
- def detect_model_format(self, filename: str, repo: str) -> str:
528
- """Auto-detect model format/architecture from filename and repo"""
529
- combined = f"{repo.lower()} {filename.lower()}"
530
-
531
- for format_name, format_info in MODEL_FORMATS.items():
532
- for pattern in format_info["pattern"]:
533
- if pattern in combined:
534
- logger.info(f"[FORMAT-DETECT] Detected {format_name.upper()} architecture")
535
- return format_name
536
-
537
- logger.warning(f"[FORMAT-DETECT] Unknown format, defaulting to llama")
538
- return "llama"
539
-
540
  def detect_quantization(self, filename: str) -> dict:
541
  """Detect quantization method from filename and return optimizations"""
542
  filename_upper = filename.upper()
@@ -633,158 +389,7 @@ class ZeroEngine:
633
  logger.error(f"Scan error: {e}")
634
  return []
635
 
636
- def boot_kernel(self, repo: str, filename: str, session_id: str = None) -> str:
637
- """HYPER-OPTIMIZED Boot kernel with format auto-detection and Gemma fixes"""
638
- try:
639
- if not repo or not filename:
640
- return "🔴 ERROR: Repository or filename missing"
641
-
642
- logger.info(f"[BOOT] Starting download: {filename} from {repo}")
643
-
644
- # DETECT QUANTIZATION FROM FILENAME
645
- quant_config = self.detect_quantization(filename)
646
-
647
- # DETECT MODEL FORMAT/ARCHITECTURE
648
- model_format = self.detect_model_format(filename, repo)
649
-
650
- # Download with timeout protection
651
- try:
652
- path = hf_hub_download(
653
- repo_id=repo,
654
- filename=filename,
655
- token=HF_TOKEN,
656
- local_files_only=False
657
- )
658
- logger.info(f"[BOOT] Download complete: {path}")
659
- except Exception as e:
660
- logger.error(f"[BOOT] Download failed: {e}")
661
- return f"🔴 DOWNLOAD FAILED: {str(e)}"
662
-
663
- # Check if model is cached
664
- is_cached = model_cache.is_cached(path)
665
- cache_status = "🎯 CACHED" if is_cached else "🆕 NEW"
666
-
667
- # Validate before loading
668
- valid, msg = ResourceMonitor.validate_deployment(path)
669
- if not valid:
670
- logger.warning(f"[BOOT] Validation failed: {msg}")
671
- return f"🔴 VALIDATION FAILED: {msg}"
672
-
673
- logger.info(f"[BOOT] Validation passed ({cache_status}), applying {quant_config['type']} optimizations for {model_format.upper()}...")
674
-
675
- # Load model with MAXIMUM PERFORMANCE SETTINGS
676
- with self.kernel_lock:
677
- # WRECK OLD MODEL
678
- if self.llm:
679
- logger.info("[BOOT] 💣 WRECKING old model...")
680
- try:
681
- model_cache.wreck_old_model_cache()
682
- del self.llm
683
- self.llm = None
684
- nuclear_ram_clear()
685
- logger.info("[BOOT] ✅ Old model DESTROYED")
686
- except Exception as e:
687
- logger.warning(f"[BOOT] Cleanup warning: {e}")
688
-
689
- # Calculate optimal parameters with token purchases
690
- vm = psutil.virtual_memory()
691
- available_ram_gb = vm.available / (1024**3)
692
-
693
- # CPU-OPTIMIZED BATCH CALCULATION - Very aggressive for 16GB RAM
694
- # Base calculation: use more RAM for batching on CPU
695
- base_batch = int(512 * available_ram_gb / 8) # More aggressive base
696
- optimal_batch = int(base_batch * quant_config["batch_multiplier"])
697
-
698
- # Apply user's batch multiplier from token purchases
699
- if session_id:
700
- user_batch_mult = token_manager.get_purchases(session_id)["batch_multiplier"]
701
- optimal_batch = int(optimal_batch * user_batch_mult)
702
- logger.info(f"[TOKEN] User batch multiplier: {user_batch_mult}x")
703
-
704
- # CPU can handle larger batches with quantized models
705
- optimal_batch = max(1024, min(8192, optimal_batch)) # 1024-8192 range for CPU
706
-
707
- # Context size
708
- optimal_ctx = quant_config["ctx_size"]
709
-
710
- # Reduce context for Gemma models (they have 131K n_ctx_train)
711
- if model_format == "gemma":
712
- optimal_ctx = min(8192, optimal_ctx) # Gemma works better with lower ctx
713
- logger.info(f"[FORMAT] Gemma detected: reducing context to {optimal_ctx}")
714
-
715
- # Thread optimization - use ALL threads on CPU (including hyperthreading)
716
- optimal_threads = psutil.cpu_count(logical=True) # ALL logical cores
717
- logger.info(f"[CPU] Using all {optimal_threads} threads (including hyperthreading)")
718
-
719
- try:
720
- logger.info(f"[BOOT] Initializing {model_format.upper()} {quant_config['type']}: threads={optimal_threads}, batch={optimal_batch}, ctx={optimal_ctx}")
721
-
722
- # Preload cache if available
723
- if is_cached:
724
- model_cache.preload_cache(path)
725
-
726
- # ULTRA-OPTIMIZED CPU-ONLY INITIALIZATION
727
- init_params = {
728
- "model_path": path,
729
- "n_ctx": optimal_ctx,
730
- "n_threads": optimal_threads,
731
- "n_threads_batch": optimal_threads,
732
- "use_mmap": USE_MMAP, # Critical for CPU
733
- "use_mlock": MLOCK_MODEL, # Let OS manage memory
734
- "n_batch": optimal_batch, # MASSIVE batches for CPU
735
- "n_gpu_layers": 0, # CPU-only
736
- "rope_scaling_type": 0,
737
- "rope_freq_scale": ROPE_SCALING,
738
- "verbose": False,
739
- "logits_all": False,
740
- "embedding": False,
741
- "f16_kv": False # Use quantized KV cache
742
- }
743
-
744
- # Add KV quantization only if not Gemma (Gemma can be finicky)
745
- if model_format != "gemma" and KV_CACHE_QUANTIZATION:
746
- init_params["type_k"] = 2
747
- init_params["type_v"] = 2
748
- logger.info("[OPTIM] KV cache quantization enabled (Q4)")
749
-
750
- self.llm = Llama(**init_params)
751
-
752
- self.active_model_info = {
753
- "repo": repo,
754
- "file": filename,
755
- "quant": quant_config['type'],
756
- "format": model_format
757
- }
758
- self.telemetry.track_load(repo, filename)
759
-
760
- # Extract and cache signature
761
- if not is_cached:
762
- logger.info("[BOOT] Extracting cache signature...")
763
- signature = model_cache.extract_cache_signature(path)
764
- if signature:
765
- model_cache.save_to_cache(path, signature)
766
-
767
- # Warm-up
768
- logger.info("[BOOT] Warming up model caches...")
769
- try:
770
- self.llm("Warmup", max_tokens=1, stream=False)
771
- force_gc()
772
- except:
773
- pass
774
-
775
- logger.info("[BOOT] 🚀 CPU-OPTIMIZED MODEL READY!")
776
- return f"🟢 {model_format.upper()} {quant_config['type']} {cache_status} | CPU:{optimal_threads}T | B:{optimal_batch} | Ctx:{optimal_ctx}"
777
-
778
- except Exception as e:
779
- logger.error(f"[BOOT] Model loading failed: {e}")
780
- self.llm = None
781
- nuclear_ram_clear()
782
- return f"🔴 LOAD FAILED: {str(e)}"
783
-
784
- except Exception as e:
785
- logger.error(f"[BOOT] Unexpected error: {e}")
786
- nuclear_ram_clear()
787
- return f"🔴 BOOT FAILURE: {str(e)}"
788
  """HYPER-OPTIMIZED Boot kernel with cache manager and old model wrecker"""
789
  try:
790
  if not repo or not filename:
@@ -942,7 +547,7 @@ class ZeroEngine:
942
  threading.Thread(target=_bg_eval, daemon=True).start()
943
  return "⚡ Primed"
944
 
945
- def inference_generator(self, prompt: str, history: List[Dict], ghost_context: str, repo: str, quant: str, username: str) -> Generator:
946
  # Update activity timestamp
947
  self.update_activity()
948
 
@@ -995,28 +600,23 @@ class ZeroEngine:
995
  first_token_time = None
996
 
997
  try:
998
- # Get max tokens from user purchases
999
- max_tokens = 2048
1000
- if username:
1001
- max_tokens = token_manager.get_purchases(username)["token_limit"]
1002
-
1003
- # HYPER-OPTIMIZED CPU INFERENCE SETTINGS
1004
  stream = self.llm(
1005
  formatted_prompt,
1006
- max_tokens=max_tokens,
1007
  stop=["User:", "<|eot_id|>", "\n\n"],
1008
  stream=True,
1009
- temperature=self.custom_params["temperature"],
1010
- top_p=self.custom_params["top_p"],
1011
- top_k=self.custom_params["top_k"],
1012
- repeat_penalty=self.custom_params["repeat_penalty"],
1013
- frequency_penalty=0.0,
1014
- presence_penalty=0.0,
1015
- tfs_z=1.0,
1016
- typical_p=1.0,
1017
- mirostat_mode=2, # CPU benefits from mirostat
1018
- mirostat_tau=5.0,
1019
- mirostat_eta=0.1,
1020
  )
1021
 
1022
  for chunk in stream:
@@ -1036,19 +636,10 @@ class ZeroEngine:
1036
  if tps > self.perf_stats["peak_tps"]:
1037
  self.perf_stats["peak_tps"] = tps
1038
 
1039
- # Charge tokens every second
1040
- if int(elapsed * 1000) % 1000 < 100 and username: # Every ~1 second
1041
- token_manager.charge_usage(username, elapsed * 1000)
1042
-
1043
  # Update history with streaming content + performance metrics
1044
- balance = token_manager.get_balance(username) if username else 0
1045
- history[-1]["content"] = f"{response_text}\n\n`⚡ {tps} t/s | 🎯 Peak: {self.perf_stats['peak_tps']:.1f} t/s | 💰 {balance:.2f} tokens`"
1046
  yield history
1047
 
1048
- # Final token charge for remaining time
1049
- if username:
1050
- token_manager.charge_usage(username, elapsed * 1000)
1051
-
1052
  # Update global performance stats
1053
  self.perf_stats["total_tokens"] += tokens_count
1054
  self.perf_stats["total_time"] += elapsed
@@ -1172,49 +763,27 @@ h1, h2, h3, h4, h5, h6 {
1172
  # --- UI INTERFACE ---
1173
  kernel = ZeroEngine()
1174
 
1175
- # Session ID for token tracking
1176
- username = token_manager.get_username()
1177
-
1178
- with gr.Blocks(title="ZeroEngine V0.2", css=CUSTOM_CSS) as demo:
1179
- # Header with Token Display
1180
- with gr.Row():
1181
- with gr.Column(scale=8):
1182
- gr.HTML("""
1183
- <div style='text-align: center; padding: 30px; border-radius: 24px;
1184
- background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
1185
- margin-bottom: 20px; box-shadow: 0 10px 30px rgba(0,0,0,0.3);'>
1186
- <h1 style='margin: 0; font-size: 3em; background: linear-gradient(90deg, #00d4ff, #7b2ff7);
1187
- -webkit-background-clip: text; -webkit-text-fill-color: transparent;
1188
- font-family: Consolas, monospace;'>
1189
- 🛰️ ZEROENGINE V0.2
1190
- </h1>
1191
- <p style='margin: 10px 0 0 0; color: #888; font-family: Consolas, monospace;'>
1192
- CPU-Optimized | Token System | Custom Parameters | Auto-Format
1193
- </p>
1194
- </div>
1195
- """)
1196
- with gr.Column(scale=2):
1197
- # Token Display
1198
- gr.HTML("""
1199
- <div style='text-align: center; padding: 20px; border-radius: 20px;
1200
- background: linear-gradient(135deg, #7b2ff7 0%, #9b59b6 100%);
1201
- margin-bottom: 20px; box-shadow: 0 8px 20px rgba(123,47,247,0.3);'>
1202
- <div style='font-size: 2em; margin-bottom: 5px;'>💰</div>
1203
- <div id='token-display' style='font-size: 1.8em; font-weight: bold; color: white; font-family: Consolas;'>
1204
- 100.00
1205
- </div>
1206
- <div style='font-size: 0.9em; color: #ddd; font-family: Consolas;'>TOKENS</div>
1207
- </div>
1208
- """)
1209
- token_balance = gr.Textbox(value="100.00", visible=False, elem_id="token_balance")
1210
- end_session_btn = gr.Button("END SESSION", variant="stop", size="sm")
1211
- session_status = gr.Markdown("", visible=False)
1212
 
1213
  with gr.Row():
1214
  with gr.Column(scale=8):
1215
  chat_box = gr.Chatbot(
1216
  label="Main Engine Feedback",
1217
- height=600,
1218
  show_label=False,
1219
  autoscroll=True,
1220
  container=True
@@ -1229,15 +798,12 @@ with gr.Blocks(title="ZeroEngine V0.2", css=CUSTOM_CSS) as demo:
1229
  )
1230
  send_btn = gr.Button("SUBMIT", variant="primary", scale=1)
1231
 
1232
- with gr.Column(scale=4):
1233
- # Hardware Status
1234
  gr.Markdown("### 🛠️ Hardware Status")
1235
  ram_metric = gr.Label(label="RAM Usage", value="0/0 GB")
1236
  cpu_metric = gr.Label(label="CPU Load", value="0%")
1237
 
1238
  gr.Markdown("---")
1239
-
1240
- # Model Control
1241
  gr.Markdown("### 📡 Model Control")
1242
  repo_input = gr.Textbox(label="HuggingFace Repo", value=DEFAULT_MODEL)
1243
  quant_dropdown = gr.Dropdown(label="Available Quants", choices=[], interactive=True)
@@ -1249,26 +815,6 @@ with gr.Blocks(title="ZeroEngine V0.2", css=CUSTOM_CSS) as demo:
1249
  boot_status = gr.Markdown("Status: `STANDBY`")
1250
 
1251
  gr.Markdown("---")
1252
-
1253
- # Custom Parameters
1254
- gr.Markdown("### ⚙️ Custom Parameters")
1255
- temperature_slider = gr.Slider(0.1, 2.0, value=0.7, step=0.1, label="Temperature")
1256
- top_p_slider = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-P")
1257
- top_k_slider = gr.Slider(1, 100, value=40, step=1, label="Top-K")
1258
- repeat_penalty_slider = gr.Slider(1.0, 2.0, value=1.1, step=0.1, label="Repeat Penalty")
1259
-
1260
- gr.Markdown("---")
1261
-
1262
- # Token Purchases
1263
- gr.Markdown("### 💎 Token Upgrades")
1264
- with gr.Row():
1265
- batch_upgrade_btn = gr.Button("🚀 Batch x2", size="sm", variant="secondary")
1266
- token_upgrade_btn = gr.Button("📈 +1K Tokens", size="sm", variant="secondary")
1267
- purchase_status = gr.Markdown("Ready to upgrade!")
1268
-
1269
- gr.Markdown("---")
1270
-
1271
- # Ghost Cache
1272
  gr.Markdown("### 👻 Ghost Cache (Pre-Context)")
1273
  ghost_buffer = gr.Textbox(
1274
  label="Background Context",
@@ -1282,7 +828,7 @@ with gr.Blocks(title="ZeroEngine V0.2", css=CUSTOM_CSS) as demo:
1282
  log_output = gr.Code(
1283
  label="Kernel Logs",
1284
  language="shell",
1285
- value="[INIT] V0.2 System Ready.",
1286
  lines=5
1287
  )
1288
 
@@ -1290,11 +836,9 @@ with gr.Blocks(title="ZeroEngine V0.2", css=CUSTOM_CSS) as demo:
1290
  def update_stats():
1291
  try:
1292
  m = ResourceMonitor.get_metrics()
1293
- balance = token_manager.get_balance(session_id)
1294
- return f"{m['ram_used_gb']}/{m['ram_total_gb']} GB", f"{m['cpu_usage_pct']}%", f"{balance}"
1295
  except Exception as e:
1296
  logger.error(f"Stats update error: {e}")
1297
- return "Error", "Error", "0.00"
1298
  return "Error", "Error"
1299
 
1300
  def on_scan(repo):
@@ -1320,78 +864,37 @@ with gr.Blocks(title="ZeroEngine V0.2", css=CUSTOM_CSS) as demo:
1320
  return
1321
 
1322
  yield "⚙️ System: Initiating boot sequence...", gr.update()
1323
- time.sleep(0.5)
1324
 
1325
- result = kernel.boot_kernel(repo, file, session_id)
1326
  yield result, gr.update()
1327
 
1328
  except Exception as e:
1329
  logger.error(f"Boot UI error: {e}")
1330
  yield f"🔴 BOOT ERROR: {str(e)}", gr.update()
1331
-
1332
- def on_batch_upgrade():
1333
- success, msg = token_manager.purchase_batch_upgrade(session_id)
1334
- balance = token_manager.get_balance(session_id)
1335
- return msg, f"{balance}"
1336
-
1337
- def on_token_upgrade():
1338
- success, msg = token_manager.purchase_token_upgrade(session_id, 1000)
1339
- balance = token_manager.get_balance(session_id)
1340
- return msg, f"{balance}"
1341
-
1342
- def on_end_session():
1343
- msg = token_manager.end_session(session_id)
1344
- return msg
1345
-
1346
- def update_custom_params(temp, top_p, top_k, repeat_pen):
1347
- kernel.custom_params["temperature"] = temp
1348
- kernel.custom_params["top_p"] = top_p
1349
- kernel.custom_params["top_k"] = int(top_k)
1350
- kernel.custom_params["repeat_penalty"] = repeat_pen
1351
- return "✅ Parameters updated!"
1352
 
1353
- # Timer for periodic stats updates (includes token balance)
1354
  timer = gr.Timer(value=2)
1355
- timer.tick(update_stats, None, [ram_metric, cpu_metric, token_balance])
1356
 
1357
  # Event handlers
1358
  scan_btn.click(on_scan, [repo_input], [quant_dropdown, log_output])
1359
  boot_btn.click(on_boot, [repo_input, quant_dropdown], [boot_status, log_output])
1360
 
1361
- # Token purchases
1362
- batch_upgrade_btn.click(on_batch_upgrade, None, [purchase_status, token_balance])
1363
- token_upgrade_btn.click(on_token_upgrade, None, [purchase_status, token_balance])
1364
- end_session_btn.click(on_end_session, None, [session_status])
1365
-
1366
- # Custom parameter updates
1367
- temperature_slider.change(update_custom_params,
1368
- [temperature_slider, top_p_slider, top_k_slider, repeat_penalty_slider],
1369
- [purchase_status])
1370
- top_p_slider.change(update_custom_params,
1371
- [temperature_slider, top_p_slider, top_k_slider, repeat_penalty_slider],
1372
- [purchase_status])
1373
- top_k_slider.change(update_custom_params,
1374
- [temperature_slider, top_p_slider, top_k_slider, repeat_penalty_slider],
1375
- [purchase_status])
1376
- repeat_penalty_slider.change(update_custom_params,
1377
- [temperature_slider, top_p_slider, top_k_slider, repeat_penalty_slider],
1378
- [purchase_status])
1379
-
1380
- # Ghost cache
1381
  stitch_btn.click(
1382
  lambda x: f"Cache: `{kernel.stitch_cache(x)}`",
1383
  [ghost_buffer],
1384
  [stitch_status]
1385
  )
1386
 
1387
- # Keyboard input preprocessing
1388
  user_input.change(
1389
  lambda x: kernel.preprocess_input(x),
1390
  [user_input],
1391
  None
1392
  )
1393
 
1394
- # Auto-boot enabled inference
1395
  inference_args = [user_input, chat_box, ghost_buffer, repo_input, quant_dropdown]
1396
  user_input.submit(kernel.inference_generator, inference_args, [chat_box])
1397
  send_btn.click(kernel.inference_generator, inference_args, [chat_box])
 
31
  DEFAULT_MODEL = "unsloth/Llama-3.2-1B-Instruct-GGUF"
32
  DEFAULT_QUANT = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
33
 
 
 
 
 
 
 
34
  # --- SPEED OPTIMIZATION CONFIG ---
35
+ FLASH_ATTENTION = True # Enable Flash Attention 2
36
+ KV_CACHE_QUANTIZATION = True # Quantize KV cache (4-bit)
37
+ CONTINUOUS_BATCHING = True # Enable continuous batching
38
+ SPECULATIVE_DECODE = False # Disabled for CPU (requires draft model)
39
+ MLOCK_MODEL = False # Disabled: prevents swapping but uses more RAM
40
+ USE_MMAP = True # Memory-mapped file loading
41
+ OFFLOAD_KQV = False # CPU-only, no offload needed
42
+ OPTIMAL_THREADS = max(1, psutil.cpu_count(logical=False) - 1) # Physical cores - 1
43
+ ROPE_SCALING = 1.0 # RoPE frequency scaling
44
+ NUMA_OPTIMIZE = True # NUMA-aware memory allocation
45
+ AGGRESSIVE_GC = True # Aggressive garbage collection
46
+
47
+ # Quantization detection and optimization mapping
48
  QUANT_OPTIMIZATIONS = {
49
+ "BF16": {"batch_multiplier": 0.3, "ctx_size": 8192, "threads_boost": 1.2},
50
+ "F16": {"batch_multiplier": 0.4, "ctx_size": 8192, "threads_boost": 1.2},
51
+ "Q8_0": {"batch_multiplier": 0.7, "ctx_size": 8192, "threads_boost": 1.0},
52
+ "Q6_K": {"batch_multiplier": 0.8, "ctx_size": 8192, "threads_boost": 1.0},
53
+ "Q5_K_M": {"batch_multiplier": 1.0, "ctx_size": 12288, "threads_boost": 0.9},
54
+ "Q5_K_S": {"batch_multiplier": 1.0, "ctx_size": 12288, "threads_boost": 0.9},
55
+ "Q4_K_M": {"batch_multiplier": 1.3, "ctx_size": 16384, "threads_boost": 0.8},
56
+ "Q4_K_S": {"batch_multiplier": 1.3, "ctx_size": 16384, "threads_boost": 0.8},
57
+ "Q4_0": {"batch_multiplier": 1.4, "ctx_size": 16384, "threads_boost": 0.8},
58
+ "Q3_K_M": {"batch_multiplier": 1.6, "ctx_size": 20480, "threads_boost": 0.7},
59
+ "Q2_K": {"batch_multiplier": 2.0, "ctx_size": 24576, "threads_boost": 0.7},
 
 
 
 
 
 
 
 
 
60
  }
61
 
62
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - ZEROENGINE - %(message)s')
 
207
  logger.error(f"[WRECKER] Failed: {e}")
208
  return False
209
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  # Global cache manager
211
  model_cache = ModelCacheManager()
212
 
 
272
  self.api = HfApi(token=HF_TOKEN)
273
  self.telemetry = TelemetryManager(self.api)
274
  self.llm: Optional[Llama] = None
275
+ self.active_model_info = {"repo": "", "file": ""}
276
  self.kernel_lock = threading.Lock()
277
  self.is_prefilling = False
278
  self.perf_stats = {
 
282
  "peak_tps": 0.0,
283
  "cache_hits": 0
284
  }
285
+ self.prompt_cache = {} # Cache for repeated prompts
286
  self.last_activity = time.time()
287
+ self.idle_timeout = 20 # 20 seconds idle timeout
288
  self.auto_cleanup_thread = None
289
  self.start_idle_monitor()
290
 
 
293
  self.typing_timer = None
294
  self.preprocessed_tokens = None
295
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
  def detect_quantization(self, filename: str) -> dict:
297
  """Detect quantization method from filename and return optimizations"""
298
  filename_upper = filename.upper()
 
389
  logger.error(f"Scan error: {e}")
390
  return []
391
 
392
+ def boot_kernel(self, repo: str, filename: str) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
393
  """HYPER-OPTIMIZED Boot kernel with cache manager and old model wrecker"""
394
  try:
395
  if not repo or not filename:
 
547
  threading.Thread(target=_bg_eval, daemon=True).start()
548
  return "⚡ Primed"
549
 
550
+ def inference_generator(self, prompt: str, history: List[Dict], ghost_context: str, repo: str, quant: str) -> Generator:
551
  # Update activity timestamp
552
  self.update_activity()
553
 
 
600
  first_token_time = None
601
 
602
  try:
603
+ # HYPER-OPTIMIZED INFERENCE SETTINGS
 
 
 
 
 
604
  stream = self.llm(
605
  formatted_prompt,
606
+ max_tokens=2048, # Increased output length
607
  stop=["User:", "<|eot_id|>", "\n\n"],
608
  stream=True,
609
+ temperature=0.7, # Balanced creativity
610
+ top_p=0.95, # Nucleus sampling
611
+ top_k=40, # Top-K sampling
612
+ repeat_penalty=1.1, # Prevent repetition
613
+ frequency_penalty=0.0, # No frequency penalty
614
+ presence_penalty=0.0, # No presence penalty
615
+ tfs_z=1.0, # Tail-free sampling
616
+ typical_p=1.0, # Typical sampling
617
+ mirostat_mode=2, # Mirostat v2 (perplexity control)
618
+ mirostat_tau=5.0, # Target perplexity
619
+ mirostat_eta=0.1, # Learning rate
620
  )
621
 
622
  for chunk in stream:
 
636
  if tps > self.perf_stats["peak_tps"]:
637
  self.perf_stats["peak_tps"] = tps
638
 
 
 
 
 
639
  # Update history with streaming content + performance metrics
640
+ history[-1]["content"] = f"{response_text}\n\n`⚡ {tps} t/s | 🎯 Peak: {self.perf_stats['peak_tps']:.1f} t/s | 💾 Cache: {self.perf_stats['cache_hits']}`"
 
641
  yield history
642
 
 
 
 
 
643
  # Update global performance stats
644
  self.perf_stats["total_tokens"] += tokens_count
645
  self.perf_stats["total_time"] += elapsed
 
763
  # --- UI INTERFACE ---
764
  kernel = ZeroEngine()
765
 
766
+ with gr.Blocks(title="ZeroEngine Kernel 6.5", css=CUSTOM_CSS) as demo:
767
+ gr.HTML("""
768
+ <div style='text-align: center; padding: 30px; border-radius: 24px;
769
+ background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
770
+ margin-bottom: 30px; box-shadow: 0 10px 30px rgba(0,0,0,0.3);'>
771
+ <h1 style='margin: 0; font-size: 3em; background: linear-gradient(90deg, #00d4ff, #7b2ff7);
772
+ -webkit-background-clip: text; -webkit-text-fill-color: transparent;
773
+ font-family: Consolas, monospace;'>
774
+ 🛰️ ZEROENGINE V0.1
775
+ </h1>
776
+ <p style='margin: 10px 0 0 0; color: #888; font-family: Consolas, monospace;'>
777
+ Gradio 6.5.0 | Hyper-Optimized | Auto-Boot | 20s Idle Timeout
778
+ </p>
779
+ </div>
780
+ """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
781
 
782
  with gr.Row():
783
  with gr.Column(scale=8):
784
  chat_box = gr.Chatbot(
785
  label="Main Engine Feedback",
786
+ height=650,
787
  show_label=False,
788
  autoscroll=True,
789
  container=True
 
798
  )
799
  send_btn = gr.Button("SUBMIT", variant="primary", scale=1)
800
 
801
+ with gr.Column(scale=3):
 
802
  gr.Markdown("### 🛠️ Hardware Status")
803
  ram_metric = gr.Label(label="RAM Usage", value="0/0 GB")
804
  cpu_metric = gr.Label(label="CPU Load", value="0%")
805
 
806
  gr.Markdown("---")
 
 
807
  gr.Markdown("### 📡 Model Control")
808
  repo_input = gr.Textbox(label="HuggingFace Repo", value=DEFAULT_MODEL)
809
  quant_dropdown = gr.Dropdown(label="Available Quants", choices=[], interactive=True)
 
815
  boot_status = gr.Markdown("Status: `STANDBY`")
816
 
817
  gr.Markdown("---")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
818
  gr.Markdown("### 👻 Ghost Cache (Pre-Context)")
819
  ghost_buffer = gr.Textbox(
820
  label="Background Context",
 
828
  log_output = gr.Code(
829
  label="Kernel Logs",
830
  language="shell",
831
+ value="[INIT] System Ready.",
832
  lines=5
833
  )
834
 
 
836
  def update_stats():
837
  try:
838
  m = ResourceMonitor.get_metrics()
839
+ return f"{m['ram_used_gb']}/{m['ram_total_gb']} GB", f"{m['cpu_usage_pct']}%"
 
840
  except Exception as e:
841
  logger.error(f"Stats update error: {e}")
 
842
  return "Error", "Error"
843
 
844
  def on_scan(repo):
 
864
  return
865
 
866
  yield "⚙️ System: Initiating boot sequence...", gr.update()
867
+ time.sleep(0.5) # Small delay for UI feedback
868
 
869
+ result = kernel.boot_kernel(repo, file)
870
  yield result, gr.update()
871
 
872
  except Exception as e:
873
  logger.error(f"Boot UI error: {e}")
874
  yield f"🔴 BOOT ERROR: {str(e)}", gr.update()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
875
 
876
+ # Timer for periodic stats updates
877
  timer = gr.Timer(value=2)
878
+ timer.tick(update_stats, None, [ram_metric, cpu_metric])
879
 
880
  # Event handlers
881
  scan_btn.click(on_scan, [repo_input], [quant_dropdown, log_output])
882
  boot_btn.click(on_boot, [repo_input, quant_dropdown], [boot_status, log_output])
883
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
884
  stitch_btn.click(
885
  lambda x: f"Cache: `{kernel.stitch_cache(x)}`",
886
  [ghost_buffer],
887
  [stitch_status]
888
  )
889
 
890
+ # Keyboard input preprocessing (tokenize while typing)
891
  user_input.change(
892
  lambda x: kernel.preprocess_input(x),
893
  [user_input],
894
  None
895
  )
896
 
897
+ # Auto-boot enabled inference - passes repo and quant for auto-boot
898
  inference_args = [user_input, chat_box, ghost_buffer, repo_input, quant_dropdown]
899
  user_input.submit(kernel.inference_generator, inference_args, [chat_box])
900
  send_btn.click(kernel.inference_generator, inference_args, [chat_box])