turtle170 commited on
Commit
0195768
Β·
verified Β·
1 Parent(s): 5ed5fed

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +452 -67
app.py CHANGED
@@ -31,32 +31,47 @@ SYSTEM_RESERVE_MB = 500
31
  DEFAULT_MODEL = "unsloth/Llama-3.2-1B-Instruct-GGUF"
32
  DEFAULT_QUANT = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
33
 
 
 
 
 
 
 
34
  # --- SPEED OPTIMIZATION CONFIG ---
35
- FLASH_ATTENTION = True # Enable Flash Attention 2
36
- KV_CACHE_QUANTIZATION = True # Quantize KV cache (4-bit)
37
- CONTINUOUS_BATCHING = True # Enable continuous batching
38
- SPECULATIVE_DECODE = False # Disabled for CPU (requires draft model)
39
- MLOCK_MODEL = False # Disabled: prevents swapping but uses more RAM
40
- USE_MMAP = True # Memory-mapped file loading
41
- OFFLOAD_KQV = False # CPU-only, no offload needed
42
- OPTIMAL_THREADS = max(1, psutil.cpu_count(logical=False) - 1) # Physical cores - 1
43
- ROPE_SCALING = 1.0 # RoPE frequency scaling
44
- NUMA_OPTIMIZE = True # NUMA-aware memory allocation
45
- AGGRESSIVE_GC = True # Aggressive garbage collection
46
-
47
- # Quantization detection and optimization mapping
48
  QUANT_OPTIMIZATIONS = {
49
- "BF16": {"batch_multiplier": 0.3, "ctx_size": 8192, "threads_boost": 1.2},
50
- "F16": {"batch_multiplier": 0.4, "ctx_size": 8192, "threads_boost": 1.2},
51
- "Q8_0": {"batch_multiplier": 0.7, "ctx_size": 8192, "threads_boost": 1.0},
52
- "Q6_K": {"batch_multiplier": 0.8, "ctx_size": 8192, "threads_boost": 1.0},
53
- "Q5_K_M": {"batch_multiplier": 1.0, "ctx_size": 12288, "threads_boost": 0.9},
54
- "Q5_K_S": {"batch_multiplier": 1.0, "ctx_size": 12288, "threads_boost": 0.9},
55
- "Q4_K_M": {"batch_multiplier": 1.3, "ctx_size": 16384, "threads_boost": 0.8},
56
- "Q4_K_S": {"batch_multiplier": 1.3, "ctx_size": 16384, "threads_boost": 0.8},
57
- "Q4_0": {"batch_multiplier": 1.4, "ctx_size": 16384, "threads_boost": 0.8},
58
- "Q3_K_M": {"batch_multiplier": 1.6, "ctx_size": 20480, "threads_boost": 0.7},
59
- "Q2_K": {"batch_multiplier": 2.0, "ctx_size": 24576, "threads_boost": 0.7},
 
 
 
 
 
 
 
 
 
60
  }
61
 
62
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - ZEROENGINE - %(message)s')
@@ -207,6 +222,100 @@ class ModelCacheManager:
207
  logger.error(f"[WRECKER] Failed: {e}")
208
  return False
209
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  # Global cache manager
211
  model_cache = ModelCacheManager()
212
 
@@ -272,7 +381,7 @@ class ZeroEngine:
272
  self.api = HfApi(token=HF_TOKEN)
273
  self.telemetry = TelemetryManager(self.api)
274
  self.llm: Optional[Llama] = None
275
- self.active_model_info = {"repo": "", "file": ""}
276
  self.kernel_lock = threading.Lock()
277
  self.is_prefilling = False
278
  self.perf_stats = {
@@ -282,9 +391,9 @@ class ZeroEngine:
282
  "peak_tps": 0.0,
283
  "cache_hits": 0
284
  }
285
- self.prompt_cache = {} # Cache for repeated prompts
286
  self.last_activity = time.time()
287
- self.idle_timeout = 20 # 20 seconds idle timeout
288
  self.auto_cleanup_thread = None
289
  self.start_idle_monitor()
290
 
@@ -293,6 +402,29 @@ class ZeroEngine:
293
  self.typing_timer = None
294
  self.preprocessed_tokens = None
295
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
  def detect_quantization(self, filename: str) -> dict:
297
  """Detect quantization method from filename and return optimizations"""
298
  filename_upper = filename.upper()
@@ -389,7 +521,158 @@ class ZeroEngine:
389
  logger.error(f"Scan error: {e}")
390
  return []
391
 
392
- def boot_kernel(self, repo: str, filename: str) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
393
  """HYPER-OPTIMIZED Boot kernel with cache manager and old model wrecker"""
394
  try:
395
  if not repo or not filename:
@@ -600,23 +883,28 @@ class ZeroEngine:
600
  first_token_time = None
601
 
602
  try:
603
- # HYPER-OPTIMIZED INFERENCE SETTINGS
 
 
 
 
 
604
  stream = self.llm(
605
  formatted_prompt,
606
- max_tokens=2048, # Increased output length
607
  stop=["User:", "<|eot_id|>", "\n\n"],
608
  stream=True,
609
- temperature=0.7, # Balanced creativity
610
- top_p=0.95, # Nucleus sampling
611
- top_k=40, # Top-K sampling
612
- repeat_penalty=1.1, # Prevent repetition
613
- frequency_penalty=0.0, # No frequency penalty
614
- presence_penalty=0.0, # No presence penalty
615
- tfs_z=1.0, # Tail-free sampling
616
- typical_p=1.0, # Typical sampling
617
- mirostat_mode=2, # Mirostat v2 (perplexity control)
618
- mirostat_tau=5.0, # Target perplexity
619
- mirostat_eta=0.1, # Learning rate
620
  )
621
 
622
  for chunk in stream:
@@ -636,10 +924,19 @@ class ZeroEngine:
636
  if tps > self.perf_stats["peak_tps"]:
637
  self.perf_stats["peak_tps"] = tps
638
 
 
 
 
 
639
  # Update history with streaming content + performance metrics
640
- history[-1]["content"] = f"{response_text}\n\n`⚑ {tps} t/s | 🎯 Peak: {self.perf_stats['peak_tps']:.1f} t/s | πŸ’Ύ Cache: {self.perf_stats['cache_hits']}`"
 
641
  yield history
642
 
 
 
 
 
643
  # Update global performance stats
644
  self.perf_stats["total_tokens"] += tokens_count
645
  self.perf_stats["total_time"] += elapsed
@@ -763,27 +1060,49 @@ h1, h2, h3, h4, h5, h6 {
763
  # --- UI INTERFACE ---
764
  kernel = ZeroEngine()
765
 
766
- with gr.Blocks(title="ZeroEngine Kernel 6.5", css=CUSTOM_CSS) as demo:
767
- gr.HTML("""
768
- <div style='text-align: center; padding: 30px; border-radius: 24px;
769
- background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
770
- margin-bottom: 30px; box-shadow: 0 10px 30px rgba(0,0,0,0.3);'>
771
- <h1 style='margin: 0; font-size: 3em; background: linear-gradient(90deg, #00d4ff, #7b2ff7);
772
- -webkit-background-clip: text; -webkit-text-fill-color: transparent;
773
- font-family: Consolas, monospace;'>
774
- πŸ›°οΈ ZEROENGINE V0.1
775
- </h1>
776
- <p style='margin: 10px 0 0 0; color: #888; font-family: Consolas, monospace;'>
777
- Gradio 6.5.0 | Hyper-Optimized | Auto-Boot | 20s Idle Timeout
778
- </p>
779
- </div>
780
- """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
781
 
782
  with gr.Row():
783
  with gr.Column(scale=8):
784
  chat_box = gr.Chatbot(
785
  label="Main Engine Feedback",
786
- height=650,
787
  show_label=False,
788
  autoscroll=True,
789
  container=True
@@ -798,12 +1117,15 @@ with gr.Blocks(title="ZeroEngine Kernel 6.5", css=CUSTOM_CSS) as demo:
798
  )
799
  send_btn = gr.Button("SUBMIT", variant="primary", scale=1)
800
 
801
- with gr.Column(scale=3):
 
802
  gr.Markdown("### πŸ› οΈ Hardware Status")
803
  ram_metric = gr.Label(label="RAM Usage", value="0/0 GB")
804
  cpu_metric = gr.Label(label="CPU Load", value="0%")
805
 
806
  gr.Markdown("---")
 
 
807
  gr.Markdown("### πŸ“‘ Model Control")
808
  repo_input = gr.Textbox(label="HuggingFace Repo", value=DEFAULT_MODEL)
809
  quant_dropdown = gr.Dropdown(label="Available Quants", choices=[], interactive=True)
@@ -815,6 +1137,26 @@ with gr.Blocks(title="ZeroEngine Kernel 6.5", css=CUSTOM_CSS) as demo:
815
  boot_status = gr.Markdown("Status: `STANDBY`")
816
 
817
  gr.Markdown("---")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
818
  gr.Markdown("### πŸ‘» Ghost Cache (Pre-Context)")
819
  ghost_buffer = gr.Textbox(
820
  label="Background Context",
@@ -828,7 +1170,7 @@ with gr.Blocks(title="ZeroEngine Kernel 6.5", css=CUSTOM_CSS) as demo:
828
  log_output = gr.Code(
829
  label="Kernel Logs",
830
  language="shell",
831
- value="[INIT] System Ready.",
832
  lines=5
833
  )
834
 
@@ -836,9 +1178,11 @@ with gr.Blocks(title="ZeroEngine Kernel 6.5", css=CUSTOM_CSS) as demo:
836
  def update_stats():
837
  try:
838
  m = ResourceMonitor.get_metrics()
839
- return f"{m['ram_used_gb']}/{m['ram_total_gb']} GB", f"{m['cpu_usage_pct']}%"
 
840
  except Exception as e:
841
  logger.error(f"Stats update error: {e}")
 
842
  return "Error", "Error"
843
 
844
  def on_scan(repo):
@@ -864,37 +1208,78 @@ with gr.Blocks(title="ZeroEngine Kernel 6.5", css=CUSTOM_CSS) as demo:
864
  return
865
 
866
  yield "βš™οΈ System: Initiating boot sequence...", gr.update()
867
- time.sleep(0.5) # Small delay for UI feedback
868
 
869
- result = kernel.boot_kernel(repo, file)
870
  yield result, gr.update()
871
 
872
  except Exception as e:
873
  logger.error(f"Boot UI error: {e}")
874
  yield f"πŸ”΄ BOOT ERROR: {str(e)}", gr.update()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
875
 
876
- # Timer for periodic stats updates
877
  timer = gr.Timer(value=2)
878
- timer.tick(update_stats, None, [ram_metric, cpu_metric])
879
 
880
  # Event handlers
881
  scan_btn.click(on_scan, [repo_input], [quant_dropdown, log_output])
882
  boot_btn.click(on_boot, [repo_input, quant_dropdown], [boot_status, log_output])
883
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
884
  stitch_btn.click(
885
  lambda x: f"Cache: `{kernel.stitch_cache(x)}`",
886
  [ghost_buffer],
887
  [stitch_status]
888
  )
889
 
890
- # Keyboard input preprocessing (tokenize while typing)
891
  user_input.change(
892
  lambda x: kernel.preprocess_input(x),
893
  [user_input],
894
  None
895
  )
896
 
897
- # Auto-boot enabled inference - passes repo and quant for auto-boot
898
  inference_args = [user_input, chat_box, ghost_buffer, repo_input, quant_dropdown]
899
  user_input.submit(kernel.inference_generator, inference_args, [chat_box])
900
  send_btn.click(kernel.inference_generator, inference_args, [chat_box])
 
31
  DEFAULT_MODEL = "unsloth/Llama-3.2-1B-Instruct-GGUF"
32
  DEFAULT_QUANT = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
33
 
34
+ # --- TOKEN SYSTEM CONFIG ---
35
+ MONTHLY_TOKEN_CREDITS = 100.0
36
+ TOKEN_COST_PER_100MS = 0.001
37
+ BATCH_UPGRADE_BASE_COST = 0.00005 # Exponential: 1->2 = 0.00005, 2->4 = 0.0001, etc.
38
+ TOKEN_UPGRADE_COST_PER_1K = 0.0001 # Cost per 1000 extra tokens
39
+
40
  # --- SPEED OPTIMIZATION CONFIG ---
41
+ FLASH_ATTENTION = False # Disabled for CPU (GPU-only feature)
42
+ KV_CACHE_QUANTIZATION = True # Keep for RAM savings
43
+ CONTINUOUS_BATCHING = False # CPU doesn't benefit much
44
+ SPECULATIVE_DECODE = False # CPU-only, no draft model
45
+ MLOCK_MODEL = False # Don't lock - allow OS to manage memory
46
+ USE_MMAP = True # Critical for CPU - fast loading
47
+ OFFLOAD_KQV = False # CPU-only
48
+ OPTIMAL_THREADS = psutil.cpu_count(logical=True) # Use ALL threads (including hyperthreading for CPU)
49
+ ROPE_SCALING = 1.0
50
+ NUMA_OPTIMIZE = False # Disabled - can cause issues on some systems
51
+ AGGRESSIVE_GC = True
52
+
53
+ # Quantization detection - CPU-optimized batch multipliers (more aggressive)
54
  QUANT_OPTIMIZATIONS = {
55
+ "BF16": {"batch_multiplier": 0.4, "ctx_size": 4096, "threads_boost": 1.0},
56
+ "F16": {"batch_multiplier": 0.5, "ctx_size": 4096, "threads_boost": 1.0},
57
+ "Q8_0": {"batch_multiplier": 1.0, "ctx_size": 8192, "threads_boost": 1.0},
58
+ "Q6_K": {"batch_multiplier": 1.2, "ctx_size": 8192, "threads_boost": 1.0},
59
+ "Q5_K_M": {"batch_multiplier": 1.5, "ctx_size": 12288, "threads_boost": 1.0},
60
+ "Q5_K_S": {"batch_multiplier": 1.5, "ctx_size": 12288, "threads_boost": 1.0},
61
+ "Q4_K_M": {"batch_multiplier": 2.0, "ctx_size": 16384, "threads_boost": 1.0}, # MASSIVE for CPU
62
+ "Q4_K_S": {"batch_multiplier": 2.0, "ctx_size": 16384, "threads_boost": 1.0},
63
+ "Q4_0": {"batch_multiplier": 2.2, "ctx_size": 16384, "threads_boost": 1.0},
64
+ "Q3_K_M": {"batch_multiplier": 2.5, "ctx_size": 20480, "threads_boost": 1.0},
65
+ "Q2_K": {"batch_multiplier": 3.0, "ctx_size": 24576, "threads_boost": 1.0},
66
+ }
67
+
68
+ # Model format/architecture detection patterns
69
+ MODEL_FORMATS = {
70
+ "llama": {"pattern": ["llama", "mistral", "mixtral"], "template": "llama"},
71
+ "gemma": {"pattern": ["gemma"], "template": "gemma"},
72
+ "phi": {"pattern": ["phi"], "template": "phi"},
73
+ "qwen": {"pattern": ["qwen"], "template": "chatml"},
74
+ "deepseek": {"pattern": ["deepseek"], "template": "deepseek"},
75
  }
76
 
77
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - ZEROENGINE - %(message)s')
 
222
  logger.error(f"[WRECKER] Failed: {e}")
223
  return False
224
 
225
+ # --- TOKEN MANAGER ---
226
+ class TokenManager:
227
+ def __init__(self):
228
+ self.user_tokens = {} # {session_id: {"balance": float, "start_time": float, "purchases": {}}}
229
+ self.active_sessions = {}
230
+
231
+ def get_session_id(self) -> str:
232
+ """Generate or retrieve session ID from Gradio request"""
233
+ import hashlib
234
+ import time
235
+ # Simple session ID based on timestamp (in production, use gr.Request)
236
+ return hashlib.md5(str(time.time()).encode()).hexdigest()[:8]
237
+
238
+ def initialize_user(self, session_id: str):
239
+ """Initialize new user with monthly credits"""
240
+ if session_id not in self.user_tokens:
241
+ self.user_tokens[session_id] = {
242
+ "balance": MONTHLY_TOKEN_CREDITS,
243
+ "start_time": time.time(),
244
+ "purchases": {"batch_multiplier": 1, "token_limit": 2048},
245
+ "total_spent": 0.0
246
+ }
247
+ logger.info(f"[TOKEN] New user {session_id}: {MONTHLY_TOKEN_CREDITS} tokens")
248
+
249
+ def charge_usage(self, session_id: str, duration_ms: float) -> bool:
250
+ """Charge user for inference time. Returns True if successful"""
251
+ self.initialize_user(session_id)
252
+
253
+ cost = (duration_ms / 100.0) * TOKEN_COST_PER_100MS
254
+
255
+ if self.user_tokens[session_id]["balance"] >= cost:
256
+ self.user_tokens[session_id]["balance"] -= cost
257
+ self.user_tokens[session_id]["total_spent"] += cost
258
+ logger.info(f"[TOKEN] Charged {cost:.4f} tokens ({duration_ms:.0f}ms) | Remaining: {self.user_tokens[session_id]['balance']:.2f}")
259
+ return True
260
+ else:
261
+ logger.warning(f"[TOKEN] Insufficient balance! Need {cost:.4f}, have {self.user_tokens[session_id]['balance']:.2f}")
262
+ return False
263
+
264
+ def purchase_batch_upgrade(self, session_id: str) -> tuple:
265
+ """Purchase batch size upgrade (exponential cost)"""
266
+ self.initialize_user(session_id)
267
+
268
+ current_mult = self.user_tokens[session_id]["purchases"]["batch_multiplier"]
269
+ upgrade_level = int(math.log2(current_mult)) if current_mult > 1 else 0
270
+ cost = BATCH_UPGRADE_BASE_COST * (2 ** upgrade_level)
271
+
272
+ if self.user_tokens[session_id]["balance"] >= cost:
273
+ self.user_tokens[session_id]["balance"] -= cost
274
+ self.user_tokens[session_id]["purchases"]["batch_multiplier"] = current_mult * 2
275
+ new_mult = current_mult * 2
276
+ logger.info(f"[TOKEN] Batch upgrade: {current_mult}x β†’ {new_mult}x | Cost: {cost:.5f}")
277
+ return True, f"βœ… Batch upgraded to {new_mult}x! (-{cost:.5f} tokens)"
278
+ else:
279
+ return False, f"❌ Insufficient tokens! Need {cost:.5f}, have {self.user_tokens[session_id]['balance']:.2f}"
280
+
281
+ def purchase_token_upgrade(self, session_id: str, extra_tokens: int = 1000) -> tuple:
282
+ """Purchase extra response token length"""
283
+ self.initialize_user(session_id)
284
+
285
+ cost = (extra_tokens / 1000) * TOKEN_UPGRADE_COST_PER_1K
286
+
287
+ if self.user_tokens[session_id]["balance"] >= cost:
288
+ self.user_tokens[session_id]["balance"] -= cost
289
+ self.user_tokens[session_id]["purchases"]["token_limit"] += extra_tokens
290
+ new_limit = self.user_tokens[session_id]["purchases"]["token_limit"]
291
+ logger.info(f"[TOKEN] Token limit upgrade: +{extra_tokens} tokens | Cost: {cost:.5f}")
292
+ return True, f"βœ… Token limit now {new_limit}! (-{cost:.5f} tokens)"
293
+ else:
294
+ return False, f"❌ Insufficient tokens! Need {cost:.5f}, have {self.user_tokens[session_id]['balance']:.2f}"
295
+
296
+ def get_balance(self, session_id: str) -> float:
297
+ """Get user's current token balance"""
298
+ self.initialize_user(session_id)
299
+ return round(self.user_tokens[session_id]["balance"], 2)
300
+
301
+ def get_purchases(self, session_id: str) -> dict:
302
+ """Get user's current purchases"""
303
+ self.initialize_user(session_id)
304
+ return self.user_tokens[session_id]["purchases"]
305
+
306
+ def end_session(self, session_id: str):
307
+ """End user session and log stats"""
308
+ if session_id in self.user_tokens:
309
+ stats = self.user_tokens[session_id]
310
+ logger.info(f"[TOKEN] Session ended: Spent {stats['total_spent']:.2f}, Remaining {stats['balance']:.2f}")
311
+ # Don't delete - keep for monthly tracking
312
+ return f"Session ended. You spent {stats['total_spent']:.2f} tokens this session."
313
+ return "No active session found."
314
+
315
+ # Global token manager
316
+ import math
317
+ token_manager = TokenManager()
318
+
319
  # Global cache manager
320
  model_cache = ModelCacheManager()
321
 
 
381
  self.api = HfApi(token=HF_TOKEN)
382
  self.telemetry = TelemetryManager(self.api)
383
  self.llm: Optional[Llama] = None
384
+ self.active_model_info = {"repo": "", "file": "", "format": ""}
385
  self.kernel_lock = threading.Lock()
386
  self.is_prefilling = False
387
  self.perf_stats = {
 
391
  "peak_tps": 0.0,
392
  "cache_hits": 0
393
  }
394
+ self.prompt_cache = {}
395
  self.last_activity = time.time()
396
+ self.idle_timeout = 20
397
  self.auto_cleanup_thread = None
398
  self.start_idle_monitor()
399
 
 
402
  self.typing_timer = None
403
  self.preprocessed_tokens = None
404
 
405
+ # Custom parameters (user-configurable)
406
+ self.custom_params = {
407
+ "temperature": 0.7,
408
+ "top_p": 0.95,
409
+ "top_k": 40,
410
+ "repeat_penalty": 1.1,
411
+ "batch_size_override": None, # None = auto
412
+ "max_tokens_override": None # None = auto
413
+ }
414
+
415
+ def detect_model_format(self, filename: str, repo: str) -> str:
416
+ """Auto-detect model format/architecture from filename and repo"""
417
+ combined = f"{repo.lower()} {filename.lower()}"
418
+
419
+ for format_name, format_info in MODEL_FORMATS.items():
420
+ for pattern in format_info["pattern"]:
421
+ if pattern in combined:
422
+ logger.info(f"[FORMAT-DETECT] Detected {format_name.upper()} architecture")
423
+ return format_name
424
+
425
+ logger.warning(f"[FORMAT-DETECT] Unknown format, defaulting to llama")
426
+ return "llama"
427
+
428
  def detect_quantization(self, filename: str) -> dict:
429
  """Detect quantization method from filename and return optimizations"""
430
  filename_upper = filename.upper()
 
521
  logger.error(f"Scan error: {e}")
522
  return []
523
 
524
+ def boot_kernel(self, repo: str, filename: str, session_id: str = None) -> str:
525
+ """HYPER-OPTIMIZED Boot kernel with format auto-detection and Gemma fixes"""
526
+ try:
527
+ if not repo or not filename:
528
+ return "πŸ”΄ ERROR: Repository or filename missing"
529
+
530
+ logger.info(f"[BOOT] Starting download: {filename} from {repo}")
531
+
532
+ # DETECT QUANTIZATION FROM FILENAME
533
+ quant_config = self.detect_quantization(filename)
534
+
535
+ # DETECT MODEL FORMAT/ARCHITECTURE
536
+ model_format = self.detect_model_format(filename, repo)
537
+
538
+ # Download with timeout protection
539
+ try:
540
+ path = hf_hub_download(
541
+ repo_id=repo,
542
+ filename=filename,
543
+ token=HF_TOKEN,
544
+ local_files_only=False
545
+ )
546
+ logger.info(f"[BOOT] Download complete: {path}")
547
+ except Exception as e:
548
+ logger.error(f"[BOOT] Download failed: {e}")
549
+ return f"πŸ”΄ DOWNLOAD FAILED: {str(e)}"
550
+
551
+ # Check if model is cached
552
+ is_cached = model_cache.is_cached(path)
553
+ cache_status = "🎯 CACHED" if is_cached else "πŸ†• NEW"
554
+
555
+ # Validate before loading
556
+ valid, msg = ResourceMonitor.validate_deployment(path)
557
+ if not valid:
558
+ logger.warning(f"[BOOT] Validation failed: {msg}")
559
+ return f"πŸ”΄ VALIDATION FAILED: {msg}"
560
+
561
+ logger.info(f"[BOOT] Validation passed ({cache_status}), applying {quant_config['type']} optimizations for {model_format.upper()}...")
562
+
563
+ # Load model with MAXIMUM PERFORMANCE SETTINGS
564
+ with self.kernel_lock:
565
+ # WRECK OLD MODEL
566
+ if self.llm:
567
+ logger.info("[BOOT] πŸ’£ WRECKING old model...")
568
+ try:
569
+ model_cache.wreck_old_model_cache()
570
+ del self.llm
571
+ self.llm = None
572
+ nuclear_ram_clear()
573
+ logger.info("[BOOT] βœ… Old model DESTROYED")
574
+ except Exception as e:
575
+ logger.warning(f"[BOOT] Cleanup warning: {e}")
576
+
577
+ # Calculate optimal parameters with token purchases
578
+ vm = psutil.virtual_memory()
579
+ available_ram_gb = vm.available / (1024**3)
580
+
581
+ # CPU-OPTIMIZED BATCH CALCULATION - Very aggressive for 16GB RAM
582
+ # Base calculation: use more RAM for batching on CPU
583
+ base_batch = int(512 * available_ram_gb / 8) # More aggressive base
584
+ optimal_batch = int(base_batch * quant_config["batch_multiplier"])
585
+
586
+ # Apply user's batch multiplier from token purchases
587
+ if session_id:
588
+ user_batch_mult = token_manager.get_purchases(session_id)["batch_multiplier"]
589
+ optimal_batch = int(optimal_batch * user_batch_mult)
590
+ logger.info(f"[TOKEN] User batch multiplier: {user_batch_mult}x")
591
+
592
+ # CPU can handle larger batches with quantized models
593
+ optimal_batch = max(1024, min(8192, optimal_batch)) # 1024-8192 range for CPU
594
+
595
+ # Context size
596
+ optimal_ctx = quant_config["ctx_size"]
597
+
598
+ # Reduce context for Gemma models (they have 131K n_ctx_train)
599
+ if model_format == "gemma":
600
+ optimal_ctx = min(8192, optimal_ctx) # Gemma works better with lower ctx
601
+ logger.info(f"[FORMAT] Gemma detected: reducing context to {optimal_ctx}")
602
+
603
+ # Thread optimization - use ALL threads on CPU (including hyperthreading)
604
+ optimal_threads = psutil.cpu_count(logical=True) # ALL logical cores
605
+ logger.info(f"[CPU] Using all {optimal_threads} threads (including hyperthreading)")
606
+
607
+ try:
608
+ logger.info(f"[BOOT] Initializing {model_format.upper()} {quant_config['type']}: threads={optimal_threads}, batch={optimal_batch}, ctx={optimal_ctx}")
609
+
610
+ # Preload cache if available
611
+ if is_cached:
612
+ model_cache.preload_cache(path)
613
+
614
+ # ULTRA-OPTIMIZED CPU-ONLY INITIALIZATION
615
+ init_params = {
616
+ "model_path": path,
617
+ "n_ctx": optimal_ctx,
618
+ "n_threads": optimal_threads,
619
+ "n_threads_batch": optimal_threads,
620
+ "use_mmap": USE_MMAP, # Critical for CPU
621
+ "use_mlock": MLOCK_MODEL, # Let OS manage memory
622
+ "n_batch": optimal_batch, # MASSIVE batches for CPU
623
+ "n_gpu_layers": 0, # CPU-only
624
+ "rope_scaling_type": 0,
625
+ "rope_freq_scale": ROPE_SCALING,
626
+ "verbose": False,
627
+ "logits_all": False,
628
+ "embedding": False,
629
+ "f16_kv": False # Use quantized KV cache
630
+ }
631
+
632
+ # Add KV quantization only if not Gemma (Gemma can be finicky)
633
+ if model_format != "gemma" and KV_CACHE_QUANTIZATION:
634
+ init_params["type_k"] = 2
635
+ init_params["type_v"] = 2
636
+ logger.info("[OPTIM] KV cache quantization enabled (Q4)")
637
+
638
+ self.llm = Llama(**init_params)
639
+
640
+ self.active_model_info = {
641
+ "repo": repo,
642
+ "file": filename,
643
+ "quant": quant_config['type'],
644
+ "format": model_format
645
+ }
646
+ self.telemetry.track_load(repo, filename)
647
+
648
+ # Extract and cache signature
649
+ if not is_cached:
650
+ logger.info("[BOOT] Extracting cache signature...")
651
+ signature = model_cache.extract_cache_signature(path)
652
+ if signature:
653
+ model_cache.save_to_cache(path, signature)
654
+
655
+ # Warm-up
656
+ logger.info("[BOOT] Warming up model caches...")
657
+ try:
658
+ self.llm("Warmup", max_tokens=1, stream=False)
659
+ force_gc()
660
+ except:
661
+ pass
662
+
663
+ logger.info("[BOOT] πŸš€ CPU-OPTIMIZED MODEL READY!")
664
+ return f"🟒 {model_format.upper()} {quant_config['type']} {cache_status} | CPU:{optimal_threads}T | B:{optimal_batch} | Ctx:{optimal_ctx}"
665
+
666
+ except Exception as e:
667
+ logger.error(f"[BOOT] Model loading failed: {e}")
668
+ self.llm = None
669
+ nuclear_ram_clear()
670
+ return f"πŸ”΄ LOAD FAILED: {str(e)}"
671
+
672
+ except Exception as e:
673
+ logger.error(f"[BOOT] Unexpected error: {e}")
674
+ nuclear_ram_clear()
675
+ return f"πŸ”΄ BOOT FAILURE: {str(e)}"
676
  """HYPER-OPTIMIZED Boot kernel with cache manager and old model wrecker"""
677
  try:
678
  if not repo or not filename:
 
883
  first_token_time = None
884
 
885
  try:
886
+ # Get max tokens from user purchases
887
+ max_tokens = 2048
888
+ if session_id:
889
+ max_tokens = token_manager.get_purchases(session_id)["token_limit"]
890
+
891
+ # HYPER-OPTIMIZED CPU INFERENCE SETTINGS
892
  stream = self.llm(
893
  formatted_prompt,
894
+ max_tokens=max_tokens,
895
  stop=["User:", "<|eot_id|>", "\n\n"],
896
  stream=True,
897
+ temperature=self.custom_params["temperature"],
898
+ top_p=self.custom_params["top_p"],
899
+ top_k=self.custom_params["top_k"],
900
+ repeat_penalty=self.custom_params["repeat_penalty"],
901
+ frequency_penalty=0.0,
902
+ presence_penalty=0.0,
903
+ tfs_z=1.0,
904
+ typical_p=1.0,
905
+ mirostat_mode=2, # CPU benefits from mirostat
906
+ mirostat_tau=5.0,
907
+ mirostat_eta=0.1,
908
  )
909
 
910
  for chunk in stream:
 
924
  if tps > self.perf_stats["peak_tps"]:
925
  self.perf_stats["peak_tps"] = tps
926
 
927
+ # Charge tokens every second
928
+ if int(elapsed * 1000) % 1000 < 100 and session_id: # Every ~1 second
929
+ token_manager.charge_usage(session_id, elapsed * 1000)
930
+
931
  # Update history with streaming content + performance metrics
932
+ balance = token_manager.get_balance(session_id) if session_id else 0
933
+ history[-1]["content"] = f"{response_text}\n\n`⚑ {tps} t/s | 🎯 Peak: {self.perf_stats['peak_tps']:.1f} t/s | πŸ’° {balance:.2f} tokens`"
934
  yield history
935
 
936
+ # Final token charge for remaining time
937
+ if session_id:
938
+ token_manager.charge_usage(session_id, elapsed * 1000)
939
+
940
  # Update global performance stats
941
  self.perf_stats["total_tokens"] += tokens_count
942
  self.perf_stats["total_time"] += elapsed
 
1060
  # --- UI INTERFACE ---
1061
  kernel = ZeroEngine()
1062
 
1063
+ # Session ID for token tracking
1064
+ session_id = token_manager.get_session_id()
1065
+
1066
+ with gr.Blocks(title="ZeroEngine V0.2", css=CUSTOM_CSS) as demo:
1067
+ # Header with Token Display
1068
+ with gr.Row():
1069
+ with gr.Column(scale=8):
1070
+ gr.HTML("""
1071
+ <div style='text-align: center; padding: 30px; border-radius: 24px;
1072
+ background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
1073
+ margin-bottom: 20px; box-shadow: 0 10px 30px rgba(0,0,0,0.3);'>
1074
+ <h1 style='margin: 0; font-size: 3em; background: linear-gradient(90deg, #00d4ff, #7b2ff7);
1075
+ -webkit-background-clip: text; -webkit-text-fill-color: transparent;
1076
+ font-family: Consolas, monospace;'>
1077
+ πŸ›°οΈ ZEROENGINE V0.2
1078
+ </h1>
1079
+ <p style='margin: 10px 0 0 0; color: #888; font-family: Consolas, monospace;'>
1080
+ CPU-Optimized | Token System | Custom Parameters | Auto-Format
1081
+ </p>
1082
+ </div>
1083
+ """)
1084
+ with gr.Column(scale=2):
1085
+ # Token Display
1086
+ gr.HTML("""
1087
+ <div style='text-align: center; padding: 20px; border-radius: 20px;
1088
+ background: linear-gradient(135deg, #7b2ff7 0%, #9b59b6 100%);
1089
+ margin-bottom: 20px; box-shadow: 0 8px 20px rgba(123,47,247,0.3);'>
1090
+ <div style='font-size: 2em; margin-bottom: 5px;'>πŸ’°</div>
1091
+ <div id='token-display' style='font-size: 1.8em; font-weight: bold; color: white; font-family: Consolas;'>
1092
+ 100.00
1093
+ </div>
1094
+ <div style='font-size: 0.9em; color: #ddd; font-family: Consolas;'>TOKENS</div>
1095
+ </div>
1096
+ """)
1097
+ token_balance = gr.Textbox(value="100.00", visible=False, elem_id="token_balance")
1098
+ end_session_btn = gr.Button("END SESSION", variant="stop", size="sm")
1099
+ session_status = gr.Markdown("", visible=False)
1100
 
1101
  with gr.Row():
1102
  with gr.Column(scale=8):
1103
  chat_box = gr.Chatbot(
1104
  label="Main Engine Feedback",
1105
+ height=600,
1106
  show_label=False,
1107
  autoscroll=True,
1108
  container=True
 
1117
  )
1118
  send_btn = gr.Button("SUBMIT", variant="primary", scale=1)
1119
 
1120
+ with gr.Column(scale=4):
1121
+ # Hardware Status
1122
  gr.Markdown("### πŸ› οΈ Hardware Status")
1123
  ram_metric = gr.Label(label="RAM Usage", value="0/0 GB")
1124
  cpu_metric = gr.Label(label="CPU Load", value="0%")
1125
 
1126
  gr.Markdown("---")
1127
+
1128
+ # Model Control
1129
  gr.Markdown("### πŸ“‘ Model Control")
1130
  repo_input = gr.Textbox(label="HuggingFace Repo", value=DEFAULT_MODEL)
1131
  quant_dropdown = gr.Dropdown(label="Available Quants", choices=[], interactive=True)
 
1137
  boot_status = gr.Markdown("Status: `STANDBY`")
1138
 
1139
  gr.Markdown("---")
1140
+
1141
+ # Custom Parameters
1142
+ gr.Markdown("### βš™οΈ Custom Parameters")
1143
+ temperature_slider = gr.Slider(0.1, 2.0, value=0.7, step=0.1, label="Temperature")
1144
+ top_p_slider = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-P")
1145
+ top_k_slider = gr.Slider(1, 100, value=40, step=1, label="Top-K")
1146
+ repeat_penalty_slider = gr.Slider(1.0, 2.0, value=1.1, step=0.1, label="Repeat Penalty")
1147
+
1148
+ gr.Markdown("---")
1149
+
1150
+ # Token Purchases
1151
+ gr.Markdown("### πŸ’Ž Token Upgrades")
1152
+ with gr.Row():
1153
+ batch_upgrade_btn = gr.Button("πŸš€ Batch x2", size="sm", variant="secondary")
1154
+ token_upgrade_btn = gr.Button("πŸ“ˆ +1K Tokens", size="sm", variant="secondary")
1155
+ purchase_status = gr.Markdown("Ready to upgrade!")
1156
+
1157
+ gr.Markdown("---")
1158
+
1159
+ # Ghost Cache
1160
  gr.Markdown("### πŸ‘» Ghost Cache (Pre-Context)")
1161
  ghost_buffer = gr.Textbox(
1162
  label="Background Context",
 
1170
  log_output = gr.Code(
1171
  label="Kernel Logs",
1172
  language="shell",
1173
+ value="[INIT] V0.2 System Ready.",
1174
  lines=5
1175
  )
1176
 
 
1178
  def update_stats():
1179
  try:
1180
  m = ResourceMonitor.get_metrics()
1181
+ balance = token_manager.get_balance(session_id)
1182
+ return f"{m['ram_used_gb']}/{m['ram_total_gb']} GB", f"{m['cpu_usage_pct']}%", f"{balance}"
1183
  except Exception as e:
1184
  logger.error(f"Stats update error: {e}")
1185
+ return "Error", "Error", "0.00"
1186
  return "Error", "Error"
1187
 
1188
  def on_scan(repo):
 
1208
  return
1209
 
1210
  yield "βš™οΈ System: Initiating boot sequence...", gr.update()
1211
+ time.sleep(0.5)
1212
 
1213
+ result = kernel.boot_kernel(repo, file, session_id)
1214
  yield result, gr.update()
1215
 
1216
  except Exception as e:
1217
  logger.error(f"Boot UI error: {e}")
1218
  yield f"πŸ”΄ BOOT ERROR: {str(e)}", gr.update()
1219
+
1220
+ def on_batch_upgrade():
1221
+ success, msg = token_manager.purchase_batch_upgrade(session_id)
1222
+ balance = token_manager.get_balance(session_id)
1223
+ return msg, f"{balance}"
1224
+
1225
+ def on_token_upgrade():
1226
+ success, msg = token_manager.purchase_token_upgrade(session_id, 1000)
1227
+ balance = token_manager.get_balance(session_id)
1228
+ return msg, f"{balance}"
1229
+
1230
+ def on_end_session():
1231
+ msg = token_manager.end_session(session_id)
1232
+ return msg
1233
+
1234
+ def update_custom_params(temp, top_p, top_k, repeat_pen):
1235
+ kernel.custom_params["temperature"] = temp
1236
+ kernel.custom_params["top_p"] = top_p
1237
+ kernel.custom_params["top_k"] = int(top_k)
1238
+ kernel.custom_params["repeat_penalty"] = repeat_pen
1239
+ return "βœ… Parameters updated!"
1240
 
1241
+ # Timer for periodic stats updates (includes token balance)
1242
  timer = gr.Timer(value=2)
1243
+ timer.tick(update_stats, None, [ram_metric, cpu_metric, token_balance])
1244
 
1245
  # Event handlers
1246
  scan_btn.click(on_scan, [repo_input], [quant_dropdown, log_output])
1247
  boot_btn.click(on_boot, [repo_input, quant_dropdown], [boot_status, log_output])
1248
 
1249
+ # Token purchases
1250
+ batch_upgrade_btn.click(on_batch_upgrade, None, [purchase_status, token_balance])
1251
+ token_upgrade_btn.click(on_token_upgrade, None, [purchase_status, token_balance])
1252
+ end_session_btn.click(on_end_session, None, [session_status])
1253
+
1254
+ # Custom parameter updates
1255
+ temperature_slider.change(update_custom_params,
1256
+ [temperature_slider, top_p_slider, top_k_slider, repeat_penalty_slider],
1257
+ [purchase_status])
1258
+ top_p_slider.change(update_custom_params,
1259
+ [temperature_slider, top_p_slider, top_k_slider, repeat_penalty_slider],
1260
+ [purchase_status])
1261
+ top_k_slider.change(update_custom_params,
1262
+ [temperature_slider, top_p_slider, top_k_slider, repeat_penalty_slider],
1263
+ [purchase_status])
1264
+ repeat_penalty_slider.change(update_custom_params,
1265
+ [temperature_slider, top_p_slider, top_k_slider, repeat_penalty_slider],
1266
+ [purchase_status])
1267
+
1268
+ # Ghost cache
1269
  stitch_btn.click(
1270
  lambda x: f"Cache: `{kernel.stitch_cache(x)}`",
1271
  [ghost_buffer],
1272
  [stitch_status]
1273
  )
1274
 
1275
+ # Keyboard input preprocessing
1276
  user_input.change(
1277
  lambda x: kernel.preprocess_input(x),
1278
  [user_input],
1279
  None
1280
  )
1281
 
1282
+ # Auto-boot enabled inference
1283
  inference_args = [user_input, chat_box, ghost_buffer, repo_input, quant_dropdown]
1284
  user_input.submit(kernel.inference_generator, inference_args, [chat_box])
1285
  send_btn.click(kernel.inference_generator, inference_args, [chat_box])