turtle170 commited on
Commit
b9fa083
Β·
verified Β·
1 Parent(s): 7046421

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +565 -68
app.py CHANGED
@@ -31,32 +31,47 @@ SYSTEM_RESERVE_MB = 500
31
  DEFAULT_MODEL = "unsloth/Llama-3.2-1B-Instruct-GGUF"
32
  DEFAULT_QUANT = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
33
 
 
 
 
 
 
 
34
  # --- SPEED OPTIMIZATION CONFIG ---
35
- FLASH_ATTENTION = True # Enable Flash Attention 2
36
- KV_CACHE_QUANTIZATION = True # Quantize KV cache (4-bit)
37
- CONTINUOUS_BATCHING = True # Enable continuous batching
38
- SPECULATIVE_DECODE = False # Disabled for CPU (requires draft model)
39
- MLOCK_MODEL = False # Disabled: prevents swapping but uses more RAM
40
- USE_MMAP = True # Memory-mapped file loading
41
- OFFLOAD_KQV = False # CPU-only, no offload needed
42
- OPTIMAL_THREADS = max(1, psutil.cpu_count(logical=False) - 1) # Physical cores - 1
43
- ROPE_SCALING = 1.0 # RoPE frequency scaling
44
- NUMA_OPTIMIZE = True # NUMA-aware memory allocation
45
- AGGRESSIVE_GC = True # Aggressive garbage collection
46
-
47
- # Quantization detection and optimization mapping
48
  QUANT_OPTIMIZATIONS = {
49
- "BF16": {"batch_multiplier": 0.3, "ctx_size": 8192, "threads_boost": 1.2},
50
- "F16": {"batch_multiplier": 0.4, "ctx_size": 8192, "threads_boost": 1.2},
51
- "Q8_0": {"batch_multiplier": 0.7, "ctx_size": 8192, "threads_boost": 1.0},
52
- "Q6_K": {"batch_multiplier": 0.8, "ctx_size": 8192, "threads_boost": 1.0},
53
- "Q5_K_M": {"batch_multiplier": 1.0, "ctx_size": 12288, "threads_boost": 0.9},
54
- "Q5_K_S": {"batch_multiplier": 1.0, "ctx_size": 12288, "threads_boost": 0.9},
55
- "Q4_K_M": {"batch_multiplier": 1.3, "ctx_size": 16384, "threads_boost": 0.8},
56
- "Q4_K_S": {"batch_multiplier": 1.3, "ctx_size": 16384, "threads_boost": 0.8},
57
- "Q4_0": {"batch_multiplier": 1.4, "ctx_size": 16384, "threads_boost": 0.8},
58
- "Q3_K_M": {"batch_multiplier": 1.6, "ctx_size": 20480, "threads_boost": 0.7},
59
- "Q2_K": {"batch_multiplier": 2.0, "ctx_size": 24576, "threads_boost": 0.7},
 
 
 
 
 
 
 
 
 
60
  }
61
 
62
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - ZEROENGINE - %(message)s')
@@ -207,6 +222,212 @@ class ModelCacheManager:
207
  logger.error(f"[WRECKER] Failed: {e}")
208
  return False
209
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  # Global cache manager
211
  model_cache = ModelCacheManager()
212
 
@@ -272,7 +493,7 @@ class ZeroEngine:
272
  self.api = HfApi(token=HF_TOKEN)
273
  self.telemetry = TelemetryManager(self.api)
274
  self.llm: Optional[Llama] = None
275
- self.active_model_info = {"repo": "", "file": ""}
276
  self.kernel_lock = threading.Lock()
277
  self.is_prefilling = False
278
  self.perf_stats = {
@@ -282,9 +503,9 @@ class ZeroEngine:
282
  "peak_tps": 0.0,
283
  "cache_hits": 0
284
  }
285
- self.prompt_cache = {} # Cache for repeated prompts
286
  self.last_activity = time.time()
287
- self.idle_timeout = 20 # 20 seconds idle timeout
288
  self.auto_cleanup_thread = None
289
  self.start_idle_monitor()
290
 
@@ -293,6 +514,29 @@ class ZeroEngine:
293
  self.typing_timer = None
294
  self.preprocessed_tokens = None
295
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
  def detect_quantization(self, filename: str) -> dict:
297
  """Detect quantization method from filename and return optimizations"""
298
  filename_upper = filename.upper()
@@ -389,7 +633,158 @@ class ZeroEngine:
389
  logger.error(f"Scan error: {e}")
390
  return []
391
 
392
- def boot_kernel(self, repo: str, filename: str) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
393
  """HYPER-OPTIMIZED Boot kernel with cache manager and old model wrecker"""
394
  try:
395
  if not repo or not filename:
@@ -547,7 +942,7 @@ class ZeroEngine:
547
  threading.Thread(target=_bg_eval, daemon=True).start()
548
  return "⚑ Primed"
549
 
550
- def inference_generator(self, prompt: str, history: List[Dict], ghost_context: str, repo: str, quant: str) -> Generator:
551
  # Update activity timestamp
552
  self.update_activity()
553
 
@@ -600,23 +995,28 @@ class ZeroEngine:
600
  first_token_time = None
601
 
602
  try:
603
- # HYPER-OPTIMIZED INFERENCE SETTINGS
 
 
 
 
 
604
  stream = self.llm(
605
  formatted_prompt,
606
- max_tokens=2048, # Increased output length
607
  stop=["User:", "<|eot_id|>", "\n\n"],
608
  stream=True,
609
- temperature=0.7, # Balanced creativity
610
- top_p=0.95, # Nucleus sampling
611
- top_k=40, # Top-K sampling
612
- repeat_penalty=1.1, # Prevent repetition
613
- frequency_penalty=0.0, # No frequency penalty
614
- presence_penalty=0.0, # No presence penalty
615
- tfs_z=1.0, # Tail-free sampling
616
- typical_p=1.0, # Typical sampling
617
- mirostat_mode=2, # Mirostat v2 (perplexity control)
618
- mirostat_tau=5.0, # Target perplexity
619
- mirostat_eta=0.1, # Learning rate
620
  )
621
 
622
  for chunk in stream:
@@ -636,10 +1036,19 @@ class ZeroEngine:
636
  if tps > self.perf_stats["peak_tps"]:
637
  self.perf_stats["peak_tps"] = tps
638
 
 
 
 
 
639
  # Update history with streaming content + performance metrics
640
- history[-1]["content"] = f"{response_text}\n\n`⚑ {tps} t/s | 🎯 Peak: {self.perf_stats['peak_tps']:.1f} t/s | πŸ’Ύ Cache: {self.perf_stats['cache_hits']}`"
 
641
  yield history
642
 
 
 
 
 
643
  # Update global performance stats
644
  self.perf_stats["total_tokens"] += tokens_count
645
  self.perf_stats["total_time"] += elapsed
@@ -763,27 +1172,49 @@ h1, h2, h3, h4, h5, h6 {
763
  # --- UI INTERFACE ---
764
  kernel = ZeroEngine()
765
 
766
- with gr.Blocks(title="ZeroEngine Kernel 6.5", css=CUSTOM_CSS) as demo:
767
- gr.HTML("""
768
- <div style='text-align: center; padding: 30px; border-radius: 24px;
769
- background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
770
- margin-bottom: 30px; box-shadow: 0 10px 30px rgba(0,0,0,0.3);'>
771
- <h1 style='margin: 0; font-size: 3em; background: linear-gradient(90deg, #00d4ff, #7b2ff7);
772
- -webkit-background-clip: text; -webkit-text-fill-color: transparent;
773
- font-family: Consolas, monospace;'>
774
- πŸ›°οΈ ZEROENGINE V0.1
775
- </h1>
776
- <p style='margin: 10px 0 0 0; color: #888; font-family: Consolas, monospace;'>
777
- Gradio 6.5.0 | Hyper-Optimized | Auto-Boot | 20s Idle Timeout
778
- </p>
779
- </div>
780
- """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
781
 
782
  with gr.Row():
783
  with gr.Column(scale=8):
784
  chat_box = gr.Chatbot(
785
  label="Main Engine Feedback",
786
- height=650,
787
  show_label=False,
788
  autoscroll=True,
789
  container=True
@@ -798,12 +1229,15 @@ with gr.Blocks(title="ZeroEngine Kernel 6.5", css=CUSTOM_CSS) as demo:
798
  )
799
  send_btn = gr.Button("SUBMIT", variant="primary", scale=1)
800
 
801
- with gr.Column(scale=3):
 
802
  gr.Markdown("### πŸ› οΈ Hardware Status")
803
  ram_metric = gr.Label(label="RAM Usage", value="0/0 GB")
804
  cpu_metric = gr.Label(label="CPU Load", value="0%")
805
 
806
  gr.Markdown("---")
 
 
807
  gr.Markdown("### πŸ“‘ Model Control")
808
  repo_input = gr.Textbox(label="HuggingFace Repo", value=DEFAULT_MODEL)
809
  quant_dropdown = gr.Dropdown(label="Available Quants", choices=[], interactive=True)
@@ -815,6 +1249,26 @@ with gr.Blocks(title="ZeroEngine Kernel 6.5", css=CUSTOM_CSS) as demo:
815
  boot_status = gr.Markdown("Status: `STANDBY`")
816
 
817
  gr.Markdown("---")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
818
  gr.Markdown("### πŸ‘» Ghost Cache (Pre-Context)")
819
  ghost_buffer = gr.Textbox(
820
  label="Background Context",
@@ -828,7 +1282,7 @@ with gr.Blocks(title="ZeroEngine Kernel 6.5", css=CUSTOM_CSS) as demo:
828
  log_output = gr.Code(
829
  label="Kernel Logs",
830
  language="shell",
831
- value="[INIT] System Ready.",
832
  lines=5
833
  )
834
 
@@ -836,9 +1290,11 @@ with gr.Blocks(title="ZeroEngine Kernel 6.5", css=CUSTOM_CSS) as demo:
836
  def update_stats():
837
  try:
838
  m = ResourceMonitor.get_metrics()
839
- return f"{m['ram_used_gb']}/{m['ram_total_gb']} GB", f"{m['cpu_usage_pct']}%"
 
840
  except Exception as e:
841
  logger.error(f"Stats update error: {e}")
 
842
  return "Error", "Error"
843
 
844
  def on_scan(repo):
@@ -864,37 +1320,78 @@ with gr.Blocks(title="ZeroEngine Kernel 6.5", css=CUSTOM_CSS) as demo:
864
  return
865
 
866
  yield "βš™οΈ System: Initiating boot sequence...", gr.update()
867
- time.sleep(0.5) # Small delay for UI feedback
868
 
869
- result = kernel.boot_kernel(repo, file)
870
  yield result, gr.update()
871
 
872
  except Exception as e:
873
  logger.error(f"Boot UI error: {e}")
874
  yield f"πŸ”΄ BOOT ERROR: {str(e)}", gr.update()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
875
 
876
- # Timer for periodic stats updates
877
  timer = gr.Timer(value=2)
878
- timer.tick(update_stats, None, [ram_metric, cpu_metric])
879
 
880
  # Event handlers
881
  scan_btn.click(on_scan, [repo_input], [quant_dropdown, log_output])
882
  boot_btn.click(on_boot, [repo_input, quant_dropdown], [boot_status, log_output])
883
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
884
  stitch_btn.click(
885
  lambda x: f"Cache: `{kernel.stitch_cache(x)}`",
886
  [ghost_buffer],
887
  [stitch_status]
888
  )
889
 
890
- # Keyboard input preprocessing (tokenize while typing)
891
  user_input.change(
892
  lambda x: kernel.preprocess_input(x),
893
  [user_input],
894
  None
895
  )
896
 
897
- # Auto-boot enabled inference - passes repo and quant for auto-boot
898
  inference_args = [user_input, chat_box, ghost_buffer, repo_input, quant_dropdown]
899
  user_input.submit(kernel.inference_generator, inference_args, [chat_box])
900
  send_btn.click(kernel.inference_generator, inference_args, [chat_box])
 
31
  DEFAULT_MODEL = "unsloth/Llama-3.2-1B-Instruct-GGUF"
32
  DEFAULT_QUANT = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
33
 
34
+ # --- TOKEN SYSTEM CONFIG ---
35
+ MONTHLY_TOKEN_CREDITS = 100.0
36
+ TOKEN_COST_PER_100MS = 0.001
37
+ BATCH_UPGRADE_BASE_COST = 0.00005 # Exponential: 1->2 = 0.00005, 2->4 = 0.0001, etc.
38
+ TOKEN_UPGRADE_COST_PER_1K = 0.0001 # Cost per 1000 extra tokens
39
+
40
  # --- SPEED OPTIMIZATION CONFIG ---
41
+ FLASH_ATTENTION = False # Disabled for CPU (GPU-only feature)
42
+ KV_CACHE_QUANTIZATION = True # Keep for RAM savings
43
+ CONTINUOUS_BATCHING = False # CPU doesn't benefit much
44
+ SPECULATIVE_DECODE = False # CPU-only, no draft model
45
+ MLOCK_MODEL = False # Don't lock - allow OS to manage memory
46
+ USE_MMAP = True # Critical for CPU - fast loading
47
+ OFFLOAD_KQV = False # CPU-only
48
+ OPTIMAL_THREADS = psutil.cpu_count(logical=True) # Use ALL threads (including hyperthreading for CPU)
49
+ ROPE_SCALING = 1.0
50
+ NUMA_OPTIMIZE = False # Disabled - can cause issues on some systems
51
+ AGGRESSIVE_GC = True
52
+
53
+ # Quantization detection - CPU-optimized batch multipliers (more aggressive)
54
  QUANT_OPTIMIZATIONS = {
55
+ "BF16": {"batch_multiplier": 0.4, "ctx_size": 4096, "threads_boost": 1.0},
56
+ "F16": {"batch_multiplier": 0.5, "ctx_size": 4096, "threads_boost": 1.0},
57
+ "Q8_0": {"batch_multiplier": 1.0, "ctx_size": 8192, "threads_boost": 1.0},
58
+ "Q6_K": {"batch_multiplier": 1.2, "ctx_size": 8192, "threads_boost": 1.0},
59
+ "Q5_K_M": {"batch_multiplier": 1.5, "ctx_size": 12288, "threads_boost": 1.0},
60
+ "Q5_K_S": {"batch_multiplier": 1.5, "ctx_size": 12288, "threads_boost": 1.0},
61
+ "Q4_K_M": {"batch_multiplier": 2.0, "ctx_size": 16384, "threads_boost": 1.0}, # MASSIVE for CPU
62
+ "Q4_K_S": {"batch_multiplier": 2.0, "ctx_size": 16384, "threads_boost": 1.0},
63
+ "Q4_0": {"batch_multiplier": 2.2, "ctx_size": 16384, "threads_boost": 1.0},
64
+ "Q3_K_M": {"batch_multiplier": 2.5, "ctx_size": 20480, "threads_boost": 1.0},
65
+ "Q2_K": {"batch_multiplier": 3.0, "ctx_size": 24576, "threads_boost": 1.0},
66
+ }
67
+
68
+ # Model format/architecture detection patterns
69
+ MODEL_FORMATS = {
70
+ "llama": {"pattern": ["llama", "mistral", "mixtral"], "template": "llama"},
71
+ "gemma": {"pattern": ["gemma"], "template": "gemma"},
72
+ "phi": {"pattern": ["phi"], "template": "phi"},
73
+ "qwen": {"pattern": ["qwen"], "template": "chatml"},
74
+ "deepseek": {"pattern": ["deepseek"], "template": "deepseek"},
75
  }
76
 
77
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - ZEROENGINE - %(message)s')
 
222
  logger.error(f"[WRECKER] Failed: {e}")
223
  return False
224
 
225
+ # --- TOKEN MANAGER ---
226
+ class TokenManager:
227
+ def __init__(self):
228
+ self.user_tokens = {} # {username: {"balance": float, "start_time": float, "purchases": {}}}
229
+ self.owner_username = "turtle170" # Owner gets infinite tokens
230
+
231
+ def is_owner(self, username: str) -> bool:
232
+ """Check if user is the owner"""
233
+ if not username:
234
+ return False
235
+ return username.lower() == self.owner_username.lower()
236
+
237
+ def initialize_user(self, username: str):
238
+ """Initialize new user with monthly credits (or infinite for owner)"""
239
+ if not username:
240
+ username = "anonymous"
241
+
242
+ if username not in self.user_tokens:
243
+ # Owner gets infinite tokens
244
+ if self.is_owner(username):
245
+ self.user_tokens[username] = {
246
+ "balance": float('inf'),
247
+ "start_time": time.time(),
248
+ "purchases": {"batch_multiplier": 1, "token_limit": 2048},
249
+ "total_spent": 0.0,
250
+ "is_owner": True,
251
+ "username": username
252
+ }
253
+ logger.info(f"[TOKEN] πŸ‘‘ OWNER {username} initialized with INFINITE tokens!")
254
+ else:
255
+ self.user_tokens[username] = {
256
+ "balance": MONTHLY_TOKEN_CREDITS,
257
+ "start_time": time.time(),
258
+ "purchases": {"batch_multiplier": 1, "token_limit": 2048},
259
+ "total_spent": 0.0,
260
+ "is_owner": False,
261
+ "username": username,
262
+ "last_reset": time.time()
263
+ }
264
+ logger.info(f"[TOKEN] New user {username}: {MONTHLY_TOKEN_CREDITS} tokens")
265
+
266
+ def check_monthly_reset(self, username: str):
267
+ """Reset tokens if a month has passed"""
268
+ if not username or username not in self.user_tokens:
269
+ return
270
+
271
+ if self.user_tokens[username].get("is_owner", False):
272
+ return # Owner never needs reset
273
+
274
+ last_reset = self.user_tokens[username].get("last_reset", time.time())
275
+ month_in_seconds = 30 * 24 * 60 * 60 # 30 days
276
+
277
+ if time.time() - last_reset > month_in_seconds:
278
+ self.user_tokens[username]["balance"] = MONTHLY_TOKEN_CREDITS
279
+ self.user_tokens[username]["last_reset"] = time.time()
280
+ self.user_tokens[username]["total_spent"] = 0.0
281
+ logger.info(f"[TOKEN] Monthly reset for {username}: {MONTHLY_TOKEN_CREDITS} tokens")
282
+
283
+ def charge_usage(self, username: str, duration_ms: float) -> bool:
284
+ """Charge user for inference time. Returns True if successful. Owner never charged."""
285
+ if not username:
286
+ username = "anonymous"
287
+
288
+ self.initialize_user(username)
289
+ self.check_monthly_reset(username)
290
+
291
+ # Owner never gets charged
292
+ if self.user_tokens[username].get("is_owner", False):
293
+ return True
294
+
295
+ cost = (duration_ms / 100.0) * TOKEN_COST_PER_100MS
296
+
297
+ # Check if user has enough balance
298
+ if self.user_tokens[username]["balance"] <= 0:
299
+ logger.warning(f"[TOKEN] ❌ {username} has 0 tokens! Access denied.")
300
+ return False
301
+
302
+ if self.user_tokens[username]["balance"] >= cost:
303
+ self.user_tokens[username]["balance"] -= cost
304
+ self.user_tokens[username]["balance"] = max(0, self.user_tokens[username]["balance"]) # Never go below 0
305
+ self.user_tokens[username]["total_spent"] += cost
306
+ logger.info(f"[TOKEN] Charged {cost:.4f} tokens ({duration_ms:.0f}ms) | Remaining: {self.user_tokens[username]['balance']:.2f}")
307
+ return True
308
+ else:
309
+ # Insufficient balance - set to 0 and deny
310
+ self.user_tokens[username]["balance"] = 0
311
+ logger.warning(f"[TOKEN] ❌ Insufficient balance! {username} now at 0 tokens.")
312
+ return False
313
+
314
+ def can_use_engine(self, username: str) -> tuple:
315
+ """Check if user can use the engine. Returns (bool, message)"""
316
+ if not username:
317
+ username = "anonymous"
318
+
319
+ self.initialize_user(username)
320
+ self.check_monthly_reset(username)
321
+
322
+ if self.user_tokens[username].get("is_owner", False):
323
+ return True, "πŸ‘‘ Owner access granted"
324
+
325
+ balance = self.user_tokens[username]["balance"]
326
+
327
+ if balance <= 0:
328
+ last_reset = self.user_tokens[username].get("last_reset", time.time())
329
+ time_until_reset = 30 * 24 * 60 * 60 - (time.time() - last_reset)
330
+ days_left = int(time_until_reset / (24 * 60 * 60))
331
+ return False, f"❌ Out of tokens! Resets in {days_left} days. Current balance: 0.00"
332
+
333
+ return True, f"βœ… Access granted. Balance: {balance:.2f} tokens"
334
+
335
+ def purchase_batch_upgrade(self, username: str) -> tuple:
336
+ """Purchase batch size upgrade (exponential cost). Free for owner."""
337
+ if not username:
338
+ return False, "❌ Please login first"
339
+
340
+ self.initialize_user(username)
341
+
342
+ # Owner gets free upgrades
343
+ if self.user_tokens[username].get("is_owner", False):
344
+ current_mult = self.user_tokens[username]["purchases"]["batch_multiplier"]
345
+ self.user_tokens[username]["purchases"]["batch_multiplier"] = current_mult * 2
346
+ new_mult = current_mult * 2
347
+ logger.info(f"[TOKEN] πŸ‘‘ OWNER free batch upgrade: {current_mult}x β†’ {new_mult}x")
348
+ return True, f"πŸ‘‘ FREE UPGRADE! Batch now {new_mult}x!"
349
+
350
+ current_mult = self.user_tokens[username]["purchases"]["batch_multiplier"]
351
+ upgrade_level = int(math.log2(current_mult)) if current_mult > 1 else 0
352
+ cost = BATCH_UPGRADE_BASE_COST * (2 ** upgrade_level)
353
+
354
+ if self.user_tokens[username]["balance"] >= cost:
355
+ self.user_tokens[username]["balance"] -= cost
356
+ self.user_tokens[username]["purchases"]["batch_multiplier"] = current_mult * 2
357
+ new_mult = current_mult * 2
358
+ logger.info(f"[TOKEN] Batch upgrade: {current_mult}x β†’ {new_mult}x | Cost: {cost:.5f}")
359
+ return True, f"βœ… Batch upgraded to {new_mult}x! (-{cost:.5f} tokens)"
360
+ else:
361
+ return False, f"❌ Insufficient tokens! Need {cost:.5f}, have {self.user_tokens[username]['balance']:.2f}"
362
+
363
+ def purchase_token_upgrade(self, username: str, extra_tokens: int = 1000) -> tuple:
364
+ """Purchase extra response token length. Free for owner."""
365
+ if not username:
366
+ return False, "❌ Please login first"
367
+
368
+ self.initialize_user(username)
369
+
370
+ # Owner gets free upgrades
371
+ if self.user_tokens[username].get("is_owner", False):
372
+ self.user_tokens[username]["purchases"]["token_limit"] += extra_tokens
373
+ new_limit = self.user_tokens[username]["purchases"]["token_limit"]
374
+ logger.info(f"[TOKEN] πŸ‘‘ OWNER free token upgrade: +{extra_tokens} tokens")
375
+ return True, f"πŸ‘‘ FREE UPGRADE! Token limit now {new_limit}!"
376
+
377
+ cost = (extra_tokens / 1000) * TOKEN_UPGRADE_COST_PER_1K
378
+
379
+ if self.user_tokens[username]["balance"] >= cost:
380
+ self.user_tokens[username]["balance"] -= cost
381
+ self.user_tokens[username]["purchases"]["token_limit"] += extra_tokens
382
+ new_limit = self.user_tokens[username]["purchases"]["token_limit"]
383
+ logger.info(f"[TOKEN] Token limit upgrade: +{extra_tokens} tokens | Cost: {cost:.5f}")
384
+ return True, f"βœ… Token limit now {new_limit}! (-{cost:.5f} tokens)"
385
+ else:
386
+ return False, f"❌ Insufficient tokens! Need {cost:.5f}, have {self.user_tokens[username]['balance']:.2f}"
387
+
388
+ def get_balance(self, username: str) -> float:
389
+ """Get user's current token balance"""
390
+ if not username:
391
+ username = "anonymous"
392
+
393
+ self.initialize_user(username)
394
+ self.check_monthly_reset(username)
395
+
396
+ balance = self.user_tokens[username]["balance"]
397
+
398
+ # Show ∞ for owner
399
+ if balance == float('inf'):
400
+ return balance
401
+
402
+ return round(max(0, balance), 2) # Never show negative
403
+
404
+ def get_purchases(self, username: str) -> dict:
405
+ """Get user's current purchases"""
406
+ if not username:
407
+ username = "anonymous"
408
+
409
+ self.initialize_user(username)
410
+ return self.user_tokens[username]["purchases"]
411
+
412
+ def end_session(self, username: str):
413
+ """End user session and log stats"""
414
+ if not username:
415
+ return "No active session found."
416
+
417
+ if username in self.user_tokens:
418
+ stats = self.user_tokens[username]
419
+
420
+ if stats.get("is_owner", False):
421
+ return f"πŸ‘‘ Owner session ended. Welcome back anytime, {stats['username']}!"
422
+
423
+ logger.info(f"[TOKEN] Session ended: Spent {stats['total_spent']:.2f}, Remaining {stats['balance']:.2f}")
424
+ return f"Session ended. You spent {stats['total_spent']:.2f} tokens this session. Balance: {stats['balance']:.2f}"
425
+ return "No active session found."
426
+
427
+ # Global token manager
428
+ import math
429
+ token_manager = TokenManager()
430
+
431
  # Global cache manager
432
  model_cache = ModelCacheManager()
433
 
 
493
  self.api = HfApi(token=HF_TOKEN)
494
  self.telemetry = TelemetryManager(self.api)
495
  self.llm: Optional[Llama] = None
496
+ self.active_model_info = {"repo": "", "file": "", "format": ""}
497
  self.kernel_lock = threading.Lock()
498
  self.is_prefilling = False
499
  self.perf_stats = {
 
503
  "peak_tps": 0.0,
504
  "cache_hits": 0
505
  }
506
+ self.prompt_cache = {}
507
  self.last_activity = time.time()
508
+ self.idle_timeout = 20
509
  self.auto_cleanup_thread = None
510
  self.start_idle_monitor()
511
 
 
514
  self.typing_timer = None
515
  self.preprocessed_tokens = None
516
 
517
+ # Custom parameters (user-configurable)
518
+ self.custom_params = {
519
+ "temperature": 0.7,
520
+ "top_p": 0.95,
521
+ "top_k": 40,
522
+ "repeat_penalty": 1.1,
523
+ "batch_size_override": None, # None = auto
524
+ "max_tokens_override": None # None = auto
525
+ }
526
+
527
+ def detect_model_format(self, filename: str, repo: str) -> str:
528
+ """Auto-detect model format/architecture from filename and repo"""
529
+ combined = f"{repo.lower()} {filename.lower()}"
530
+
531
+ for format_name, format_info in MODEL_FORMATS.items():
532
+ for pattern in format_info["pattern"]:
533
+ if pattern in combined:
534
+ logger.info(f"[FORMAT-DETECT] Detected {format_name.upper()} architecture")
535
+ return format_name
536
+
537
+ logger.warning(f"[FORMAT-DETECT] Unknown format, defaulting to llama")
538
+ return "llama"
539
+
540
  def detect_quantization(self, filename: str) -> dict:
541
  """Detect quantization method from filename and return optimizations"""
542
  filename_upper = filename.upper()
 
633
  logger.error(f"Scan error: {e}")
634
  return []
635
 
636
+ def boot_kernel(self, repo: str, filename: str, session_id: str = None) -> str:
637
+ """HYPER-OPTIMIZED Boot kernel with format auto-detection and Gemma fixes"""
638
+ try:
639
+ if not repo or not filename:
640
+ return "πŸ”΄ ERROR: Repository or filename missing"
641
+
642
+ logger.info(f"[BOOT] Starting download: {filename} from {repo}")
643
+
644
+ # DETECT QUANTIZATION FROM FILENAME
645
+ quant_config = self.detect_quantization(filename)
646
+
647
+ # DETECT MODEL FORMAT/ARCHITECTURE
648
+ model_format = self.detect_model_format(filename, repo)
649
+
650
+ # Download with timeout protection
651
+ try:
652
+ path = hf_hub_download(
653
+ repo_id=repo,
654
+ filename=filename,
655
+ token=HF_TOKEN,
656
+ local_files_only=False
657
+ )
658
+ logger.info(f"[BOOT] Download complete: {path}")
659
+ except Exception as e:
660
+ logger.error(f"[BOOT] Download failed: {e}")
661
+ return f"πŸ”΄ DOWNLOAD FAILED: {str(e)}"
662
+
663
+ # Check if model is cached
664
+ is_cached = model_cache.is_cached(path)
665
+ cache_status = "🎯 CACHED" if is_cached else "πŸ†• NEW"
666
+
667
+ # Validate before loading
668
+ valid, msg = ResourceMonitor.validate_deployment(path)
669
+ if not valid:
670
+ logger.warning(f"[BOOT] Validation failed: {msg}")
671
+ return f"πŸ”΄ VALIDATION FAILED: {msg}"
672
+
673
+ logger.info(f"[BOOT] Validation passed ({cache_status}), applying {quant_config['type']} optimizations for {model_format.upper()}...")
674
+
675
+ # Load model with MAXIMUM PERFORMANCE SETTINGS
676
+ with self.kernel_lock:
677
+ # WRECK OLD MODEL
678
+ if self.llm:
679
+ logger.info("[BOOT] πŸ’£ WRECKING old model...")
680
+ try:
681
+ model_cache.wreck_old_model_cache()
682
+ del self.llm
683
+ self.llm = None
684
+ nuclear_ram_clear()
685
+ logger.info("[BOOT] βœ… Old model DESTROYED")
686
+ except Exception as e:
687
+ logger.warning(f"[BOOT] Cleanup warning: {e}")
688
+
689
+ # Calculate optimal parameters with token purchases
690
+ vm = psutil.virtual_memory()
691
+ available_ram_gb = vm.available / (1024**3)
692
+
693
+ # CPU-OPTIMIZED BATCH CALCULATION - Very aggressive for 16GB RAM
694
+ # Base calculation: use more RAM for batching on CPU
695
+ base_batch = int(512 * available_ram_gb / 8) # More aggressive base
696
+ optimal_batch = int(base_batch * quant_config["batch_multiplier"])
697
+
698
+ # Apply user's batch multiplier from token purchases
699
+ if session_id:
700
+ user_batch_mult = token_manager.get_purchases(session_id)["batch_multiplier"]
701
+ optimal_batch = int(optimal_batch * user_batch_mult)
702
+ logger.info(f"[TOKEN] User batch multiplier: {user_batch_mult}x")
703
+
704
+ # CPU can handle larger batches with quantized models
705
+ optimal_batch = max(1024, min(8192, optimal_batch)) # 1024-8192 range for CPU
706
+
707
+ # Context size
708
+ optimal_ctx = quant_config["ctx_size"]
709
+
710
+ # Reduce context for Gemma models (they have 131K n_ctx_train)
711
+ if model_format == "gemma":
712
+ optimal_ctx = min(8192, optimal_ctx) # Gemma works better with lower ctx
713
+ logger.info(f"[FORMAT] Gemma detected: reducing context to {optimal_ctx}")
714
+
715
+ # Thread optimization - use ALL threads on CPU (including hyperthreading)
716
+ optimal_threads = psutil.cpu_count(logical=True) # ALL logical cores
717
+ logger.info(f"[CPU] Using all {optimal_threads} threads (including hyperthreading)")
718
+
719
+ try:
720
+ logger.info(f"[BOOT] Initializing {model_format.upper()} {quant_config['type']}: threads={optimal_threads}, batch={optimal_batch}, ctx={optimal_ctx}")
721
+
722
+ # Preload cache if available
723
+ if is_cached:
724
+ model_cache.preload_cache(path)
725
+
726
+ # ULTRA-OPTIMIZED CPU-ONLY INITIALIZATION
727
+ init_params = {
728
+ "model_path": path,
729
+ "n_ctx": optimal_ctx,
730
+ "n_threads": optimal_threads,
731
+ "n_threads_batch": optimal_threads,
732
+ "use_mmap": USE_MMAP, # Critical for CPU
733
+ "use_mlock": MLOCK_MODEL, # Let OS manage memory
734
+ "n_batch": optimal_batch, # MASSIVE batches for CPU
735
+ "n_gpu_layers": 0, # CPU-only
736
+ "rope_scaling_type": 0,
737
+ "rope_freq_scale": ROPE_SCALING,
738
+ "verbose": False,
739
+ "logits_all": False,
740
+ "embedding": False,
741
+ "f16_kv": False # Use quantized KV cache
742
+ }
743
+
744
+ # Add KV quantization only if not Gemma (Gemma can be finicky)
745
+ if model_format != "gemma" and KV_CACHE_QUANTIZATION:
746
+ init_params["type_k"] = 2
747
+ init_params["type_v"] = 2
748
+ logger.info("[OPTIM] KV cache quantization enabled (Q4)")
749
+
750
+ self.llm = Llama(**init_params)
751
+
752
+ self.active_model_info = {
753
+ "repo": repo,
754
+ "file": filename,
755
+ "quant": quant_config['type'],
756
+ "format": model_format
757
+ }
758
+ self.telemetry.track_load(repo, filename)
759
+
760
+ # Extract and cache signature
761
+ if not is_cached:
762
+ logger.info("[BOOT] Extracting cache signature...")
763
+ signature = model_cache.extract_cache_signature(path)
764
+ if signature:
765
+ model_cache.save_to_cache(path, signature)
766
+
767
+ # Warm-up
768
+ logger.info("[BOOT] Warming up model caches...")
769
+ try:
770
+ self.llm("Warmup", max_tokens=1, stream=False)
771
+ force_gc()
772
+ except:
773
+ pass
774
+
775
+ logger.info("[BOOT] πŸš€ CPU-OPTIMIZED MODEL READY!")
776
+ return f"🟒 {model_format.upper()} {quant_config['type']} {cache_status} | CPU:{optimal_threads}T | B:{optimal_batch} | Ctx:{optimal_ctx}"
777
+
778
+ except Exception as e:
779
+ logger.error(f"[BOOT] Model loading failed: {e}")
780
+ self.llm = None
781
+ nuclear_ram_clear()
782
+ return f"πŸ”΄ LOAD FAILED: {str(e)}"
783
+
784
+ except Exception as e:
785
+ logger.error(f"[BOOT] Unexpected error: {e}")
786
+ nuclear_ram_clear()
787
+ return f"πŸ”΄ BOOT FAILURE: {str(e)}"
788
  """HYPER-OPTIMIZED Boot kernel with cache manager and old model wrecker"""
789
  try:
790
  if not repo or not filename:
 
942
  threading.Thread(target=_bg_eval, daemon=True).start()
943
  return "⚑ Primed"
944
 
945
+ def inference_generator(self, prompt: str, history: List[Dict], ghost_context: str, repo: str, quant: str, username: str) -> Generator:
946
  # Update activity timestamp
947
  self.update_activity()
948
 
 
995
  first_token_time = None
996
 
997
  try:
998
+ # Get max tokens from user purchases
999
+ max_tokens = 2048
1000
+ if username:
1001
+ max_tokens = token_manager.get_purchases(username)["token_limit"]
1002
+
1003
+ # HYPER-OPTIMIZED CPU INFERENCE SETTINGS
1004
  stream = self.llm(
1005
  formatted_prompt,
1006
+ max_tokens=max_tokens,
1007
  stop=["User:", "<|eot_id|>", "\n\n"],
1008
  stream=True,
1009
+ temperature=self.custom_params["temperature"],
1010
+ top_p=self.custom_params["top_p"],
1011
+ top_k=self.custom_params["top_k"],
1012
+ repeat_penalty=self.custom_params["repeat_penalty"],
1013
+ frequency_penalty=0.0,
1014
+ presence_penalty=0.0,
1015
+ tfs_z=1.0,
1016
+ typical_p=1.0,
1017
+ mirostat_mode=2, # CPU benefits from mirostat
1018
+ mirostat_tau=5.0,
1019
+ mirostat_eta=0.1,
1020
  )
1021
 
1022
  for chunk in stream:
 
1036
  if tps > self.perf_stats["peak_tps"]:
1037
  self.perf_stats["peak_tps"] = tps
1038
 
1039
+ # Charge tokens every second
1040
+ if int(elapsed * 1000) % 1000 < 100 and username: # Every ~1 second
1041
+ token_manager.charge_usage(username, elapsed * 1000)
1042
+
1043
  # Update history with streaming content + performance metrics
1044
+ balance = token_manager.get_balance(username) if username else 0
1045
+ history[-1]["content"] = f"{response_text}\n\n`⚑ {tps} t/s | 🎯 Peak: {self.perf_stats['peak_tps']:.1f} t/s | πŸ’° {balance:.2f} tokens`"
1046
  yield history
1047
 
1048
+ # Final token charge for remaining time
1049
+ if username:
1050
+ token_manager.charge_usage(username, elapsed * 1000)
1051
+
1052
  # Update global performance stats
1053
  self.perf_stats["total_tokens"] += tokens_count
1054
  self.perf_stats["total_time"] += elapsed
 
1172
  # --- UI INTERFACE ---
1173
  kernel = ZeroEngine()
1174
 
1175
+ # Session ID for token tracking
1176
+ username = token_manager.get_username()
1177
+
1178
+ with gr.Blocks(title="ZeroEngine V0.2", css=CUSTOM_CSS) as demo:
1179
+ # Header with Token Display
1180
+ with gr.Row():
1181
+ with gr.Column(scale=8):
1182
+ gr.HTML("""
1183
+ <div style='text-align: center; padding: 30px; border-radius: 24px;
1184
+ background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
1185
+ margin-bottom: 20px; box-shadow: 0 10px 30px rgba(0,0,0,0.3);'>
1186
+ <h1 style='margin: 0; font-size: 3em; background: linear-gradient(90deg, #00d4ff, #7b2ff7);
1187
+ -webkit-background-clip: text; -webkit-text-fill-color: transparent;
1188
+ font-family: Consolas, monospace;'>
1189
+ πŸ›°οΈ ZEROENGINE V0.2
1190
+ </h1>
1191
+ <p style='margin: 10px 0 0 0; color: #888; font-family: Consolas, monospace;'>
1192
+ CPU-Optimized | Token System | Custom Parameters | Auto-Format
1193
+ </p>
1194
+ </div>
1195
+ """)
1196
+ with gr.Column(scale=2):
1197
+ # Token Display
1198
+ gr.HTML("""
1199
+ <div style='text-align: center; padding: 20px; border-radius: 20px;
1200
+ background: linear-gradient(135deg, #7b2ff7 0%, #9b59b6 100%);
1201
+ margin-bottom: 20px; box-shadow: 0 8px 20px rgba(123,47,247,0.3);'>
1202
+ <div style='font-size: 2em; margin-bottom: 5px;'>πŸ’°</div>
1203
+ <div id='token-display' style='font-size: 1.8em; font-weight: bold; color: white; font-family: Consolas;'>
1204
+ 100.00
1205
+ </div>
1206
+ <div style='font-size: 0.9em; color: #ddd; font-family: Consolas;'>TOKENS</div>
1207
+ </div>
1208
+ """)
1209
+ token_balance = gr.Textbox(value="100.00", visible=False, elem_id="token_balance")
1210
+ end_session_btn = gr.Button("END SESSION", variant="stop", size="sm")
1211
+ session_status = gr.Markdown("", visible=False)
1212
 
1213
  with gr.Row():
1214
  with gr.Column(scale=8):
1215
  chat_box = gr.Chatbot(
1216
  label="Main Engine Feedback",
1217
+ height=600,
1218
  show_label=False,
1219
  autoscroll=True,
1220
  container=True
 
1229
  )
1230
  send_btn = gr.Button("SUBMIT", variant="primary", scale=1)
1231
 
1232
+ with gr.Column(scale=4):
1233
+ # Hardware Status
1234
  gr.Markdown("### πŸ› οΈ Hardware Status")
1235
  ram_metric = gr.Label(label="RAM Usage", value="0/0 GB")
1236
  cpu_metric = gr.Label(label="CPU Load", value="0%")
1237
 
1238
  gr.Markdown("---")
1239
+
1240
+ # Model Control
1241
  gr.Markdown("### πŸ“‘ Model Control")
1242
  repo_input = gr.Textbox(label="HuggingFace Repo", value=DEFAULT_MODEL)
1243
  quant_dropdown = gr.Dropdown(label="Available Quants", choices=[], interactive=True)
 
1249
  boot_status = gr.Markdown("Status: `STANDBY`")
1250
 
1251
  gr.Markdown("---")
1252
+
1253
+ # Custom Parameters
1254
+ gr.Markdown("### βš™οΈ Custom Parameters")
1255
+ temperature_slider = gr.Slider(0.1, 2.0, value=0.7, step=0.1, label="Temperature")
1256
+ top_p_slider = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-P")
1257
+ top_k_slider = gr.Slider(1, 100, value=40, step=1, label="Top-K")
1258
+ repeat_penalty_slider = gr.Slider(1.0, 2.0, value=1.1, step=0.1, label="Repeat Penalty")
1259
+
1260
+ gr.Markdown("---")
1261
+
1262
+ # Token Purchases
1263
+ gr.Markdown("### πŸ’Ž Token Upgrades")
1264
+ with gr.Row():
1265
+ batch_upgrade_btn = gr.Button("πŸš€ Batch x2", size="sm", variant="secondary")
1266
+ token_upgrade_btn = gr.Button("πŸ“ˆ +1K Tokens", size="sm", variant="secondary")
1267
+ purchase_status = gr.Markdown("Ready to upgrade!")
1268
+
1269
+ gr.Markdown("---")
1270
+
1271
+ # Ghost Cache
1272
  gr.Markdown("### πŸ‘» Ghost Cache (Pre-Context)")
1273
  ghost_buffer = gr.Textbox(
1274
  label="Background Context",
 
1282
  log_output = gr.Code(
1283
  label="Kernel Logs",
1284
  language="shell",
1285
+ value="[INIT] V0.2 System Ready.",
1286
  lines=5
1287
  )
1288
 
 
1290
  def update_stats():
1291
  try:
1292
  m = ResourceMonitor.get_metrics()
1293
+ balance = token_manager.get_balance(session_id)
1294
+ return f"{m['ram_used_gb']}/{m['ram_total_gb']} GB", f"{m['cpu_usage_pct']}%", f"{balance}"
1295
  except Exception as e:
1296
  logger.error(f"Stats update error: {e}")
1297
+ return "Error", "Error", "0.00"
1298
  return "Error", "Error"
1299
 
1300
  def on_scan(repo):
 
1320
  return
1321
 
1322
  yield "βš™οΈ System: Initiating boot sequence...", gr.update()
1323
+ time.sleep(0.5)
1324
 
1325
+ result = kernel.boot_kernel(repo, file, session_id)
1326
  yield result, gr.update()
1327
 
1328
  except Exception as e:
1329
  logger.error(f"Boot UI error: {e}")
1330
  yield f"πŸ”΄ BOOT ERROR: {str(e)}", gr.update()
1331
+
1332
+ def on_batch_upgrade():
1333
+ success, msg = token_manager.purchase_batch_upgrade(session_id)
1334
+ balance = token_manager.get_balance(session_id)
1335
+ return msg, f"{balance}"
1336
+
1337
+ def on_token_upgrade():
1338
+ success, msg = token_manager.purchase_token_upgrade(session_id, 1000)
1339
+ balance = token_manager.get_balance(session_id)
1340
+ return msg, f"{balance}"
1341
+
1342
+ def on_end_session():
1343
+ msg = token_manager.end_session(session_id)
1344
+ return msg
1345
+
1346
+ def update_custom_params(temp, top_p, top_k, repeat_pen):
1347
+ kernel.custom_params["temperature"] = temp
1348
+ kernel.custom_params["top_p"] = top_p
1349
+ kernel.custom_params["top_k"] = int(top_k)
1350
+ kernel.custom_params["repeat_penalty"] = repeat_pen
1351
+ return "βœ… Parameters updated!"
1352
 
1353
+ # Timer for periodic stats updates (includes token balance)
1354
  timer = gr.Timer(value=2)
1355
+ timer.tick(update_stats, None, [ram_metric, cpu_metric, token_balance])
1356
 
1357
  # Event handlers
1358
  scan_btn.click(on_scan, [repo_input], [quant_dropdown, log_output])
1359
  boot_btn.click(on_boot, [repo_input, quant_dropdown], [boot_status, log_output])
1360
 
1361
+ # Token purchases
1362
+ batch_upgrade_btn.click(on_batch_upgrade, None, [purchase_status, token_balance])
1363
+ token_upgrade_btn.click(on_token_upgrade, None, [purchase_status, token_balance])
1364
+ end_session_btn.click(on_end_session, None, [session_status])
1365
+
1366
+ # Custom parameter updates
1367
+ temperature_slider.change(update_custom_params,
1368
+ [temperature_slider, top_p_slider, top_k_slider, repeat_penalty_slider],
1369
+ [purchase_status])
1370
+ top_p_slider.change(update_custom_params,
1371
+ [temperature_slider, top_p_slider, top_k_slider, repeat_penalty_slider],
1372
+ [purchase_status])
1373
+ top_k_slider.change(update_custom_params,
1374
+ [temperature_slider, top_p_slider, top_k_slider, repeat_penalty_slider],
1375
+ [purchase_status])
1376
+ repeat_penalty_slider.change(update_custom_params,
1377
+ [temperature_slider, top_p_slider, top_k_slider, repeat_penalty_slider],
1378
+ [purchase_status])
1379
+
1380
+ # Ghost cache
1381
  stitch_btn.click(
1382
  lambda x: f"Cache: `{kernel.stitch_cache(x)}`",
1383
  [ghost_buffer],
1384
  [stitch_status]
1385
  )
1386
 
1387
+ # Keyboard input preprocessing
1388
  user_input.change(
1389
  lambda x: kernel.preprocess_input(x),
1390
  [user_input],
1391
  None
1392
  )
1393
 
1394
+ # Auto-boot enabled inference
1395
  inference_args = [user_input, chat_box, ghost_buffer, repo_input, quant_dropdown]
1396
  user_input.submit(kernel.inference_generator, inference_args, [chat_box])
1397
  send_btn.click(kernel.inference_generator, inference_args, [chat_box])