turtle170 commited on
Commit
7ca413a
Β·
verified Β·
1 Parent(s): 022b660

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +303 -32
app.py CHANGED
@@ -36,16 +36,180 @@ FLASH_ATTENTION = True # Enable Flash Attention 2
36
  KV_CACHE_QUANTIZATION = True # Quantize KV cache (4-bit)
37
  CONTINUOUS_BATCHING = True # Enable continuous batching
38
  SPECULATIVE_DECODE = False # Disabled for CPU (requires draft model)
39
- MLOCK_MODEL = True # Lock model in RAM (prevent swap)
40
  USE_MMAP = True # Memory-mapped file loading
41
  OFFLOAD_KQV = False # CPU-only, no offload needed
42
  OPTIMAL_THREADS = max(1, psutil.cpu_count(logical=False) - 1) # Physical cores - 1
43
  ROPE_SCALING = 1.0 # RoPE frequency scaling
44
  NUMA_OPTIMIZE = True # NUMA-aware memory allocation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - ZEROENGINE - %(message)s')
47
  logger = logging.getLogger(__name__)
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  # --- TELEMETRY MODULE ---
50
  class TelemetryManager:
51
  def __init__(self, api: HfApi):
@@ -124,6 +288,55 @@ class ZeroEngine:
124
  self.auto_cleanup_thread = None
125
  self.start_idle_monitor()
126
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  def start_idle_monitor(self):
128
  """Start background thread to monitor idle timeout"""
129
  def monitor():
@@ -137,6 +350,7 @@ class ZeroEngine:
137
  del self.llm
138
  self.llm = None
139
  self.active_model_info = {"repo": "", "file": ""}
 
140
  logger.info("[IDLE] Model unloaded successfully")
141
  except Exception as e:
142
  logger.error(f"[IDLE] Cleanup error: {e}")
@@ -176,13 +390,16 @@ class ZeroEngine:
176
  return []
177
 
178
  def boot_kernel(self, repo: str, filename: str) -> str:
179
- """HYPER-OPTIMIZED Boot kernel with all speed optimizations enabled"""
180
  try:
181
  if not repo or not filename:
182
  return "πŸ”΄ ERROR: Repository or filename missing"
183
 
184
  logger.info(f"[BOOT] Starting download: {filename} from {repo}")
185
 
 
 
 
186
  # Download with timeout protection
187
  try:
188
  path = hf_hub_download(
@@ -196,13 +413,17 @@ class ZeroEngine:
196
  logger.error(f"[BOOT] Download failed: {e}")
197
  return f"πŸ”΄ DOWNLOAD FAILED: {str(e)}"
198
 
 
 
 
 
199
  # Validate before loading
200
  valid, msg = ResourceMonitor.validate_deployment(path)
201
  if not valid:
202
  logger.warning(f"[BOOT] Validation failed: {msg}")
203
  return f"πŸ”΄ VALIDATION FAILED: {msg}"
204
 
205
- logger.info("[BOOT] Validation passed, applying optimizations...")
206
 
207
  # Apply NUMA optimization
208
  if NUMA_OPTIMIZE:
@@ -210,70 +431,104 @@ class ZeroEngine:
210
 
211
  # Load model with MAXIMUM PERFORMANCE SETTINGS
212
  with self.kernel_lock:
213
- # Clear previous model
214
  if self.llm:
215
- logger.info("[BOOT] Clearing previous model...")
216
  try:
 
 
 
 
217
  del self.llm
218
  self.llm = None
 
 
 
 
 
219
  except Exception as e:
220
  logger.warning(f"[BOOT] Cleanup warning: {e}")
221
 
222
- # Calculate optimal batch size based on available RAM
223
  vm = psutil.virtual_memory()
224
  available_ram_gb = vm.available / (1024**3)
225
- # Dynamic batch sizing: more RAM = larger batches
226
- optimal_batch = min(512, int(128 * available_ram_gb / 4))
 
 
 
 
 
 
 
 
 
 
227
 
228
  try:
229
- logger.info(f"[BOOT] Initializing with {OPTIMAL_THREADS} threads, batch={optimal_batch}")
 
 
 
 
230
 
231
  # ULTRA-OPTIMIZED LLAMA.CPP INITIALIZATION
232
  self.llm = Llama(
233
  model_path=path,
234
- n_ctx=4096, # Increased context window
235
- n_threads=OPTIMAL_THREADS, # Optimized thread count
236
- n_threads_batch=OPTIMAL_THREADS, # Batch processing threads
237
- use_mmap=USE_MMAP, # Memory-mapped weights (fast loading)
238
- use_mlock=MLOCK_MODEL, # Lock in RAM (prevent swap thrashing)
239
- n_batch=optimal_batch, # Dynamic batch size
240
- n_gpu_layers=0, # CPU-only mode
241
- flash_attn=FLASH_ATTENTION, # Flash Attention (2x faster)
242
  type_k=2 if KV_CACHE_QUANTIZATION else None, # Q4 KV cache quantization
243
  type_v=2 if KV_CACHE_QUANTIZATION else None, # Q4 KV cache quantization
244
- rope_scaling_type=0, # Linear RoPE scaling
245
- rope_freq_scale=ROPE_SCALING, # RoPE frequency scale
246
- numa=NUMA_OPTIMIZE, # NUMA optimization
247
  verbose=False,
248
- logits_all=False, # Only compute final logits (faster)
249
- embedding=False, # Disable embeddings (not needed)
250
- offload_kqv=OFFLOAD_KQV, # No offload on CPU
251
- f16_kv=False # Use quantized KV cache instead
252
  )
253
 
254
- self.active_model_info = {"repo": repo, "file": filename}
255
  self.telemetry.track_load(repo, filename)
256
 
 
 
 
 
 
 
 
257
  # Warm-up inference to populate caches
258
  logger.info("[BOOT] Warming up model caches...")
259
  try:
260
- self.llm("Test", max_tokens=1, stream=False)
 
261
  except:
262
  pass
263
 
264
  logger.info("[BOOT] πŸš€ HYPER-OPTIMIZED MODEL READY!")
265
- return f"🟒 KERNEL ONLINE: {filename} | Threads: {OPTIMAL_THREADS} | Batch: {optimal_batch} | Flash Attn: {FLASH_ATTENTION}"
266
 
267
  except Exception as e:
268
  logger.error(f"[BOOT] Model loading failed: {e}")
269
  self.llm = None
 
270
  return f"πŸ”΄ LOAD FAILED: {str(e)}"
271
 
272
  except Exception as e:
273
  logger.error(f"[BOOT] Unexpected error: {e}")
 
274
  return f"πŸ”΄ BOOT FAILURE: {str(e)}"
275
 
276
  def stitch_cache(self, ghost_text: str) -> str:
 
277
  if not self.llm or not ghost_text or self.is_prefilling:
278
  return "Kernel Idle/Busy"
279
 
@@ -283,18 +538,22 @@ class ZeroEngine:
283
  tokens = self.llm.tokenize(ghost_text.encode("utf-8"))
284
  self.llm.eval(tokens)
285
  logger.info(f"Ghost cache primed: {len(tokens)} tokens")
 
286
  except Exception as e:
287
  logger.error(f"KV Cache priming failed: {e}")
288
  finally:
289
  self.is_prefilling = False
290
 
291
  threading.Thread(target=_bg_eval, daemon=True).start()
292
- return "⚑ Ghost Cache Primed"
293
 
294
  def inference_generator(self, prompt: str, history: List[Dict], ghost_context: str, repo: str, quant: str) -> Generator:
295
  # Update activity timestamp
296
  self.update_activity()
297
 
 
 
 
298
  # AUTO-BOOT: If model not loaded, auto-boot default model
299
  if not self.llm:
300
  logger.info("[AUTO-BOOT] No model loaded, initiating auto-boot...")
@@ -378,7 +637,7 @@ class ZeroEngine:
378
  self.perf_stats["peak_tps"] = tps
379
 
380
  # Update history with streaming content + performance metrics
381
- history[-1]["content"] = f"{response_text}\n\n`⚑ {tps} t/s | 🎯 Peak: {self.perf_stats['peak_tps']:.1f} t/s`"
382
  yield history
383
 
384
  # Update global performance stats
@@ -396,12 +655,16 @@ class ZeroEngine:
396
 
397
  self.telemetry.track_generation(tokens_count)
398
 
 
 
 
399
  logger.info(f"βœ… Generation complete: {tokens_count} tokens @ {tps:.1f} t/s (TTFT: {first_token_time*1000:.0f}ms)")
400
 
401
  except Exception as e:
402
  logger.error(f"Inference error: {e}")
403
  history[-1]["content"] = f"πŸ”΄ Runtime Error: {str(e)}"
404
  yield history
 
405
 
406
  # --- CUSTOM CSS ---
407
  CUSTOM_CSS = """
@@ -552,14 +815,15 @@ with gr.Blocks(title="ZeroEngine Kernel 6.5", css=CUSTOM_CSS) as demo:
552
  boot_status = gr.Markdown("Status: `STANDBY`")
553
 
554
  gr.Markdown("---")
555
- gr.Markdown("### πŸ‘» Ghost Cache")
556
  ghost_buffer = gr.Textbox(
557
  label="Background Context",
558
- placeholder="Queue priming tokens here...",
559
  lines=3
560
  )
 
 
561
  stitch_status = gr.Markdown("Cache: `EMPTY`")
562
- stitch_btn = gr.Button("STITCH", size="sm")
563
 
564
  log_output = gr.Code(
565
  label="Kernel Logs",
@@ -623,6 +887,13 @@ with gr.Blocks(title="ZeroEngine Kernel 6.5", css=CUSTOM_CSS) as demo:
623
  [stitch_status]
624
  )
625
 
 
 
 
 
 
 
 
626
  # Auto-boot enabled inference - passes repo and quant for auto-boot
627
  inference_args = [user_input, chat_box, ghost_buffer, repo_input, quant_dropdown]
628
  user_input.submit(kernel.inference_generator, inference_args, [chat_box])
 
36
  KV_CACHE_QUANTIZATION = True # Quantize KV cache (4-bit)
37
  CONTINUOUS_BATCHING = True # Enable continuous batching
38
  SPECULATIVE_DECODE = False # Disabled for CPU (requires draft model)
39
+ MLOCK_MODEL = False # Disabled: prevents swapping but uses more RAM
40
  USE_MMAP = True # Memory-mapped file loading
41
  OFFLOAD_KQV = False # CPU-only, no offload needed
42
  OPTIMAL_THREADS = max(1, psutil.cpu_count(logical=False) - 1) # Physical cores - 1
43
  ROPE_SCALING = 1.0 # RoPE frequency scaling
44
  NUMA_OPTIMIZE = True # NUMA-aware memory allocation
45
+ AGGRESSIVE_GC = True # Aggressive garbage collection
46
+
47
+ # Quantization detection and optimization mapping
48
+ QUANT_OPTIMIZATIONS = {
49
+ "BF16": {"batch_multiplier": 0.3, "ctx_size": 8192, "threads_boost": 1.2},
50
+ "F16": {"batch_multiplier": 0.4, "ctx_size": 8192, "threads_boost": 1.2},
51
+ "Q8_0": {"batch_multiplier": 0.7, "ctx_size": 8192, "threads_boost": 1.0},
52
+ "Q6_K": {"batch_multiplier": 0.8, "ctx_size": 8192, "threads_boost": 1.0},
53
+ "Q5_K_M": {"batch_multiplier": 1.0, "ctx_size": 12288, "threads_boost": 0.9},
54
+ "Q5_K_S": {"batch_multiplier": 1.0, "ctx_size": 12288, "threads_boost": 0.9},
55
+ "Q4_K_M": {"batch_multiplier": 1.3, "ctx_size": 16384, "threads_boost": 0.8},
56
+ "Q4_K_S": {"batch_multiplier": 1.3, "ctx_size": 16384, "threads_boost": 0.8},
57
+ "Q4_0": {"batch_multiplier": 1.4, "ctx_size": 16384, "threads_boost": 0.8},
58
+ "Q3_K_M": {"batch_multiplier": 1.6, "ctx_size": 20480, "threads_boost": 0.7},
59
+ "Q2_K": {"batch_multiplier": 2.0, "ctx_size": 24576, "threads_boost": 0.7},
60
+ }
61
 
62
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - ZEROENGINE - %(message)s')
63
  logger = logging.getLogger(__name__)
64
 
65
+ # --- AGGRESSIVE GARBAGE COLLECTOR ---
66
+ import gc
67
+ gc.enable()
68
+ gc.set_threshold(700, 10, 10) # Aggressive thresholds
69
+
70
+ def force_gc():
71
+ """Force aggressive garbage collection"""
72
+ if AGGRESSIVE_GC:
73
+ collected = gc.collect(2) # Full collection
74
+ logger.info(f"[GC] Collected {collected} objects")
75
+ return collected
76
+ return 0
77
+
78
+ def nuclear_ram_clear():
79
+ """NUCLEAR option: Clear all Python caches and force full GC"""
80
+ try:
81
+ # Clear function caches
82
+ import functools
83
+ functools._CacheInfo.__call__ = lambda self: None
84
+
85
+ # Clear import caches
86
+ import sys
87
+ if hasattr(sys, 'modules'):
88
+ # Don't delete core modules, just clear their caches
89
+ for module_name, module in list(sys.modules.items()):
90
+ if hasattr(module, '__dict__') and not module_name.startswith('_'):
91
+ if hasattr(module, '__pycache__'):
92
+ delattr(module, '__pycache__')
93
+
94
+ # Force multiple GC passes
95
+ for _ in range(3):
96
+ gc.collect(2)
97
+
98
+ logger.info("[RAM-NUKE] πŸ’₯ Nuclear RAM clear complete")
99
+ return True
100
+ except Exception as e:
101
+ logger.error(f"[RAM-NUKE] Failed: {e}")
102
+ return False
103
+
104
+ # --- MODEL CACHE MANAGER (LoRA-style lightweight caching) ---
105
+ class ModelCacheManager:
106
+ def __init__(self):
107
+ self.cache_dir = "/tmp/zeroengine_cache"
108
+ self.cache = {} # {model_path: {"adapter": bytes, "metadata": dict}}
109
+ self.max_cache_size_mb = 50 # Only cache 50MB total (tiny!)
110
+
111
+ os.makedirs(self.cache_dir, exist_ok=True)
112
+ logger.info(f"[CACHE] Initialized at {self.cache_dir}")
113
+
114
+ def extract_cache_signature(self, model_path: str) -> Optional[bytes]:
115
+ """Extract TINY signature from model (first 1MB = ~LoRA adapter size)"""
116
+ try:
117
+ cache_size = 1024 * 1024 # 1MB
118
+ with open(model_path, 'rb') as f:
119
+ signature = f.read(cache_size)
120
+ logger.info(f"[CACHE] Extracted {len(signature)} bytes signature from {os.path.basename(model_path)}")
121
+ return signature
122
+ except Exception as e:
123
+ logger.error(f"[CACHE] Extraction failed: {e}")
124
+ return None
125
+
126
+ def save_to_cache(self, model_path: str, signature: bytes):
127
+ """Save tiny model signature to cache"""
128
+ try:
129
+ model_name = os.path.basename(model_path)
130
+ cache_path = os.path.join(self.cache_dir, f"{model_name}.cache")
131
+
132
+ # Check total cache size
133
+ total_size = sum(os.path.getsize(os.path.join(self.cache_dir, f))
134
+ for f in os.listdir(self.cache_dir) if f.endswith('.cache'))
135
+
136
+ # If cache too big, delete oldest
137
+ if total_size > (self.max_cache_size_mb * 1024 * 1024):
138
+ logger.info("[CACHE] Cache full, removing oldest entry")
139
+ cache_files = sorted(
140
+ [os.path.join(self.cache_dir, f) for f in os.listdir(self.cache_dir) if f.endswith('.cache')],
141
+ key=os.path.getmtime
142
+ )
143
+ if cache_files:
144
+ os.remove(cache_files[0])
145
+ logger.info(f"[CACHE] Deleted {os.path.basename(cache_files[0])}")
146
+
147
+ # Save new cache
148
+ with open(cache_path, 'wb') as f:
149
+ f.write(signature)
150
+
151
+ self.cache[model_path] = {
152
+ "signature": signature,
153
+ "cached_at": time.time(),
154
+ "hits": 0
155
+ }
156
+ logger.info(f"[CACHE] βœ… Cached {model_name} ({len(signature) / 1024:.1f}KB)")
157
+
158
+ except Exception as e:
159
+ logger.error(f"[CACHE] Save failed: {e}")
160
+
161
+ def is_cached(self, model_path: str) -> bool:
162
+ """Check if model signature is cached"""
163
+ model_name = os.path.basename(model_path)
164
+ cache_path = os.path.join(self.cache_dir, f"{model_name}.cache")
165
+ exists = os.path.exists(cache_path)
166
+ if exists:
167
+ logger.info(f"[CACHE] 🎯 HIT for {model_name}")
168
+ return exists
169
+
170
+ def preload_cache(self, model_path: str):
171
+ """Preload cached signature (simulates faster load)"""
172
+ try:
173
+ model_name = os.path.basename(model_path)
174
+ cache_path = os.path.join(self.cache_dir, f"{model_name}.cache")
175
+
176
+ if os.path.exists(cache_path):
177
+ with open(cache_path, 'rb') as f:
178
+ signature = f.read()
179
+
180
+ if model_path in self.cache:
181
+ self.cache[model_path]["hits"] += 1
182
+
183
+ logger.info(f"[CACHE] Preloaded {len(signature) / 1024:.1f}KB signature")
184
+ return True
185
+ except Exception as e:
186
+ logger.error(f"[CACHE] Preload failed: {e}")
187
+ return False
188
+
189
+ def wreck_old_model_cache(self):
190
+ """WRECK the old model's cache to free RAM"""
191
+ try:
192
+ logger.info("[WRECKER] πŸ’£ Destroying old model caches...")
193
+
194
+ # Clear Python's internal caches
195
+ gc.collect()
196
+
197
+ # This is symbolic - the real wrecking happens when we del self.llm
198
+ # But we can clear our tiny cache references
199
+ for model_path in list(self.cache.keys()):
200
+ if self.cache[model_path].get("signature"):
201
+ self.cache[model_path]["signature"] = None
202
+
203
+ nuclear_ram_clear()
204
+ logger.info("[WRECKER] βœ… Old model WRECKED")
205
+ return True
206
+ except Exception as e:
207
+ logger.error(f"[WRECKER] Failed: {e}")
208
+ return False
209
+
210
+ # Global cache manager
211
+ model_cache = ModelCacheManager()
212
+
213
  # --- TELEMETRY MODULE ---
214
  class TelemetryManager:
215
  def __init__(self, api: HfApi):
 
288
  self.auto_cleanup_thread = None
289
  self.start_idle_monitor()
290
 
291
+ # Keyboard input pre-processing
292
+ self.typing_buffer = ""
293
+ self.typing_timer = None
294
+ self.preprocessed_tokens = None
295
+
296
+ def detect_quantization(self, filename: str) -> dict:
297
+ """Detect quantization method from filename and return optimizations"""
298
+ filename_upper = filename.upper()
299
+
300
+ for quant_type, optimizations in QUANT_OPTIMIZATIONS.items():
301
+ if quant_type in filename_upper:
302
+ logger.info(f"[QUANT-DETECT] Found {quant_type} in filename, applying optimizations")
303
+ return {"type": quant_type, **optimizations}
304
+
305
+ # Default to Q4_K_M if unknown
306
+ logger.warning(f"[QUANT-DETECT] Unknown quantization, using Q4_K_M defaults")
307
+ return {"type": "Q4_K_M", **QUANT_OPTIMIZATIONS["Q4_K_M"]}
308
+
309
+ def preprocess_input(self, text: str):
310
+ """Pre-process keyboard input in background (tensors ready before submit)"""
311
+ if not self.llm or not text or len(text) < 5:
312
+ return
313
+
314
+ def _preprocess():
315
+ try:
316
+ logger.info(f"[PREPROCESS] Tokenizing {len(text)} chars in background...")
317
+ tokens = self.llm.tokenize(text.encode("utf-8"))
318
+ self.preprocessed_tokens = tokens
319
+ logger.info(f"[PREPROCESS] βœ… Ready: {len(tokens)} tokens cached")
320
+ except Exception as e:
321
+ logger.error(f"[PREPROCESS] Failed: {e}")
322
+ self.preprocessed_tokens = None
323
+
324
+ # Cancel previous timer if user is still typing
325
+ if self.typing_timer:
326
+ self.typing_timer.cancel()
327
+
328
+ # Start new timer - preprocess after 1 second of no typing
329
+ self.typing_timer = threading.Timer(1.0, _preprocess)
330
+ self.typing_timer.daemon = True
331
+ self.typing_timer.start()
332
+
333
+ def clear_preprocessed(self):
334
+ """Clear preprocessed tokens and force GC"""
335
+ if self.preprocessed_tokens:
336
+ self.preprocessed_tokens = None
337
+ force_gc()
338
+ logger.info("[PREPROCESS] Cleared cached tokens")
339
+
340
  def start_idle_monitor(self):
341
  """Start background thread to monitor idle timeout"""
342
  def monitor():
 
350
  del self.llm
351
  self.llm = None
352
  self.active_model_info = {"repo": "", "file": ""}
353
+ force_gc() # Aggressive cleanup
354
  logger.info("[IDLE] Model unloaded successfully")
355
  except Exception as e:
356
  logger.error(f"[IDLE] Cleanup error: {e}")
 
390
  return []
391
 
392
  def boot_kernel(self, repo: str, filename: str) -> str:
393
+ """HYPER-OPTIMIZED Boot kernel with cache manager and old model wrecker"""
394
  try:
395
  if not repo or not filename:
396
  return "πŸ”΄ ERROR: Repository or filename missing"
397
 
398
  logger.info(f"[BOOT] Starting download: {filename} from {repo}")
399
 
400
+ # DETECT QUANTIZATION FROM FILENAME
401
+ quant_config = self.detect_quantization(filename)
402
+
403
  # Download with timeout protection
404
  try:
405
  path = hf_hub_download(
 
413
  logger.error(f"[BOOT] Download failed: {e}")
414
  return f"πŸ”΄ DOWNLOAD FAILED: {str(e)}"
415
 
416
+ # Check if model is cached (for faster subsequent loads)
417
+ is_cached = model_cache.is_cached(path)
418
+ cache_status = "🎯 CACHED" if is_cached else "πŸ†• NEW"
419
+
420
  # Validate before loading
421
  valid, msg = ResourceMonitor.validate_deployment(path)
422
  if not valid:
423
  logger.warning(f"[BOOT] Validation failed: {msg}")
424
  return f"πŸ”΄ VALIDATION FAILED: {msg}"
425
 
426
+ logger.info(f"[BOOT] Validation passed ({cache_status}), applying {quant_config['type']} optimizations...")
427
 
428
  # Apply NUMA optimization
429
  if NUMA_OPTIMIZE:
 
431
 
432
  # Load model with MAXIMUM PERFORMANCE SETTINGS
433
  with self.kernel_lock:
434
+ # WRECK OLD MODEL - Nuclear option
435
  if self.llm:
436
+ logger.info("[BOOT] πŸ’£ WRECKING old model...")
437
  try:
438
+ # Wreck the cache first
439
+ model_cache.wreck_old_model_cache()
440
+
441
+ # Delete the model
442
  del self.llm
443
  self.llm = None
444
+
445
+ # Nuclear RAM clear
446
+ nuclear_ram_clear()
447
+
448
+ logger.info("[BOOT] βœ… Old model DESTROYED")
449
  except Exception as e:
450
  logger.warning(f"[BOOT] Cleanup warning: {e}")
451
 
452
+ # Calculate optimal batch size based on quantization and available RAM
453
  vm = psutil.virtual_memory()
454
  available_ram_gb = vm.available / (1024**3)
455
+
456
+ # MASSIVE batch sizes for quantized models
457
+ base_batch = int(256 * available_ram_gb / 4)
458
+ optimal_batch = int(base_batch * quant_config["batch_multiplier"])
459
+ optimal_batch = max(512, min(4096, optimal_batch)) # Clamp between 512-4096
460
+
461
+ # Context size based on quantization
462
+ optimal_ctx = quant_config["ctx_size"]
463
+
464
+ # Thread count with quantization-specific boost
465
+ optimal_threads = int(OPTIMAL_THREADS * quant_config["threads_boost"])
466
+ optimal_threads = max(2, min(optimal_threads, psutil.cpu_count(logical=False)))
467
 
468
  try:
469
+ logger.info(f"[BOOT] Initializing {quant_config['type']}: threads={optimal_threads}, batch={optimal_batch}, ctx={optimal_ctx}")
470
+
471
+ # Preload cache if available (simulates faster warmup)
472
+ if is_cached:
473
+ model_cache.preload_cache(path)
474
 
475
  # ULTRA-OPTIMIZED LLAMA.CPP INITIALIZATION
476
  self.llm = Llama(
477
  model_path=path,
478
+ n_ctx=optimal_ctx, # Dynamic context based on quant
479
+ n_threads=optimal_threads, # Optimized thread count
480
+ n_threads_batch=optimal_threads, # Batch processing threads
481
+ use_mmap=USE_MMAP, # Memory-mapped weights (fast loading)
482
+ use_mlock=MLOCK_MODEL, # Lock in RAM (prevent swap thrashing)
483
+ n_batch=optimal_batch, # MASSIVE batch size
484
+ n_gpu_layers=0, # CPU-only mode
485
+ flash_attn=FLASH_ATTENTION, # Flash Attention (2x faster)
486
  type_k=2 if KV_CACHE_QUANTIZATION else None, # Q4 KV cache quantization
487
  type_v=2 if KV_CACHE_QUANTIZATION else None, # Q4 KV cache quantization
488
+ rope_scaling_type=0, # Linear RoPE scaling
489
+ rope_freq_scale=ROPE_SCALING, # RoPE frequency scale
490
+ numa=NUMA_OPTIMIZE, # NUMA optimization
491
  verbose=False,
492
+ logits_all=False, # Only compute final logits (faster)
493
+ embedding=False, # Disable embeddings (not needed)
494
+ offload_kqv=OFFLOAD_KQV, # No offload on CPU
495
+ f16_kv=False # Use quantized KV cache instead
496
  )
497
 
498
+ self.active_model_info = {"repo": repo, "file": filename, "quant": quant_config['type']}
499
  self.telemetry.track_load(repo, filename)
500
 
501
+ # Extract and cache TINY signature for faster future loads
502
+ if not is_cached:
503
+ logger.info("[BOOT] Extracting cache signature...")
504
+ signature = model_cache.extract_cache_signature(path)
505
+ if signature:
506
+ model_cache.save_to_cache(path, signature)
507
+
508
  # Warm-up inference to populate caches
509
  logger.info("[BOOT] Warming up model caches...")
510
  try:
511
+ self.llm("Warmup", max_tokens=1, stream=False)
512
+ force_gc() # Clear warmup artifacts
513
  except:
514
  pass
515
 
516
  logger.info("[BOOT] πŸš€ HYPER-OPTIMIZED MODEL READY!")
517
+ return f"🟒 {quant_config['type']} KERNEL {cache_status} | T:{optimal_threads} | B:{optimal_batch} | Ctx:{optimal_ctx}"
518
 
519
  except Exception as e:
520
  logger.error(f"[BOOT] Model loading failed: {e}")
521
  self.llm = None
522
+ nuclear_ram_clear()
523
  return f"πŸ”΄ LOAD FAILED: {str(e)}"
524
 
525
  except Exception as e:
526
  logger.error(f"[BOOT] Unexpected error: {e}")
527
+ nuclear_ram_clear()
528
  return f"πŸ”΄ BOOT FAILURE: {str(e)}"
529
 
530
  def stitch_cache(self, ghost_text: str) -> str:
531
+ """Prime KV cache with ghost context"""
532
  if not self.llm or not ghost_text or self.is_prefilling:
533
  return "Kernel Idle/Busy"
534
 
 
538
  tokens = self.llm.tokenize(ghost_text.encode("utf-8"))
539
  self.llm.eval(tokens)
540
  logger.info(f"Ghost cache primed: {len(tokens)} tokens")
541
+ force_gc() # Clean up after priming
542
  except Exception as e:
543
  logger.error(f"KV Cache priming failed: {e}")
544
  finally:
545
  self.is_prefilling = False
546
 
547
  threading.Thread(target=_bg_eval, daemon=True).start()
548
+ return "⚑ Primed"
549
 
550
  def inference_generator(self, prompt: str, history: List[Dict], ghost_context: str, repo: str, quant: str) -> Generator:
551
  # Update activity timestamp
552
  self.update_activity()
553
 
554
+ # Clear any preprocessed tokens from typing
555
+ self.clear_preprocessed()
556
+
557
  # AUTO-BOOT: If model not loaded, auto-boot default model
558
  if not self.llm:
559
  logger.info("[AUTO-BOOT] No model loaded, initiating auto-boot...")
 
637
  self.perf_stats["peak_tps"] = tps
638
 
639
  # Update history with streaming content + performance metrics
640
+ history[-1]["content"] = f"{response_text}\n\n`⚑ {tps} t/s | 🎯 Peak: {self.perf_stats['peak_tps']:.1f} t/s | πŸ’Ύ Cache: {self.perf_stats['cache_hits']}`"
641
  yield history
642
 
643
  # Update global performance stats
 
655
 
656
  self.telemetry.track_generation(tokens_count)
657
 
658
+ # Aggressive GC after generation
659
+ force_gc()
660
+
661
  logger.info(f"βœ… Generation complete: {tokens_count} tokens @ {tps:.1f} t/s (TTFT: {first_token_time*1000:.0f}ms)")
662
 
663
  except Exception as e:
664
  logger.error(f"Inference error: {e}")
665
  history[-1]["content"] = f"πŸ”΄ Runtime Error: {str(e)}"
666
  yield history
667
+ force_gc()
668
 
669
  # --- CUSTOM CSS ---
670
  CUSTOM_CSS = """
 
815
  boot_status = gr.Markdown("Status: `STANDBY`")
816
 
817
  gr.Markdown("---")
818
+ gr.Markdown("### πŸ‘» Ghost Cache (Pre-Context)")
819
  ghost_buffer = gr.Textbox(
820
  label="Background Context",
821
+ placeholder="Add context that will be prepended to all messages...",
822
  lines=3
823
  )
824
+ with gr.Row():
825
+ stitch_btn = gr.Button("PRIME CACHE", variant="secondary", size="sm", scale=1)
826
  stitch_status = gr.Markdown("Cache: `EMPTY`")
 
827
 
828
  log_output = gr.Code(
829
  label="Kernel Logs",
 
887
  [stitch_status]
888
  )
889
 
890
+ # Keyboard input preprocessing (tokenize while typing)
891
+ user_input.change(
892
+ lambda x: kernel.preprocess_input(x),
893
+ [user_input],
894
+ None
895
+ )
896
+
897
  # Auto-boot enabled inference - passes repo and quant for auto-boot
898
  inference_args = [user_input, chat_box, ghost_buffer, repo_input, quant_dropdown]
899
  user_input.submit(kernel.inference_generator, inference_args, [chat_box])