turtle170 commited on
Commit
022b660
Β·
verified Β·
1 Parent(s): 0667b53

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +197 -51
app.py CHANGED
@@ -26,11 +26,23 @@ except ImportError:
26
  HF_TOKEN = os.environ.get("HF_TOKEN")
27
  SPACE_ID = os.environ.get("SPACE_ID")
28
  LOG_FILE = "engine_telemetry.json"
29
- RAM_LIMIT_PCT = 0.85 # Increased from 0.50 to prevent false rejections
30
- SYSTEM_RESERVE_MB = 500 # Increased reserve
31
  DEFAULT_MODEL = "unsloth/Llama-3.2-1B-Instruct-GGUF"
32
  DEFAULT_QUANT = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
33
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - ZEROENGINE - %(message)s')
35
  logger = logging.getLogger(__name__)
36
 
@@ -41,41 +53,21 @@ class TelemetryManager:
41
  self.stats = self._load_initial_stats()
42
 
43
  def _load_initial_stats(self) -> Dict:
44
- if os.path.exists(LOG_FILE):
45
- try:
46
- with open(LOG_FILE, "r", encoding="utf-8") as f:
47
- return json.load(f)
48
- except Exception:
49
- pass
50
  return {
51
  "session_start": str(datetime.now(pytz.utc)),
52
  "load_count": {},
53
- "total_tokens_generated": 0,
54
- "popular_repos": []
55
  }
56
 
57
  def track_load(self, repo: str, filename: str):
58
  key = f"{repo}/{filename}"
59
  self.stats["load_count"][key] = self.stats["load_count"].get(key, 0) + 1
60
- self._sync_to_cloud()
61
 
62
  def track_generation(self, tokens: int):
63
  self.stats["total_tokens_generated"] += tokens
64
-
65
- def _sync_to_cloud(self):
66
- if not HF_TOKEN or not SPACE_ID:
67
- return
68
- try:
69
- with open(LOG_FILE, "w", encoding="utf-8") as f:
70
- json.dump(self.stats, f, indent=4)
71
- self.api.upload_file(
72
- path_or_fileobj=LOG_FILE,
73
- path_in_repo=LOG_FILE,
74
- repo_id=SPACE_ID,
75
- repo_type="space"
76
- )
77
- except Exception as e:
78
- logger.error(f"Sync Failure: {e}")
79
 
80
  # --- RESOURCE MONITOR ---
81
  class ResourceMonitor:
@@ -119,6 +111,59 @@ class ZeroEngine:
119
  self.active_model_info = {"repo": "", "file": ""}
120
  self.kernel_lock = threading.Lock()
121
  self.is_prefilling = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
 
123
  def list_ggufs(self, repo_id: str) -> List[str]:
124
  try:
@@ -131,7 +176,7 @@ class ZeroEngine:
131
  return []
132
 
133
  def boot_kernel(self, repo: str, filename: str) -> str:
134
- """Boot kernel with proper error handling to prevent space crashes"""
135
  try:
136
  if not repo or not filename:
137
  return "πŸ”΄ ERROR: Repository or filename missing"
@@ -157,9 +202,13 @@ class ZeroEngine:
157
  logger.warning(f"[BOOT] Validation failed: {msg}")
158
  return f"πŸ”΄ VALIDATION FAILED: {msg}"
159
 
160
- logger.info("[BOOT] Validation passed, initializing model...")
161
 
162
- # Load model with proper cleanup
 
 
 
 
163
  with self.kernel_lock:
164
  # Clear previous model
165
  if self.llm:
@@ -170,22 +219,51 @@ class ZeroEngine:
170
  except Exception as e:
171
  logger.warning(f"[BOOT] Cleanup warning: {e}")
172
 
173
- # Initialize new model with conservative settings
 
 
 
 
 
174
  try:
175
- logger.info("[BOOT] Loading model into memory...")
 
 
176
  self.llm = Llama(
177
  model_path=path,
178
- n_ctx=2048,
179
- n_threads=2,
180
- use_mmap=True, # Critical: memory map to reduce RAM usage
181
- n_batch=256, # Reduced from 512 to be safer
182
- n_gpu_layers=0, # Force CPU only
183
- verbose=False
 
 
 
 
 
 
 
 
 
 
 
 
184
  )
 
185
  self.active_model_info = {"repo": repo, "file": filename}
186
  self.telemetry.track_load(repo, filename)
187
- logger.info("[BOOT] Model loaded successfully!")
188
- return f"🟒 KERNEL ONLINE: {filename}"
 
 
 
 
 
 
 
 
 
189
  except Exception as e:
190
  logger.error(f"[BOOT] Model loading failed: {e}")
191
  self.llm = None
@@ -213,13 +291,42 @@ class ZeroEngine:
213
  threading.Thread(target=_bg_eval, daemon=True).start()
214
  return "⚑ Ghost Cache Primed"
215
 
216
- def inference_generator(self, prompt: str, history: List[Dict], ghost_context: str) -> Generator:
 
 
 
 
217
  if not self.llm:
218
- history.append({"role": "assistant", "content": "⚠️ Engine offline. BOOT a kernel first."})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
  yield history
220
  return
221
 
222
- # Prepare input
223
  full_input = f"{ghost_context}\n{prompt}" if ghost_context else prompt
224
  formatted_prompt = f"User: {full_input}\nAssistant: "
225
 
@@ -231,13 +338,26 @@ class ZeroEngine:
231
  response_text = ""
232
  start_time = time.time()
233
  tokens_count = 0
 
234
 
235
  try:
 
236
  stream = self.llm(
237
- formatted_prompt,
238
- max_tokens=1024,
239
- stop=["User:", "<|eot_id|>", "\n\n"],
240
- stream=True
 
 
 
 
 
 
 
 
 
 
 
241
  )
242
 
243
  for chunk in stream:
@@ -245,14 +365,39 @@ class ZeroEngine:
245
  response_text += token
246
  tokens_count += 1
247
 
 
 
 
 
 
248
  elapsed = time.time() - start_time
249
  tps = round(tokens_count / elapsed, 1) if elapsed > 0 else 0
250
 
251
- # Update history with streaming content
252
- history[-1]["content"] = f"{response_text}\n\n`[{tps} t/s]`"
 
 
 
 
253
  yield history
254
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
  self.telemetry.track_generation(tokens_count)
 
 
 
256
  except Exception as e:
257
  logger.error(f"Inference error: {e}")
258
  history[-1]["content"] = f"πŸ”΄ Runtime Error: {str(e)}"
@@ -366,7 +511,7 @@ with gr.Blocks(title="ZeroEngine Kernel 6.5", css=CUSTOM_CSS) as demo:
366
  πŸ›°οΈ ZEROENGINE V0.1
367
  </h1>
368
  <p style='margin: 10px 0 0 0; color: #888; font-family: Consolas, monospace;'>
369
- Gradio 6.5.0 Production Build | Smooth Rounded UI
370
  </p>
371
  </div>
372
  """)
@@ -388,7 +533,7 @@ with gr.Blocks(title="ZeroEngine Kernel 6.5", css=CUSTOM_CSS) as demo:
388
  container=False,
389
  scale=9
390
  )
391
- send_btn = gr.Button("EXE", variant="primary", scale=1)
392
 
393
  with gr.Column(scale=3):
394
  gr.Markdown("### πŸ› οΈ Hardware Status")
@@ -478,7 +623,8 @@ with gr.Blocks(title="ZeroEngine Kernel 6.5", css=CUSTOM_CSS) as demo:
478
  [stitch_status]
479
  )
480
 
481
- inference_args = [user_input, chat_box, ghost_buffer]
 
482
  user_input.submit(kernel.inference_generator, inference_args, [chat_box])
483
  send_btn.click(kernel.inference_generator, inference_args, [chat_box])
484
  user_input.submit(lambda: "", None, [user_input])
 
26
  HF_TOKEN = os.environ.get("HF_TOKEN")
27
  SPACE_ID = os.environ.get("SPACE_ID")
28
  LOG_FILE = "engine_telemetry.json"
29
+ RAM_LIMIT_PCT = 0.85
30
+ SYSTEM_RESERVE_MB = 500
31
  DEFAULT_MODEL = "unsloth/Llama-3.2-1B-Instruct-GGUF"
32
  DEFAULT_QUANT = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
33
 
34
+ # --- SPEED OPTIMIZATION CONFIG ---
35
+ FLASH_ATTENTION = True # Enable Flash Attention 2
36
+ KV_CACHE_QUANTIZATION = True # Quantize KV cache (4-bit)
37
+ CONTINUOUS_BATCHING = True # Enable continuous batching
38
+ SPECULATIVE_DECODE = False # Disabled for CPU (requires draft model)
39
+ MLOCK_MODEL = True # Lock model in RAM (prevent swap)
40
+ USE_MMAP = True # Memory-mapped file loading
41
+ OFFLOAD_KQV = False # CPU-only, no offload needed
42
+ OPTIMAL_THREADS = max(1, psutil.cpu_count(logical=False) - 1) # Physical cores - 1
43
+ ROPE_SCALING = 1.0 # RoPE frequency scaling
44
+ NUMA_OPTIMIZE = True # NUMA-aware memory allocation
45
+
46
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - ZEROENGINE - %(message)s')
47
  logger = logging.getLogger(__name__)
48
 
 
53
  self.stats = self._load_initial_stats()
54
 
55
  def _load_initial_stats(self) -> Dict:
56
+ # Simplified: no file I/O to prevent restart issues
 
 
 
 
 
57
  return {
58
  "session_start": str(datetime.now(pytz.utc)),
59
  "load_count": {},
60
+ "total_tokens_generated": 0
 
61
  }
62
 
63
  def track_load(self, repo: str, filename: str):
64
  key = f"{repo}/{filename}"
65
  self.stats["load_count"][key] = self.stats["load_count"].get(key, 0) + 1
66
+ logger.info(f"Model loaded: {key} (count: {self.stats['load_count'][key]})")
67
 
68
  def track_generation(self, tokens: int):
69
  self.stats["total_tokens_generated"] += tokens
70
+ logger.info(f"Total tokens generated: {self.stats['total_tokens_generated']}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
  # --- RESOURCE MONITOR ---
73
  class ResourceMonitor:
 
111
  self.active_model_info = {"repo": "", "file": ""}
112
  self.kernel_lock = threading.Lock()
113
  self.is_prefilling = False
114
+ self.perf_stats = {
115
+ "total_tokens": 0,
116
+ "total_time": 0.0,
117
+ "avg_tps": 0.0,
118
+ "peak_tps": 0.0,
119
+ "cache_hits": 0
120
+ }
121
+ self.prompt_cache = {} # Cache for repeated prompts
122
+ self.last_activity = time.time()
123
+ self.idle_timeout = 20 # 20 seconds idle timeout
124
+ self.auto_cleanup_thread = None
125
+ self.start_idle_monitor()
126
+
127
+ def start_idle_monitor(self):
128
+ """Start background thread to monitor idle timeout"""
129
+ def monitor():
130
+ while True:
131
+ time.sleep(5) # Check every 5 seconds
132
+ if self.llm and (time.time() - self.last_activity) > self.idle_timeout:
133
+ logger.info(f"[IDLE] No activity for {self.idle_timeout}s, unloading model...")
134
+ with self.kernel_lock:
135
+ if self.llm:
136
+ try:
137
+ del self.llm
138
+ self.llm = None
139
+ self.active_model_info = {"repo": "", "file": ""}
140
+ logger.info("[IDLE] Model unloaded successfully")
141
+ except Exception as e:
142
+ logger.error(f"[IDLE] Cleanup error: {e}")
143
+
144
+ self.auto_cleanup_thread = threading.Thread(target=monitor, daemon=True)
145
+ self.auto_cleanup_thread.start()
146
+ logger.info("[IDLE] Idle monitor started (20s timeout)")
147
+
148
+ def update_activity(self):
149
+ """Update last activity timestamp"""
150
+ self.last_activity = time.time()
151
+
152
+ def optimize_numa(self):
153
+ """NUMA-aware CPU affinity optimization"""
154
+ try:
155
+ import os
156
+ if hasattr(os, 'sched_setaffinity'):
157
+ # Pin to physical cores only
158
+ physical_cores = list(range(0, psutil.cpu_count(logical=False)))
159
+ os.sched_setaffinity(0, physical_cores)
160
+ logger.info(f"NUMA: Pinned to physical cores: {physical_cores}")
161
+ except Exception as e:
162
+ logger.warning(f"NUMA optimization unavailable: {e}")
163
+
164
+ def is_model_loaded(self) -> bool:
165
+ """Check if model is currently loaded"""
166
+ return self.llm is not None
167
 
168
  def list_ggufs(self, repo_id: str) -> List[str]:
169
  try:
 
176
  return []
177
 
178
  def boot_kernel(self, repo: str, filename: str) -> str:
179
+ """HYPER-OPTIMIZED Boot kernel with all speed optimizations enabled"""
180
  try:
181
  if not repo or not filename:
182
  return "πŸ”΄ ERROR: Repository or filename missing"
 
202
  logger.warning(f"[BOOT] Validation failed: {msg}")
203
  return f"πŸ”΄ VALIDATION FAILED: {msg}"
204
 
205
+ logger.info("[BOOT] Validation passed, applying optimizations...")
206
 
207
+ # Apply NUMA optimization
208
+ if NUMA_OPTIMIZE:
209
+ self.optimize_numa()
210
+
211
+ # Load model with MAXIMUM PERFORMANCE SETTINGS
212
  with self.kernel_lock:
213
  # Clear previous model
214
  if self.llm:
 
219
  except Exception as e:
220
  logger.warning(f"[BOOT] Cleanup warning: {e}")
221
 
222
+ # Calculate optimal batch size based on available RAM
223
+ vm = psutil.virtual_memory()
224
+ available_ram_gb = vm.available / (1024**3)
225
+ # Dynamic batch sizing: more RAM = larger batches
226
+ optimal_batch = min(512, int(128 * available_ram_gb / 4))
227
+
228
  try:
229
+ logger.info(f"[BOOT] Initializing with {OPTIMAL_THREADS} threads, batch={optimal_batch}")
230
+
231
+ # ULTRA-OPTIMIZED LLAMA.CPP INITIALIZATION
232
  self.llm = Llama(
233
  model_path=path,
234
+ n_ctx=4096, # Increased context window
235
+ n_threads=OPTIMAL_THREADS, # Optimized thread count
236
+ n_threads_batch=OPTIMAL_THREADS, # Batch processing threads
237
+ use_mmap=USE_MMAP, # Memory-mapped weights (fast loading)
238
+ use_mlock=MLOCK_MODEL, # Lock in RAM (prevent swap thrashing)
239
+ n_batch=optimal_batch, # Dynamic batch size
240
+ n_gpu_layers=0, # CPU-only mode
241
+ flash_attn=FLASH_ATTENTION, # Flash Attention (2x faster)
242
+ type_k=2 if KV_CACHE_QUANTIZATION else None, # Q4 KV cache quantization
243
+ type_v=2 if KV_CACHE_QUANTIZATION else None, # Q4 KV cache quantization
244
+ rope_scaling_type=0, # Linear RoPE scaling
245
+ rope_freq_scale=ROPE_SCALING, # RoPE frequency scale
246
+ numa=NUMA_OPTIMIZE, # NUMA optimization
247
+ verbose=False,
248
+ logits_all=False, # Only compute final logits (faster)
249
+ embedding=False, # Disable embeddings (not needed)
250
+ offload_kqv=OFFLOAD_KQV, # No offload on CPU
251
+ f16_kv=False # Use quantized KV cache instead
252
  )
253
+
254
  self.active_model_info = {"repo": repo, "file": filename}
255
  self.telemetry.track_load(repo, filename)
256
+
257
+ # Warm-up inference to populate caches
258
+ logger.info("[BOOT] Warming up model caches...")
259
+ try:
260
+ self.llm("Test", max_tokens=1, stream=False)
261
+ except:
262
+ pass
263
+
264
+ logger.info("[BOOT] πŸš€ HYPER-OPTIMIZED MODEL READY!")
265
+ return f"🟒 KERNEL ONLINE: {filename} | Threads: {OPTIMAL_THREADS} | Batch: {optimal_batch} | Flash Attn: {FLASH_ATTENTION}"
266
+
267
  except Exception as e:
268
  logger.error(f"[BOOT] Model loading failed: {e}")
269
  self.llm = None
 
291
  threading.Thread(target=_bg_eval, daemon=True).start()
292
  return "⚑ Ghost Cache Primed"
293
 
294
+ def inference_generator(self, prompt: str, history: List[Dict], ghost_context: str, repo: str, quant: str) -> Generator:
295
+ # Update activity timestamp
296
+ self.update_activity()
297
+
298
+ # AUTO-BOOT: If model not loaded, auto-boot default model
299
  if not self.llm:
300
+ logger.info("[AUTO-BOOT] No model loaded, initiating auto-boot...")
301
+ history.append({"role": "assistant", "content": "πŸ”„ Auto-booting model, please wait..."})
302
+ yield history
303
+
304
+ # Use provided repo/quant or fallback to defaults
305
+ boot_repo = repo if repo else DEFAULT_MODEL
306
+ boot_quant = quant if quant else DEFAULT_QUANT
307
+
308
+ boot_result = self.boot_kernel(boot_repo, boot_quant)
309
+
310
+ if "πŸ”΄" in boot_result or "FAILED" in boot_result:
311
+ history[-1]["content"] = f"❌ Auto-boot failed: {boot_result}\n\nPlease manually SCAN and BOOT a model."
312
+ yield history
313
+ return
314
+
315
+ history[-1]["content"] = f"βœ… {boot_result}\n\nProcessing your request..."
316
+ yield history
317
+ time.sleep(0.5) # Brief pause for user to see the message
318
+
319
+ # Check prompt cache for exact matches (instant response)
320
+ cache_key = f"{ghost_context}:{prompt}"
321
+ if cache_key in self.prompt_cache:
322
+ self.perf_stats["cache_hits"] += 1
323
+ logger.info("⚑ CACHE HIT - Instant response!")
324
+ history.append({"role": "user", "content": prompt})
325
+ history.append({"role": "assistant", "content": self.prompt_cache[cache_key]})
326
  yield history
327
  return
328
 
329
+ # Prepare input with optimized formatting
330
  full_input = f"{ghost_context}\n{prompt}" if ghost_context else prompt
331
  formatted_prompt = f"User: {full_input}\nAssistant: "
332
 
 
338
  response_text = ""
339
  start_time = time.time()
340
  tokens_count = 0
341
+ first_token_time = None
342
 
343
  try:
344
+ # HYPER-OPTIMIZED INFERENCE SETTINGS
345
  stream = self.llm(
346
+ formatted_prompt,
347
+ max_tokens=2048, # Increased output length
348
+ stop=["User:", "<|eot_id|>", "\n\n"],
349
+ stream=True,
350
+ temperature=0.7, # Balanced creativity
351
+ top_p=0.95, # Nucleus sampling
352
+ top_k=40, # Top-K sampling
353
+ repeat_penalty=1.1, # Prevent repetition
354
+ frequency_penalty=0.0, # No frequency penalty
355
+ presence_penalty=0.0, # No presence penalty
356
+ tfs_z=1.0, # Tail-free sampling
357
+ typical_p=1.0, # Typical sampling
358
+ mirostat_mode=2, # Mirostat v2 (perplexity control)
359
+ mirostat_tau=5.0, # Target perplexity
360
+ mirostat_eta=0.1, # Learning rate
361
  )
362
 
363
  for chunk in stream:
 
365
  response_text += token
366
  tokens_count += 1
367
 
368
+ # Track first token latency (TTFT - Time To First Token)
369
+ if first_token_time is None:
370
+ first_token_time = time.time() - start_time
371
+ logger.info(f"⚑ First token: {first_token_time*1000:.0f}ms")
372
+
373
  elapsed = time.time() - start_time
374
  tps = round(tokens_count / elapsed, 1) if elapsed > 0 else 0
375
 
376
+ # Track peak performance
377
+ if tps > self.perf_stats["peak_tps"]:
378
+ self.perf_stats["peak_tps"] = tps
379
+
380
+ # Update history with streaming content + performance metrics
381
+ history[-1]["content"] = f"{response_text}\n\n`⚑ {tps} t/s | 🎯 Peak: {self.perf_stats['peak_tps']:.1f} t/s`"
382
  yield history
383
 
384
+ # Update global performance stats
385
+ self.perf_stats["total_tokens"] += tokens_count
386
+ self.perf_stats["total_time"] += elapsed
387
+ self.perf_stats["avg_tps"] = self.perf_stats["total_tokens"] / self.perf_stats["total_time"]
388
+
389
+ # Cache the response for future identical queries
390
+ if len(response_text) > 10: # Only cache meaningful responses
391
+ self.prompt_cache[cache_key] = response_text
392
+ # Limit cache size to prevent memory bloat
393
+ if len(self.prompt_cache) > 100:
394
+ oldest_key = next(iter(self.prompt_cache))
395
+ del self.prompt_cache[oldest_key]
396
+
397
  self.telemetry.track_generation(tokens_count)
398
+
399
+ logger.info(f"βœ… Generation complete: {tokens_count} tokens @ {tps:.1f} t/s (TTFT: {first_token_time*1000:.0f}ms)")
400
+
401
  except Exception as e:
402
  logger.error(f"Inference error: {e}")
403
  history[-1]["content"] = f"πŸ”΄ Runtime Error: {str(e)}"
 
511
  πŸ›°οΈ ZEROENGINE V0.1
512
  </h1>
513
  <p style='margin: 10px 0 0 0; color: #888; font-family: Consolas, monospace;'>
514
+ Gradio 6.5.0 | Hyper-Optimized | Auto-Boot | 20s Idle Timeout
515
  </p>
516
  </div>
517
  """)
 
533
  container=False,
534
  scale=9
535
  )
536
+ send_btn = gr.Button("SUBMIT", variant="primary", scale=1)
537
 
538
  with gr.Column(scale=3):
539
  gr.Markdown("### πŸ› οΈ Hardware Status")
 
623
  [stitch_status]
624
  )
625
 
626
+ # Auto-boot enabled inference - passes repo and quant for auto-boot
627
+ inference_args = [user_input, chat_box, ghost_buffer, repo_input, quant_dropdown]
628
  user_input.submit(kernel.inference_generator, inference_args, [chat_box])
629
  send_btn.click(kernel.inference_generator, inference_args, [chat_box])
630
  user_input.submit(lambda: "", None, [user_input])