turtle170 commited on
Commit
72e0339
Β·
verified Β·
1 Parent(s): 551f9be

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -161
app.py CHANGED
@@ -785,142 +785,6 @@ class ZeroEngine:
785
  logger.error(f"[BOOT] Unexpected error: {e}")
786
  nuclear_ram_clear()
787
  return f"πŸ”΄ BOOT FAILURE: {str(e)}"
788
- """HYPER-OPTIMIZED Boot kernel with cache manager and old model wrecker"""
789
- try:
790
- if not repo or not filename:
791
- return "πŸ”΄ ERROR: Repository or filename missing"
792
-
793
- logger.info(f"[BOOT] Starting download: {filename} from {repo}")
794
-
795
- # DETECT QUANTIZATION FROM FILENAME
796
- quant_config = self.detect_quantization(filename)
797
-
798
- # Download with timeout protection
799
- try:
800
- path = hf_hub_download(
801
- repo_id=repo,
802
- filename=filename,
803
- token=HF_TOKEN,
804
- local_files_only=False
805
- )
806
- logger.info(f"[BOOT] Download complete: {path}")
807
- except Exception as e:
808
- logger.error(f"[BOOT] Download failed: {e}")
809
- return f"πŸ”΄ DOWNLOAD FAILED: {str(e)}"
810
-
811
- # Check if model is cached (for faster subsequent loads)
812
- is_cached = model_cache.is_cached(path)
813
- cache_status = "🎯 CACHED" if is_cached else "πŸ†• NEW"
814
-
815
- # Validate before loading
816
- valid, msg = ResourceMonitor.validate_deployment(path)
817
- if not valid:
818
- logger.warning(f"[BOOT] Validation failed: {msg}")
819
- return f"πŸ”΄ VALIDATION FAILED: {msg}"
820
-
821
- logger.info(f"[BOOT] Validation passed ({cache_status}), applying {quant_config['type']} optimizations...")
822
-
823
- # Apply NUMA optimization
824
- if NUMA_OPTIMIZE:
825
- self.optimize_numa()
826
-
827
- # Load model with MAXIMUM PERFORMANCE SETTINGS
828
- with self.kernel_lock:
829
- # WRECK OLD MODEL - Nuclear option
830
- if self.llm:
831
- logger.info("[BOOT] πŸ’£ WRECKING old model...")
832
- try:
833
- # Wreck the cache first
834
- model_cache.wreck_old_model_cache()
835
-
836
- # Delete the model
837
- del self.llm
838
- self.llm = None
839
-
840
- # Nuclear RAM clear
841
- nuclear_ram_clear()
842
-
843
- logger.info("[BOOT] βœ… Old model DESTROYED")
844
- except Exception as e:
845
- logger.warning(f"[BOOT] Cleanup warning: {e}")
846
-
847
- # Calculate optimal batch size based on quantization and available RAM
848
- vm = psutil.virtual_memory()
849
- available_ram_gb = vm.available / (1024**3)
850
-
851
- # MASSIVE batch sizes for quantized models
852
- base_batch = int(256 * available_ram_gb / 4)
853
- optimal_batch = int(base_batch * quant_config["batch_multiplier"])
854
- optimal_batch = max(512, min(4096, optimal_batch)) # Clamp between 512-4096
855
-
856
- # Context size based on quantization
857
- optimal_ctx = quant_config["ctx_size"]
858
-
859
- # Thread count with quantization-specific boost
860
- optimal_threads = int(OPTIMAL_THREADS * quant_config["threads_boost"])
861
- optimal_threads = max(2, min(optimal_threads, psutil.cpu_count(logical=False)))
862
-
863
- try:
864
- logger.info(f"[BOOT] Initializing {quant_config['type']}: threads={optimal_threads}, batch={optimal_batch}, ctx={optimal_ctx}")
865
-
866
- # Preload cache if available (simulates faster warmup)
867
- if is_cached:
868
- model_cache.preload_cache(path)
869
-
870
- # ULTRA-OPTIMIZED LLAMA.CPP INITIALIZATION
871
- self.llm = Llama(
872
- model_path=path,
873
- n_ctx=optimal_ctx, # Dynamic context based on quant
874
- n_threads=optimal_threads, # Optimized thread count
875
- n_threads_batch=optimal_threads, # Batch processing threads
876
- use_mmap=USE_MMAP, # Memory-mapped weights (fast loading)
877
- use_mlock=MLOCK_MODEL, # Lock in RAM (prevent swap thrashing)
878
- n_batch=optimal_batch, # MASSIVE batch size
879
- n_gpu_layers=0, # CPU-only mode
880
- flash_attn=FLASH_ATTENTION, # Flash Attention (2x faster)
881
- type_k=2 if KV_CACHE_QUANTIZATION else None, # Q4 KV cache quantization
882
- type_v=2 if KV_CACHE_QUANTIZATION else None, # Q4 KV cache quantization
883
- rope_scaling_type=0, # Linear RoPE scaling
884
- rope_freq_scale=ROPE_SCALING, # RoPE frequency scale
885
- numa=NUMA_OPTIMIZE, # NUMA optimization
886
- verbose=False,
887
- logits_all=False, # Only compute final logits (faster)
888
- embedding=False, # Disable embeddings (not needed)
889
- offload_kqv=OFFLOAD_KQV, # No offload on CPU
890
- f16_kv=False # Use quantized KV cache instead
891
- )
892
-
893
- self.active_model_info = {"repo": repo, "file": filename, "quant": quant_config['type']}
894
- self.telemetry.track_load(repo, filename)
895
-
896
- # Extract and cache TINY signature for faster future loads
897
- if not is_cached:
898
- logger.info("[BOOT] Extracting cache signature...")
899
- signature = model_cache.extract_cache_signature(path)
900
- if signature:
901
- model_cache.save_to_cache(path, signature)
902
-
903
- # Warm-up inference to populate caches
904
- logger.info("[BOOT] Warming up model caches...")
905
- try:
906
- self.llm("Warmup", max_tokens=1, stream=False)
907
- force_gc() # Clear warmup artifacts
908
- except:
909
- pass
910
-
911
- logger.info("[BOOT] πŸš€ HYPER-OPTIMIZED MODEL READY!")
912
- return f"🟒 {quant_config['type']} KERNEL {cache_status} | T:{optimal_threads} | B:{optimal_batch} | Ctx:{optimal_ctx}"
913
-
914
- except Exception as e:
915
- logger.error(f"[BOOT] Model loading failed: {e}")
916
- self.llm = None
917
- nuclear_ram_clear()
918
- return f"πŸ”΄ LOAD FAILED: {str(e)}"
919
-
920
- except Exception as e:
921
- logger.error(f"[BOOT] Unexpected error: {e}")
922
- nuclear_ram_clear()
923
- return f"πŸ”΄ BOOT FAILURE: {str(e)}"
924
 
925
  def stitch_cache(self, ghost_text: str) -> str:
926
  """Prime KV cache with ghost context"""
@@ -942,7 +806,7 @@ class ZeroEngine:
942
  threading.Thread(target=_bg_eval, daemon=True).start()
943
  return "⚑ Primed"
944
 
945
- def inference_generator(self, prompt: str, history: List[Dict], ghost_context: str, repo: str, quant: str, username: str) -> Generator:
946
  username = profile.username if profile else "anonymous"
947
  # Update activity timestamp
948
  self.update_activity()
@@ -1291,14 +1155,14 @@ with gr.Blocks(title="ZeroEngine V0.2", css=CUSTOM_CSS) as demo:
1291
  # --- UI LOGIC ---
1292
  def update_stats(profile: gr.OAuthProfile | None):
1293
  try:
1294
- m = ResourceMonitor.get_metrics()
1295
- current_user = profile.username if profile else "Guest"
1296
- balance = token_manager.get_balance(current_user)
1297
-
1298
- return f"{m['ram_used_gb']}/{m['ram_total_gb']} GB", f"{m['cpu_usage_pct']}%", f"{balance}"
1299
- except Exception as e:
1300
- logger.error(f"Stats update error: {e}")
1301
- return "Error", "Error", "0.00"
1302
 
1303
  def on_scan(repo):
1304
  try:
@@ -1316,21 +1180,10 @@ with gr.Blocks(title="ZeroEngine V0.2", css=CUSTOM_CSS) as demo:
1316
  logger.error(f"Scan error: {e}")
1317
  return gr.update(choices=[], value=None), f"πŸ”΄ Scan failed: {str(e)}"
1318
 
1319
- def on_boot(repo, file):
1320
- try:
1321
- if not repo or not file:
1322
- yield "πŸ”΄ ERROR: Repository and filename required", gr.update()
1323
- return
1324
-
1325
- yield "βš™οΈ System: Initiating boot sequence...", gr.update()
1326
- time.sleep(0.5)
1327
-
1328
- result = kernel.boot_kernel(repo, file, session_id)
1329
- yield result, gr.update()
1330
-
1331
- except Exception as e:
1332
- logger.error(f"Boot UI error: {e}")
1333
- yield f"πŸ”΄ BOOT ERROR: {str(e)}", gr.update()
1334
 
1335
  def on_batch_upgrade():
1336
  success, msg = token_manager.purchase_batch_upgrade(session_id)
@@ -1359,7 +1212,7 @@ with gr.Blocks(title="ZeroEngine V0.2", css=CUSTOM_CSS) as demo:
1359
 
1360
  # Event handlers
1361
  scan_btn.click(on_scan, [repo_input], [quant_dropdown, log_output])
1362
- boot_btn.click(on_boot, [repo_input, quant_dropdown], [boot_status, log_output])
1363
 
1364
  # Token purchases
1365
  batch_upgrade_btn.click(on_batch_upgrade, None, [purchase_status, token_balance])
 
785
  logger.error(f"[BOOT] Unexpected error: {e}")
786
  nuclear_ram_clear()
787
  return f"πŸ”΄ BOOT FAILURE: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
788
 
789
  def stitch_cache(self, ghost_text: str) -> str:
790
  """Prime KV cache with ghost context"""
 
806
  threading.Thread(target=_bg_eval, daemon=True).start()
807
  return "⚑ Primed"
808
 
809
+ def inference_generator(self, prompt: str, history: List[Dict], ghost_context: str, repo: str, quant: str, profile: gr.OAuthProfile | None) -> Generator:
810
  username = profile.username if profile else "anonymous"
811
  # Update activity timestamp
812
  self.update_activity()
 
1155
  # --- UI LOGIC ---
1156
  def update_stats(profile: gr.OAuthProfile | None):
1157
  try:
1158
+ m = ResourceMonitor.get_metrics()
1159
+ current_user = profile.username if profile else "anonymous"
1160
+ balance = token_manager.get_balance(current_user)
1161
+
1162
+ return f"{m['ram_used_gb']}/{m['ram_total_gb']} GB", f"{m['cpu_usage_pct']}%", f"{balance}"
1163
+ except Exception as e:
1164
+ logger.error(f"Stats update error: {e}")
1165
+ return "Error", "Error", "0.00"
1166
 
1167
  def on_scan(repo):
1168
  try:
 
1180
  logger.error(f"Scan error: {e}")
1181
  return gr.update(choices=[], value=None), f"πŸ”΄ Scan failed: {str(e)}"
1182
 
1183
+ def on_boot(repo, file, profile: gr.OAuthProfile | None):
1184
+ username = profile.username if profile else "anonymous"
1185
+ result = kernel.boot_kernel(repo, file, username)
1186
+ return result
 
 
 
 
 
 
 
 
 
 
 
1187
 
1188
  def on_batch_upgrade():
1189
  success, msg = token_manager.purchase_batch_upgrade(session_id)
 
1212
 
1213
  # Event handlers
1214
  scan_btn.click(on_scan, [repo_input], [quant_dropdown, log_output])
1215
+ boot_btn.click(on_boot, [repo_input, quant_dropdown], [boot_status])
1216
 
1217
  # Token purchases
1218
  batch_upgrade_btn.click(on_batch_upgrade, None, [purchase_status, token_balance])