Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -785,142 +785,6 @@ class ZeroEngine:
|
|
| 785 |
logger.error(f"[BOOT] Unexpected error: {e}")
|
| 786 |
nuclear_ram_clear()
|
| 787 |
return f"π΄ BOOT FAILURE: {str(e)}"
|
| 788 |
-
"""HYPER-OPTIMIZED Boot kernel with cache manager and old model wrecker"""
|
| 789 |
-
try:
|
| 790 |
-
if not repo or not filename:
|
| 791 |
-
return "π΄ ERROR: Repository or filename missing"
|
| 792 |
-
|
| 793 |
-
logger.info(f"[BOOT] Starting download: {filename} from {repo}")
|
| 794 |
-
|
| 795 |
-
# DETECT QUANTIZATION FROM FILENAME
|
| 796 |
-
quant_config = self.detect_quantization(filename)
|
| 797 |
-
|
| 798 |
-
# Download with timeout protection
|
| 799 |
-
try:
|
| 800 |
-
path = hf_hub_download(
|
| 801 |
-
repo_id=repo,
|
| 802 |
-
filename=filename,
|
| 803 |
-
token=HF_TOKEN,
|
| 804 |
-
local_files_only=False
|
| 805 |
-
)
|
| 806 |
-
logger.info(f"[BOOT] Download complete: {path}")
|
| 807 |
-
except Exception as e:
|
| 808 |
-
logger.error(f"[BOOT] Download failed: {e}")
|
| 809 |
-
return f"π΄ DOWNLOAD FAILED: {str(e)}"
|
| 810 |
-
|
| 811 |
-
# Check if model is cached (for faster subsequent loads)
|
| 812 |
-
is_cached = model_cache.is_cached(path)
|
| 813 |
-
cache_status = "π― CACHED" if is_cached else "π NEW"
|
| 814 |
-
|
| 815 |
-
# Validate before loading
|
| 816 |
-
valid, msg = ResourceMonitor.validate_deployment(path)
|
| 817 |
-
if not valid:
|
| 818 |
-
logger.warning(f"[BOOT] Validation failed: {msg}")
|
| 819 |
-
return f"π΄ VALIDATION FAILED: {msg}"
|
| 820 |
-
|
| 821 |
-
logger.info(f"[BOOT] Validation passed ({cache_status}), applying {quant_config['type']} optimizations...")
|
| 822 |
-
|
| 823 |
-
# Apply NUMA optimization
|
| 824 |
-
if NUMA_OPTIMIZE:
|
| 825 |
-
self.optimize_numa()
|
| 826 |
-
|
| 827 |
-
# Load model with MAXIMUM PERFORMANCE SETTINGS
|
| 828 |
-
with self.kernel_lock:
|
| 829 |
-
# WRECK OLD MODEL - Nuclear option
|
| 830 |
-
if self.llm:
|
| 831 |
-
logger.info("[BOOT] π£ WRECKING old model...")
|
| 832 |
-
try:
|
| 833 |
-
# Wreck the cache first
|
| 834 |
-
model_cache.wreck_old_model_cache()
|
| 835 |
-
|
| 836 |
-
# Delete the model
|
| 837 |
-
del self.llm
|
| 838 |
-
self.llm = None
|
| 839 |
-
|
| 840 |
-
# Nuclear RAM clear
|
| 841 |
-
nuclear_ram_clear()
|
| 842 |
-
|
| 843 |
-
logger.info("[BOOT] β
Old model DESTROYED")
|
| 844 |
-
except Exception as e:
|
| 845 |
-
logger.warning(f"[BOOT] Cleanup warning: {e}")
|
| 846 |
-
|
| 847 |
-
# Calculate optimal batch size based on quantization and available RAM
|
| 848 |
-
vm = psutil.virtual_memory()
|
| 849 |
-
available_ram_gb = vm.available / (1024**3)
|
| 850 |
-
|
| 851 |
-
# MASSIVE batch sizes for quantized models
|
| 852 |
-
base_batch = int(256 * available_ram_gb / 4)
|
| 853 |
-
optimal_batch = int(base_batch * quant_config["batch_multiplier"])
|
| 854 |
-
optimal_batch = max(512, min(4096, optimal_batch)) # Clamp between 512-4096
|
| 855 |
-
|
| 856 |
-
# Context size based on quantization
|
| 857 |
-
optimal_ctx = quant_config["ctx_size"]
|
| 858 |
-
|
| 859 |
-
# Thread count with quantization-specific boost
|
| 860 |
-
optimal_threads = int(OPTIMAL_THREADS * quant_config["threads_boost"])
|
| 861 |
-
optimal_threads = max(2, min(optimal_threads, psutil.cpu_count(logical=False)))
|
| 862 |
-
|
| 863 |
-
try:
|
| 864 |
-
logger.info(f"[BOOT] Initializing {quant_config['type']}: threads={optimal_threads}, batch={optimal_batch}, ctx={optimal_ctx}")
|
| 865 |
-
|
| 866 |
-
# Preload cache if available (simulates faster warmup)
|
| 867 |
-
if is_cached:
|
| 868 |
-
model_cache.preload_cache(path)
|
| 869 |
-
|
| 870 |
-
# ULTRA-OPTIMIZED LLAMA.CPP INITIALIZATION
|
| 871 |
-
self.llm = Llama(
|
| 872 |
-
model_path=path,
|
| 873 |
-
n_ctx=optimal_ctx, # Dynamic context based on quant
|
| 874 |
-
n_threads=optimal_threads, # Optimized thread count
|
| 875 |
-
n_threads_batch=optimal_threads, # Batch processing threads
|
| 876 |
-
use_mmap=USE_MMAP, # Memory-mapped weights (fast loading)
|
| 877 |
-
use_mlock=MLOCK_MODEL, # Lock in RAM (prevent swap thrashing)
|
| 878 |
-
n_batch=optimal_batch, # MASSIVE batch size
|
| 879 |
-
n_gpu_layers=0, # CPU-only mode
|
| 880 |
-
flash_attn=FLASH_ATTENTION, # Flash Attention (2x faster)
|
| 881 |
-
type_k=2 if KV_CACHE_QUANTIZATION else None, # Q4 KV cache quantization
|
| 882 |
-
type_v=2 if KV_CACHE_QUANTIZATION else None, # Q4 KV cache quantization
|
| 883 |
-
rope_scaling_type=0, # Linear RoPE scaling
|
| 884 |
-
rope_freq_scale=ROPE_SCALING, # RoPE frequency scale
|
| 885 |
-
numa=NUMA_OPTIMIZE, # NUMA optimization
|
| 886 |
-
verbose=False,
|
| 887 |
-
logits_all=False, # Only compute final logits (faster)
|
| 888 |
-
embedding=False, # Disable embeddings (not needed)
|
| 889 |
-
offload_kqv=OFFLOAD_KQV, # No offload on CPU
|
| 890 |
-
f16_kv=False # Use quantized KV cache instead
|
| 891 |
-
)
|
| 892 |
-
|
| 893 |
-
self.active_model_info = {"repo": repo, "file": filename, "quant": quant_config['type']}
|
| 894 |
-
self.telemetry.track_load(repo, filename)
|
| 895 |
-
|
| 896 |
-
# Extract and cache TINY signature for faster future loads
|
| 897 |
-
if not is_cached:
|
| 898 |
-
logger.info("[BOOT] Extracting cache signature...")
|
| 899 |
-
signature = model_cache.extract_cache_signature(path)
|
| 900 |
-
if signature:
|
| 901 |
-
model_cache.save_to_cache(path, signature)
|
| 902 |
-
|
| 903 |
-
# Warm-up inference to populate caches
|
| 904 |
-
logger.info("[BOOT] Warming up model caches...")
|
| 905 |
-
try:
|
| 906 |
-
self.llm("Warmup", max_tokens=1, stream=False)
|
| 907 |
-
force_gc() # Clear warmup artifacts
|
| 908 |
-
except:
|
| 909 |
-
pass
|
| 910 |
-
|
| 911 |
-
logger.info("[BOOT] π HYPER-OPTIMIZED MODEL READY!")
|
| 912 |
-
return f"π’ {quant_config['type']} KERNEL {cache_status} | T:{optimal_threads} | B:{optimal_batch} | Ctx:{optimal_ctx}"
|
| 913 |
-
|
| 914 |
-
except Exception as e:
|
| 915 |
-
logger.error(f"[BOOT] Model loading failed: {e}")
|
| 916 |
-
self.llm = None
|
| 917 |
-
nuclear_ram_clear()
|
| 918 |
-
return f"π΄ LOAD FAILED: {str(e)}"
|
| 919 |
-
|
| 920 |
-
except Exception as e:
|
| 921 |
-
logger.error(f"[BOOT] Unexpected error: {e}")
|
| 922 |
-
nuclear_ram_clear()
|
| 923 |
-
return f"π΄ BOOT FAILURE: {str(e)}"
|
| 924 |
|
| 925 |
def stitch_cache(self, ghost_text: str) -> str:
|
| 926 |
"""Prime KV cache with ghost context"""
|
|
@@ -942,7 +806,7 @@ class ZeroEngine:
|
|
| 942 |
threading.Thread(target=_bg_eval, daemon=True).start()
|
| 943 |
return "β‘ Primed"
|
| 944 |
|
| 945 |
-
def inference_generator(self, prompt: str, history: List[Dict], ghost_context: str, repo: str, quant: str,
|
| 946 |
username = profile.username if profile else "anonymous"
|
| 947 |
# Update activity timestamp
|
| 948 |
self.update_activity()
|
|
@@ -1291,14 +1155,14 @@ with gr.Blocks(title="ZeroEngine V0.2", css=CUSTOM_CSS) as demo:
|
|
| 1291 |
# --- UI LOGIC ---
|
| 1292 |
def update_stats(profile: gr.OAuthProfile | None):
|
| 1293 |
try:
|
| 1294 |
-
|
| 1295 |
-
|
| 1296 |
-
|
| 1297 |
-
|
| 1298 |
-
|
| 1299 |
-
|
| 1300 |
-
|
| 1301 |
-
|
| 1302 |
|
| 1303 |
def on_scan(repo):
|
| 1304 |
try:
|
|
@@ -1316,21 +1180,10 @@ with gr.Blocks(title="ZeroEngine V0.2", css=CUSTOM_CSS) as demo:
|
|
| 1316 |
logger.error(f"Scan error: {e}")
|
| 1317 |
return gr.update(choices=[], value=None), f"π΄ Scan failed: {str(e)}"
|
| 1318 |
|
| 1319 |
-
def on_boot(repo, file):
|
| 1320 |
-
|
| 1321 |
-
|
| 1322 |
-
|
| 1323 |
-
return
|
| 1324 |
-
|
| 1325 |
-
yield "βοΈ System: Initiating boot sequence...", gr.update()
|
| 1326 |
-
time.sleep(0.5)
|
| 1327 |
-
|
| 1328 |
-
result = kernel.boot_kernel(repo, file, session_id)
|
| 1329 |
-
yield result, gr.update()
|
| 1330 |
-
|
| 1331 |
-
except Exception as e:
|
| 1332 |
-
logger.error(f"Boot UI error: {e}")
|
| 1333 |
-
yield f"π΄ BOOT ERROR: {str(e)}", gr.update()
|
| 1334 |
|
| 1335 |
def on_batch_upgrade():
|
| 1336 |
success, msg = token_manager.purchase_batch_upgrade(session_id)
|
|
@@ -1359,7 +1212,7 @@ with gr.Blocks(title="ZeroEngine V0.2", css=CUSTOM_CSS) as demo:
|
|
| 1359 |
|
| 1360 |
# Event handlers
|
| 1361 |
scan_btn.click(on_scan, [repo_input], [quant_dropdown, log_output])
|
| 1362 |
-
boot_btn.click(on_boot, [repo_input, quant_dropdown], [boot_status
|
| 1363 |
|
| 1364 |
# Token purchases
|
| 1365 |
batch_upgrade_btn.click(on_batch_upgrade, None, [purchase_status, token_balance])
|
|
|
|
| 785 |
logger.error(f"[BOOT] Unexpected error: {e}")
|
| 786 |
nuclear_ram_clear()
|
| 787 |
return f"π΄ BOOT FAILURE: {str(e)}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 788 |
|
| 789 |
def stitch_cache(self, ghost_text: str) -> str:
|
| 790 |
"""Prime KV cache with ghost context"""
|
|
|
|
| 806 |
threading.Thread(target=_bg_eval, daemon=True).start()
|
| 807 |
return "β‘ Primed"
|
| 808 |
|
| 809 |
+
def inference_generator(self, prompt: str, history: List[Dict], ghost_context: str, repo: str, quant: str, profile: gr.OAuthProfile | None) -> Generator:
|
| 810 |
username = profile.username if profile else "anonymous"
|
| 811 |
# Update activity timestamp
|
| 812 |
self.update_activity()
|
|
|
|
| 1155 |
# --- UI LOGIC ---
|
| 1156 |
def update_stats(profile: gr.OAuthProfile | None):
|
| 1157 |
try:
|
| 1158 |
+
m = ResourceMonitor.get_metrics()
|
| 1159 |
+
current_user = profile.username if profile else "anonymous"
|
| 1160 |
+
balance = token_manager.get_balance(current_user)
|
| 1161 |
+
|
| 1162 |
+
return f"{m['ram_used_gb']}/{m['ram_total_gb']} GB", f"{m['cpu_usage_pct']}%", f"{balance}"
|
| 1163 |
+
except Exception as e:
|
| 1164 |
+
logger.error(f"Stats update error: {e}")
|
| 1165 |
+
return "Error", "Error", "0.00"
|
| 1166 |
|
| 1167 |
def on_scan(repo):
|
| 1168 |
try:
|
|
|
|
| 1180 |
logger.error(f"Scan error: {e}")
|
| 1181 |
return gr.update(choices=[], value=None), f"π΄ Scan failed: {str(e)}"
|
| 1182 |
|
| 1183 |
+
def on_boot(repo, file, profile: gr.OAuthProfile | None):
|
| 1184 |
+
username = profile.username if profile else "anonymous"
|
| 1185 |
+
result = kernel.boot_kernel(repo, file, username)
|
| 1186 |
+
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1187 |
|
| 1188 |
def on_batch_upgrade():
|
| 1189 |
success, msg = token_manager.purchase_batch_upgrade(session_id)
|
|
|
|
| 1212 |
|
| 1213 |
# Event handlers
|
| 1214 |
scan_btn.click(on_scan, [repo_input], [quant_dropdown, log_output])
|
| 1215 |
+
boot_btn.click(on_boot, [repo_input, quant_dropdown], [boot_status])
|
| 1216 |
|
| 1217 |
# Token purchases
|
| 1218 |
batch_upgrade_btn.click(on_batch_upgrade, None, [purchase_status, token_balance])
|