Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -31,47 +31,32 @@ SYSTEM_RESERVE_MB = 500
|
|
| 31 |
DEFAULT_MODEL = "unsloth/Llama-3.2-1B-Instruct-GGUF"
|
| 32 |
DEFAULT_QUANT = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
|
| 33 |
|
| 34 |
-
# --- TOKEN SYSTEM CONFIG ---
|
| 35 |
-
MONTHLY_TOKEN_CREDITS = 100.0
|
| 36 |
-
TOKEN_COST_PER_100MS = 0.001
|
| 37 |
-
BATCH_UPGRADE_BASE_COST = 0.00005 # Exponential: 1->2 = 0.00005, 2->4 = 0.0001, etc.
|
| 38 |
-
TOKEN_UPGRADE_COST_PER_1K = 0.0001 # Cost per 1000 extra tokens
|
| 39 |
-
|
| 40 |
# --- SPEED OPTIMIZATION CONFIG ---
|
| 41 |
-
FLASH_ATTENTION =
|
| 42 |
-
KV_CACHE_QUANTIZATION = True #
|
| 43 |
-
CONTINUOUS_BATCHING =
|
| 44 |
-
SPECULATIVE_DECODE = False # CPU
|
| 45 |
-
MLOCK_MODEL = False #
|
| 46 |
-
USE_MMAP = True #
|
| 47 |
-
OFFLOAD_KQV = False # CPU-only
|
| 48 |
-
OPTIMAL_THREADS = psutil.cpu_count(logical=
|
| 49 |
-
ROPE_SCALING = 1.0
|
| 50 |
-
NUMA_OPTIMIZE =
|
| 51 |
-
AGGRESSIVE_GC = True
|
| 52 |
-
|
| 53 |
-
# Quantization detection
|
| 54 |
QUANT_OPTIMIZATIONS = {
|
| 55 |
-
"BF16": {"batch_multiplier": 0.
|
| 56 |
-
"F16": {"batch_multiplier": 0.
|
| 57 |
-
"Q8_0": {"batch_multiplier":
|
| 58 |
-
"Q6_K": {"batch_multiplier":
|
| 59 |
-
"Q5_K_M": {"batch_multiplier": 1.
|
| 60 |
-
"Q5_K_S": {"batch_multiplier": 1.
|
| 61 |
-
"Q4_K_M": {"batch_multiplier":
|
| 62 |
-
"Q4_K_S": {"batch_multiplier":
|
| 63 |
-
"Q4_0": {"batch_multiplier":
|
| 64 |
-
"Q3_K_M": {"batch_multiplier":
|
| 65 |
-
"Q2_K": {"batch_multiplier":
|
| 66 |
-
}
|
| 67 |
-
|
| 68 |
-
# Model format/architecture detection patterns
|
| 69 |
-
MODEL_FORMATS = {
|
| 70 |
-
"llama": {"pattern": ["llama", "mistral", "mixtral"], "template": "llama"},
|
| 71 |
-
"gemma": {"pattern": ["gemma"], "template": "gemma"},
|
| 72 |
-
"phi": {"pattern": ["phi"], "template": "phi"},
|
| 73 |
-
"qwen": {"pattern": ["qwen"], "template": "chatml"},
|
| 74 |
-
"deepseek": {"pattern": ["deepseek"], "template": "deepseek"},
|
| 75 |
}
|
| 76 |
|
| 77 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - ZEROENGINE - %(message)s')
|
|
@@ -222,212 +207,6 @@ class ModelCacheManager:
|
|
| 222 |
logger.error(f"[WRECKER] Failed: {e}")
|
| 223 |
return False
|
| 224 |
|
| 225 |
-
# --- TOKEN MANAGER ---
|
| 226 |
-
class TokenManager:
|
| 227 |
-
def __init__(self):
|
| 228 |
-
self.user_tokens = {} # {username: {"balance": float, "start_time": float, "purchases": {}}}
|
| 229 |
-
self.owner_username = "turtle170" # Owner gets infinite tokens
|
| 230 |
-
|
| 231 |
-
def is_owner(self, username: str) -> bool:
|
| 232 |
-
"""Check if user is the owner"""
|
| 233 |
-
if not username:
|
| 234 |
-
return False
|
| 235 |
-
return username.lower() == self.owner_username.lower()
|
| 236 |
-
|
| 237 |
-
def initialize_user(self, username: str):
|
| 238 |
-
"""Initialize new user with monthly credits (or infinite for owner)"""
|
| 239 |
-
if not username:
|
| 240 |
-
username = "anonymous"
|
| 241 |
-
|
| 242 |
-
if username not in self.user_tokens:
|
| 243 |
-
# Owner gets infinite tokens
|
| 244 |
-
if self.is_owner(username):
|
| 245 |
-
self.user_tokens[username] = {
|
| 246 |
-
"balance": float('inf'),
|
| 247 |
-
"start_time": time.time(),
|
| 248 |
-
"purchases": {"batch_multiplier": 1, "token_limit": 2048},
|
| 249 |
-
"total_spent": 0.0,
|
| 250 |
-
"is_owner": True,
|
| 251 |
-
"username": username
|
| 252 |
-
}
|
| 253 |
-
logger.info(f"[TOKEN] 👑 OWNER {username} initialized with INFINITE tokens!")
|
| 254 |
-
else:
|
| 255 |
-
self.user_tokens[username] = {
|
| 256 |
-
"balance": MONTHLY_TOKEN_CREDITS,
|
| 257 |
-
"start_time": time.time(),
|
| 258 |
-
"purchases": {"batch_multiplier": 1, "token_limit": 2048},
|
| 259 |
-
"total_spent": 0.0,
|
| 260 |
-
"is_owner": False,
|
| 261 |
-
"username": username,
|
| 262 |
-
"last_reset": time.time()
|
| 263 |
-
}
|
| 264 |
-
logger.info(f"[TOKEN] New user {username}: {MONTHLY_TOKEN_CREDITS} tokens")
|
| 265 |
-
|
| 266 |
-
def check_monthly_reset(self, username: str):
|
| 267 |
-
"""Reset tokens if a month has passed"""
|
| 268 |
-
if not username or username not in self.user_tokens:
|
| 269 |
-
return
|
| 270 |
-
|
| 271 |
-
if self.user_tokens[username].get("is_owner", False):
|
| 272 |
-
return # Owner never needs reset
|
| 273 |
-
|
| 274 |
-
last_reset = self.user_tokens[username].get("last_reset", time.time())
|
| 275 |
-
month_in_seconds = 30 * 24 * 60 * 60 # 30 days
|
| 276 |
-
|
| 277 |
-
if time.time() - last_reset > month_in_seconds:
|
| 278 |
-
self.user_tokens[username]["balance"] = MONTHLY_TOKEN_CREDITS
|
| 279 |
-
self.user_tokens[username]["last_reset"] = time.time()
|
| 280 |
-
self.user_tokens[username]["total_spent"] = 0.0
|
| 281 |
-
logger.info(f"[TOKEN] Monthly reset for {username}: {MONTHLY_TOKEN_CREDITS} tokens")
|
| 282 |
-
|
| 283 |
-
def charge_usage(self, username: str, duration_ms: float) -> bool:
|
| 284 |
-
"""Charge user for inference time. Returns True if successful. Owner never charged."""
|
| 285 |
-
if not username:
|
| 286 |
-
username = "anonymous"
|
| 287 |
-
|
| 288 |
-
self.initialize_user(username)
|
| 289 |
-
self.check_monthly_reset(username)
|
| 290 |
-
|
| 291 |
-
# Owner never gets charged
|
| 292 |
-
if self.user_tokens[username].get("is_owner", False):
|
| 293 |
-
return True
|
| 294 |
-
|
| 295 |
-
cost = (duration_ms / 100.0) * TOKEN_COST_PER_100MS
|
| 296 |
-
|
| 297 |
-
# Check if user has enough balance
|
| 298 |
-
if self.user_tokens[username]["balance"] <= 0:
|
| 299 |
-
logger.warning(f"[TOKEN] ❌ {username} has 0 tokens! Access denied.")
|
| 300 |
-
return False
|
| 301 |
-
|
| 302 |
-
if self.user_tokens[username]["balance"] >= cost:
|
| 303 |
-
self.user_tokens[username]["balance"] -= cost
|
| 304 |
-
self.user_tokens[username]["balance"] = max(0, self.user_tokens[username]["balance"]) # Never go below 0
|
| 305 |
-
self.user_tokens[username]["total_spent"] += cost
|
| 306 |
-
logger.info(f"[TOKEN] Charged {cost:.4f} tokens ({duration_ms:.0f}ms) | Remaining: {self.user_tokens[username]['balance']:.2f}")
|
| 307 |
-
return True
|
| 308 |
-
else:
|
| 309 |
-
# Insufficient balance - set to 0 and deny
|
| 310 |
-
self.user_tokens[username]["balance"] = 0
|
| 311 |
-
logger.warning(f"[TOKEN] ❌ Insufficient balance! {username} now at 0 tokens.")
|
| 312 |
-
return False
|
| 313 |
-
|
| 314 |
-
def can_use_engine(self, username: str) -> tuple:
|
| 315 |
-
"""Check if user can use the engine. Returns (bool, message)"""
|
| 316 |
-
if not username:
|
| 317 |
-
username = "anonymous"
|
| 318 |
-
|
| 319 |
-
self.initialize_user(username)
|
| 320 |
-
self.check_monthly_reset(username)
|
| 321 |
-
|
| 322 |
-
if self.user_tokens[username].get("is_owner", False):
|
| 323 |
-
return True, "👑 Owner access granted"
|
| 324 |
-
|
| 325 |
-
balance = self.user_tokens[username]["balance"]
|
| 326 |
-
|
| 327 |
-
if balance <= 0:
|
| 328 |
-
last_reset = self.user_tokens[username].get("last_reset", time.time())
|
| 329 |
-
time_until_reset = 30 * 24 * 60 * 60 - (time.time() - last_reset)
|
| 330 |
-
days_left = int(time_until_reset / (24 * 60 * 60))
|
| 331 |
-
return False, f"❌ Out of tokens! Resets in {days_left} days. Current balance: 0.00"
|
| 332 |
-
|
| 333 |
-
return True, f"✅ Access granted. Balance: {balance:.2f} tokens"
|
| 334 |
-
|
| 335 |
-
def purchase_batch_upgrade(self, username: str) -> tuple:
|
| 336 |
-
"""Purchase batch size upgrade (exponential cost). Free for owner."""
|
| 337 |
-
if not username:
|
| 338 |
-
return False, "❌ Please login first"
|
| 339 |
-
|
| 340 |
-
self.initialize_user(username)
|
| 341 |
-
|
| 342 |
-
# Owner gets free upgrades
|
| 343 |
-
if self.user_tokens[username].get("is_owner", False):
|
| 344 |
-
current_mult = self.user_tokens[username]["purchases"]["batch_multiplier"]
|
| 345 |
-
self.user_tokens[username]["purchases"]["batch_multiplier"] = current_mult * 2
|
| 346 |
-
new_mult = current_mult * 2
|
| 347 |
-
logger.info(f"[TOKEN] 👑 OWNER free batch upgrade: {current_mult}x → {new_mult}x")
|
| 348 |
-
return True, f"👑 FREE UPGRADE! Batch now {new_mult}x!"
|
| 349 |
-
|
| 350 |
-
current_mult = self.user_tokens[username]["purchases"]["batch_multiplier"]
|
| 351 |
-
upgrade_level = int(math.log2(current_mult)) if current_mult > 1 else 0
|
| 352 |
-
cost = BATCH_UPGRADE_BASE_COST * (2 ** upgrade_level)
|
| 353 |
-
|
| 354 |
-
if self.user_tokens[username]["balance"] >= cost:
|
| 355 |
-
self.user_tokens[username]["balance"] -= cost
|
| 356 |
-
self.user_tokens[username]["purchases"]["batch_multiplier"] = current_mult * 2
|
| 357 |
-
new_mult = current_mult * 2
|
| 358 |
-
logger.info(f"[TOKEN] Batch upgrade: {current_mult}x → {new_mult}x | Cost: {cost:.5f}")
|
| 359 |
-
return True, f"✅ Batch upgraded to {new_mult}x! (-{cost:.5f} tokens)"
|
| 360 |
-
else:
|
| 361 |
-
return False, f"❌ Insufficient tokens! Need {cost:.5f}, have {self.user_tokens[username]['balance']:.2f}"
|
| 362 |
-
|
| 363 |
-
def purchase_token_upgrade(self, username: str, extra_tokens: int = 1000) -> tuple:
|
| 364 |
-
"""Purchase extra response token length. Free for owner."""
|
| 365 |
-
if not username:
|
| 366 |
-
return False, "❌ Please login first"
|
| 367 |
-
|
| 368 |
-
self.initialize_user(username)
|
| 369 |
-
|
| 370 |
-
# Owner gets free upgrades
|
| 371 |
-
if self.user_tokens[username].get("is_owner", False):
|
| 372 |
-
self.user_tokens[username]["purchases"]["token_limit"] += extra_tokens
|
| 373 |
-
new_limit = self.user_tokens[username]["purchases"]["token_limit"]
|
| 374 |
-
logger.info(f"[TOKEN] 👑 OWNER free token upgrade: +{extra_tokens} tokens")
|
| 375 |
-
return True, f"👑 FREE UPGRADE! Token limit now {new_limit}!"
|
| 376 |
-
|
| 377 |
-
cost = (extra_tokens / 1000) * TOKEN_UPGRADE_COST_PER_1K
|
| 378 |
-
|
| 379 |
-
if self.user_tokens[username]["balance"] >= cost:
|
| 380 |
-
self.user_tokens[username]["balance"] -= cost
|
| 381 |
-
self.user_tokens[username]["purchases"]["token_limit"] += extra_tokens
|
| 382 |
-
new_limit = self.user_tokens[username]["purchases"]["token_limit"]
|
| 383 |
-
logger.info(f"[TOKEN] Token limit upgrade: +{extra_tokens} tokens | Cost: {cost:.5f}")
|
| 384 |
-
return True, f"✅ Token limit now {new_limit}! (-{cost:.5f} tokens)"
|
| 385 |
-
else:
|
| 386 |
-
return False, f"❌ Insufficient tokens! Need {cost:.5f}, have {self.user_tokens[username]['balance']:.2f}"
|
| 387 |
-
|
| 388 |
-
def get_balance(self, username: str) -> float:
|
| 389 |
-
"""Get user's current token balance"""
|
| 390 |
-
if not username:
|
| 391 |
-
username = "anonymous"
|
| 392 |
-
|
| 393 |
-
self.initialize_user(username)
|
| 394 |
-
self.check_monthly_reset(username)
|
| 395 |
-
|
| 396 |
-
balance = self.user_tokens[username]["balance"]
|
| 397 |
-
|
| 398 |
-
# Show ∞ for owner
|
| 399 |
-
if balance == float('inf'):
|
| 400 |
-
return balance
|
| 401 |
-
|
| 402 |
-
return round(max(0, balance), 2) # Never show negative
|
| 403 |
-
|
| 404 |
-
def get_purchases(self, username: str) -> dict:
|
| 405 |
-
"""Get user's current purchases"""
|
| 406 |
-
if not username:
|
| 407 |
-
username = "anonymous"
|
| 408 |
-
|
| 409 |
-
self.initialize_user(username)
|
| 410 |
-
return self.user_tokens[username]["purchases"]
|
| 411 |
-
|
| 412 |
-
def end_session(self, username: str):
|
| 413 |
-
"""End user session and log stats"""
|
| 414 |
-
if not username:
|
| 415 |
-
return "No active session found."
|
| 416 |
-
|
| 417 |
-
if username in self.user_tokens:
|
| 418 |
-
stats = self.user_tokens[username]
|
| 419 |
-
|
| 420 |
-
if stats.get("is_owner", False):
|
| 421 |
-
return f"👑 Owner session ended. Welcome back anytime, {stats['username']}!"
|
| 422 |
-
|
| 423 |
-
logger.info(f"[TOKEN] Session ended: Spent {stats['total_spent']:.2f}, Remaining {stats['balance']:.2f}")
|
| 424 |
-
return f"Session ended. You spent {stats['total_spent']:.2f} tokens this session. Balance: {stats['balance']:.2f}"
|
| 425 |
-
return "No active session found."
|
| 426 |
-
|
| 427 |
-
# Global token manager
|
| 428 |
-
import math
|
| 429 |
-
token_manager = TokenManager()
|
| 430 |
-
|
| 431 |
# Global cache manager
|
| 432 |
model_cache = ModelCacheManager()
|
| 433 |
|
|
@@ -493,7 +272,7 @@ class ZeroEngine:
|
|
| 493 |
self.api = HfApi(token=HF_TOKEN)
|
| 494 |
self.telemetry = TelemetryManager(self.api)
|
| 495 |
self.llm: Optional[Llama] = None
|
| 496 |
-
self.active_model_info = {"repo": "", "file": ""
|
| 497 |
self.kernel_lock = threading.Lock()
|
| 498 |
self.is_prefilling = False
|
| 499 |
self.perf_stats = {
|
|
@@ -503,9 +282,9 @@ class ZeroEngine:
|
|
| 503 |
"peak_tps": 0.0,
|
| 504 |
"cache_hits": 0
|
| 505 |
}
|
| 506 |
-
self.prompt_cache = {}
|
| 507 |
self.last_activity = time.time()
|
| 508 |
-
self.idle_timeout = 20
|
| 509 |
self.auto_cleanup_thread = None
|
| 510 |
self.start_idle_monitor()
|
| 511 |
|
|
@@ -514,29 +293,6 @@ class ZeroEngine:
|
|
| 514 |
self.typing_timer = None
|
| 515 |
self.preprocessed_tokens = None
|
| 516 |
|
| 517 |
-
# Custom parameters (user-configurable)
|
| 518 |
-
self.custom_params = {
|
| 519 |
-
"temperature": 0.7,
|
| 520 |
-
"top_p": 0.95,
|
| 521 |
-
"top_k": 40,
|
| 522 |
-
"repeat_penalty": 1.1,
|
| 523 |
-
"batch_size_override": None, # None = auto
|
| 524 |
-
"max_tokens_override": None # None = auto
|
| 525 |
-
}
|
| 526 |
-
|
| 527 |
-
def detect_model_format(self, filename: str, repo: str) -> str:
|
| 528 |
-
"""Auto-detect model format/architecture from filename and repo"""
|
| 529 |
-
combined = f"{repo.lower()} {filename.lower()}"
|
| 530 |
-
|
| 531 |
-
for format_name, format_info in MODEL_FORMATS.items():
|
| 532 |
-
for pattern in format_info["pattern"]:
|
| 533 |
-
if pattern in combined:
|
| 534 |
-
logger.info(f"[FORMAT-DETECT] Detected {format_name.upper()} architecture")
|
| 535 |
-
return format_name
|
| 536 |
-
|
| 537 |
-
logger.warning(f"[FORMAT-DETECT] Unknown format, defaulting to llama")
|
| 538 |
-
return "llama"
|
| 539 |
-
|
| 540 |
def detect_quantization(self, filename: str) -> dict:
|
| 541 |
"""Detect quantization method from filename and return optimizations"""
|
| 542 |
filename_upper = filename.upper()
|
|
@@ -633,158 +389,7 @@ class ZeroEngine:
|
|
| 633 |
logger.error(f"Scan error: {e}")
|
| 634 |
return []
|
| 635 |
|
| 636 |
-
def boot_kernel(self, repo: str, filename: str
|
| 637 |
-
"""HYPER-OPTIMIZED Boot kernel with format auto-detection and Gemma fixes"""
|
| 638 |
-
try:
|
| 639 |
-
if not repo or not filename:
|
| 640 |
-
return "🔴 ERROR: Repository or filename missing"
|
| 641 |
-
|
| 642 |
-
logger.info(f"[BOOT] Starting download: {filename} from {repo}")
|
| 643 |
-
|
| 644 |
-
# DETECT QUANTIZATION FROM FILENAME
|
| 645 |
-
quant_config = self.detect_quantization(filename)
|
| 646 |
-
|
| 647 |
-
# DETECT MODEL FORMAT/ARCHITECTURE
|
| 648 |
-
model_format = self.detect_model_format(filename, repo)
|
| 649 |
-
|
| 650 |
-
# Download with timeout protection
|
| 651 |
-
try:
|
| 652 |
-
path = hf_hub_download(
|
| 653 |
-
repo_id=repo,
|
| 654 |
-
filename=filename,
|
| 655 |
-
token=HF_TOKEN,
|
| 656 |
-
local_files_only=False
|
| 657 |
-
)
|
| 658 |
-
logger.info(f"[BOOT] Download complete: {path}")
|
| 659 |
-
except Exception as e:
|
| 660 |
-
logger.error(f"[BOOT] Download failed: {e}")
|
| 661 |
-
return f"🔴 DOWNLOAD FAILED: {str(e)}"
|
| 662 |
-
|
| 663 |
-
# Check if model is cached
|
| 664 |
-
is_cached = model_cache.is_cached(path)
|
| 665 |
-
cache_status = "🎯 CACHED" if is_cached else "🆕 NEW"
|
| 666 |
-
|
| 667 |
-
# Validate before loading
|
| 668 |
-
valid, msg = ResourceMonitor.validate_deployment(path)
|
| 669 |
-
if not valid:
|
| 670 |
-
logger.warning(f"[BOOT] Validation failed: {msg}")
|
| 671 |
-
return f"🔴 VALIDATION FAILED: {msg}"
|
| 672 |
-
|
| 673 |
-
logger.info(f"[BOOT] Validation passed ({cache_status}), applying {quant_config['type']} optimizations for {model_format.upper()}...")
|
| 674 |
-
|
| 675 |
-
# Load model with MAXIMUM PERFORMANCE SETTINGS
|
| 676 |
-
with self.kernel_lock:
|
| 677 |
-
# WRECK OLD MODEL
|
| 678 |
-
if self.llm:
|
| 679 |
-
logger.info("[BOOT] 💣 WRECKING old model...")
|
| 680 |
-
try:
|
| 681 |
-
model_cache.wreck_old_model_cache()
|
| 682 |
-
del self.llm
|
| 683 |
-
self.llm = None
|
| 684 |
-
nuclear_ram_clear()
|
| 685 |
-
logger.info("[BOOT] ✅ Old model DESTROYED")
|
| 686 |
-
except Exception as e:
|
| 687 |
-
logger.warning(f"[BOOT] Cleanup warning: {e}")
|
| 688 |
-
|
| 689 |
-
# Calculate optimal parameters with token purchases
|
| 690 |
-
vm = psutil.virtual_memory()
|
| 691 |
-
available_ram_gb = vm.available / (1024**3)
|
| 692 |
-
|
| 693 |
-
# CPU-OPTIMIZED BATCH CALCULATION - Very aggressive for 16GB RAM
|
| 694 |
-
# Base calculation: use more RAM for batching on CPU
|
| 695 |
-
base_batch = int(512 * available_ram_gb / 8) # More aggressive base
|
| 696 |
-
optimal_batch = int(base_batch * quant_config["batch_multiplier"])
|
| 697 |
-
|
| 698 |
-
# Apply user's batch multiplier from token purchases
|
| 699 |
-
if session_id:
|
| 700 |
-
user_batch_mult = token_manager.get_purchases(session_id)["batch_multiplier"]
|
| 701 |
-
optimal_batch = int(optimal_batch * user_batch_mult)
|
| 702 |
-
logger.info(f"[TOKEN] User batch multiplier: {user_batch_mult}x")
|
| 703 |
-
|
| 704 |
-
# CPU can handle larger batches with quantized models
|
| 705 |
-
optimal_batch = max(1024, min(8192, optimal_batch)) # 1024-8192 range for CPU
|
| 706 |
-
|
| 707 |
-
# Context size
|
| 708 |
-
optimal_ctx = quant_config["ctx_size"]
|
| 709 |
-
|
| 710 |
-
# Reduce context for Gemma models (they have 131K n_ctx_train)
|
| 711 |
-
if model_format == "gemma":
|
| 712 |
-
optimal_ctx = min(8192, optimal_ctx) # Gemma works better with lower ctx
|
| 713 |
-
logger.info(f"[FORMAT] Gemma detected: reducing context to {optimal_ctx}")
|
| 714 |
-
|
| 715 |
-
# Thread optimization - use ALL threads on CPU (including hyperthreading)
|
| 716 |
-
optimal_threads = psutil.cpu_count(logical=True) # ALL logical cores
|
| 717 |
-
logger.info(f"[CPU] Using all {optimal_threads} threads (including hyperthreading)")
|
| 718 |
-
|
| 719 |
-
try:
|
| 720 |
-
logger.info(f"[BOOT] Initializing {model_format.upper()} {quant_config['type']}: threads={optimal_threads}, batch={optimal_batch}, ctx={optimal_ctx}")
|
| 721 |
-
|
| 722 |
-
# Preload cache if available
|
| 723 |
-
if is_cached:
|
| 724 |
-
model_cache.preload_cache(path)
|
| 725 |
-
|
| 726 |
-
# ULTRA-OPTIMIZED CPU-ONLY INITIALIZATION
|
| 727 |
-
init_params = {
|
| 728 |
-
"model_path": path,
|
| 729 |
-
"n_ctx": optimal_ctx,
|
| 730 |
-
"n_threads": optimal_threads,
|
| 731 |
-
"n_threads_batch": optimal_threads,
|
| 732 |
-
"use_mmap": USE_MMAP, # Critical for CPU
|
| 733 |
-
"use_mlock": MLOCK_MODEL, # Let OS manage memory
|
| 734 |
-
"n_batch": optimal_batch, # MASSIVE batches for CPU
|
| 735 |
-
"n_gpu_layers": 0, # CPU-only
|
| 736 |
-
"rope_scaling_type": 0,
|
| 737 |
-
"rope_freq_scale": ROPE_SCALING,
|
| 738 |
-
"verbose": False,
|
| 739 |
-
"logits_all": False,
|
| 740 |
-
"embedding": False,
|
| 741 |
-
"f16_kv": False # Use quantized KV cache
|
| 742 |
-
}
|
| 743 |
-
|
| 744 |
-
# Add KV quantization only if not Gemma (Gemma can be finicky)
|
| 745 |
-
if model_format != "gemma" and KV_CACHE_QUANTIZATION:
|
| 746 |
-
init_params["type_k"] = 2
|
| 747 |
-
init_params["type_v"] = 2
|
| 748 |
-
logger.info("[OPTIM] KV cache quantization enabled (Q4)")
|
| 749 |
-
|
| 750 |
-
self.llm = Llama(**init_params)
|
| 751 |
-
|
| 752 |
-
self.active_model_info = {
|
| 753 |
-
"repo": repo,
|
| 754 |
-
"file": filename,
|
| 755 |
-
"quant": quant_config['type'],
|
| 756 |
-
"format": model_format
|
| 757 |
-
}
|
| 758 |
-
self.telemetry.track_load(repo, filename)
|
| 759 |
-
|
| 760 |
-
# Extract and cache signature
|
| 761 |
-
if not is_cached:
|
| 762 |
-
logger.info("[BOOT] Extracting cache signature...")
|
| 763 |
-
signature = model_cache.extract_cache_signature(path)
|
| 764 |
-
if signature:
|
| 765 |
-
model_cache.save_to_cache(path, signature)
|
| 766 |
-
|
| 767 |
-
# Warm-up
|
| 768 |
-
logger.info("[BOOT] Warming up model caches...")
|
| 769 |
-
try:
|
| 770 |
-
self.llm("Warmup", max_tokens=1, stream=False)
|
| 771 |
-
force_gc()
|
| 772 |
-
except:
|
| 773 |
-
pass
|
| 774 |
-
|
| 775 |
-
logger.info("[BOOT] 🚀 CPU-OPTIMIZED MODEL READY!")
|
| 776 |
-
return f"🟢 {model_format.upper()} {quant_config['type']} {cache_status} | CPU:{optimal_threads}T | B:{optimal_batch} | Ctx:{optimal_ctx}"
|
| 777 |
-
|
| 778 |
-
except Exception as e:
|
| 779 |
-
logger.error(f"[BOOT] Model loading failed: {e}")
|
| 780 |
-
self.llm = None
|
| 781 |
-
nuclear_ram_clear()
|
| 782 |
-
return f"🔴 LOAD FAILED: {str(e)}"
|
| 783 |
-
|
| 784 |
-
except Exception as e:
|
| 785 |
-
logger.error(f"[BOOT] Unexpected error: {e}")
|
| 786 |
-
nuclear_ram_clear()
|
| 787 |
-
return f"🔴 BOOT FAILURE: {str(e)}"
|
| 788 |
"""HYPER-OPTIMIZED Boot kernel with cache manager and old model wrecker"""
|
| 789 |
try:
|
| 790 |
if not repo or not filename:
|
|
@@ -942,7 +547,7 @@ class ZeroEngine:
|
|
| 942 |
threading.Thread(target=_bg_eval, daemon=True).start()
|
| 943 |
return "⚡ Primed"
|
| 944 |
|
| 945 |
-
def inference_generator(self, prompt: str, history: List[Dict], ghost_context: str, repo: str, quant: str
|
| 946 |
# Update activity timestamp
|
| 947 |
self.update_activity()
|
| 948 |
|
|
@@ -995,28 +600,23 @@ class ZeroEngine:
|
|
| 995 |
first_token_time = None
|
| 996 |
|
| 997 |
try:
|
| 998 |
-
#
|
| 999 |
-
max_tokens = 2048
|
| 1000 |
-
if username:
|
| 1001 |
-
max_tokens = token_manager.get_purchases(username)["token_limit"]
|
| 1002 |
-
|
| 1003 |
-
# HYPER-OPTIMIZED CPU INFERENCE SETTINGS
|
| 1004 |
stream = self.llm(
|
| 1005 |
formatted_prompt,
|
| 1006 |
-
max_tokens=
|
| 1007 |
stop=["User:", "<|eot_id|>", "\n\n"],
|
| 1008 |
stream=True,
|
| 1009 |
-
temperature=
|
| 1010 |
-
top_p=
|
| 1011 |
-
top_k=
|
| 1012 |
-
repeat_penalty=
|
| 1013 |
-
frequency_penalty=0.0,
|
| 1014 |
-
presence_penalty=0.0,
|
| 1015 |
-
tfs_z=1.0,
|
| 1016 |
-
typical_p=1.0,
|
| 1017 |
-
mirostat_mode=2, #
|
| 1018 |
-
mirostat_tau=5.0,
|
| 1019 |
-
mirostat_eta=0.1,
|
| 1020 |
)
|
| 1021 |
|
| 1022 |
for chunk in stream:
|
|
@@ -1036,19 +636,10 @@ class ZeroEngine:
|
|
| 1036 |
if tps > self.perf_stats["peak_tps"]:
|
| 1037 |
self.perf_stats["peak_tps"] = tps
|
| 1038 |
|
| 1039 |
-
# Charge tokens every second
|
| 1040 |
-
if int(elapsed * 1000) % 1000 < 100 and username: # Every ~1 second
|
| 1041 |
-
token_manager.charge_usage(username, elapsed * 1000)
|
| 1042 |
-
|
| 1043 |
# Update history with streaming content + performance metrics
|
| 1044 |
-
|
| 1045 |
-
history[-1]["content"] = f"{response_text}\n\n`⚡ {tps} t/s | 🎯 Peak: {self.perf_stats['peak_tps']:.1f} t/s | 💰 {balance:.2f} tokens`"
|
| 1046 |
yield history
|
| 1047 |
|
| 1048 |
-
# Final token charge for remaining time
|
| 1049 |
-
if username:
|
| 1050 |
-
token_manager.charge_usage(username, elapsed * 1000)
|
| 1051 |
-
|
| 1052 |
# Update global performance stats
|
| 1053 |
self.perf_stats["total_tokens"] += tokens_count
|
| 1054 |
self.perf_stats["total_time"] += elapsed
|
|
@@ -1172,49 +763,27 @@ h1, h2, h3, h4, h5, h6 {
|
|
| 1172 |
# --- UI INTERFACE ---
|
| 1173 |
kernel = ZeroEngine()
|
| 1174 |
|
| 1175 |
-
|
| 1176 |
-
|
| 1177 |
-
|
| 1178 |
-
|
| 1179 |
-
|
| 1180 |
-
|
| 1181 |
-
|
| 1182 |
-
|
| 1183 |
-
|
| 1184 |
-
|
| 1185 |
-
|
| 1186 |
-
|
| 1187 |
-
|
| 1188 |
-
|
| 1189 |
-
|
| 1190 |
-
</h1>
|
| 1191 |
-
<p style='margin: 10px 0 0 0; color: #888; font-family: Consolas, monospace;'>
|
| 1192 |
-
CPU-Optimized | Token System | Custom Parameters | Auto-Format
|
| 1193 |
-
</p>
|
| 1194 |
-
</div>
|
| 1195 |
-
""")
|
| 1196 |
-
with gr.Column(scale=2):
|
| 1197 |
-
# Token Display
|
| 1198 |
-
gr.HTML("""
|
| 1199 |
-
<div style='text-align: center; padding: 20px; border-radius: 20px;
|
| 1200 |
-
background: linear-gradient(135deg, #7b2ff7 0%, #9b59b6 100%);
|
| 1201 |
-
margin-bottom: 20px; box-shadow: 0 8px 20px rgba(123,47,247,0.3);'>
|
| 1202 |
-
<div style='font-size: 2em; margin-bottom: 5px;'>💰</div>
|
| 1203 |
-
<div id='token-display' style='font-size: 1.8em; font-weight: bold; color: white; font-family: Consolas;'>
|
| 1204 |
-
100.00
|
| 1205 |
-
</div>
|
| 1206 |
-
<div style='font-size: 0.9em; color: #ddd; font-family: Consolas;'>TOKENS</div>
|
| 1207 |
-
</div>
|
| 1208 |
-
""")
|
| 1209 |
-
token_balance = gr.Textbox(value="100.00", visible=False, elem_id="token_balance")
|
| 1210 |
-
end_session_btn = gr.Button("END SESSION", variant="stop", size="sm")
|
| 1211 |
-
session_status = gr.Markdown("", visible=False)
|
| 1212 |
|
| 1213 |
with gr.Row():
|
| 1214 |
with gr.Column(scale=8):
|
| 1215 |
chat_box = gr.Chatbot(
|
| 1216 |
label="Main Engine Feedback",
|
| 1217 |
-
height=
|
| 1218 |
show_label=False,
|
| 1219 |
autoscroll=True,
|
| 1220 |
container=True
|
|
@@ -1229,15 +798,12 @@ with gr.Blocks(title="ZeroEngine V0.2", css=CUSTOM_CSS) as demo:
|
|
| 1229 |
)
|
| 1230 |
send_btn = gr.Button("SUBMIT", variant="primary", scale=1)
|
| 1231 |
|
| 1232 |
-
with gr.Column(scale=
|
| 1233 |
-
# Hardware Status
|
| 1234 |
gr.Markdown("### 🛠️ Hardware Status")
|
| 1235 |
ram_metric = gr.Label(label="RAM Usage", value="0/0 GB")
|
| 1236 |
cpu_metric = gr.Label(label="CPU Load", value="0%")
|
| 1237 |
|
| 1238 |
gr.Markdown("---")
|
| 1239 |
-
|
| 1240 |
-
# Model Control
|
| 1241 |
gr.Markdown("### 📡 Model Control")
|
| 1242 |
repo_input = gr.Textbox(label="HuggingFace Repo", value=DEFAULT_MODEL)
|
| 1243 |
quant_dropdown = gr.Dropdown(label="Available Quants", choices=[], interactive=True)
|
|
@@ -1249,26 +815,6 @@ with gr.Blocks(title="ZeroEngine V0.2", css=CUSTOM_CSS) as demo:
|
|
| 1249 |
boot_status = gr.Markdown("Status: `STANDBY`")
|
| 1250 |
|
| 1251 |
gr.Markdown("---")
|
| 1252 |
-
|
| 1253 |
-
# Custom Parameters
|
| 1254 |
-
gr.Markdown("### ⚙️ Custom Parameters")
|
| 1255 |
-
temperature_slider = gr.Slider(0.1, 2.0, value=0.7, step=0.1, label="Temperature")
|
| 1256 |
-
top_p_slider = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-P")
|
| 1257 |
-
top_k_slider = gr.Slider(1, 100, value=40, step=1, label="Top-K")
|
| 1258 |
-
repeat_penalty_slider = gr.Slider(1.0, 2.0, value=1.1, step=0.1, label="Repeat Penalty")
|
| 1259 |
-
|
| 1260 |
-
gr.Markdown("---")
|
| 1261 |
-
|
| 1262 |
-
# Token Purchases
|
| 1263 |
-
gr.Markdown("### 💎 Token Upgrades")
|
| 1264 |
-
with gr.Row():
|
| 1265 |
-
batch_upgrade_btn = gr.Button("🚀 Batch x2", size="sm", variant="secondary")
|
| 1266 |
-
token_upgrade_btn = gr.Button("📈 +1K Tokens", size="sm", variant="secondary")
|
| 1267 |
-
purchase_status = gr.Markdown("Ready to upgrade!")
|
| 1268 |
-
|
| 1269 |
-
gr.Markdown("---")
|
| 1270 |
-
|
| 1271 |
-
# Ghost Cache
|
| 1272 |
gr.Markdown("### 👻 Ghost Cache (Pre-Context)")
|
| 1273 |
ghost_buffer = gr.Textbox(
|
| 1274 |
label="Background Context",
|
|
@@ -1282,7 +828,7 @@ with gr.Blocks(title="ZeroEngine V0.2", css=CUSTOM_CSS) as demo:
|
|
| 1282 |
log_output = gr.Code(
|
| 1283 |
label="Kernel Logs",
|
| 1284 |
language="shell",
|
| 1285 |
-
value="[INIT]
|
| 1286 |
lines=5
|
| 1287 |
)
|
| 1288 |
|
|
@@ -1290,11 +836,9 @@ with gr.Blocks(title="ZeroEngine V0.2", css=CUSTOM_CSS) as demo:
|
|
| 1290 |
def update_stats():
|
| 1291 |
try:
|
| 1292 |
m = ResourceMonitor.get_metrics()
|
| 1293 |
-
|
| 1294 |
-
return f"{m['ram_used_gb']}/{m['ram_total_gb']} GB", f"{m['cpu_usage_pct']}%", f"{balance}"
|
| 1295 |
except Exception as e:
|
| 1296 |
logger.error(f"Stats update error: {e}")
|
| 1297 |
-
return "Error", "Error", "0.00"
|
| 1298 |
return "Error", "Error"
|
| 1299 |
|
| 1300 |
def on_scan(repo):
|
|
@@ -1320,78 +864,37 @@ with gr.Blocks(title="ZeroEngine V0.2", css=CUSTOM_CSS) as demo:
|
|
| 1320 |
return
|
| 1321 |
|
| 1322 |
yield "⚙️ System: Initiating boot sequence...", gr.update()
|
| 1323 |
-
time.sleep(0.5)
|
| 1324 |
|
| 1325 |
-
result = kernel.boot_kernel(repo, file
|
| 1326 |
yield result, gr.update()
|
| 1327 |
|
| 1328 |
except Exception as e:
|
| 1329 |
logger.error(f"Boot UI error: {e}")
|
| 1330 |
yield f"🔴 BOOT ERROR: {str(e)}", gr.update()
|
| 1331 |
-
|
| 1332 |
-
def on_batch_upgrade():
|
| 1333 |
-
success, msg = token_manager.purchase_batch_upgrade(session_id)
|
| 1334 |
-
balance = token_manager.get_balance(session_id)
|
| 1335 |
-
return msg, f"{balance}"
|
| 1336 |
-
|
| 1337 |
-
def on_token_upgrade():
|
| 1338 |
-
success, msg = token_manager.purchase_token_upgrade(session_id, 1000)
|
| 1339 |
-
balance = token_manager.get_balance(session_id)
|
| 1340 |
-
return msg, f"{balance}"
|
| 1341 |
-
|
| 1342 |
-
def on_end_session():
|
| 1343 |
-
msg = token_manager.end_session(session_id)
|
| 1344 |
-
return msg
|
| 1345 |
-
|
| 1346 |
-
def update_custom_params(temp, top_p, top_k, repeat_pen):
|
| 1347 |
-
kernel.custom_params["temperature"] = temp
|
| 1348 |
-
kernel.custom_params["top_p"] = top_p
|
| 1349 |
-
kernel.custom_params["top_k"] = int(top_k)
|
| 1350 |
-
kernel.custom_params["repeat_penalty"] = repeat_pen
|
| 1351 |
-
return "✅ Parameters updated!"
|
| 1352 |
|
| 1353 |
-
# Timer for periodic stats updates
|
| 1354 |
timer = gr.Timer(value=2)
|
| 1355 |
-
timer.tick(update_stats, None, [ram_metric, cpu_metric
|
| 1356 |
|
| 1357 |
# Event handlers
|
| 1358 |
scan_btn.click(on_scan, [repo_input], [quant_dropdown, log_output])
|
| 1359 |
boot_btn.click(on_boot, [repo_input, quant_dropdown], [boot_status, log_output])
|
| 1360 |
|
| 1361 |
-
# Token purchases
|
| 1362 |
-
batch_upgrade_btn.click(on_batch_upgrade, None, [purchase_status, token_balance])
|
| 1363 |
-
token_upgrade_btn.click(on_token_upgrade, None, [purchase_status, token_balance])
|
| 1364 |
-
end_session_btn.click(on_end_session, None, [session_status])
|
| 1365 |
-
|
| 1366 |
-
# Custom parameter updates
|
| 1367 |
-
temperature_slider.change(update_custom_params,
|
| 1368 |
-
[temperature_slider, top_p_slider, top_k_slider, repeat_penalty_slider],
|
| 1369 |
-
[purchase_status])
|
| 1370 |
-
top_p_slider.change(update_custom_params,
|
| 1371 |
-
[temperature_slider, top_p_slider, top_k_slider, repeat_penalty_slider],
|
| 1372 |
-
[purchase_status])
|
| 1373 |
-
top_k_slider.change(update_custom_params,
|
| 1374 |
-
[temperature_slider, top_p_slider, top_k_slider, repeat_penalty_slider],
|
| 1375 |
-
[purchase_status])
|
| 1376 |
-
repeat_penalty_slider.change(update_custom_params,
|
| 1377 |
-
[temperature_slider, top_p_slider, top_k_slider, repeat_penalty_slider],
|
| 1378 |
-
[purchase_status])
|
| 1379 |
-
|
| 1380 |
-
# Ghost cache
|
| 1381 |
stitch_btn.click(
|
| 1382 |
lambda x: f"Cache: `{kernel.stitch_cache(x)}`",
|
| 1383 |
[ghost_buffer],
|
| 1384 |
[stitch_status]
|
| 1385 |
)
|
| 1386 |
|
| 1387 |
-
# Keyboard input preprocessing
|
| 1388 |
user_input.change(
|
| 1389 |
lambda x: kernel.preprocess_input(x),
|
| 1390 |
[user_input],
|
| 1391 |
None
|
| 1392 |
)
|
| 1393 |
|
| 1394 |
-
# Auto-boot enabled inference
|
| 1395 |
inference_args = [user_input, chat_box, ghost_buffer, repo_input, quant_dropdown]
|
| 1396 |
user_input.submit(kernel.inference_generator, inference_args, [chat_box])
|
| 1397 |
send_btn.click(kernel.inference_generator, inference_args, [chat_box])
|
|
|
|
| 31 |
DEFAULT_MODEL = "unsloth/Llama-3.2-1B-Instruct-GGUF"
|
| 32 |
DEFAULT_QUANT = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
# --- SPEED OPTIMIZATION CONFIG ---
|
| 35 |
+
FLASH_ATTENTION = True # Enable Flash Attention 2
|
| 36 |
+
KV_CACHE_QUANTIZATION = True # Quantize KV cache (4-bit)
|
| 37 |
+
CONTINUOUS_BATCHING = True # Enable continuous batching
|
| 38 |
+
SPECULATIVE_DECODE = False # Disabled for CPU (requires draft model)
|
| 39 |
+
MLOCK_MODEL = False # Disabled: prevents swapping but uses more RAM
|
| 40 |
+
USE_MMAP = True # Memory-mapped file loading
|
| 41 |
+
OFFLOAD_KQV = False # CPU-only, no offload needed
|
| 42 |
+
OPTIMAL_THREADS = max(1, psutil.cpu_count(logical=False) - 1) # Physical cores - 1
|
| 43 |
+
ROPE_SCALING = 1.0 # RoPE frequency scaling
|
| 44 |
+
NUMA_OPTIMIZE = True # NUMA-aware memory allocation
|
| 45 |
+
AGGRESSIVE_GC = True # Aggressive garbage collection
|
| 46 |
+
|
| 47 |
+
# Quantization detection and optimization mapping
|
| 48 |
QUANT_OPTIMIZATIONS = {
|
| 49 |
+
"BF16": {"batch_multiplier": 0.3, "ctx_size": 8192, "threads_boost": 1.2},
|
| 50 |
+
"F16": {"batch_multiplier": 0.4, "ctx_size": 8192, "threads_boost": 1.2},
|
| 51 |
+
"Q8_0": {"batch_multiplier": 0.7, "ctx_size": 8192, "threads_boost": 1.0},
|
| 52 |
+
"Q6_K": {"batch_multiplier": 0.8, "ctx_size": 8192, "threads_boost": 1.0},
|
| 53 |
+
"Q5_K_M": {"batch_multiplier": 1.0, "ctx_size": 12288, "threads_boost": 0.9},
|
| 54 |
+
"Q5_K_S": {"batch_multiplier": 1.0, "ctx_size": 12288, "threads_boost": 0.9},
|
| 55 |
+
"Q4_K_M": {"batch_multiplier": 1.3, "ctx_size": 16384, "threads_boost": 0.8},
|
| 56 |
+
"Q4_K_S": {"batch_multiplier": 1.3, "ctx_size": 16384, "threads_boost": 0.8},
|
| 57 |
+
"Q4_0": {"batch_multiplier": 1.4, "ctx_size": 16384, "threads_boost": 0.8},
|
| 58 |
+
"Q3_K_M": {"batch_multiplier": 1.6, "ctx_size": 20480, "threads_boost": 0.7},
|
| 59 |
+
"Q2_K": {"batch_multiplier": 2.0, "ctx_size": 24576, "threads_boost": 0.7},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
}
|
| 61 |
|
| 62 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - ZEROENGINE - %(message)s')
|
|
|
|
| 207 |
logger.error(f"[WRECKER] Failed: {e}")
|
| 208 |
return False
|
| 209 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
# Global cache manager
|
| 211 |
model_cache = ModelCacheManager()
|
| 212 |
|
|
|
|
| 272 |
self.api = HfApi(token=HF_TOKEN)
|
| 273 |
self.telemetry = TelemetryManager(self.api)
|
| 274 |
self.llm: Optional[Llama] = None
|
| 275 |
+
self.active_model_info = {"repo": "", "file": ""}
|
| 276 |
self.kernel_lock = threading.Lock()
|
| 277 |
self.is_prefilling = False
|
| 278 |
self.perf_stats = {
|
|
|
|
| 282 |
"peak_tps": 0.0,
|
| 283 |
"cache_hits": 0
|
| 284 |
}
|
| 285 |
+
self.prompt_cache = {} # Cache for repeated prompts
|
| 286 |
self.last_activity = time.time()
|
| 287 |
+
self.idle_timeout = 20 # 20 seconds idle timeout
|
| 288 |
self.auto_cleanup_thread = None
|
| 289 |
self.start_idle_monitor()
|
| 290 |
|
|
|
|
| 293 |
self.typing_timer = None
|
| 294 |
self.preprocessed_tokens = None
|
| 295 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 296 |
def detect_quantization(self, filename: str) -> dict:
|
| 297 |
"""Detect quantization method from filename and return optimizations"""
|
| 298 |
filename_upper = filename.upper()
|
|
|
|
| 389 |
logger.error(f"Scan error: {e}")
|
| 390 |
return []
|
| 391 |
|
| 392 |
+
def boot_kernel(self, repo: str, filename: str) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 393 |
"""HYPER-OPTIMIZED Boot kernel with cache manager and old model wrecker"""
|
| 394 |
try:
|
| 395 |
if not repo or not filename:
|
|
|
|
| 547 |
threading.Thread(target=_bg_eval, daemon=True).start()
|
| 548 |
return "⚡ Primed"
|
| 549 |
|
| 550 |
+
def inference_generator(self, prompt: str, history: List[Dict], ghost_context: str, repo: str, quant: str) -> Generator:
|
| 551 |
# Update activity timestamp
|
| 552 |
self.update_activity()
|
| 553 |
|
|
|
|
| 600 |
first_token_time = None
|
| 601 |
|
| 602 |
try:
|
| 603 |
+
# HYPER-OPTIMIZED INFERENCE SETTINGS
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 604 |
stream = self.llm(
|
| 605 |
formatted_prompt,
|
| 606 |
+
max_tokens=2048, # Increased output length
|
| 607 |
stop=["User:", "<|eot_id|>", "\n\n"],
|
| 608 |
stream=True,
|
| 609 |
+
temperature=0.7, # Balanced creativity
|
| 610 |
+
top_p=0.95, # Nucleus sampling
|
| 611 |
+
top_k=40, # Top-K sampling
|
| 612 |
+
repeat_penalty=1.1, # Prevent repetition
|
| 613 |
+
frequency_penalty=0.0, # No frequency penalty
|
| 614 |
+
presence_penalty=0.0, # No presence penalty
|
| 615 |
+
tfs_z=1.0, # Tail-free sampling
|
| 616 |
+
typical_p=1.0, # Typical sampling
|
| 617 |
+
mirostat_mode=2, # Mirostat v2 (perplexity control)
|
| 618 |
+
mirostat_tau=5.0, # Target perplexity
|
| 619 |
+
mirostat_eta=0.1, # Learning rate
|
| 620 |
)
|
| 621 |
|
| 622 |
for chunk in stream:
|
|
|
|
| 636 |
if tps > self.perf_stats["peak_tps"]:
|
| 637 |
self.perf_stats["peak_tps"] = tps
|
| 638 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 639 |
# Update history with streaming content + performance metrics
|
| 640 |
+
history[-1]["content"] = f"{response_text}\n\n`⚡ {tps} t/s | 🎯 Peak: {self.perf_stats['peak_tps']:.1f} t/s | 💾 Cache: {self.perf_stats['cache_hits']}`"
|
|
|
|
| 641 |
yield history
|
| 642 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 643 |
# Update global performance stats
|
| 644 |
self.perf_stats["total_tokens"] += tokens_count
|
| 645 |
self.perf_stats["total_time"] += elapsed
|
|
|
|
| 763 |
# --- UI INTERFACE ---
|
| 764 |
kernel = ZeroEngine()
|
| 765 |
|
| 766 |
+
with gr.Blocks(title="ZeroEngine Kernel 6.5", css=CUSTOM_CSS) as demo:
|
| 767 |
+
gr.HTML("""
|
| 768 |
+
<div style='text-align: center; padding: 30px; border-radius: 24px;
|
| 769 |
+
background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
|
| 770 |
+
margin-bottom: 30px; box-shadow: 0 10px 30px rgba(0,0,0,0.3);'>
|
| 771 |
+
<h1 style='margin: 0; font-size: 3em; background: linear-gradient(90deg, #00d4ff, #7b2ff7);
|
| 772 |
+
-webkit-background-clip: text; -webkit-text-fill-color: transparent;
|
| 773 |
+
font-family: Consolas, monospace;'>
|
| 774 |
+
🛰️ ZEROENGINE V0.1
|
| 775 |
+
</h1>
|
| 776 |
+
<p style='margin: 10px 0 0 0; color: #888; font-family: Consolas, monospace;'>
|
| 777 |
+
Gradio 6.5.0 | Hyper-Optimized | Auto-Boot | 20s Idle Timeout
|
| 778 |
+
</p>
|
| 779 |
+
</div>
|
| 780 |
+
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 781 |
|
| 782 |
with gr.Row():
|
| 783 |
with gr.Column(scale=8):
|
| 784 |
chat_box = gr.Chatbot(
|
| 785 |
label="Main Engine Feedback",
|
| 786 |
+
height=650,
|
| 787 |
show_label=False,
|
| 788 |
autoscroll=True,
|
| 789 |
container=True
|
|
|
|
| 798 |
)
|
| 799 |
send_btn = gr.Button("SUBMIT", variant="primary", scale=1)
|
| 800 |
|
| 801 |
+
with gr.Column(scale=3):
|
|
|
|
| 802 |
gr.Markdown("### 🛠️ Hardware Status")
|
| 803 |
ram_metric = gr.Label(label="RAM Usage", value="0/0 GB")
|
| 804 |
cpu_metric = gr.Label(label="CPU Load", value="0%")
|
| 805 |
|
| 806 |
gr.Markdown("---")
|
|
|
|
|
|
|
| 807 |
gr.Markdown("### 📡 Model Control")
|
| 808 |
repo_input = gr.Textbox(label="HuggingFace Repo", value=DEFAULT_MODEL)
|
| 809 |
quant_dropdown = gr.Dropdown(label="Available Quants", choices=[], interactive=True)
|
|
|
|
| 815 |
boot_status = gr.Markdown("Status: `STANDBY`")
|
| 816 |
|
| 817 |
gr.Markdown("---")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 818 |
gr.Markdown("### 👻 Ghost Cache (Pre-Context)")
|
| 819 |
ghost_buffer = gr.Textbox(
|
| 820 |
label="Background Context",
|
|
|
|
| 828 |
log_output = gr.Code(
|
| 829 |
label="Kernel Logs",
|
| 830 |
language="shell",
|
| 831 |
+
value="[INIT] System Ready.",
|
| 832 |
lines=5
|
| 833 |
)
|
| 834 |
|
|
|
|
| 836 |
def update_stats():
|
| 837 |
try:
|
| 838 |
m = ResourceMonitor.get_metrics()
|
| 839 |
+
return f"{m['ram_used_gb']}/{m['ram_total_gb']} GB", f"{m['cpu_usage_pct']}%"
|
|
|
|
| 840 |
except Exception as e:
|
| 841 |
logger.error(f"Stats update error: {e}")
|
|
|
|
| 842 |
return "Error", "Error"
|
| 843 |
|
| 844 |
def on_scan(repo):
|
|
|
|
| 864 |
return
|
| 865 |
|
| 866 |
yield "⚙️ System: Initiating boot sequence...", gr.update()
|
| 867 |
+
time.sleep(0.5) # Small delay for UI feedback
|
| 868 |
|
| 869 |
+
result = kernel.boot_kernel(repo, file)
|
| 870 |
yield result, gr.update()
|
| 871 |
|
| 872 |
except Exception as e:
|
| 873 |
logger.error(f"Boot UI error: {e}")
|
| 874 |
yield f"🔴 BOOT ERROR: {str(e)}", gr.update()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 875 |
|
| 876 |
+
# Timer for periodic stats updates
|
| 877 |
timer = gr.Timer(value=2)
|
| 878 |
+
timer.tick(update_stats, None, [ram_metric, cpu_metric])
|
| 879 |
|
| 880 |
# Event handlers
|
| 881 |
scan_btn.click(on_scan, [repo_input], [quant_dropdown, log_output])
|
| 882 |
boot_btn.click(on_boot, [repo_input, quant_dropdown], [boot_status, log_output])
|
| 883 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 884 |
stitch_btn.click(
|
| 885 |
lambda x: f"Cache: `{kernel.stitch_cache(x)}`",
|
| 886 |
[ghost_buffer],
|
| 887 |
[stitch_status]
|
| 888 |
)
|
| 889 |
|
| 890 |
+
# Keyboard input preprocessing (tokenize while typing)
|
| 891 |
user_input.change(
|
| 892 |
lambda x: kernel.preprocess_input(x),
|
| 893 |
[user_input],
|
| 894 |
None
|
| 895 |
)
|
| 896 |
|
| 897 |
+
# Auto-boot enabled inference - passes repo and quant for auto-boot
|
| 898 |
inference_args = [user_input, chat_box, ghost_buffer, repo_input, quant_dropdown]
|
| 899 |
user_input.submit(kernel.inference_generator, inference_args, [chat_box])
|
| 900 |
send_btn.click(kernel.inference_generator, inference_args, [chat_box])
|