Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -245,7 +245,7 @@ class TokenManager:
|
|
| 245 |
self.user_tokens[username] = {
|
| 246 |
"balance": float('inf'),
|
| 247 |
"start_time": time.time(),
|
| 248 |
-
"purchases": {"
|
| 249 |
"total_spent": 0.0,
|
| 250 |
"is_owner": True,
|
| 251 |
"username": username
|
|
@@ -255,7 +255,7 @@ class TokenManager:
|
|
| 255 |
self.user_tokens[username] = {
|
| 256 |
"balance": MONTHLY_TOKEN_CREDITS,
|
| 257 |
"start_time": time.time(),
|
| 258 |
-
"purchases": {"
|
| 259 |
"total_spent": 0.0,
|
| 260 |
"is_owner": False,
|
| 261 |
"username": username,
|
|
@@ -328,62 +328,93 @@ class TokenManager:
|
|
| 328 |
last_reset = self.user_tokens[username].get("last_reset", time.time())
|
| 329 |
time_until_reset = 30 * 24 * 60 * 60 - (time.time() - last_reset)
|
| 330 |
days_left = int(time_until_reset / (24 * 60 * 60))
|
| 331 |
-
return False, f"
|
| 332 |
|
| 333 |
-
return True, f"
|
| 334 |
|
| 335 |
-
def purchase_batch_upgrade(self, username: str) -> tuple:
|
| 336 |
-
"""Purchase batch size upgrade
|
| 337 |
if not username:
|
| 338 |
-
return False, "
|
| 339 |
|
| 340 |
self.initialize_user(username)
|
| 341 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 342 |
# Owner gets free upgrades
|
| 343 |
if self.user_tokens[username].get("is_owner", False):
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
return True, f"
|
| 349 |
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
cost = BATCH_UPGRADE_BASE_COST * (2 ** upgrade_level)
|
| 353 |
|
| 354 |
if self.user_tokens[username]["balance"] >= cost:
|
| 355 |
self.user_tokens[username]["balance"] -= cost
|
| 356 |
-
self.user_tokens[username]["purchases"]["
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
|
|
|
| 360 |
else:
|
| 361 |
-
return False, f"
|
| 362 |
|
| 363 |
-
def purchase_token_upgrade(self, username: str,
|
| 364 |
-
"""Purchase
|
| 365 |
if not username:
|
| 366 |
-
return False, "
|
| 367 |
|
| 368 |
self.initialize_user(username)
|
| 369 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 370 |
# Owner gets free upgrades
|
| 371 |
if self.user_tokens[username].get("is_owner", False):
|
| 372 |
-
self.user_tokens[username]["purchases"]["
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
|
|
|
| 376 |
|
| 377 |
-
|
|
|
|
| 378 |
|
| 379 |
if self.user_tokens[username]["balance"] >= cost:
|
| 380 |
self.user_tokens[username]["balance"] -= cost
|
| 381 |
-
self.user_tokens[username]["purchases"]["
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
|
|
|
| 385 |
else:
|
| 386 |
-
return False, f"
|
| 387 |
|
| 388 |
def get_balance(self, username: str) -> float:
|
| 389 |
"""Get user's current token balance"""
|
|
@@ -418,7 +449,7 @@ class TokenManager:
|
|
| 418 |
stats = self.user_tokens[username]
|
| 419 |
|
| 420 |
if stats.get("is_owner", False):
|
| 421 |
-
return f"
|
| 422 |
|
| 423 |
logger.info(f"[TOKEN] Session ended: Spent {stats['total_spent']:.2f}, Remaining {stats['balance']:.2f}")
|
| 424 |
return f"Session ended. You spent {stats['total_spent']:.2f} tokens this session. Balance: {stats['balance']:.2f}"
|
|
@@ -560,7 +591,7 @@ class ZeroEngine:
|
|
| 560 |
logger.info(f"[PREPROCESS] Tokenizing {len(text)} chars in background...")
|
| 561 |
tokens = self.llm.tokenize(text.encode("utf-8"))
|
| 562 |
self.preprocessed_tokens = tokens
|
| 563 |
-
logger.info(f"[PREPROCESS]
|
| 564 |
except Exception as e:
|
| 565 |
logger.error(f"[PREPROCESS] Failed: {e}")
|
| 566 |
self.preprocessed_tokens = None
|
|
@@ -637,7 +668,7 @@ class ZeroEngine:
|
|
| 637 |
"""HYPER-OPTIMIZED Boot kernel with format auto-detection and Gemma fixes"""
|
| 638 |
try:
|
| 639 |
if not repo or not filename:
|
| 640 |
-
return "
|
| 641 |
|
| 642 |
logger.info(f"[BOOT] Starting download: {filename} from {repo}")
|
| 643 |
|
|
@@ -658,17 +689,17 @@ class ZeroEngine:
|
|
| 658 |
logger.info(f"[BOOT] Download complete: {path}")
|
| 659 |
except Exception as e:
|
| 660 |
logger.error(f"[BOOT] Download failed: {e}")
|
| 661 |
-
return f"
|
| 662 |
|
| 663 |
# Check if model is cached
|
| 664 |
is_cached = model_cache.is_cached(path)
|
| 665 |
-
cache_status = "
|
| 666 |
|
| 667 |
# Validate before loading
|
| 668 |
valid, msg = ResourceMonitor.validate_deployment(path)
|
| 669 |
if not valid:
|
| 670 |
logger.warning(f"[BOOT] Validation failed: {msg}")
|
| 671 |
-
return f"
|
| 672 |
|
| 673 |
logger.info(f"[BOOT] Validation passed ({cache_status}), applying {quant_config['type']} optimizations for {model_format.upper()}...")
|
| 674 |
|
|
@@ -676,13 +707,13 @@ class ZeroEngine:
|
|
| 676 |
with self.kernel_lock:
|
| 677 |
# WRECK OLD MODEL
|
| 678 |
if self.llm:
|
| 679 |
-
logger.info("[BOOT]
|
| 680 |
try:
|
| 681 |
model_cache.wreck_old_model_cache()
|
| 682 |
del self.llm
|
| 683 |
self.llm = None
|
| 684 |
nuclear_ram_clear()
|
| 685 |
-
logger.info("[BOOT]
|
| 686 |
except Exception as e:
|
| 687 |
logger.warning(f"[BOOT] Cleanup warning: {e}")
|
| 688 |
|
|
@@ -693,13 +724,13 @@ class ZeroEngine:
|
|
| 693 |
# CPU-OPTIMIZED BATCH CALCULATION - Very aggressive for 16GB RAM
|
| 694 |
# Base calculation: use more RAM for batching on CPU
|
| 695 |
base_batch = int(512 * available_ram_gb / 8) # More aggressive base
|
| 696 |
-
optimal_batch =
|
| 697 |
|
| 698 |
-
# Apply user's batch
|
| 699 |
if session_id:
|
| 700 |
-
|
| 701 |
-
optimal_batch =
|
| 702 |
-
logger.info(f"[TOKEN] User batch
|
| 703 |
|
| 704 |
# CPU can handle larger batches with quantized models
|
| 705 |
optimal_batch = max(1024, min(8192, optimal_batch)) # 1024-8192 range for CPU
|
|
@@ -772,19 +803,19 @@ class ZeroEngine:
|
|
| 772 |
except:
|
| 773 |
pass
|
| 774 |
|
| 775 |
-
logger.info("[BOOT]
|
| 776 |
-
return f"
|
| 777 |
|
| 778 |
except Exception as e:
|
| 779 |
logger.error(f"[BOOT] Model loading failed: {e}")
|
| 780 |
self.llm = None
|
| 781 |
nuclear_ram_clear()
|
| 782 |
-
return f"
|
| 783 |
|
| 784 |
except Exception as e:
|
| 785 |
logger.error(f"[BOOT] Unexpected error: {e}")
|
| 786 |
nuclear_ram_clear()
|
| 787 |
-
return f"
|
| 788 |
|
| 789 |
def stitch_cache(self, ghost_text: str) -> str:
|
| 790 |
"""Prime KV cache with ghost context"""
|
|
@@ -794,6 +825,7 @@ class ZeroEngine:
|
|
| 794 |
def _bg_eval():
|
| 795 |
self.is_prefilling = True
|
| 796 |
try:
|
|
|
|
| 797 |
tokens = self.llm.tokenize(ghost_text.encode("utf-8"))
|
| 798 |
self.llm.eval(tokens)
|
| 799 |
logger.info(f"Ghost cache primed: {len(tokens)} tokens")
|
|
@@ -804,7 +836,7 @@ class ZeroEngine:
|
|
| 804 |
self.is_prefilling = False
|
| 805 |
|
| 806 |
threading.Thread(target=_bg_eval, daemon=True).start()
|
| 807 |
-
return "
|
| 808 |
|
| 809 |
def inference_generator(self, prompt: str, history: List[Dict], ghost_context: str, repo: str, quant: str, profile: gr.OAuthProfile | None) -> Generator:
|
| 810 |
username = profile.username if profile else "anonymous"
|
|
@@ -817,7 +849,7 @@ class ZeroEngine:
|
|
| 817 |
# AUTO-BOOT: If model not loaded, auto-boot default model
|
| 818 |
if not self.llm:
|
| 819 |
logger.info("[AUTO-BOOT] No model loaded, initiating auto-boot...")
|
| 820 |
-
history.append({"role": "assistant", "content": "
|
| 821 |
yield history
|
| 822 |
|
| 823 |
# Use provided repo/quant or fallback to defaults
|
|
@@ -826,12 +858,12 @@ class ZeroEngine:
|
|
| 826 |
|
| 827 |
boot_result = self.boot_kernel(boot_repo, boot_quant)
|
| 828 |
|
| 829 |
-
if "
|
| 830 |
-
history[-1]["content"] = f"
|
| 831 |
yield history
|
| 832 |
return
|
| 833 |
|
| 834 |
-
history[-1]["content"] = f"
|
| 835 |
yield history
|
| 836 |
time.sleep(0.5) # Brief pause for user to see the message
|
| 837 |
|
|
@@ -839,7 +871,7 @@ class ZeroEngine:
|
|
| 839 |
cache_key = f"{ghost_context}:{prompt}"
|
| 840 |
if cache_key in self.prompt_cache:
|
| 841 |
self.perf_stats["cache_hits"] += 1
|
| 842 |
-
logger.info("
|
| 843 |
history.append({"role": "user", "content": prompt})
|
| 844 |
history.append({"role": "assistant", "content": self.prompt_cache[cache_key]})
|
| 845 |
yield history
|
|
@@ -863,7 +895,7 @@ class ZeroEngine:
|
|
| 863 |
# Get max tokens from user purchases
|
| 864 |
max_tokens = 2048
|
| 865 |
if username:
|
| 866 |
-
max_tokens = token_manager.get_purchases(username)["
|
| 867 |
|
| 868 |
# HYPER-OPTIMIZED CPU INFERENCE SETTINGS
|
| 869 |
stream = self.llm(
|
|
@@ -1125,12 +1157,14 @@ with gr.Blocks(title="ZeroEngine V0.2", css=CUSTOM_CSS) as demo:
|
|
| 1125 |
|
| 1126 |
gr.Markdown("---")
|
| 1127 |
|
| 1128 |
-
#
|
| 1129 |
-
gr.Markdown("### π
|
|
|
|
|
|
|
| 1130 |
with gr.Row():
|
| 1131 |
-
batch_upgrade_btn = gr.Button("π Batch
|
| 1132 |
-
token_upgrade_btn = gr.Button("π
|
| 1133 |
-
purchase_status = gr.Markdown("Ready to
|
| 1134 |
|
| 1135 |
gr.Markdown("---")
|
| 1136 |
|
|
@@ -1185,13 +1219,13 @@ with gr.Blocks(title="ZeroEngine V0.2", css=CUSTOM_CSS) as demo:
|
|
| 1185 |
result = kernel.boot_kernel(repo, file, username)
|
| 1186 |
return result
|
| 1187 |
|
| 1188 |
-
def on_batch_upgrade():
|
| 1189 |
-
success, msg = token_manager.purchase_batch_upgrade(session_id)
|
| 1190 |
balance = token_manager.get_balance(session_id)
|
| 1191 |
return msg, f"{balance}"
|
| 1192 |
|
| 1193 |
-
def on_token_upgrade():
|
| 1194 |
-
success, msg = token_manager.purchase_token_upgrade(session_id,
|
| 1195 |
balance = token_manager.get_balance(session_id)
|
| 1196 |
return msg, f"{balance}"
|
| 1197 |
|
|
@@ -1215,8 +1249,8 @@ with gr.Blocks(title="ZeroEngine V0.2", css=CUSTOM_CSS) as demo:
|
|
| 1215 |
boot_btn.click(on_boot, [repo_input, quant_dropdown], [boot_status])
|
| 1216 |
|
| 1217 |
# Token purchases
|
| 1218 |
-
batch_upgrade_btn.click(on_batch_upgrade,
|
| 1219 |
-
token_upgrade_btn.click(on_token_upgrade,
|
| 1220 |
end_session_btn.click(on_end_session, None, [session_status])
|
| 1221 |
|
| 1222 |
# Custom parameter updates
|
|
|
|
| 245 |
self.user_tokens[username] = {
|
| 246 |
"balance": float('inf'),
|
| 247 |
"start_time": time.time(),
|
| 248 |
+
"purchases": {"batch_size": 512, "max_tokens": 2048},
|
| 249 |
"total_spent": 0.0,
|
| 250 |
"is_owner": True,
|
| 251 |
"username": username
|
|
|
|
| 255 |
self.user_tokens[username] = {
|
| 256 |
"balance": MONTHLY_TOKEN_CREDITS,
|
| 257 |
"start_time": time.time(),
|
| 258 |
+
"purchases": {"batch_size": 512, "max_tokens": 2048},
|
| 259 |
"total_spent": 0.0,
|
| 260 |
"is_owner": False,
|
| 261 |
"username": username,
|
|
|
|
| 328 |
last_reset = self.user_tokens[username].get("last_reset", time.time())
|
| 329 |
time_until_reset = 30 * 24 * 60 * 60 - (time.time() - last_reset)
|
| 330 |
days_left = int(time_until_reset / (24 * 60 * 60))
|
| 331 |
+
return False, f" Out of tokens! Resets in {days_left} days. Current balance: 0.00"
|
| 332 |
|
| 333 |
+
return True, f" Access granted. Balance: {balance:.2f} tokens"
|
| 334 |
|
| 335 |
+
def purchase_batch_upgrade(self, username: str, batch_size: int = 512) -> tuple:
|
| 336 |
+
"""Purchase specific batch size upgrade. Free for owner. Auto-rounds to nearest power of 2."""
|
| 337 |
if not username:
|
| 338 |
+
return False, " Please login first"
|
| 339 |
|
| 340 |
self.initialize_user(username)
|
| 341 |
|
| 342 |
+
# SMART ROUNDING: Round to nearest power of 2 for optimal performance
|
| 343 |
+
if batch_size <= 0:
|
| 344 |
+
batch_size = 512
|
| 345 |
+
|
| 346 |
+
# Find nearest power of 2
|
| 347 |
+
def round_to_power_of_2(n):
|
| 348 |
+
if n <= 128:
|
| 349 |
+
return 128
|
| 350 |
+
# Find the next power of 2
|
| 351 |
+
power = 1
|
| 352 |
+
while power < n:
|
| 353 |
+
power *= 2
|
| 354 |
+
# Check if previous power of 2 is closer
|
| 355 |
+
prev_power = power // 2
|
| 356 |
+
if abs(n - prev_power) <= abs(n - power):
|
| 357 |
+
return prev_power
|
| 358 |
+
return power
|
| 359 |
+
|
| 360 |
+
rounded_batch = round_to_power_of_2(batch_size)
|
| 361 |
+
|
| 362 |
# Owner gets free upgrades
|
| 363 |
if self.user_tokens[username].get("is_owner", False):
|
| 364 |
+
self.user_tokens[username]["purchases"]["batch_size"] = rounded_batch
|
| 365 |
+
logger.info(f"[TOKEN] OWNER set batch size to: {rounded_batch} (rounded from {batch_size})")
|
| 366 |
+
if rounded_batch != batch_size:
|
| 367 |
+
return True, f" Batch size set to {rounded_batch} (rounded from {batch_size})!"
|
| 368 |
+
return True, f" Batch size set to {rounded_batch}!"
|
| 369 |
|
| 370 |
+
# Cost based on rounded batch size (larger batches cost more)
|
| 371 |
+
cost = (rounded_batch / 1000) * 0.01 # 0.01 tokens per 1000 batch size
|
|
|
|
| 372 |
|
| 373 |
if self.user_tokens[username]["balance"] >= cost:
|
| 374 |
self.user_tokens[username]["balance"] -= cost
|
| 375 |
+
self.user_tokens[username]["purchases"]["batch_size"] = rounded_batch
|
| 376 |
+
logger.info(f"[TOKEN] Batch size set to {rounded_batch} (rounded from {batch_size}) | Cost: {cost:.5f}")
|
| 377 |
+
if rounded_batch != batch_size:
|
| 378 |
+
return True, f" Batch size set to {rounded_batch} (rounded from {batch_size})! (-{cost:.5f} tokens)"
|
| 379 |
+
return True, f" Batch size set to {rounded_batch}! (-{cost:.5f} tokens)"
|
| 380 |
else:
|
| 381 |
+
return False, f" Insufficient tokens! Need {cost:.5f}, have {self.user_tokens[username]['balance']:.2f}"
|
| 382 |
|
| 383 |
+
def purchase_token_upgrade(self, username: str, max_tokens: int = 2048) -> tuple:
|
| 384 |
+
"""Purchase specific max tokens setting. Free for owner. Auto-rounds to nearest 256."""
|
| 385 |
if not username:
|
| 386 |
+
return False, " Please login first"
|
| 387 |
|
| 388 |
self.initialize_user(username)
|
| 389 |
|
| 390 |
+
# SMART ROUNDING: Round to nearest 256 for optimal memory alignment
|
| 391 |
+
if max_tokens <= 0:
|
| 392 |
+
max_tokens = 2048
|
| 393 |
+
|
| 394 |
+
# Find nearest multiple of 256
|
| 395 |
+
rounded_tokens = ((max_tokens + 128) // 256) * 256
|
| 396 |
+
rounded_tokens = max(256, min(8192, rounded_tokens)) # Clamp between 256-8192
|
| 397 |
+
|
| 398 |
# Owner gets free upgrades
|
| 399 |
if self.user_tokens[username].get("is_owner", False):
|
| 400 |
+
self.user_tokens[username]["purchases"]["max_tokens"] = rounded_tokens
|
| 401 |
+
logger.info(f"[TOKEN] OWNER set max tokens to: {rounded_tokens} (rounded from {max_tokens})")
|
| 402 |
+
if rounded_tokens != max_tokens:
|
| 403 |
+
return True, f" Max tokens set to {rounded_tokens} (rounded from {max_tokens})!"
|
| 404 |
+
return True, f" Max tokens set to {rounded_tokens}!"
|
| 405 |
|
| 406 |
+
# Cost based on rounded max tokens (larger context costs more)
|
| 407 |
+
cost = (rounded_tokens / 1000) * TOKEN_UPGRADE_COST_PER_1K
|
| 408 |
|
| 409 |
if self.user_tokens[username]["balance"] >= cost:
|
| 410 |
self.user_tokens[username]["balance"] -= cost
|
| 411 |
+
self.user_tokens[username]["purchases"]["max_tokens"] = rounded_tokens
|
| 412 |
+
logger.info(f"[TOKEN] Max tokens set to {rounded_tokens} (rounded from {max_tokens}) | Cost: {cost:.5f}")
|
| 413 |
+
if rounded_tokens != max_tokens:
|
| 414 |
+
return True, f" Max tokens set to {rounded_tokens} (rounded from {max_tokens})! (-{cost:.5f} tokens)"
|
| 415 |
+
return True, f" Max tokens set to {rounded_tokens}! (-{cost:.5f} tokens)"
|
| 416 |
else:
|
| 417 |
+
return False, f" Insufficient tokens! Need {cost:.5f}, have {self.user_tokens[username]['balance']:.2f}"
|
| 418 |
|
| 419 |
def get_balance(self, username: str) -> float:
|
| 420 |
"""Get user's current token balance"""
|
|
|
|
| 449 |
stats = self.user_tokens[username]
|
| 450 |
|
| 451 |
if stats.get("is_owner", False):
|
| 452 |
+
return f" OWNER session ended. Welcome back anytime, {stats['username']}!"
|
| 453 |
|
| 454 |
logger.info(f"[TOKEN] Session ended: Spent {stats['total_spent']:.2f}, Remaining {stats['balance']:.2f}")
|
| 455 |
return f"Session ended. You spent {stats['total_spent']:.2f} tokens this session. Balance: {stats['balance']:.2f}"
|
|
|
|
| 591 |
logger.info(f"[PREPROCESS] Tokenizing {len(text)} chars in background...")
|
| 592 |
tokens = self.llm.tokenize(text.encode("utf-8"))
|
| 593 |
self.preprocessed_tokens = tokens
|
| 594 |
+
logger.info(f"[PREPROCESS] Ready: {len(tokens)} tokens cached")
|
| 595 |
except Exception as e:
|
| 596 |
logger.error(f"[PREPROCESS] Failed: {e}")
|
| 597 |
self.preprocessed_tokens = None
|
|
|
|
| 668 |
"""HYPER-OPTIMIZED Boot kernel with format auto-detection and Gemma fixes"""
|
| 669 |
try:
|
| 670 |
if not repo or not filename:
|
| 671 |
+
return " ERROR: Repository or filename missing"
|
| 672 |
|
| 673 |
logger.info(f"[BOOT] Starting download: {filename} from {repo}")
|
| 674 |
|
|
|
|
| 689 |
logger.info(f"[BOOT] Download complete: {path}")
|
| 690 |
except Exception as e:
|
| 691 |
logger.error(f"[BOOT] Download failed: {e}")
|
| 692 |
+
return f" DOWNLOAD FAILED: {str(e)}"
|
| 693 |
|
| 694 |
# Check if model is cached
|
| 695 |
is_cached = model_cache.is_cached(path)
|
| 696 |
+
cache_status = " CACHED" if is_cached else " NEW"
|
| 697 |
|
| 698 |
# Validate before loading
|
| 699 |
valid, msg = ResourceMonitor.validate_deployment(path)
|
| 700 |
if not valid:
|
| 701 |
logger.warning(f"[BOOT] Validation failed: {msg}")
|
| 702 |
+
return f" VALIDATION FAILED: {msg}"
|
| 703 |
|
| 704 |
logger.info(f"[BOOT] Validation passed ({cache_status}), applying {quant_config['type']} optimizations for {model_format.upper()}...")
|
| 705 |
|
|
|
|
| 707 |
with self.kernel_lock:
|
| 708 |
# WRECK OLD MODEL
|
| 709 |
if self.llm:
|
| 710 |
+
logger.info("[BOOT] WRECKING old model...")
|
| 711 |
try:
|
| 712 |
model_cache.wreck_old_model_cache()
|
| 713 |
del self.llm
|
| 714 |
self.llm = None
|
| 715 |
nuclear_ram_clear()
|
| 716 |
+
logger.info("[BOOT] Old model DESTROYED")
|
| 717 |
except Exception as e:
|
| 718 |
logger.warning(f"[BOOT] Cleanup warning: {e}")
|
| 719 |
|
|
|
|
| 724 |
# CPU-OPTIMIZED BATCH CALCULATION - Very aggressive for 16GB RAM
|
| 725 |
# Base calculation: use more RAM for batching on CPU
|
| 726 |
base_batch = int(512 * available_ram_gb / 8) # More aggressive base
|
| 727 |
+
optimal_batch = base_batch
|
| 728 |
|
| 729 |
+
# Apply user's batch size from token purchases
|
| 730 |
if session_id:
|
| 731 |
+
user_batch_size = token_manager.get_purchases(session_id)["batch_size"]
|
| 732 |
+
optimal_batch = user_batch_size
|
| 733 |
+
logger.info(f"[TOKEN] User batch size: {user_batch_size}")
|
| 734 |
|
| 735 |
# CPU can handle larger batches with quantized models
|
| 736 |
optimal_batch = max(1024, min(8192, optimal_batch)) # 1024-8192 range for CPU
|
|
|
|
| 803 |
except:
|
| 804 |
pass
|
| 805 |
|
| 806 |
+
logger.info("[BOOT] CPU-OPTIMIZED MODEL READY!")
|
| 807 |
+
return f" {model_format.upper()} {quant_config['type']} {cache_status} | CPU:{optimal_threads}T | B:{optimal_batch} | Ctx:{optimal_ctx}"
|
| 808 |
|
| 809 |
except Exception as e:
|
| 810 |
logger.error(f"[BOOT] Model loading failed: {e}")
|
| 811 |
self.llm = None
|
| 812 |
nuclear_ram_clear()
|
| 813 |
+
return f" LOAD FAILED: {str(e)}"
|
| 814 |
|
| 815 |
except Exception as e:
|
| 816 |
logger.error(f"[BOOT] Unexpected error: {e}")
|
| 817 |
nuclear_ram_clear()
|
| 818 |
+
return f" BOOT FAILURE: {str(e)}"
|
| 819 |
|
| 820 |
def stitch_cache(self, ghost_text: str) -> str:
|
| 821 |
"""Prime KV cache with ghost context"""
|
|
|
|
| 825 |
def _bg_eval():
|
| 826 |
self.is_prefilling = True
|
| 827 |
try:
|
| 828 |
+
logger.info(f"[PREPROCESS] Tokenizing {len(ghost_text)} chars in background...")
|
| 829 |
tokens = self.llm.tokenize(ghost_text.encode("utf-8"))
|
| 830 |
self.llm.eval(tokens)
|
| 831 |
logger.info(f"Ghost cache primed: {len(tokens)} tokens")
|
|
|
|
| 836 |
self.is_prefilling = False
|
| 837 |
|
| 838 |
threading.Thread(target=_bg_eval, daemon=True).start()
|
| 839 |
+
return " Primed"
|
| 840 |
|
| 841 |
def inference_generator(self, prompt: str, history: List[Dict], ghost_context: str, repo: str, quant: str, profile: gr.OAuthProfile | None) -> Generator:
|
| 842 |
username = profile.username if profile else "anonymous"
|
|
|
|
| 849 |
# AUTO-BOOT: If model not loaded, auto-boot default model
|
| 850 |
if not self.llm:
|
| 851 |
logger.info("[AUTO-BOOT] No model loaded, initiating auto-boot...")
|
| 852 |
+
history.append({"role": "assistant", "content": " Auto-booting model, please wait..."})
|
| 853 |
yield history
|
| 854 |
|
| 855 |
# Use provided repo/quant or fallback to defaults
|
|
|
|
| 858 |
|
| 859 |
boot_result = self.boot_kernel(boot_repo, boot_quant)
|
| 860 |
|
| 861 |
+
if " " in boot_result or "FAILED" in boot_result:
|
| 862 |
+
history[-1]["content"] = f" Auto-boot failed: {boot_result}\n\nPlease manually SCAN and BOOT a model."
|
| 863 |
yield history
|
| 864 |
return
|
| 865 |
|
| 866 |
+
history[-1]["content"] = f" {boot_result}\n\nProcessing your request..."
|
| 867 |
yield history
|
| 868 |
time.sleep(0.5) # Brief pause for user to see the message
|
| 869 |
|
|
|
|
| 871 |
cache_key = f"{ghost_context}:{prompt}"
|
| 872 |
if cache_key in self.prompt_cache:
|
| 873 |
self.perf_stats["cache_hits"] += 1
|
| 874 |
+
logger.info(" CACHE HIT - Instant response!")
|
| 875 |
history.append({"role": "user", "content": prompt})
|
| 876 |
history.append({"role": "assistant", "content": self.prompt_cache[cache_key]})
|
| 877 |
yield history
|
|
|
|
| 895 |
# Get max tokens from user purchases
|
| 896 |
max_tokens = 2048
|
| 897 |
if username:
|
| 898 |
+
max_tokens = token_manager.get_purchases(username)["max_tokens"]
|
| 899 |
|
| 900 |
# HYPER-OPTIMIZED CPU INFERENCE SETTINGS
|
| 901 |
stream = self.llm(
|
|
|
|
| 1157 |
|
| 1158 |
gr.Markdown("---")
|
| 1159 |
|
| 1160 |
+
# Performance Settings
|
| 1161 |
+
gr.Markdown("### π Performance Settings")
|
| 1162 |
+
batch_size_input = gr.Number(label="Batch Size", value=512, minimum=128, maximum=8192, step=128)
|
| 1163 |
+
max_tokens_input = gr.Number(label="Max Tokens", value=2048, minimum=512, maximum=8192, step=256)
|
| 1164 |
with gr.Row():
|
| 1165 |
+
batch_upgrade_btn = gr.Button("π Set Batch Size", size="sm", variant="secondary")
|
| 1166 |
+
token_upgrade_btn = gr.Button("π Set Max Tokens", size="sm", variant="secondary")
|
| 1167 |
+
purchase_status = gr.Markdown("Ready to configure!")
|
| 1168 |
|
| 1169 |
gr.Markdown("---")
|
| 1170 |
|
|
|
|
| 1219 |
result = kernel.boot_kernel(repo, file, username)
|
| 1220 |
return result
|
| 1221 |
|
| 1222 |
+
def on_batch_upgrade(batch_size):
|
| 1223 |
+
success, msg = token_manager.purchase_batch_upgrade(session_id, int(batch_size))
|
| 1224 |
balance = token_manager.get_balance(session_id)
|
| 1225 |
return msg, f"{balance}"
|
| 1226 |
|
| 1227 |
+
def on_token_upgrade(max_tokens):
|
| 1228 |
+
success, msg = token_manager.purchase_token_upgrade(session_id, int(max_tokens))
|
| 1229 |
balance = token_manager.get_balance(session_id)
|
| 1230 |
return msg, f"{balance}"
|
| 1231 |
|
|
|
|
| 1249 |
boot_btn.click(on_boot, [repo_input, quant_dropdown], [boot_status])
|
| 1250 |
|
| 1251 |
# Token purchases
|
| 1252 |
+
batch_upgrade_btn.click(on_batch_upgrade, [batch_size_input], [purchase_status, token_balance])
|
| 1253 |
+
token_upgrade_btn.click(on_token_upgrade, [max_tokens_input], [purchase_status, token_balance])
|
| 1254 |
end_session_btn.click(on_end_session, None, [session_status])
|
| 1255 |
|
| 1256 |
# Custom parameter updates
|