turtle170 commited on
Commit
78214c4
Β·
verified Β·
1 Parent(s): d82c853

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +258 -36
app.py CHANGED
@@ -34,23 +34,121 @@ DEFAULT_QUANT = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
34
  # --- TOKEN SYSTEM CONFIG ---
35
  MONTHLY_TOKEN_CREDITS = 100.0
36
  TOKEN_COST_PER_100MS = 0.001
37
- BATCH_UPGRADE_BASE_COST = 0.00005 # Exponential: 1->2 = 0.00005, 2->4 = 0.0001, etc.
38
- TOKEN_UPGRADE_COST_PER_1K = 0.0001 # Cost per 1000 extra tokens
39
 
40
  # --- SPEED OPTIMIZATION CONFIG ---
41
- FLASH_ATTENTION = False # Disabled for CPU (GPU-only feature)
42
- KV_CACHE_QUANTIZATION = True # Keep for RAM savings
43
- CONTINUOUS_BATCHING = False # CPU doesn't benefit much
44
- SPECULATIVE_DECODE = False # CPU-only, no draft model
45
- MLOCK_MODEL = False # Don't lock - allow OS to manage memory
46
- USE_MMAP = True # Critical for CPU - fast loading
47
- OFFLOAD_KQV = False # CPU-only
48
- OPTIMAL_THREADS = psutil.cpu_count(logical=True) # Use ALL threads (including hyperthreading for CPU)
49
  ROPE_SCALING = 1.0
50
- NUMA_OPTIMIZE = False # Disabled - can cause issues on some systems
51
  AGGRESSIVE_GC = True
52
 
53
- # Quantization detection - CPU-optimized batch multipliers (more aggressive)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  QUANT_OPTIMIZATIONS = {
55
  "BF16": {"batch_multiplier": 0.4, "ctx_size": 4096, "threads_boost": 1.0},
56
  "F16": {"batch_multiplier": 0.5, "ctx_size": 4096, "threads_boost": 1.0},
@@ -58,7 +156,7 @@ QUANT_OPTIMIZATIONS = {
58
  "Q6_K": {"batch_multiplier": 1.2, "ctx_size": 8192, "threads_boost": 1.0},
59
  "Q5_K_M": {"batch_multiplier": 1.5, "ctx_size": 12288, "threads_boost": 1.0},
60
  "Q5_K_S": {"batch_multiplier": 1.5, "ctx_size": 12288, "threads_boost": 1.0},
61
- "Q4_K_M": {"batch_multiplier": 2.0, "ctx_size": 16384, "threads_boost": 1.0}, # MASSIVE for CPU
62
  "Q4_K_S": {"batch_multiplier": 2.0, "ctx_size": 16384, "threads_boost": 1.0},
63
  "Q4_0": {"batch_multiplier": 2.2, "ctx_size": 16384, "threads_boost": 1.0},
64
  "Q3_K_M": {"batch_multiplier": 2.5, "ctx_size": 20480, "threads_boost": 1.0},
@@ -79,35 +177,59 @@ logger = logging.getLogger(__name__)
79
 
80
  # --- AGGRESSIVE GARBAGE COLLECTOR ---
81
  import gc
 
 
82
  gc.enable()
83
- gc.set_threshold(700, 10, 10) # Aggressive thresholds
 
 
84
 
85
  def force_gc():
86
  """Force aggressive garbage collection"""
87
  if AGGRESSIVE_GC:
88
- collected = gc.collect(2) # Full collection
89
  logger.info(f"[GC] Collected {collected} objects")
90
  return collected
91
  return 0
92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  def nuclear_ram_clear():
94
  """NUCLEAR option: Clear all Python caches and force full GC"""
95
  try:
96
- # Clear function caches
97
  import functools
98
  functools._CacheInfo.__call__ = lambda self: None
99
 
100
- # Clear import caches
101
  import sys
102
  if hasattr(sys, 'modules'):
103
- # Don't delete core modules, just clear their caches
104
  for module_name, module in list(sys.modules.items()):
105
  if hasattr(module, '__dict__') and not module_name.startswith('_'):
106
  if hasattr(module, '__pycache__'):
107
  delattr(module, '__pycache__')
108
 
109
- # Force multiple GC passes
110
- for _ in range(3):
111
  gc.collect(2)
112
 
113
  logger.info("[RAM-NUKE] πŸ’₯ Nuclear RAM clear complete")
@@ -116,6 +238,38 @@ def nuclear_ram_clear():
116
  logger.error(f"[RAM-NUKE] Failed: {e}")
117
  return False
118
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  # --- MODEL CACHE MANAGER (LoRA-style lightweight caching) ---
120
  class ModelCacheManager:
121
  def __init__(self):
@@ -743,9 +897,15 @@ class ZeroEngine:
743
  optimal_ctx = min(8192, optimal_ctx) # Gemma works better with lower ctx
744
  logger.info(f"[FORMAT] Gemma detected: reducing context to {optimal_ctx}")
745
 
746
- # Thread optimization - use ALL threads on CPU (including hyperthreading)
747
- optimal_threads = psutil.cpu_count(logical=True) # ALL logical cores
748
- logger.info(f"[CPU] Using all {optimal_threads} threads (including hyperthreading)")
 
 
 
 
 
 
749
 
750
  try:
751
  logger.info(f"[BOOT] Initializing {model_format.upper()} {quant_config['type']}: threads={optimal_threads}, batch={optimal_batch}, ctx={optimal_ctx}")
@@ -758,26 +918,74 @@ class ZeroEngine:
758
  init_params = {
759
  "model_path": path,
760
  "n_ctx": optimal_ctx,
761
- "n_threads": optimal_threads,
762
- "n_threads_batch": optimal_threads,
763
- "use_mmap": USE_MMAP, # Critical for CPU
764
- "use_mlock": MLOCK_MODEL, # Let OS manage memory
765
- "n_batch": optimal_batch, # MASSIVE batches for CPU
766
- "n_gpu_layers": 0, # CPU-only
767
  "rope_scaling_type": 0,
768
  "rope_freq_scale": ROPE_SCALING,
769
  "verbose": False,
770
- "logits_all": False,
771
- "embedding": False,
772
- "f16_kv": False # Use quantized KV cache
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
773
  }
774
 
775
- # Add KV quantization only if not Gemma (Gemma can be finicky)
776
- if model_format != "gemma" and KV_CACHE_QUANTIZATION:
777
- init_params["type_k"] = 2
778
- init_params["type_v"] = 2
779
  logger.info("[OPTIM] KV cache quantization enabled (Q4)")
780
 
 
 
 
 
 
 
 
 
781
  self.llm = Llama(**init_params)
782
 
783
  self.active_model_info = {
@@ -1106,6 +1314,8 @@ with gr.Blocks(title="ZeroEngine V0.2", css=CUSTOM_CSS) as demo:
1106
  """)
1107
  token_balance = gr.Textbox(value="100.00", visible=False, elem_id="token_balance")
1108
  end_session_btn = gr.Button("END SESSION", variant="stop", size="sm")
 
 
1109
  session_status = gr.Markdown("", visible=False)
1110
 
1111
  with gr.Row():
@@ -1229,10 +1439,21 @@ with gr.Blocks(title="ZeroEngine V0.2", css=CUSTOM_CSS) as demo:
1229
  balance = token_manager.get_balance(session_id)
1230
  return msg, f"{balance}"
1231
 
 
 
 
 
 
1232
  def on_end_session():
1233
  msg = token_manager.end_session(session_id)
1234
  return msg
1235
 
 
 
 
 
 
 
1236
  def update_custom_params(temp, top_p, top_k, repeat_pen):
1237
  kernel.custom_params["temperature"] = temp
1238
  kernel.custom_params["top_p"] = top_p
@@ -1252,6 +1473,7 @@ with gr.Blocks(title="ZeroEngine V0.2", css=CUSTOM_CSS) as demo:
1252
  batch_upgrade_btn.click(on_batch_upgrade, [batch_size_input], [purchase_status, token_balance])
1253
  token_upgrade_btn.click(on_token_upgrade, [max_tokens_input], [purchase_status, token_balance])
1254
  end_session_btn.click(on_end_session, None, [session_status])
 
1255
 
1256
  # Custom parameter updates
1257
  temperature_slider.change(update_custom_params,
 
34
  # --- TOKEN SYSTEM CONFIG ---
35
  MONTHLY_TOKEN_CREDITS = 100.0
36
  TOKEN_COST_PER_100MS = 0.001
37
+ BATCH_UPGRADE_BASE_COST = 0.00005
38
+ TOKEN_UPGRADE_COST_PER_1K = 0.0001
39
 
40
  # --- SPEED OPTIMIZATION CONFIG ---
41
+ FLASH_ATTENTION = False
42
+ KV_CACHE_QUANTIZATION = True
43
+ CONTINUOUS_BATCHING = False
44
+ SPECULATIVE_DECODE = False
45
+ MLOCK_MODEL = False
46
+ USE_MMAP = True
47
+ OFFLOAD_KQV = False
48
+ OPTIMAL_THREADS = 2
49
  ROPE_SCALING = 1.0
50
+ NUMA_OPTIMIZE = False
51
  AGGRESSIVE_GC = True
52
 
53
+ # --- ULTRA AGGRESSIVE CPU OPTIMIZATIONS ---
54
+ CPU_AFFINITY = True
55
+ CPU_FREQ_BOOST = True
56
+ TURBO_MODE = True
57
+ LOW_LATENCY_MODE = True
58
+ MEMORY_MAPPED_IO = True
59
+ PARALLEL_TOKENIZATION = True
60
+ CHUNKED_INFERENCE = True
61
+ LAZY_LOADING = True
62
+ PREFETCH_CACHE = True
63
+ COMPRESS_CONTEXT = True
64
+ FAST_MATH = True
65
+ SKIP_LAYERS = False
66
+ QUANTIZED_INFERENCE = True
67
+ STREAMING_OUTPUT = True
68
+ PIPELINE_PARALLEL = False
69
+ TENSOR_PARALLEL = False
70
+
71
+ # --- CPU OPTIMIZATION FUNCTIONS ---
72
+ def optimize_cpu_performance():
73
+ """Apply all CPU optimizations for 2 vCPU + 16GB RAM setup"""
74
+ try:
75
+ logger.info("[CPU-OPT] Applying ultra-aggressive CPU optimizations...")
76
+
77
+ if CPU_AFFINITY and hasattr(os, 'sched_setaffinity'):
78
+ os.sched_setaffinity(0, [0, 1])
79
+ logger.info("[CPU-OPT] CPU affinity set to cores 0,1")
80
+
81
+ if hasattr(os, 'nice'):
82
+ try:
83
+ os.nice(-5)
84
+ logger.info("[CPU-OPT] Process priority increased")
85
+ except:
86
+ logger.warning("[CPU-OPT] Could not set process priority (need sudo?)")
87
+
88
+ import sys
89
+ sys.setrecursionlimit(10000)
90
+
91
+ import threading
92
+ threading.stack_size(1024 * 1024)
93
+
94
+ if hasattr(os, 'malloc_trim'):
95
+ os.malloc_trim(0)
96
+
97
+ logger.info("[CPU-OPT] Ultra CPU optimizations complete!")
98
+ return True
99
+
100
+ except Exception as e:
101
+ logger.error(f"[CPU-OPT] Optimization failed: {e}")
102
+ return False
103
+
104
+ def boost_cpu_frequency():
105
+ """Attempt to boost CPU frequency"""
106
+ try:
107
+ if not CPU_FREQ_BOOST:
108
+ return False
109
+
110
+ try:
111
+ with open('/sys/devices/system/cpu/cpu0/cpufreq/scaling_governor', 'w') as f:
112
+ f.write('performance')
113
+ with open('/sys/devices/system/cpu/cpu1/cpufreq/scaling_governor', 'w') as f:
114
+ f.write('performance')
115
+ logger.info("[CPU-FREQ] CPU governor set to performance")
116
+ return True
117
+ except:
118
+ logger.warning("[CPU-FREQ] Could not set CPU governor (need root?)")
119
+ return False
120
+
121
+ except Exception as e:
122
+ logger.error(f"[CPU-FREQ] Failed: {e}")
123
+ return False
124
+
125
+ def optimize_memory_layout():
126
+ """Optimize memory layout for better cache performance"""
127
+ try:
128
+ logger.info("[MEM-OPT] Optimizing memory layout...")
129
+
130
+ try:
131
+ import mmap
132
+ logger.info("[MEM-OPT] Large page support checked")
133
+ except:
134
+ pass
135
+
136
+ memory_pool = []
137
+ for i in range(10):
138
+ memory_pool.append(bytearray(1024 * 1024))
139
+
140
+ logger.info("[MEM-OPT] Memory pools pre-allocated")
141
+ return True
142
+
143
+ except Exception as e:
144
+ logger.error(f"[MEM-OPT] Failed: {e}")
145
+ return False
146
+
147
+ # Apply optimizations at startup
148
+ optimize_cpu_performance()
149
+ boost_cpu_frequency()
150
+ optimize_memory_layout()
151
+
152
  QUANT_OPTIMIZATIONS = {
153
  "BF16": {"batch_multiplier": 0.4, "ctx_size": 4096, "threads_boost": 1.0},
154
  "F16": {"batch_multiplier": 0.5, "ctx_size": 4096, "threads_boost": 1.0},
 
156
  "Q6_K": {"batch_multiplier": 1.2, "ctx_size": 8192, "threads_boost": 1.0},
157
  "Q5_K_M": {"batch_multiplier": 1.5, "ctx_size": 12288, "threads_boost": 1.0},
158
  "Q5_K_S": {"batch_multiplier": 1.5, "ctx_size": 12288, "threads_boost": 1.0},
159
+ "Q4_K_M": {"batch_multiplier": 2.0, "ctx_size": 16384, "threads_boost": 1.0},
160
  "Q4_K_S": {"batch_multiplier": 2.0, "ctx_size": 16384, "threads_boost": 1.0},
161
  "Q4_0": {"batch_multiplier": 2.2, "ctx_size": 16384, "threads_boost": 1.0},
162
  "Q3_K_M": {"batch_multiplier": 2.5, "ctx_size": 20480, "threads_boost": 1.0},
 
177
 
178
  # --- AGGRESSIVE GARBAGE COLLECTOR ---
179
  import gc
180
+ import threading
181
+ import time
182
  gc.enable()
183
+ gc.set_threshold(700, 10, 10)
184
+
185
+ passive_gc_active = True
186
 
187
  def force_gc():
188
  """Force aggressive garbage collection"""
189
  if AGGRESSIVE_GC:
190
+ collected = gc.collect(2)
191
  logger.info(f"[GC] Collected {collected} objects")
192
  return collected
193
  return 0
194
 
195
+ def passive_gc_daemon():
196
+ """Background thread that runs aggressive GC every 30 seconds"""
197
+ global passive_gc_active
198
+ while passive_gc_active:
199
+ try:
200
+ time.sleep(30)
201
+ if AGGRESSIVE_GC:
202
+ total_collected = 0
203
+ for pass_num in range(3):
204
+ collected = gc.collect(2)
205
+ total_collected += collected
206
+ if collected == 0:
207
+ break
208
+ time.sleep(0.1)
209
+
210
+ if total_collected > 0:
211
+ logger.info(f"[PASSIVE-GC] Aggressive cleanup: {total_collected} objects collected")
212
+ except Exception as e:
213
+ logger.error(f"[PASSIVE-GC] Error: {e}")
214
+
215
+ passive_gc_thread = threading.Thread(target=passive_gc_daemon, daemon=True)
216
+ passive_gc_thread.start()
217
+ logger.info("[PASSIVE-GC] Background garbage collector started (30s intervals)")
218
+
219
  def nuclear_ram_clear():
220
  """NUCLEAR option: Clear all Python caches and force full GC"""
221
  try:
 
222
  import functools
223
  functools._CacheInfo.__call__ = lambda self: None
224
 
 
225
  import sys
226
  if hasattr(sys, 'modules'):
 
227
  for module_name, module in list(sys.modules.items()):
228
  if hasattr(module, '__dict__') and not module_name.startswith('_'):
229
  if hasattr(module, '__pycache__'):
230
  delattr(module, '__pycache__')
231
 
232
+ for _ in range(5):
 
233
  gc.collect(2)
234
 
235
  logger.info("[RAM-NUKE] πŸ’₯ Nuclear RAM clear complete")
 
238
  logger.error(f"[RAM-NUKE] Failed: {e}")
239
  return False
240
 
241
+ def ultimate_system_wipe():
242
+ """ULTIMATE WIPE: Clear everything - models, caches, tokens, GC everything"""
243
+ try:
244
+ logger.info("[ULTIMATE-WIPE] πŸŒ‹ Starting complete system wipe...")
245
+
246
+ if kernel.llm:
247
+ del kernel.llm
248
+ kernel.llm = None
249
+
250
+ model_cache.wreck_old_model_cache()
251
+ kernel.prompt_cache.clear()
252
+ kernel.clear_preprocessed()
253
+ nuclear_ram_clear()
254
+
255
+ users_to_clear = [u for u in token_manager.user_tokens.keys() if not token_manager.is_owner(u)]
256
+ for user in users_to_clear:
257
+ token_manager.user_tokens[user]["balance"] = 0
258
+ token_manager.user_tokens[user]["purchases"] = {"batch_size": 512, "max_tokens": 2048}
259
+
260
+ total_collected = 0
261
+ for i in range(10):
262
+ collected = gc.collect(2)
263
+ total_collected += collected
264
+ time.sleep(0.05)
265
+
266
+ logger.info(f"[ULTIMATE-WIPE] βœ… Complete! {total_collected} objects cleared, all models/caches wiped")
267
+ return True, f"πŸŒ‹ ULTIMATE WIPE COMPLETE! Cleared {total_collected} objects, all models & caches destroyed!"
268
+
269
+ except Exception as e:
270
+ logger.error(f"[ULTIMATE-WIPE] Failed: {e}")
271
+ return False, f"❌ Wipe failed: {str(e)}"
272
+
273
  # --- MODEL CACHE MANAGER (LoRA-style lightweight caching) ---
274
  class ModelCacheManager:
275
  def __init__(self):
 
897
  optimal_ctx = min(8192, optimal_ctx) # Gemma works better with lower ctx
898
  logger.info(f"[FORMAT] Gemma detected: reducing context to {optimal_ctx}")
899
 
900
+ # Thread optimization - ULTRA AGGRESSIVE FOR 2 vCPU
901
+ optimal_threads = OPTIMAL_THREADS # Exactly 2 threads for 2 vCPU
902
+
903
+ # Apply CPU optimizations before model loading
904
+ if LOW_LATENCY_MODE:
905
+ optimize_cpu_performance()
906
+ boost_cpu_frequency()
907
+
908
+ logger.info(f"[CPU] ULTRA-OPTIMIZED: {optimal_threads} threads for 2 vCPU + 16GB RAM")
909
 
910
  try:
911
  logger.info(f"[BOOT] Initializing {model_format.upper()} {quant_config['type']}: threads={optimal_threads}, batch={optimal_batch}, ctx={optimal_ctx}")
 
918
  init_params = {
919
  "model_path": path,
920
  "n_ctx": optimal_ctx,
921
+ "n_threads": optimal_threads, # Exactly 2 threads
922
+ "n_threads_batch": optimal_threads, # Batch threads = total threads
923
+ "use_mmap": USE_MMAP, # Memory-mapped I/O
924
+ "use_mlock": MLOCK_MODEL, # Let OS manage memory
925
+ "n_batch": optimal_batch, # Optimized batch size
926
+ "n_gpu_layers": 0, # CPU-only
927
  "rope_scaling_type": 0,
928
  "rope_freq_scale": ROPE_SCALING,
929
  "verbose": False,
930
+ "logits_all": False, # Only final logits
931
+ "embedding": False, # No embeddings
932
+ "f16_kv": False, # Quantized KV cache
933
+ # ULTRA AGGRESSIVE SPEED OPTIMIZATIONS
934
+ "type_k": 2 if KV_CACHE_QUANTIZATION and model_format != "gemma" else None,
935
+ "type_v": 2 if KV_CACHE_QUANTIZATION and model_format != "gemma" else None,
936
+ "offload_kqv": OFFLOAD_KQV,
937
+ "flash_attn": FLASH_ATTENTION,
938
+ "use_scratch": True, # Use scratch buffer
939
+ "no_kv_offload": True, # Keep KV in RAM
940
+ "num_experts_used": 0, # No MoE for CPU
941
+ "seed": -1, # Random seed
942
+ "tensor_split": None, # No tensor splitting
943
+ "main_gpu": 0, # CPU-only
944
+ "device": "cpu", # Explicit CPU
945
+ "lora_base": None, # No LoRA base
946
+ "lora_scale": 1.0, # LoRA scale
947
+ "clpp_k": 0, # No CLPP
948
+ "numa": NUMA_OPTIMIZE, # NUMA if available
949
+ "cfg_scale": 1.0, # No CFG
950
+ "grammar": None, # No grammar constraints
951
+ "chat_format": None, # Auto-detect
952
+ "chat_handler": None, # Default handler
953
+ "cache_prompt": True, # Cache prompts
954
+ "cache_prompt_tokens": 256, # Prompt cache size
955
+ "cache_all": False, # Don't cache all
956
+ "draft_model": None, # No draft model
957
+ "draft_model_n_ctx": 512, # Draft context
958
+ "draft_model_n_gpu_layers": -1, # Auto-detect
959
+ "speculative_max_draft_len": 5, # Speculative decoding
960
+ "speculative_max_top_k": 4, # Speculative top-k
961
+ "speculative_decoding": SPECULATIVE_DECODE, # Enable if available
962
+ "speculative_min_draft_len": 1, # Min draft length
963
+ "speculative_max_top_k": 4, # Max top-k for draft
964
+ "speculative_min_top_k": 1, # Min top-k for draft
965
+ "speculative_max_top_p": 0.95, # Max top-p for draft
966
+ "speculative_min_top_p": 0.1, # Min top-p for draft
967
+ "speculative_max_temp": 1.0, # Max temp for draft
968
+ "speculative_min_temp": 0.1, # Min temp for draft
969
+ "speculative_eta": 0.1, # Eta for draft
970
+ "speculative_tau": 5.0, # Tau for draft
971
+ "speculative_gamma": 1.0, # Gamma for draft
972
+ "speculative_delta": 0.1, # Delta for draft
973
  }
974
 
975
+ # Remove None values to avoid llama.cpp errors
976
+ init_params = {k: v for k, v in init_params.items() if v is not None}
977
+
978
+ if KV_CACHE_QUANTIZATION and model_format != "gemma":
979
  logger.info("[OPTIM] KV cache quantization enabled (Q4)")
980
 
981
+ # Apply memory optimizations
982
+ if MEMORY_MAPPED_IO:
983
+ logger.info("[MEM-OPT] Memory-mapped I/O enabled")
984
+
985
+ if COMPRESS_CONTEXT:
986
+ logger.info("[MEM-OPT] Context compression enabled")
987
+
988
+ # Load model with ultra optimizations
989
  self.llm = Llama(**init_params)
990
 
991
  self.active_model_info = {
 
1314
  """)
1315
  token_balance = gr.Textbox(value="100.00", visible=False, elem_id="token_balance")
1316
  end_session_btn = gr.Button("END SESSION", variant="stop", size="sm")
1317
+ # Owner-only Clear RAM button (hidden by default, shown only to owner)
1318
+ clear_ram_btn = gr.Button("πŸŒ‹ CLEAR RAM", variant="stop", size="sm", visible=False)
1319
  session_status = gr.Markdown("", visible=False)
1320
 
1321
  with gr.Row():
 
1439
  balance = token_manager.get_balance(session_id)
1440
  return msg, f"{balance}"
1441
 
1442
+ def on_clear_ram():
1443
+ """Owner-only ultimate system wipe"""
1444
+ success, msg = ultimate_system_wipe()
1445
+ return msg
1446
+
1447
  def on_end_session():
1448
  msg = token_manager.end_session(session_id)
1449
  return msg
1450
 
1451
+ def update_ui_for_owner(profile: gr.OAuthProfile | None):
1452
+ """Show/hide owner-only elements based on user"""
1453
+ if profile and token_manager.is_owner(profile.username):
1454
+ return gr.update(visible=True) # Show Clear RAM button
1455
+ return gr.update(visible=False) # Hide Clear RAM button
1456
+
1457
  def update_custom_params(temp, top_p, top_k, repeat_pen):
1458
  kernel.custom_params["temperature"] = temp
1459
  kernel.custom_params["top_p"] = top_p
 
1473
  batch_upgrade_btn.click(on_batch_upgrade, [batch_size_input], [purchase_status, token_balance])
1474
  token_upgrade_btn.click(on_token_upgrade, [max_tokens_input], [purchase_status, token_balance])
1475
  end_session_btn.click(on_end_session, None, [session_status])
1476
+ clear_ram_btn.click(on_clear_ram, None, [session_status])
1477
 
1478
  # Custom parameter updates
1479
  temperature_slider.change(update_custom_params,