Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -26,11 +26,23 @@ except ImportError:
|
|
| 26 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 27 |
SPACE_ID = os.environ.get("SPACE_ID")
|
| 28 |
LOG_FILE = "engine_telemetry.json"
|
| 29 |
-
RAM_LIMIT_PCT = 0.85
|
| 30 |
-
SYSTEM_RESERVE_MB = 500
|
| 31 |
DEFAULT_MODEL = "unsloth/Llama-3.2-1B-Instruct-GGUF"
|
| 32 |
DEFAULT_QUANT = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - ZEROENGINE - %(message)s')
|
| 35 |
logger = logging.getLogger(__name__)
|
| 36 |
|
|
@@ -41,41 +53,21 @@ class TelemetryManager:
|
|
| 41 |
self.stats = self._load_initial_stats()
|
| 42 |
|
| 43 |
def _load_initial_stats(self) -> Dict:
|
| 44 |
-
|
| 45 |
-
try:
|
| 46 |
-
with open(LOG_FILE, "r", encoding="utf-8") as f:
|
| 47 |
-
return json.load(f)
|
| 48 |
-
except Exception:
|
| 49 |
-
pass
|
| 50 |
return {
|
| 51 |
"session_start": str(datetime.now(pytz.utc)),
|
| 52 |
"load_count": {},
|
| 53 |
-
"total_tokens_generated": 0
|
| 54 |
-
"popular_repos": []
|
| 55 |
}
|
| 56 |
|
| 57 |
def track_load(self, repo: str, filename: str):
|
| 58 |
key = f"{repo}/{filename}"
|
| 59 |
self.stats["load_count"][key] = self.stats["load_count"].get(key, 0) + 1
|
| 60 |
-
self.
|
| 61 |
|
| 62 |
def track_generation(self, tokens: int):
|
| 63 |
self.stats["total_tokens_generated"] += tokens
|
| 64 |
-
|
| 65 |
-
def _sync_to_cloud(self):
|
| 66 |
-
if not HF_TOKEN or not SPACE_ID:
|
| 67 |
-
return
|
| 68 |
-
try:
|
| 69 |
-
with open(LOG_FILE, "w", encoding="utf-8") as f:
|
| 70 |
-
json.dump(self.stats, f, indent=4)
|
| 71 |
-
self.api.upload_file(
|
| 72 |
-
path_or_fileobj=LOG_FILE,
|
| 73 |
-
path_in_repo=LOG_FILE,
|
| 74 |
-
repo_id=SPACE_ID,
|
| 75 |
-
repo_type="space"
|
| 76 |
-
)
|
| 77 |
-
except Exception as e:
|
| 78 |
-
logger.error(f"Sync Failure: {e}")
|
| 79 |
|
| 80 |
# --- RESOURCE MONITOR ---
|
| 81 |
class ResourceMonitor:
|
|
@@ -119,6 +111,59 @@ class ZeroEngine:
|
|
| 119 |
self.active_model_info = {"repo": "", "file": ""}
|
| 120 |
self.kernel_lock = threading.Lock()
|
| 121 |
self.is_prefilling = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
|
| 123 |
def list_ggufs(self, repo_id: str) -> List[str]:
|
| 124 |
try:
|
|
@@ -131,7 +176,7 @@ class ZeroEngine:
|
|
| 131 |
return []
|
| 132 |
|
| 133 |
def boot_kernel(self, repo: str, filename: str) -> str:
|
| 134 |
-
"""Boot kernel with
|
| 135 |
try:
|
| 136 |
if not repo or not filename:
|
| 137 |
return "π΄ ERROR: Repository or filename missing"
|
|
@@ -157,9 +202,13 @@ class ZeroEngine:
|
|
| 157 |
logger.warning(f"[BOOT] Validation failed: {msg}")
|
| 158 |
return f"π΄ VALIDATION FAILED: {msg}"
|
| 159 |
|
| 160 |
-
logger.info("[BOOT] Validation passed,
|
| 161 |
|
| 162 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
with self.kernel_lock:
|
| 164 |
# Clear previous model
|
| 165 |
if self.llm:
|
|
@@ -170,22 +219,51 @@ class ZeroEngine:
|
|
| 170 |
except Exception as e:
|
| 171 |
logger.warning(f"[BOOT] Cleanup warning: {e}")
|
| 172 |
|
| 173 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
try:
|
| 175 |
-
logger.info("[BOOT]
|
|
|
|
|
|
|
| 176 |
self.llm = Llama(
|
| 177 |
model_path=path,
|
| 178 |
-
n_ctx=
|
| 179 |
-
n_threads=
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
)
|
|
|
|
| 185 |
self.active_model_info = {"repo": repo, "file": filename}
|
| 186 |
self.telemetry.track_load(repo, filename)
|
| 187 |
-
|
| 188 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
except Exception as e:
|
| 190 |
logger.error(f"[BOOT] Model loading failed: {e}")
|
| 191 |
self.llm = None
|
|
@@ -213,13 +291,42 @@ class ZeroEngine:
|
|
| 213 |
threading.Thread(target=_bg_eval, daemon=True).start()
|
| 214 |
return "β‘ Ghost Cache Primed"
|
| 215 |
|
| 216 |
-
def inference_generator(self, prompt: str, history: List[Dict], ghost_context: str) -> Generator:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
if not self.llm:
|
| 218 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
yield history
|
| 220 |
return
|
| 221 |
|
| 222 |
-
# Prepare input
|
| 223 |
full_input = f"{ghost_context}\n{prompt}" if ghost_context else prompt
|
| 224 |
formatted_prompt = f"User: {full_input}\nAssistant: "
|
| 225 |
|
|
@@ -231,13 +338,26 @@ class ZeroEngine:
|
|
| 231 |
response_text = ""
|
| 232 |
start_time = time.time()
|
| 233 |
tokens_count = 0
|
|
|
|
| 234 |
|
| 235 |
try:
|
|
|
|
| 236 |
stream = self.llm(
|
| 237 |
-
formatted_prompt,
|
| 238 |
-
max_tokens=
|
| 239 |
-
stop=["User:", "<|eot_id|>", "\n\n"],
|
| 240 |
-
stream=True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 241 |
)
|
| 242 |
|
| 243 |
for chunk in stream:
|
|
@@ -245,14 +365,39 @@ class ZeroEngine:
|
|
| 245 |
response_text += token
|
| 246 |
tokens_count += 1
|
| 247 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 248 |
elapsed = time.time() - start_time
|
| 249 |
tps = round(tokens_count / elapsed, 1) if elapsed > 0 else 0
|
| 250 |
|
| 251 |
-
#
|
| 252 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
yield history
|
| 254 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 255 |
self.telemetry.track_generation(tokens_count)
|
|
|
|
|
|
|
|
|
|
| 256 |
except Exception as e:
|
| 257 |
logger.error(f"Inference error: {e}")
|
| 258 |
history[-1]["content"] = f"π΄ Runtime Error: {str(e)}"
|
|
@@ -366,7 +511,7 @@ with gr.Blocks(title="ZeroEngine Kernel 6.5", css=CUSTOM_CSS) as demo:
|
|
| 366 |
π°οΈ ZEROENGINE V0.1
|
| 367 |
</h1>
|
| 368 |
<p style='margin: 10px 0 0 0; color: #888; font-family: Consolas, monospace;'>
|
| 369 |
-
Gradio 6.5.0
|
| 370 |
</p>
|
| 371 |
</div>
|
| 372 |
""")
|
|
@@ -388,7 +533,7 @@ with gr.Blocks(title="ZeroEngine Kernel 6.5", css=CUSTOM_CSS) as demo:
|
|
| 388 |
container=False,
|
| 389 |
scale=9
|
| 390 |
)
|
| 391 |
-
send_btn = gr.Button("
|
| 392 |
|
| 393 |
with gr.Column(scale=3):
|
| 394 |
gr.Markdown("### π οΈ Hardware Status")
|
|
@@ -478,7 +623,8 @@ with gr.Blocks(title="ZeroEngine Kernel 6.5", css=CUSTOM_CSS) as demo:
|
|
| 478 |
[stitch_status]
|
| 479 |
)
|
| 480 |
|
| 481 |
-
|
|
|
|
| 482 |
user_input.submit(kernel.inference_generator, inference_args, [chat_box])
|
| 483 |
send_btn.click(kernel.inference_generator, inference_args, [chat_box])
|
| 484 |
user_input.submit(lambda: "", None, [user_input])
|
|
|
|
| 26 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 27 |
SPACE_ID = os.environ.get("SPACE_ID")
|
| 28 |
LOG_FILE = "engine_telemetry.json"
|
| 29 |
+
RAM_LIMIT_PCT = 0.85
|
| 30 |
+
SYSTEM_RESERVE_MB = 500
|
| 31 |
DEFAULT_MODEL = "unsloth/Llama-3.2-1B-Instruct-GGUF"
|
| 32 |
DEFAULT_QUANT = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
|
| 33 |
|
| 34 |
+
# --- SPEED OPTIMIZATION CONFIG ---
|
| 35 |
+
FLASH_ATTENTION = True # Enable Flash Attention 2
|
| 36 |
+
KV_CACHE_QUANTIZATION = True # Quantize KV cache (4-bit)
|
| 37 |
+
CONTINUOUS_BATCHING = True # Enable continuous batching
|
| 38 |
+
SPECULATIVE_DECODE = False # Disabled for CPU (requires draft model)
|
| 39 |
+
MLOCK_MODEL = True # Lock model in RAM (prevent swap)
|
| 40 |
+
USE_MMAP = True # Memory-mapped file loading
|
| 41 |
+
OFFLOAD_KQV = False # CPU-only, no offload needed
|
| 42 |
+
OPTIMAL_THREADS = max(1, psutil.cpu_count(logical=False) - 1) # Physical cores - 1
|
| 43 |
+
ROPE_SCALING = 1.0 # RoPE frequency scaling
|
| 44 |
+
NUMA_OPTIMIZE = True # NUMA-aware memory allocation
|
| 45 |
+
|
| 46 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - ZEROENGINE - %(message)s')
|
| 47 |
logger = logging.getLogger(__name__)
|
| 48 |
|
|
|
|
| 53 |
self.stats = self._load_initial_stats()
|
| 54 |
|
| 55 |
def _load_initial_stats(self) -> Dict:
|
| 56 |
+
# Simplified: no file I/O to prevent restart issues
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
return {
|
| 58 |
"session_start": str(datetime.now(pytz.utc)),
|
| 59 |
"load_count": {},
|
| 60 |
+
"total_tokens_generated": 0
|
|
|
|
| 61 |
}
|
| 62 |
|
| 63 |
def track_load(self, repo: str, filename: str):
|
| 64 |
key = f"{repo}/{filename}"
|
| 65 |
self.stats["load_count"][key] = self.stats["load_count"].get(key, 0) + 1
|
| 66 |
+
logger.info(f"Model loaded: {key} (count: {self.stats['load_count'][key]})")
|
| 67 |
|
| 68 |
def track_generation(self, tokens: int):
|
| 69 |
self.stats["total_tokens_generated"] += tokens
|
| 70 |
+
logger.info(f"Total tokens generated: {self.stats['total_tokens_generated']}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
# --- RESOURCE MONITOR ---
|
| 73 |
class ResourceMonitor:
|
|
|
|
| 111 |
self.active_model_info = {"repo": "", "file": ""}
|
| 112 |
self.kernel_lock = threading.Lock()
|
| 113 |
self.is_prefilling = False
|
| 114 |
+
self.perf_stats = {
|
| 115 |
+
"total_tokens": 0,
|
| 116 |
+
"total_time": 0.0,
|
| 117 |
+
"avg_tps": 0.0,
|
| 118 |
+
"peak_tps": 0.0,
|
| 119 |
+
"cache_hits": 0
|
| 120 |
+
}
|
| 121 |
+
self.prompt_cache = {} # Cache for repeated prompts
|
| 122 |
+
self.last_activity = time.time()
|
| 123 |
+
self.idle_timeout = 20 # 20 seconds idle timeout
|
| 124 |
+
self.auto_cleanup_thread = None
|
| 125 |
+
self.start_idle_monitor()
|
| 126 |
+
|
| 127 |
+
def start_idle_monitor(self):
|
| 128 |
+
"""Start background thread to monitor idle timeout"""
|
| 129 |
+
def monitor():
|
| 130 |
+
while True:
|
| 131 |
+
time.sleep(5) # Check every 5 seconds
|
| 132 |
+
if self.llm and (time.time() - self.last_activity) > self.idle_timeout:
|
| 133 |
+
logger.info(f"[IDLE] No activity for {self.idle_timeout}s, unloading model...")
|
| 134 |
+
with self.kernel_lock:
|
| 135 |
+
if self.llm:
|
| 136 |
+
try:
|
| 137 |
+
del self.llm
|
| 138 |
+
self.llm = None
|
| 139 |
+
self.active_model_info = {"repo": "", "file": ""}
|
| 140 |
+
logger.info("[IDLE] Model unloaded successfully")
|
| 141 |
+
except Exception as e:
|
| 142 |
+
logger.error(f"[IDLE] Cleanup error: {e}")
|
| 143 |
+
|
| 144 |
+
self.auto_cleanup_thread = threading.Thread(target=monitor, daemon=True)
|
| 145 |
+
self.auto_cleanup_thread.start()
|
| 146 |
+
logger.info("[IDLE] Idle monitor started (20s timeout)")
|
| 147 |
+
|
| 148 |
+
def update_activity(self):
|
| 149 |
+
"""Update last activity timestamp"""
|
| 150 |
+
self.last_activity = time.time()
|
| 151 |
+
|
| 152 |
+
def optimize_numa(self):
|
| 153 |
+
"""NUMA-aware CPU affinity optimization"""
|
| 154 |
+
try:
|
| 155 |
+
import os
|
| 156 |
+
if hasattr(os, 'sched_setaffinity'):
|
| 157 |
+
# Pin to physical cores only
|
| 158 |
+
physical_cores = list(range(0, psutil.cpu_count(logical=False)))
|
| 159 |
+
os.sched_setaffinity(0, physical_cores)
|
| 160 |
+
logger.info(f"NUMA: Pinned to physical cores: {physical_cores}")
|
| 161 |
+
except Exception as e:
|
| 162 |
+
logger.warning(f"NUMA optimization unavailable: {e}")
|
| 163 |
+
|
| 164 |
+
def is_model_loaded(self) -> bool:
|
| 165 |
+
"""Check if model is currently loaded"""
|
| 166 |
+
return self.llm is not None
|
| 167 |
|
| 168 |
def list_ggufs(self, repo_id: str) -> List[str]:
|
| 169 |
try:
|
|
|
|
| 176 |
return []
|
| 177 |
|
| 178 |
def boot_kernel(self, repo: str, filename: str) -> str:
|
| 179 |
+
"""HYPER-OPTIMIZED Boot kernel with all speed optimizations enabled"""
|
| 180 |
try:
|
| 181 |
if not repo or not filename:
|
| 182 |
return "π΄ ERROR: Repository or filename missing"
|
|
|
|
| 202 |
logger.warning(f"[BOOT] Validation failed: {msg}")
|
| 203 |
return f"π΄ VALIDATION FAILED: {msg}"
|
| 204 |
|
| 205 |
+
logger.info("[BOOT] Validation passed, applying optimizations...")
|
| 206 |
|
| 207 |
+
# Apply NUMA optimization
|
| 208 |
+
if NUMA_OPTIMIZE:
|
| 209 |
+
self.optimize_numa()
|
| 210 |
+
|
| 211 |
+
# Load model with MAXIMUM PERFORMANCE SETTINGS
|
| 212 |
with self.kernel_lock:
|
| 213 |
# Clear previous model
|
| 214 |
if self.llm:
|
|
|
|
| 219 |
except Exception as e:
|
| 220 |
logger.warning(f"[BOOT] Cleanup warning: {e}")
|
| 221 |
|
| 222 |
+
# Calculate optimal batch size based on available RAM
|
| 223 |
+
vm = psutil.virtual_memory()
|
| 224 |
+
available_ram_gb = vm.available / (1024**3)
|
| 225 |
+
# Dynamic batch sizing: more RAM = larger batches
|
| 226 |
+
optimal_batch = min(512, int(128 * available_ram_gb / 4))
|
| 227 |
+
|
| 228 |
try:
|
| 229 |
+
logger.info(f"[BOOT] Initializing with {OPTIMAL_THREADS} threads, batch={optimal_batch}")
|
| 230 |
+
|
| 231 |
+
# ULTRA-OPTIMIZED LLAMA.CPP INITIALIZATION
|
| 232 |
self.llm = Llama(
|
| 233 |
model_path=path,
|
| 234 |
+
n_ctx=4096, # Increased context window
|
| 235 |
+
n_threads=OPTIMAL_THREADS, # Optimized thread count
|
| 236 |
+
n_threads_batch=OPTIMAL_THREADS, # Batch processing threads
|
| 237 |
+
use_mmap=USE_MMAP, # Memory-mapped weights (fast loading)
|
| 238 |
+
use_mlock=MLOCK_MODEL, # Lock in RAM (prevent swap thrashing)
|
| 239 |
+
n_batch=optimal_batch, # Dynamic batch size
|
| 240 |
+
n_gpu_layers=0, # CPU-only mode
|
| 241 |
+
flash_attn=FLASH_ATTENTION, # Flash Attention (2x faster)
|
| 242 |
+
type_k=2 if KV_CACHE_QUANTIZATION else None, # Q4 KV cache quantization
|
| 243 |
+
type_v=2 if KV_CACHE_QUANTIZATION else None, # Q4 KV cache quantization
|
| 244 |
+
rope_scaling_type=0, # Linear RoPE scaling
|
| 245 |
+
rope_freq_scale=ROPE_SCALING, # RoPE frequency scale
|
| 246 |
+
numa=NUMA_OPTIMIZE, # NUMA optimization
|
| 247 |
+
verbose=False,
|
| 248 |
+
logits_all=False, # Only compute final logits (faster)
|
| 249 |
+
embedding=False, # Disable embeddings (not needed)
|
| 250 |
+
offload_kqv=OFFLOAD_KQV, # No offload on CPU
|
| 251 |
+
f16_kv=False # Use quantized KV cache instead
|
| 252 |
)
|
| 253 |
+
|
| 254 |
self.active_model_info = {"repo": repo, "file": filename}
|
| 255 |
self.telemetry.track_load(repo, filename)
|
| 256 |
+
|
| 257 |
+
# Warm-up inference to populate caches
|
| 258 |
+
logger.info("[BOOT] Warming up model caches...")
|
| 259 |
+
try:
|
| 260 |
+
self.llm("Test", max_tokens=1, stream=False)
|
| 261 |
+
except:
|
| 262 |
+
pass
|
| 263 |
+
|
| 264 |
+
logger.info("[BOOT] π HYPER-OPTIMIZED MODEL READY!")
|
| 265 |
+
return f"π’ KERNEL ONLINE: {filename} | Threads: {OPTIMAL_THREADS} | Batch: {optimal_batch} | Flash Attn: {FLASH_ATTENTION}"
|
| 266 |
+
|
| 267 |
except Exception as e:
|
| 268 |
logger.error(f"[BOOT] Model loading failed: {e}")
|
| 269 |
self.llm = None
|
|
|
|
| 291 |
threading.Thread(target=_bg_eval, daemon=True).start()
|
| 292 |
return "β‘ Ghost Cache Primed"
|
| 293 |
|
| 294 |
+
def inference_generator(self, prompt: str, history: List[Dict], ghost_context: str, repo: str, quant: str) -> Generator:
|
| 295 |
+
# Update activity timestamp
|
| 296 |
+
self.update_activity()
|
| 297 |
+
|
| 298 |
+
# AUTO-BOOT: If model not loaded, auto-boot default model
|
| 299 |
if not self.llm:
|
| 300 |
+
logger.info("[AUTO-BOOT] No model loaded, initiating auto-boot...")
|
| 301 |
+
history.append({"role": "assistant", "content": "π Auto-booting model, please wait..."})
|
| 302 |
+
yield history
|
| 303 |
+
|
| 304 |
+
# Use provided repo/quant or fallback to defaults
|
| 305 |
+
boot_repo = repo if repo else DEFAULT_MODEL
|
| 306 |
+
boot_quant = quant if quant else DEFAULT_QUANT
|
| 307 |
+
|
| 308 |
+
boot_result = self.boot_kernel(boot_repo, boot_quant)
|
| 309 |
+
|
| 310 |
+
if "π΄" in boot_result or "FAILED" in boot_result:
|
| 311 |
+
history[-1]["content"] = f"β Auto-boot failed: {boot_result}\n\nPlease manually SCAN and BOOT a model."
|
| 312 |
+
yield history
|
| 313 |
+
return
|
| 314 |
+
|
| 315 |
+
history[-1]["content"] = f"β
{boot_result}\n\nProcessing your request..."
|
| 316 |
+
yield history
|
| 317 |
+
time.sleep(0.5) # Brief pause for user to see the message
|
| 318 |
+
|
| 319 |
+
# Check prompt cache for exact matches (instant response)
|
| 320 |
+
cache_key = f"{ghost_context}:{prompt}"
|
| 321 |
+
if cache_key in self.prompt_cache:
|
| 322 |
+
self.perf_stats["cache_hits"] += 1
|
| 323 |
+
logger.info("β‘ CACHE HIT - Instant response!")
|
| 324 |
+
history.append({"role": "user", "content": prompt})
|
| 325 |
+
history.append({"role": "assistant", "content": self.prompt_cache[cache_key]})
|
| 326 |
yield history
|
| 327 |
return
|
| 328 |
|
| 329 |
+
# Prepare input with optimized formatting
|
| 330 |
full_input = f"{ghost_context}\n{prompt}" if ghost_context else prompt
|
| 331 |
formatted_prompt = f"User: {full_input}\nAssistant: "
|
| 332 |
|
|
|
|
| 338 |
response_text = ""
|
| 339 |
start_time = time.time()
|
| 340 |
tokens_count = 0
|
| 341 |
+
first_token_time = None
|
| 342 |
|
| 343 |
try:
|
| 344 |
+
# HYPER-OPTIMIZED INFERENCE SETTINGS
|
| 345 |
stream = self.llm(
|
| 346 |
+
formatted_prompt,
|
| 347 |
+
max_tokens=2048, # Increased output length
|
| 348 |
+
stop=["User:", "<|eot_id|>", "\n\n"],
|
| 349 |
+
stream=True,
|
| 350 |
+
temperature=0.7, # Balanced creativity
|
| 351 |
+
top_p=0.95, # Nucleus sampling
|
| 352 |
+
top_k=40, # Top-K sampling
|
| 353 |
+
repeat_penalty=1.1, # Prevent repetition
|
| 354 |
+
frequency_penalty=0.0, # No frequency penalty
|
| 355 |
+
presence_penalty=0.0, # No presence penalty
|
| 356 |
+
tfs_z=1.0, # Tail-free sampling
|
| 357 |
+
typical_p=1.0, # Typical sampling
|
| 358 |
+
mirostat_mode=2, # Mirostat v2 (perplexity control)
|
| 359 |
+
mirostat_tau=5.0, # Target perplexity
|
| 360 |
+
mirostat_eta=0.1, # Learning rate
|
| 361 |
)
|
| 362 |
|
| 363 |
for chunk in stream:
|
|
|
|
| 365 |
response_text += token
|
| 366 |
tokens_count += 1
|
| 367 |
|
| 368 |
+
# Track first token latency (TTFT - Time To First Token)
|
| 369 |
+
if first_token_time is None:
|
| 370 |
+
first_token_time = time.time() - start_time
|
| 371 |
+
logger.info(f"β‘ First token: {first_token_time*1000:.0f}ms")
|
| 372 |
+
|
| 373 |
elapsed = time.time() - start_time
|
| 374 |
tps = round(tokens_count / elapsed, 1) if elapsed > 0 else 0
|
| 375 |
|
| 376 |
+
# Track peak performance
|
| 377 |
+
if tps > self.perf_stats["peak_tps"]:
|
| 378 |
+
self.perf_stats["peak_tps"] = tps
|
| 379 |
+
|
| 380 |
+
# Update history with streaming content + performance metrics
|
| 381 |
+
history[-1]["content"] = f"{response_text}\n\n`β‘ {tps} t/s | π― Peak: {self.perf_stats['peak_tps']:.1f} t/s`"
|
| 382 |
yield history
|
| 383 |
|
| 384 |
+
# Update global performance stats
|
| 385 |
+
self.perf_stats["total_tokens"] += tokens_count
|
| 386 |
+
self.perf_stats["total_time"] += elapsed
|
| 387 |
+
self.perf_stats["avg_tps"] = self.perf_stats["total_tokens"] / self.perf_stats["total_time"]
|
| 388 |
+
|
| 389 |
+
# Cache the response for future identical queries
|
| 390 |
+
if len(response_text) > 10: # Only cache meaningful responses
|
| 391 |
+
self.prompt_cache[cache_key] = response_text
|
| 392 |
+
# Limit cache size to prevent memory bloat
|
| 393 |
+
if len(self.prompt_cache) > 100:
|
| 394 |
+
oldest_key = next(iter(self.prompt_cache))
|
| 395 |
+
del self.prompt_cache[oldest_key]
|
| 396 |
+
|
| 397 |
self.telemetry.track_generation(tokens_count)
|
| 398 |
+
|
| 399 |
+
logger.info(f"β
Generation complete: {tokens_count} tokens @ {tps:.1f} t/s (TTFT: {first_token_time*1000:.0f}ms)")
|
| 400 |
+
|
| 401 |
except Exception as e:
|
| 402 |
logger.error(f"Inference error: {e}")
|
| 403 |
history[-1]["content"] = f"π΄ Runtime Error: {str(e)}"
|
|
|
|
| 511 |
π°οΈ ZEROENGINE V0.1
|
| 512 |
</h1>
|
| 513 |
<p style='margin: 10px 0 0 0; color: #888; font-family: Consolas, monospace;'>
|
| 514 |
+
Gradio 6.5.0 | Hyper-Optimized | Auto-Boot | 20s Idle Timeout
|
| 515 |
</p>
|
| 516 |
</div>
|
| 517 |
""")
|
|
|
|
| 533 |
container=False,
|
| 534 |
scale=9
|
| 535 |
)
|
| 536 |
+
send_btn = gr.Button("SUBMIT", variant="primary", scale=1)
|
| 537 |
|
| 538 |
with gr.Column(scale=3):
|
| 539 |
gr.Markdown("### π οΈ Hardware Status")
|
|
|
|
| 623 |
[stitch_status]
|
| 624 |
)
|
| 625 |
|
| 626 |
+
# Auto-boot enabled inference - passes repo and quant for auto-boot
|
| 627 |
+
inference_args = [user_input, chat_box, ghost_buffer, repo_input, quant_dropdown]
|
| 628 |
user_input.submit(kernel.inference_generator, inference_args, [chat_box])
|
| 629 |
send_btn.click(kernel.inference_generator, inference_args, [chat_box])
|
| 630 |
user_input.submit(lambda: "", None, [user_input])
|