Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -10,6 +10,115 @@ from typing import List, Dict, Optional, Generator
|
|
| 10 |
|
| 11 |
import gradio as gr
|
| 12 |
from huggingface_hub import HfApi, hf_hub_download
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
# Initialize logger early for startup functions
|
| 15 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - ZEROENGINE - %(message)s')
|
|
@@ -610,7 +719,7 @@ class TokenManager:
|
|
| 610 |
return f"Session ended. You spent {stats['total_spent']:.2f} tokens this session. Balance: {stats['balance']:.2f}"
|
| 611 |
return "No active session found."
|
| 612 |
|
| 613 |
-
|
| 614 |
import math
|
| 615 |
token_manager = TokenManager()
|
| 616 |
|
|
@@ -737,13 +846,20 @@ class ZeroEngine:
|
|
| 737 |
return {"type": "Q4_K_M", **QUANT_OPTIMIZATIONS["Q4_K_M"]}
|
| 738 |
|
| 739 |
def preprocess_input(self, text: str):
|
| 740 |
-
"""Pre-process keyboard input
|
| 741 |
-
if not
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 742 |
return
|
| 743 |
|
| 744 |
def _preprocess():
|
| 745 |
try:
|
| 746 |
-
logger.info(f"[PREPROCESS] Tokenizing {len(text)} chars
|
| 747 |
tokens = self.llm.tokenize(text.encode("utf-8"))
|
| 748 |
self.preprocessed_tokens = tokens
|
| 749 |
logger.info(f"[PREPROCESS] Ready: {len(tokens)} tokens cached")
|
|
@@ -751,15 +867,13 @@ class ZeroEngine:
|
|
| 751 |
logger.error(f"[PREPROCESS] Failed: {e}")
|
| 752 |
self.preprocessed_tokens = None
|
| 753 |
|
| 754 |
-
# Cancel previous timer if user is still typing
|
| 755 |
if self.typing_timer:
|
| 756 |
self.typing_timer.cancel()
|
| 757 |
|
| 758 |
-
# Start new timer - preprocess after 1 second of no typing
|
| 759 |
self.typing_timer = threading.Timer(1.0, _preprocess)
|
| 760 |
self.typing_timer.daemon = True
|
| 761 |
self.typing_timer.start()
|
| 762 |
-
|
| 763 |
def clear_preprocessed(self):
|
| 764 |
"""Clear preprocessed tokens and force GC"""
|
| 765 |
if self.preprocessed_tokens:
|
|
@@ -1085,12 +1199,11 @@ class ZeroEngine:
|
|
| 1085 |
time.sleep(0.5) # Brief pause for user to see the message
|
| 1086 |
|
| 1087 |
# Check prompt cache for exact matches (instant response)
|
| 1088 |
-
|
| 1089 |
-
if
|
| 1090 |
-
|
| 1091 |
-
logger.info(" CACHE HIT - Instant response!")
|
| 1092 |
history.append({"role": "user", "content": prompt})
|
| 1093 |
-
history.append({"role": "assistant", "content":
|
| 1094 |
yield history
|
| 1095 |
return
|
| 1096 |
|
|
@@ -1180,6 +1293,12 @@ class ZeroEngine:
|
|
| 1180 |
|
| 1181 |
# Aggressive GC after generation
|
| 1182 |
force_gc()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1183 |
|
| 1184 |
logger.info(f"✅ Generation complete: {tokens_count} tokens @ {tps:.1f} t/s (TTFT: {first_token_time*1000:.0f}ms)")
|
| 1185 |
|
|
|
|
| 10 |
|
| 11 |
import gradio as gr
|
| 12 |
from huggingface_hub import HfApi, hf_hub_download
|
| 13 |
+
from gradio_client import Client
|
| 14 |
+
import hashlib
|
| 15 |
+
|
| 16 |
+
# Backend processor connection
|
| 17 |
+
BACKEND_URL = "turtle170/ZeroEngine-Backend"
|
| 18 |
+
|
| 19 |
+
class BackendProcessor:
|
| 20 |
+
"""Client for ZeroEngine-Backend processing"""
|
| 21 |
+
|
| 22 |
+
def __init__(self):
|
| 23 |
+
self.client = None
|
| 24 |
+
self.connected = False
|
| 25 |
+
self.last_connect_attempt = 0
|
| 26 |
+
self.connect_cooldown = 30 # seconds
|
| 27 |
+
|
| 28 |
+
def connect(self):
|
| 29 |
+
"""Lazy connection with cooldown"""
|
| 30 |
+
current_time = time.time()
|
| 31 |
+
|
| 32 |
+
if self.connected:
|
| 33 |
+
return True
|
| 34 |
+
|
| 35 |
+
if current_time - self.last_connect_attempt < self.connect_cooldown:
|
| 36 |
+
return False
|
| 37 |
+
|
| 38 |
+
try:
|
| 39 |
+
self.last_connect_attempt = current_time
|
| 40 |
+
self.client = Client(BACKEND_URL)
|
| 41 |
+
self.connected = True
|
| 42 |
+
logger.info("[BACKEND] ✅ Connected to ZeroEngine-Backend")
|
| 43 |
+
return True
|
| 44 |
+
except Exception as e:
|
| 45 |
+
logger.error(f"[BACKEND] ❌ Connection failed: {e}")
|
| 46 |
+
self.connected = False
|
| 47 |
+
return False
|
| 48 |
+
|
| 49 |
+
def tokenize_async(self, text: str):
|
| 50 |
+
"""Background tokenization"""
|
| 51 |
+
if not text or len(text) < 5:
|
| 52 |
+
return
|
| 53 |
+
|
| 54 |
+
def _background():
|
| 55 |
+
try:
|
| 56 |
+
if self.connect():
|
| 57 |
+
result = self.client.predict(text, api_name="/predict")
|
| 58 |
+
data = json.loads(result)
|
| 59 |
+
if data.get("success"):
|
| 60 |
+
logger.info(f"[BACKEND] Tokenized: ~{data['estimated_tokens']} tokens")
|
| 61 |
+
except Exception as e:
|
| 62 |
+
logger.warning(f"[BACKEND] Tokenize failed: {e}")
|
| 63 |
+
|
| 64 |
+
threading.Thread(target=_background, daemon=True).start()
|
| 65 |
+
|
| 66 |
+
def cache_response(self, prompt: str, response: str):
|
| 67 |
+
"""Cache a response for instant retrieval"""
|
| 68 |
+
prompt_hash = hashlib.md5(prompt.encode()).hexdigest()[:16]
|
| 69 |
+
|
| 70 |
+
def _background():
|
| 71 |
+
try:
|
| 72 |
+
if self.connect():
|
| 73 |
+
result = self.client.predict(
|
| 74 |
+
prompt_hash,
|
| 75 |
+
response,
|
| 76 |
+
api_name="/predict_3"
|
| 77 |
+
)
|
| 78 |
+
data = json.loads(result)
|
| 79 |
+
if data.get("success"):
|
| 80 |
+
logger.info(f"[BACKEND] Cached response: {prompt_hash}")
|
| 81 |
+
except Exception as e:
|
| 82 |
+
logger.warning(f"[BACKEND] Cache failed: {e}")
|
| 83 |
+
|
| 84 |
+
threading.Thread(target=_background, daemon=True).start()
|
| 85 |
+
|
| 86 |
+
def get_cached_response(self, prompt: str) -> Optional[str]:
|
| 87 |
+
"""Try to get cached response (synchronous)"""
|
| 88 |
+
prompt_hash = hashlib.md5(prompt.encode()).hexdigest()[:16]
|
| 89 |
+
|
| 90 |
+
try:
|
| 91 |
+
if self.connect():
|
| 92 |
+
result = self.client.predict(
|
| 93 |
+
prompt_hash,
|
| 94 |
+
api_name="/predict_4"
|
| 95 |
+
)
|
| 96 |
+
data = json.loads(result)
|
| 97 |
+
if data.get("success"):
|
| 98 |
+
logger.info(f"[BACKEND] ⚡ CACHE HIT: {prompt_hash}")
|
| 99 |
+
return data["response"]
|
| 100 |
+
except Exception as e:
|
| 101 |
+
logger.warning(f"[BACKEND] Cache retrieval failed: {e}")
|
| 102 |
+
|
| 103 |
+
return None
|
| 104 |
+
|
| 105 |
+
def charge_tokens_async(self, username: str, duration_ms: float):
|
| 106 |
+
"""Calculate token cost asynchronously"""
|
| 107 |
+
def _background():
|
| 108 |
+
try:
|
| 109 |
+
if self.connect():
|
| 110 |
+
result = self.client.predict(
|
| 111 |
+
username,
|
| 112 |
+
duration_ms,
|
| 113 |
+
api_name="/predict_5"
|
| 114 |
+
)
|
| 115 |
+
data = json.loads(result)
|
| 116 |
+
if data.get("success"):
|
| 117 |
+
logger.info(f"[BACKEND] Charged {username}: {data['cost']} tokens")
|
| 118 |
+
except Exception as e:
|
| 119 |
+
logger.warning(f"[BACKEND] Charge failed: {e}")
|
| 120 |
+
|
| 121 |
+
threading.Thread(target=_background, daemon=True).start()
|
| 122 |
|
| 123 |
# Initialize logger early for startup functions
|
| 124 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - ZEROENGINE - %(message)s')
|
|
|
|
| 719 |
return f"Session ended. You spent {stats['total_spent']:.2f} tokens this session. Balance: {stats['balance']:.2f}"
|
| 720 |
return "No active session found."
|
| 721 |
|
| 722 |
+
backend = BackendProcessor()
|
| 723 |
import math
|
| 724 |
token_manager = TokenManager()
|
| 725 |
|
|
|
|
| 846 |
return {"type": "Q4_K_M", **QUANT_OPTIMIZATIONS["Q4_K_M"]}
|
| 847 |
|
| 848 |
def preprocess_input(self, text: str):
|
| 849 |
+
"""Pre-process keyboard input with backend support"""
|
| 850 |
+
if not text or len(text) < 5:
|
| 851 |
+
return
|
| 852 |
+
|
| 853 |
+
# Send to backend for async tokenization
|
| 854 |
+
backend.tokenize_async(text)
|
| 855 |
+
|
| 856 |
+
# Also do local preprocessing if model loaded
|
| 857 |
+
if not self.llm:
|
| 858 |
return
|
| 859 |
|
| 860 |
def _preprocess():
|
| 861 |
try:
|
| 862 |
+
logger.info(f"[PREPROCESS] Tokenizing {len(text)} chars locally...")
|
| 863 |
tokens = self.llm.tokenize(text.encode("utf-8"))
|
| 864 |
self.preprocessed_tokens = tokens
|
| 865 |
logger.info(f"[PREPROCESS] Ready: {len(tokens)} tokens cached")
|
|
|
|
| 867 |
logger.error(f"[PREPROCESS] Failed: {e}")
|
| 868 |
self.preprocessed_tokens = None
|
| 869 |
|
|
|
|
| 870 |
if self.typing_timer:
|
| 871 |
self.typing_timer.cancel()
|
| 872 |
|
|
|
|
| 873 |
self.typing_timer = threading.Timer(1.0, _preprocess)
|
| 874 |
self.typing_timer.daemon = True
|
| 875 |
self.typing_timer.start()
|
| 876 |
+
|
| 877 |
def clear_preprocessed(self):
|
| 878 |
"""Clear preprocessed tokens and force GC"""
|
| 879 |
if self.preprocessed_tokens:
|
|
|
|
| 1199 |
time.sleep(0.5) # Brief pause for user to see the message
|
| 1200 |
|
| 1201 |
# Check prompt cache for exact matches (instant response)
|
| 1202 |
+
cached_response = backend.get_cached_response(full_input)
|
| 1203 |
+
if cached_response:
|
| 1204 |
+
logger.info("⚡ BACKEND CACHE HIT - Instant response!")
|
|
|
|
| 1205 |
history.append({"role": "user", "content": prompt})
|
| 1206 |
+
history.append({"role": "assistant", "content": cached_response})
|
| 1207 |
yield history
|
| 1208 |
return
|
| 1209 |
|
|
|
|
| 1293 |
|
| 1294 |
# Aggressive GC after generation
|
| 1295 |
force_gc()
|
| 1296 |
+
# Cache this response in backend for future use
|
| 1297 |
+
backend.cache_response(full_input, response_text)
|
| 1298 |
+
|
| 1299 |
+
# Send token charge to backend (async)
|
| 1300 |
+
if username:
|
| 1301 |
+
backend.charge_tokens_async(username, elapsed * 1000)
|
| 1302 |
|
| 1303 |
logger.info(f"✅ Generation complete: {tokens_count} tokens @ {tps:.1f} t/s (TTFT: {first_token_time*1000:.0f}ms)")
|
| 1304 |
|