Spaces:
Running
Running
| import gradio as gr | |
| import json | |
| import time | |
| import hashlib | |
| import logging | |
| import datetime | |
| import pytz | |
| import psutil | |
| import threading | |
| import gc | |
| from typing import Dict, Optional | |
| from functools import lru_cache | |
| import concurrent.futures | |
| import os | |
| # Initialize logging for backend | |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - BACKEND - %(message)s', force=True) | |
| logger = logging.getLogger(__name__) | |
| # Suppress asyncio warnings during shutdown | |
| import warnings | |
| warnings.filterwarnings("ignore", category=RuntimeWarning, message=".*asyncio.*") | |
| # ============================================================================ | |
| # ZEROENGINE-BACKEND: Background Processing Service - SPEED OPTIMIZED | |
| # ============================================================================ | |
| # This space handles: | |
| # - Tokenization pre-processing | |
| # - Prompt caching | |
| # - Token accounting calculations | |
| # - Response caching | |
| # ============================================================================ | |
| # SPEED OPTIMIZATIONS: Larger caches with 16GB RAM available | |
| MAX_PROMPT_CACHE_SIZE = 50000 # Increased from default | |
| MAX_RESPONSE_CACHE_SIZE = 10000 # Increased from default | |
| MAX_TOKEN_LEDGER_SIZE = 10000 # Increased from default | |
| # HARD-CODED: Hugging Face Space RAM limits (same as main app) | |
| TOTAL_RAM_GB = 18.0 # HARD-CODED: 18GB total for container | |
| USABLE_RAM_GB = 16.0 # HARD-CODED: 16GB usable for backend (2GB reserved) | |
| # In-memory caches with optimized data structures | |
| prompt_cache = {} | |
| response_cache = {} | |
| token_ledger = {} | |
| backend_start_time = time.time() | |
| # Performance tracking | |
| performance_stats = { | |
| "total_requests": 0, | |
| "cache_hits": 0, | |
| "cache_misses": 0, | |
| "avg_response_time": 0.0, | |
| "memory_usage_mb": 0.0 | |
| } | |
| # Background cleanup thread | |
| cleanup_thread_running = True | |
| def background_cleanup(): | |
| """Background thread for cache management and optimization""" | |
| while cleanup_thread_running: | |
| try: | |
| # Clean up old entries every 5 minutes | |
| time.sleep(300) | |
| current_time = time.time() | |
| # Clean old prompt cache entries (older than 1 hour) | |
| old_prompt_keys = [ | |
| key for key, data in prompt_cache.items() | |
| if current_time - data.get("cached_at", 0) > 3600 | |
| ] | |
| for key in old_prompt_keys[:100]: # Limit cleanup batch size | |
| del prompt_cache[key] | |
| # Clean old response cache entries (older than 2 hours) | |
| old_response_keys = [ | |
| key for key, data in response_cache.items() | |
| if current_time - data.get("cached_at", 0) > 7200 | |
| ] | |
| for key in old_response_keys[:50]: # Limit cleanup batch size | |
| del response_cache[key] | |
| # Force garbage collection | |
| gc.collect() | |
| logger.info(f"[CLEANUP] Removed {len(old_prompt_keys)} old prompts, {len(old_response_keys)} old responses") | |
| except Exception as e: | |
| logger.error(f"[CLEANUP] Background cleanup error: {e}") | |
| # Start background cleanup thread | |
| cleanup_thread = threading.Thread(target=background_cleanup, daemon=True) | |
| cleanup_thread.start() | |
| logger.info("[INIT] Background cleanup thread started") | |
| # Log hard-coded RAM configuration | |
| logger.info(f"[RAM] HARD-CODED: Total: {TOTAL_RAM_GB:.1f}GB, Usable: {USABLE_RAM_GB:.1f}GB (Hugging Face Space)") | |
| logger.info(f"[RAM] (Ignoring host system memory - using container limits)") | |
| def fast_hash(text: str) -> str: | |
| """Fast hashing function with LRU cache""" | |
| return hashlib.md5(text.encode()).hexdigest() | |
| def get_memory_usage() -> float: | |
| """Get current memory usage in MB""" | |
| try: | |
| return psutil.Process().memory_info().rss / 1024 / 1024 | |
| except: | |
| return 0.0 | |
| def tokenize_text(text: str) -> str: | |
| """SPEED-OPTIMIZED tokenization with fast caching""" | |
| start_time = time.time() | |
| # Update performance stats | |
| performance_stats["total_requests"] += 1 | |
| try: | |
| # Check cache first for instant response | |
| text_hash = fast_hash(text)[:16] | |
| cached_result = prompt_cache.get(text_hash) | |
| if cached_result: | |
| performance_stats["cache_hits"] += 1 | |
| processing_time = time.time() - start_time | |
| result = { | |
| "success": True, | |
| "estimated_tokens": cached_result["tokens"], | |
| "processing_time_ms": round(processing_time * 1000, 2), | |
| "text_length": len(text), | |
| "word_count": len(text.split()), | |
| "char_count": len(text), | |
| "timestamp": datetime.datetime.now(pytz.UTC).isoformat(), | |
| "request_id": hashlib.md5(f"{text}{time.time()}".encode()).hexdigest()[:8], | |
| "cache_hit": True | |
| } | |
| logger.info(f"[TOKENIZE] β‘ CACHE HIT: {cached_result['tokens']} tokens in {processing_time*1000:.1f}ms") | |
| return json.dumps(result, indent=2) | |
| # Cache miss - calculate tokens | |
| performance_stats["cache_misses"] += 1 | |
| # OPTIMIZED: Faster token estimation algorithm | |
| words = text.split() | |
| word_count = len(words) | |
| char_count = len(text) | |
| # More accurate token estimation based on patterns | |
| estimated_tokens = word_count + (char_count // 4) + (len([w for w in words if len(w) > 8]) * 2) | |
| processing_time = time.time() - start_time | |
| result = { | |
| "success": True, | |
| "estimated_tokens": estimated_tokens, | |
| "processing_time_ms": round(processing_time * 1000, 2), | |
| "text_length": len(text), | |
| "word_count": word_count, | |
| "char_count": char_count, | |
| "timestamp": datetime.datetime.now(pytz.UTC).isoformat(), | |
| "request_id": hashlib.md5(f"{text}{time.time()}".encode()).hexdigest()[:8], | |
| "cache_hit": False | |
| } | |
| # Cache the result for future requests | |
| prompt_cache[text_hash] = { | |
| "text": text[:100] + "..." if len(text) > 100 else text, | |
| "tokens": estimated_tokens, | |
| "cached_at": time.time() | |
| } | |
| # Limit cache size with LRU eviction | |
| if len(prompt_cache) > MAX_PROMPT_CACHE_SIZE: | |
| oldest_key = min(prompt_cache.keys(), key=lambda k: prompt_cache[k]["cached_at"]) | |
| del prompt_cache[oldest_key] | |
| logger.info(f"[TOKENIZE] β CALCULATED: {estimated_tokens} tokens in {processing_time*1000:.1f}ms") | |
| return json.dumps(result, indent=2) | |
| except Exception as e: | |
| processing_time = time.time() - start_time | |
| logger.error(f"[TOKENIZE] β Failed after {processing_time*1000:.1f}ms: {e}") | |
| return json.dumps({ | |
| "success": False, | |
| "error": str(e), | |
| "error_type": type(e).__name__, | |
| "processing_time_ms": round(processing_time * 1000, 2), | |
| "timestamp": datetime.datetime.now(pytz.UTC).isoformat() | |
| }, indent=2) | |
| def cache_prompt(key: str, value: str) -> str: | |
| """SPEED-OPTIMIZED prompt caching with larger limits""" | |
| start_time = time.time() | |
| try: | |
| # Use fast hash for key | |
| cache_key = fast_hash(key) if len(key) > 32 else key | |
| prompt_cache[cache_key] = { | |
| "value": value, | |
| "cached_at": time.time() | |
| } | |
| # Limit cache size with optimized eviction | |
| if len(prompt_cache) > MAX_PROMPT_CACHE_SIZE: | |
| # Batch remove oldest 1000 entries for efficiency | |
| oldest_keys = sorted(prompt_cache.keys(), | |
| key=lambda k: prompt_cache[k]["cached_at"])[:1000] | |
| for old_key in oldest_keys: | |
| del prompt_cache[old_key] | |
| processing_time = time.time() - start_time | |
| result = { | |
| "success": True, | |
| "key": cache_key, | |
| "value_length": len(value), | |
| "cache_size": len(prompt_cache), | |
| "processing_time_ms": round(processing_time * 1000, 2), | |
| "timestamp": datetime.datetime.now(pytz.UTC).isoformat(), | |
| "request_id": hashlib.md5(f"{cache_key}{time.time()}".encode()).hexdigest()[:8] | |
| } | |
| logger.info(f"[CACHE-PROMPT] β‘ Stored: {len(value)} chars in {processing_time*1000:.1f}ms") | |
| return json.dumps(result, indent=2) | |
| except Exception as e: | |
| processing_time = time.time() - start_time | |
| logger.error(f"[CACHE-PROMPT] β Failed after {processing_time*1000:.1f}ms: {e}") | |
| return json.dumps({ | |
| "success": False, | |
| "error": str(e), | |
| "error_type": type(e).__name__, | |
| "processing_time_ms": round(processing_time * 1000, 2), | |
| "timestamp": datetime.datetime.now(pytz.UTC).isoformat() | |
| }, indent=2) | |
| def get_cached_prompt(key: str) -> str: | |
| """SPEED-OPTIMIZED prompt retrieval""" | |
| start_time = time.time() | |
| try: | |
| # Use fast hash for key | |
| cache_key = fast_hash(key) if len(key) > 32 else key | |
| cached_value = prompt_cache.get(cache_key) | |
| processing_time = time.time() - start_time | |
| if cached_value is not None: | |
| result = { | |
| "success": True, | |
| "found": True, | |
| "key": cache_key, | |
| "value": cached_value["value"], | |
| "value_length": len(cached_value["value"]), | |
| "cache_size": len(prompt_cache), | |
| "processing_time_ms": round(processing_time * 1000, 2), | |
| "timestamp": datetime.datetime.now(pytz.UTC).isoformat(), | |
| "request_id": hashlib.md5(f"{cache_key}{time.time()}".encode()).hexdigest()[:8], | |
| "cache_hit": True | |
| } | |
| logger.info(f"[GET-PROMPT] β‘ HIT: {len(cached_value['value'])} chars in {processing_time*1000:.1f}ms") | |
| else: | |
| result = { | |
| "success": True, | |
| "found": False, | |
| "key": cache_key, | |
| "value": None, | |
| "cache_size": len(prompt_cache), | |
| "processing_time_ms": round(processing_time * 1000, 2), | |
| "timestamp": datetime.datetime.now(pytz.UTC).isoformat(), | |
| "request_id": hashlib.md5(f"{cache_key}{time.time()}".encode()).hexdigest()[:8], | |
| "cache_hit": False | |
| } | |
| logger.info(f"[GET-PROMPT] β οΈ MISS: {cache_key} in {processing_time*1000:.1f}ms") | |
| return json.dumps(result, indent=2) | |
| except Exception as e: | |
| processing_time = time.time() - start_time | |
| logger.error(f"[GET-PROMPT] β Failed after {processing_time*1000:.1f}ms: {e}") | |
| return json.dumps({ | |
| "success": False, | |
| "error": str(e), | |
| "error_type": type(e).__name__, | |
| "processing_time_ms": round(processing_time * 1000, 2), | |
| "timestamp": datetime.datetime.now(pytz.UTC).isoformat() | |
| }, indent=2) | |
| def cache_response(prompt_hash: str, response: str) -> str: | |
| """SPEED-OPTIMIZED response caching with larger limits""" | |
| start_time = time.time() | |
| try: | |
| response_cache[prompt_hash] = { | |
| "response": response, | |
| "cached_at": time.time() | |
| } | |
| # Limit cache size with optimized eviction | |
| if len(response_cache) > MAX_RESPONSE_CACHE_SIZE: | |
| # Batch remove oldest 500 entries for efficiency | |
| oldest_keys = sorted(response_cache.keys(), | |
| key=lambda k: response_cache[k]["cached_at"])[:500] | |
| for old_key in oldest_keys: | |
| del response_cache[old_key] | |
| processing_time = time.time() - start_time | |
| result = { | |
| "success": True, | |
| "cached_hash": prompt_hash, | |
| "response_length": len(response), | |
| "cache_size": len(response_cache), | |
| "processing_time_ms": round(processing_time * 1000, 2), | |
| "timestamp": datetime.datetime.now(pytz.UTC).isoformat(), | |
| "request_id": hashlib.md5(f"{prompt_hash}{time.time()}".encode()).hexdigest()[:8] | |
| } | |
| logger.info(f"[CACHE-RESPONSE] β‘ Stored: {len(response)} chars in {processing_time*1000:.1f}ms") | |
| return json.dumps(result, indent=2) | |
| except Exception as e: | |
| processing_time = time.time() - start_time | |
| logger.error(f"[CACHE-RESPONSE] β Failed after {processing_time*1000:.1f}ms: {e}") | |
| return json.dumps({ | |
| "success": False, | |
| "error": str(e), | |
| "error_type": type(e).__name__, | |
| "processing_time_ms": round(processing_time * 1000, 2), | |
| "timestamp": datetime.datetime.now(pytz.UTC).isoformat() | |
| }, indent=2) | |
| def get_cached_response(prompt_hash: str) -> str: | |
| """SPEED-OPTIMIZED response retrieval""" | |
| start_time = time.time() | |
| try: | |
| cached_data = response_cache.get(prompt_hash) | |
| processing_time = time.time() - start_time | |
| if cached_data is not None: | |
| response = cached_data["response"] | |
| age_seconds = round(time.time() - cached_data["cached_at"], 2) | |
| result = { | |
| "success": True, | |
| "found": True, | |
| "hash": prompt_hash, | |
| "response": response, | |
| "response_length": len(response), | |
| "age_seconds": age_seconds, | |
| "cache_size": len(response_cache), | |
| "processing_time_ms": round(processing_time * 1000, 2), | |
| "timestamp": datetime.datetime.now(pytz.UTC).isoformat(), | |
| "request_id": hashlib.md5(f"{prompt_hash}{time.time()}".encode()).hexdigest()[:8], | |
| "cache_hit": True, | |
| "cached_at": datetime.datetime.fromtimestamp(cached_data["cached_at"], pytz.UTC).isoformat() | |
| } | |
| logger.info(f"[GET-RESPONSE] β‘ HIT: {len(response)} chars in {processing_time*1000:.1f}ms") | |
| else: | |
| result = { | |
| "success": True, | |
| "found": False, | |
| "hash": prompt_hash, | |
| "response": None, | |
| "cache_size": len(response_cache), | |
| "processing_time_ms": round(processing_time * 1000, 2), | |
| "timestamp": datetime.datetime.now(pytz.UTC).isoformat(), | |
| "request_id": hashlib.md5(f"{prompt_hash}{time.time()}".encode()).hexdigest()[:8], | |
| "cache_hit": False | |
| } | |
| logger.info(f"[GET-RESPONSE] β οΈ MISS: {prompt_hash} in {processing_time*1000:.1f}ms") | |
| return json.dumps(result, indent=2) | |
| except Exception as e: | |
| processing_time = time.time() - start_time | |
| logger.error(f"[GET-RESPONSE] β Failed after {processing_time*1000:.1f}ms: {e}") | |
| return json.dumps({ | |
| "success": False, | |
| "error": str(e), | |
| "error_type": type(e).__name__, | |
| "processing_time_ms": round(processing_time * 1000, 2), | |
| "timestamp": datetime.datetime.now(pytz.UTC).isoformat() | |
| }, indent=2) | |
| def calculate_token_cost(username: str, duration_ms: float) -> str: | |
| """Calculate token cost with extremely detailed logging""" | |
| logger.info(f"[TOKEN-COST] ===== TOKEN COST REQUEST START =====") | |
| logger.info(f"[TOKEN-COST] Username: '{username}'") | |
| logger.info(f"[TOKEN-COST] Username length: {len(username)} characters") | |
| logger.info(f"[TOKEN-COST] Duration: {duration_ms}ms") | |
| logger.info(f"[TOKEN-COST] Current users tracked: {len(token_ledger)}") | |
| logger.info(f"[TOKEN-COST] User ledger keys: {list(token_ledger.keys())[:10]}{'...' if len(token_ledger) > 10 else ''}") | |
| if username in token_ledger: | |
| user_data = token_ledger[username] | |
| logger.info(f"[TOKEN-COST] Existing user data found:") | |
| logger.info(f"[TOKEN-COST] - Total cost: {user_data['total_cost']} tokens") | |
| logger.info(f"[TOKEN-COST] - Total duration: {user_data['total_duration_ms']}ms") | |
| logger.info(f"[TOKEN-COST] - Previous requests: {user_data['requests']}") | |
| else: | |
| logger.info(f"[TOKEN-COST] New user - creating ledger entry") | |
| start_time = time.time() | |
| try: | |
| cost = (duration_ms / 100.0) * 0.001 # 0.001 tokens per 100ms | |
| processing_time = time.time() - start_time | |
| # Track in ledger (for analytics) | |
| if username not in token_ledger: | |
| token_ledger[username] = { | |
| "total_cost": 0.0, | |
| "total_duration_ms": 0.0, | |
| "requests": 0, | |
| "first_seen": time.time(), | |
| "last_seen": time.time() | |
| } | |
| # Update user data | |
| token_ledger[username]["total_cost"] += cost | |
| token_ledger[username]["total_duration_ms"] += duration_ms | |
| token_ledger[username]["requests"] += 1 | |
| token_ledger[username]["last_seen"] = time.time() | |
| user_data = token_ledger[username] | |
| avg_cost_per_request = user_data["total_cost"] / user_data["requests"] | |
| avg_duration_per_request = user_data["total_duration_ms"] / user_data["requests"] | |
| account_age_seconds = round(time.time() - user_data["first_seen"], 2) | |
| result = { | |
| "success": True, | |
| "username": username, | |
| "duration_ms": duration_ms, | |
| "cost": round(cost, 6), | |
| "total_cost": round(user_data["total_cost"], 4), | |
| "total_requests": user_data["requests"], | |
| "total_duration_ms": round(user_data["total_duration_ms"], 2), | |
| "avg_cost_per_request": round(avg_cost_per_request, 6), | |
| "avg_duration_per_request": round(avg_duration_per_request, 2), | |
| "account_age_seconds": account_age_seconds, | |
| "processing_time_ms": round(processing_time * 1000, 2), | |
| "timestamp": datetime.datetime.now(pytz.UTC).isoformat(), | |
| "request_id": hashlib.md5(f"{username}{duration_ms}{time.time()}".encode()).hexdigest()[:8] | |
| } | |
| logger.info(f"[TOKEN-COST] β Token cost calculated successfully") | |
| logger.info(f"[TOKEN-COST] Request cost: {cost} tokens") | |
| logger.info(f"[TOKEN-COST] User total cost: {user_data['total_cost']} tokens") | |
| logger.info(f"[TOKEN-COST] User total requests: {user_data['requests']}") | |
| logger.info(f"[TOKEN-COST] User avg cost per request: {avg_cost_per_request} tokens") | |
| logger.info(f"[TOKEN-COST] User avg duration per request: {avg_duration_per_request}ms") | |
| logger.info(f"[TOKEN-COST] User account age: {account_age_seconds} seconds") | |
| logger.info(f"[TOKEN-COST] Processing time: {processing_time:.4f}s ({processing_time*1000:.2f}ms)") | |
| logger.info(f"[TOKEN-COST] Request ID: {result['request_id']}") | |
| logger.info(f"[TOKEN-COST] ===== TOKEN COST REQUEST END =====") | |
| return json.dumps(result, indent=2) | |
| except Exception as e: | |
| processing_time = time.time() - start_time | |
| logger.error(f"[TOKEN-COST] β Token cost calculation failed after {processing_time:.4f}s: {e}") | |
| logger.error(f"[TOKEN-COST] Error type: {type(e).__name__}") | |
| logger.error(f"[TOKEN-COST] Error details: {str(e)}") | |
| logger.error(f"[TOKEN-COST] Username that caused error: '{username}'") | |
| logger.error(f"[TOKEN-COST] Duration that caused error: {duration_ms}ms") | |
| logger.error(f"[TOKEN-COST] ===== TOKEN COST REQUEST END (ERROR) =====") | |
| return json.dumps({ | |
| "success": False, | |
| "error": str(e), | |
| "error_type": type(e).__name__, | |
| "processing_time_ms": round(processing_time * 1000, 2), | |
| "timestamp": datetime.datetime.now(pytz.UTC).isoformat() | |
| }, indent=2) | |
| def get_cache_stats() -> str: | |
| """SPEED-OPTIMIZED cache statistics with performance tracking""" | |
| start_time = time.time() | |
| try: | |
| # Calculate detailed statistics | |
| total_prompt_memory = sum(len(str(v)) for v in prompt_cache.values()) | |
| total_response_memory = sum(len(v['response']) for v in response_cache.values()) | |
| total_requests = sum(u['requests'] for u in token_ledger.values()) | |
| total_tokens = sum(u['total_cost'] for u in token_ledger.values()) | |
| total_duration = sum(u['total_duration_ms'] for u in token_ledger.values()) | |
| # User statistics | |
| active_users = len([u for u in token_ledger.values() if time.time() - u.get('last_seen', u.get('first_seen', 0)) < 3600]) | |
| avg_requests_per_user = total_requests / len(token_ledger) if len(token_ledger) > 0 else 0 | |
| avg_tokens_per_user = total_tokens / len(token_ledger) if len(token_ledger) > 0 else 0 | |
| # Performance metrics | |
| cache_hit_rate = (performance_stats["cache_hits"] / performance_stats["total_requests"] * 100) if performance_stats["total_requests"] > 0 else 0 | |
| memory_usage_mb = get_memory_usage() | |
| uptime_seconds = round(time.time() - backend_start_time, 2) | |
| # HARD-CODED: Use Hugging Face Space RAM limits | |
| total_ram_mb = TOTAL_RAM_GB * 1024 # 18GB * 1024 = 18432MB | |
| usable_ram_mb = USABLE_RAM_GB * 1024 # 16GB * 1024 = 16384MB | |
| used_ram_mb = memory_usage_mb | |
| available_ram_mb = usable_ram_mb - used_ram_mb | |
| ram_usage_pct = (used_ram_mb / usable_ram_mb) * 100 | |
| processing_time = time.time() - start_time | |
| result = { | |
| "success": True, | |
| "prompt_cache_size": len(prompt_cache), | |
| "response_cache_size": len(response_cache), | |
| "users_tracked": len(token_ledger), | |
| "active_users_last_hour": active_users, | |
| "total_requests": total_requests, | |
| "total_tokens_spent": round(total_tokens, 4), | |
| "total_duration_ms": round(total_duration, 2), | |
| "avg_requests_per_user": round(avg_requests_per_user, 2), | |
| "avg_tokens_per_user": round(avg_tokens_per_user, 4), | |
| "prompt_cache_memory_bytes": total_prompt_memory, | |
| "response_cache_memory_bytes": total_response_memory, | |
| "total_cache_memory_bytes": total_prompt_memory + total_response_memory, | |
| # PERFORMANCE METRICS | |
| "performance_stats": performance_stats, | |
| "cache_hit_rate_pct": round(cache_hit_rate, 2), | |
| "memory_usage_mb": round(memory_usage_mb, 2), | |
| "uptime_seconds": uptime_seconds, | |
| "requests_per_second": round(total_requests / uptime_seconds, 2) if uptime_seconds > 0 else 0, | |
| # HARD-CODED RAM INFO | |
| "ram_info": { | |
| "total_ram_gb": TOTAL_RAM_GB, | |
| "usable_ram_gb": USABLE_RAM_GB, | |
| "used_ram_mb": round(used_ram_mb, 2), | |
| "available_ram_mb": round(available_ram_mb, 2), | |
| "total_ram_mb": total_ram_mb, | |
| "ram_usage_pct": round(ram_usage_pct, 2), | |
| "hardcoded": True | |
| }, | |
| "processing_time_ms": round(processing_time * 1000, 2), | |
| "timestamp": datetime.datetime.now(pytz.UTC).isoformat(), | |
| "request_id": hashlib.md5(f"stats{time.time()}".encode()).hexdigest()[:8] | |
| } | |
| logger.info(f"[CACHE-STATS] β‘ Retrieved in {processing_time*1000:.1f}ms - {cache_hit_rate:.1f}% hit rate | RAM: {used_ram_mb:.1f}/{usable_ram_mb:.1f}MB ({ram_usage_pct:.1f}%)") | |
| return json.dumps(result, indent=2) | |
| except Exception as e: | |
| processing_time = time.time() - start_time | |
| logger.error(f"[CACHE-STATS] β Failed after {processing_time*1000:.1f}ms: {e}") | |
| return json.dumps({ | |
| "success": False, | |
| "error": str(e), | |
| "error_type": type(e).__name__, | |
| "processing_time_ms": round(processing_time * 1000, 2), | |
| "timestamp": datetime.datetime.now(pytz.UTC).isoformat() | |
| }, indent=2) | |
| def get_backend_health() -> str: | |
| """SPEED-OPTIMIZED backend health status with hard-coded RAM""" | |
| logger.info(f"[BACKEND-HEALTH] Checking backend health status...") | |
| logger.info(f"[BACKEND-HEALTH] Current prompt cache size: {len(prompt_cache)} entries") | |
| logger.info(f"[BACKEND-HEALTH] Current response cache size: {len(response_cache)} entries") | |
| logger.info(f"[BACKEND-HEALTH] Current users tracked: {len(token_ledger)}") | |
| logger.info(f"[BACKEND-HEALTH] Total requests processed: {sum(u['requests'] for u in token_ledger.values())}") | |
| start_time = time.time() | |
| try: | |
| # Calculate health metrics | |
| total_cache_size = len(prompt_cache) + len(response_cache) | |
| total_requests = sum(u['requests'] for u in token_ledger.values()) | |
| total_memory_usage = sum(len(str(v)) for v in prompt_cache.values()) + sum(len(v['response']) for v in response_cache.values()) | |
| # Determine health status | |
| health_status = "healthy" | |
| issues = [] | |
| if total_cache_size > 200: | |
| health_status = "degraded" | |
| issues.append("High cache usage") | |
| if len(token_ledger) > 1000: | |
| health_status = "degraded" | |
| issues.append("High user count") | |
| if total_memory_usage > 10000000: # 10MB | |
| health_status = "degraded" | |
| issues.append("High memory usage") | |
| processing_time = time.time() - start_time | |
| result = { | |
| "success": True, | |
| "status": health_status, | |
| "issues": issues, | |
| "prompt_cache_size": len(prompt_cache), | |
| "response_cache_size": len(response_cache), | |
| "total_cache_size": total_cache_size, | |
| "users_tracked": len(token_ledger), | |
| "total_requests": total_requests, | |
| "total_memory_usage_bytes": total_memory_usage, | |
| "uptime_seconds": round(time.time() - backend_start_time, 2) if 'backend_start_time' in globals() else 0, | |
| "processing_time_ms": round(processing_time * 1000, 2), | |
| "timestamp": datetime.datetime.now(pytz.UTC).isoformat(), | |
| "request_id": hashlib.md5(f"health{time.time()}".encode()).hexdigest()[:8] | |
| } | |
| logger.info(f"[BACKEND-HEALTH] β Backend health check completed successfully") | |
| logger.info(f"[BACKEND-HEALTH] Health status: {health_status}") | |
| if issues: | |
| logger.warning(f"[BACKEND-HEALTH] Issues detected: {', '.join(issues)}") | |
| logger.info(f"[BACKEND-HEALTH] Total cache size: {total_cache_size} entries") | |
| logger.info(f"[BACKEND-HEALTH] Users tracked: {len(token_ledger)}") | |
| logger.info(f"[BACKEND-HEALTH] Total requests: {total_requests}") | |
| logger.info(f"[BACKEND-HEALTH] Memory usage: {total_memory_usage} bytes") | |
| logger.info(f"[BACKEND-HEALTH] Processing time: {processing_time:.4f}s ({processing_time*1000:.2f}ms)") | |
| logger.info(f"[BACKEND-HEALTH] Request ID: {result['request_id']}") | |
| logger.info(f"[BACKEND-HEALTH] ===== BACKEND HEALTH REQUEST END =====") | |
| return json.dumps(result, indent=2) | |
| except Exception as e: | |
| processing_time = time.time() - start_time | |
| logger.error(f"[BACKEND-HEALTH] β Backend health check failed after {processing_time:.4f}s: {e}") | |
| logger.error(f"[BACKEND-HEALTH] Error type: {type(e).__name__}") | |
| logger.error(f"[BACKEND-HEALTH] Error details: {str(e)}") | |
| logger.error(f"[BACKEND-HEALTH] ===== BACKEND HEALTH REQUEST END (ERROR) =====") | |
| return json.dumps({ | |
| "success": False, | |
| "status": "error", | |
| "error": str(e), | |
| "error_type": type(e).__name__, | |
| "processing_time_ms": round(processing_time * 1000, 2), | |
| "timestamp": datetime.datetime.now(pytz.UTC).isoformat() | |
| }, indent=2) | |
| # ============================================================================ | |
| # GRADIO INTERFACE | |
| # ============================================================================ | |
| if __name__ == "__main__": | |
| import atexit | |
| import signal | |
| import sys | |
| def cleanup_on_exit(): | |
| """Cleanup function called on application exit""" | |
| logger.info("[CLEANUP] Backend shutting down...") | |
| # Clear caches | |
| global prompt_cache, response_cache, token_ledger | |
| logger.info(f"[CLEANUP] Clearing {len(prompt_cache)} prompt cache entries") | |
| logger.info(f"[CLEANUP] Clearing {len(response_cache)} response cache entries") | |
| logger.info(f"[CLEANUP] Clearing {len(token_ledger)} user token records") | |
| prompt_cache.clear() | |
| response_cache.clear() | |
| token_ledger.clear() | |
| logger.info("[CLEANUP] Backend shutdown complete") | |
| # Register cleanup functions | |
| atexit.register(cleanup_on_exit) | |
| def signal_handler(signum, frame): | |
| """Handle shutdown signals gracefully""" | |
| logger.info(f"[CLEANUP] Received signal {signum}") | |
| cleanup_on_exit() | |
| import sys | |
| sys.exit(0) | |
| signal.signal(signal.SIGTERM, signal_handler) | |
| signal.signal(signal.SIGINT, signal_handler) | |
| logger.info("[INIT] ===== BACKEND APPLICATION STARTUP =====") | |
| logger.info(f"[INIT] ZeroEngine-Backend starting up...") | |
| logger.info(f"[INIT] Backend start time: {datetime.datetime.fromtimestamp(backend_start_time, pytz.UTC).isoformat()}") | |
| logger.info(f"[INIT] Python version: {sys.version}") | |
| logger.info(f"[INIT] Gradio version: {gr.__version__}") | |
| logger.info(f"[INIT] Cache sizes - Prompt: {len(prompt_cache)}, Response: {len(response_cache)}") | |
| logger.info(f"[INIT] Users tracked: {len(token_ledger)}") | |
| logger.info(f"[INIT] Server will launch on port 7860") | |
| logger.info(f"[INIT] ===== BACKEND APPLICATION STARTUP END =====") | |
| logger.info("[INIT] Creating Gradio interface...") | |
| try: | |
| with gr.Blocks(title="ZeroEngine-Backend") as demo: | |
| logger.info("[INIT] Gradio Blocks created successfully") | |
| # Apply theme after Blocks creation for Gradio 6.5.0 compatibility | |
| if hasattr(demo, 'theme'): | |
| logger.info("[INIT] Applying theme...") | |
| demo.theme = gr.themes.Monochrome() | |
| logger.info("[INIT] Theme applied successfully") | |
| else: | |
| logger.warning("[INIT] Theme attribute not found, skipping theme application") | |
| logger.info("[INIT] Creating HTML header...") | |
| gr.HTML(""" | |
| <div style='text-align: center; padding: 20px;'> | |
| <h1>π§ ZeroEngine-Backend</h1> | |
| <p style='color: #888;'>Background Processing Service for ZeroEngine</p> | |
| </div> | |
| """) | |
| logger.info("[INIT] HTML header created") | |
| logger.info("[INIT] Creating tabs...") | |
| with gr.Tab("π’ Tokenize"): | |
| logger.info("[INIT] Tokenize tab created") | |
| gr.Markdown("### Fast Tokenization Pre-Processing") | |
| with gr.Row(): | |
| with gr.Column(): | |
| tokenize_input = gr.Textbox( | |
| label="Text to Tokenize", | |
| placeholder="Enter text here...", | |
| lines=5 | |
| ) | |
| tokenize_btn = gr.Button("Tokenize", variant="primary") | |
| with gr.Column(): | |
| tokenize_output = gr.Code(label="Result (JSON)", language="json") | |
| tokenize_btn.click(tokenize_text, [tokenize_input], [tokenize_output], api_name="/predict") | |
| logger.info("[INIT] Tokenize tab components configured") | |
| with gr.Tab("πΎ Prompt Cache"): | |
| logger.info("[INIT] Prompt Cache tab created") | |
| gr.Markdown("### Store and Retrieve Prompts") | |
| with gr.Row(): | |
| with gr.Column(): | |
| cache_key_input = gr.Textbox(label="Cache Key") | |
| cache_value_input = gr.Textbox(label="Value to Cache", lines=3) | |
| cache_store_btn = gr.Button("Store", variant="primary") | |
| cache_store_output = gr.Code(label="Result", language="json") | |
| with gr.Column(): | |
| cache_get_input = gr.Textbox(label="Key to Retrieve") | |
| cache_get_btn = gr.Button("Retrieve", variant="secondary") | |
| cache_get_output = gr.Code(label="Result", language="json") | |
| cache_store_btn.click(cache_prompt, [cache_key_input, cache_value_input], [cache_store_output], api_name="/predict_2") | |
| cache_get_btn.click(get_cached_prompt, [cache_get_input], [cache_get_output], api_name="/predict_3") | |
| logger.info("[INIT] Prompt Cache tab components configured") | |
| with gr.Tab("β‘ Response Cache"): | |
| logger.info("[INIT] Response Cache tab created") | |
| gr.Markdown("### Cache Complete Responses") | |
| with gr.Row(): | |
| with gr.Column(): | |
| resp_hash_input = gr.Textbox(label="Prompt Hash") | |
| resp_value_input = gr.Textbox(label="Response to Cache", lines=5) | |
| resp_cache_btn = gr.Button("Cache Response", variant="primary") | |
| resp_cache_output = gr.Code(label="Result", language="json") | |
| with gr.Column(): | |
| resp_get_input = gr.Textbox(label="Hash to Retrieve") | |
| resp_get_btn = gr.Button("Get Response", variant="secondary") | |
| resp_get_output = gr.Code(label="Result", language="json") | |
| resp_cache_btn.click(cache_response, [resp_hash_input, resp_value_input], [resp_cache_output], api_name="/predict_4") | |
| resp_get_btn.click(get_cached_response, [resp_get_input], [resp_get_output], api_name="/predict_5") | |
| logger.info("[INIT] Response Cache tab components configured") | |
| with gr.Tab("π° Token Accounting"): | |
| logger.info("[INIT] Token Accounting tab created") | |
| gr.Markdown("### Calculate Token Costs") | |
| with gr.Row(): | |
| username_input = gr.Textbox(label="Username", value="turtle170") | |
| duration_input = gr.Number(label="Duration (ms)", value=5000) | |
| calc_btn = gr.Button("Calculate Cost", variant="primary") | |
| calc_output = gr.Code(label="Result (JSON)", language="json") | |
| calc_btn.click(calculate_token_cost, [username_input, duration_input], [calc_output], api_name="/predict_6") | |
| logger.info("[INIT] Token Accounting tab components configured") | |
| with gr.Tab("π Stats"): | |
| logger.info("[INIT] Stats tab created") | |
| gr.Markdown("### Cache Statistics") | |
| stats_btn = gr.Button("Get Stats", variant="primary") | |
| stats_output = gr.Code(label="Statistics (JSON)", language="json") | |
| stats_btn.click(get_cache_stats, None, [stats_output], api_name="/predict_7") | |
| logger.info("[INIT] Stats tab components configured") | |
| with gr.Tab("π₯ Health"): | |
| logger.info("[INIT] Health tab created") | |
| gr.Markdown("### Backend Health Status") | |
| health_btn = gr.Button("Check Health", variant="primary") | |
| health_output = gr.Code(label="Health Status (JSON)", language="json") | |
| health_btn.click(get_backend_health, None, [health_output], api_name="/predict_8") | |
| logger.info("[INIT] Health tab components configured") | |
| logger.info("[INIT] All tabs created successfully") | |
| logger.info("[INIT] Launching Gradio demo...") | |
| demo.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False) | |
| logger.info("[INIT] Gradio demo launched successfully") | |
| except Exception as e: | |
| logger.error(f"[INIT] Failed to create Gradio interface: {e}") | |
| logger.error(f"[INIT] Error type: {type(e).__name__}") | |
| logger.error(f"[INIT] Error details: {str(e)}") | |
| import traceback | |
| logger.error(f"[INIT] Traceback: {traceback.format_exc()}") | |
| raise |