Spaces:

turtle170
/

ZeroEngine-Backend

Running

App Files Files Community

turtle170 commited on Feb 1

Commit

47f6bdb

verified ·

1 Parent(s): 41c6199

Update app.py

Browse files

Files changed (1) hide show

app.py +182 -243

app.py CHANGED Viewed

@@ -5,7 +5,13 @@ import hashlib
 import logging
 import datetime
 import pytz
 from typing import Dict, Optional
 # Initialize logging for backend
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - BACKEND - %(message)s', force=True)
@@ -16,7 +22,7 @@ import warnings
 warnings.filterwarnings("ignore", category=RuntimeWarning, message=".*asyncio.*")
 # ============================================================================
-# ZEROENGINE-BACKEND: Background Processing Service
 # ============================================================================
 # This space handles:
 # - Tokenization pre-processing
@@ -25,24 +31,129 @@ warnings.filterwarnings("ignore", category=RuntimeWarning, message=".*asyncio.*"
 # - Response caching
 # ============================================================================
-# In-memory caches (will reset on space restart)
 prompt_cache = {}
 response_cache = {}
 token_ledger = {}
 backend_start_time = time.time()
 def tokenize_text(text: str) -> str:
-    """Enhanced tokenization with extremely detailed logging"""
-    logger.info(f"[TOKENIZE] ===== TOKENIZE REQUEST START =====")
-    logger.info(f"[TOKENIZE] Input text length: {len(text)} characters")
-    logger.info(f"[TOKENIZE] Input text preview: '{text[:100]}{'...' if len(text) > 100 else ''}'")
-    logger.info(f"[TOKENIZE] Input text hash: {hashlib.md5(text.encode()).hexdigest()[:16]}")
     start_time = time.time()
     try:
-        # Simple character-based estimation (can be enhanced with proper tokenizer)
-        estimated_tokens = len(text.split()) + len(text) // 4
         processing_time = time.time() - start_time
         result = {
@@ -50,40 +161,31 @@ def tokenize_text(text: str) -> str:
             "estimated_tokens": estimated_tokens,
             "processing_time_ms": round(processing_time * 1000, 2),
             "text_length": len(text),
-            "word_count": len(text.split()),
-            "char_count": len(text),
             "timestamp": datetime.datetime.now(pytz.UTC).isoformat(),
-            "request_id": hashlib.md5(f"{text}{time.time()}".encode()).hexdigest()[:8]
         }
-        logger.info(f"[TOKENIZE] ✅ Tokenization completed successfully")
-        logger.info(f"[TOKENIZE] Estimated tokens: {estimated_tokens}")
-        logger.info(f"[TOKENIZE] Processing time: {processing_time:.4f}s ({processing_time*1000:.2f}ms)")
-        logger.info(f"[TOKENIZE] Word count: {len(text.split())}")
-        logger.info(f"[TOKENIZE] Character count: {len(text)}")
-        logger.info(f"[TOKENIZE] Request ID: {result['request_id']}")
-        logger.info(f"[TOKENIZE] ===== TOKENIZE REQUEST END =====")
-        # Create cache key
-        text_hash = hashlib.md5(text.encode()).hexdigest()[:16]
         prompt_cache[text_hash] = {
             "text": text[:100] + "..." if len(text) > 100 else text,
             "tokens": estimated_tokens,
             "cached_at": time.time()
         }
-        logger.info(f"[TOKENIZE] Cached tokenization result for key: {text_hash}")
         return json.dumps(result, indent=2)
     except Exception as e:
         processing_time = time.time() - start_time
-        logger.error(f"[TOKENIZE] ❌ Tokenization failed after {processing_time:.4f}s: {e}")
-        logger.error(f"[TOKENIZE] Error type: {type(e).__name__}")
-        logger.error(f"[TOKENIZE] Error details: {str(e)}")
-        logger.error(f"[TOKENIZE] Input text that caused error: '{text[:200]}{'...' if len(text) > 200 else ''}'")
-        logger.error(f"[TOKENIZE] ===== TOKENIZE REQUEST END (ERROR) =====")
         return json.dumps({
             "success": False,
@@ -94,61 +196,44 @@ def tokenize_text(text: str) -> str:
         }, indent=2)
 def cache_prompt(key: str, value: str) -> str:
-    """Store prompt in cache with extremely detailed logging"""
-    logger.info(f"[CACHE-PROMPT] ===== CACHE PROMPT REQUEST START =====")
-    logger.info(f"[CACHE-PROMPT] Requested key: '{key}'")
-    logger.info(f"[CACHE-PROMPT] Key length: {len(key)} characters")
-    logger.info(f"[CACHE-PROMPT] Key hash: {hashlib.md5(key.encode()).hexdigest()[:16]}")
-    logger.info(f"[CACHE-PROMPT] Value length: {len(value)} characters")
-    logger.info(f"[CACHE-PROMPT] Value preview: '{value[:100]}{'...' if len(value) > 100 else ''}'")
-    logger.info(f"[CACHE-PROMPT] Current cache size: {len(prompt_cache)} entries")
-    logger.info(f"[CACHE-PROMPT] Current cache memory usage: {sum(len(v) for v in prompt_cache.values())} characters")
-    logger.info(f"[CACHE-PROMPT] Available keys: {list(prompt_cache.keys())[:10]}{'...' if len(prompt_cache) > 10 else ''}")
     start_time = time.time()
     try:
-        prompt_cache[key] = {
             "value": value,
-            "timestamp": time.time()
         }
-        processing_time = time.time() - start_time
-        # Limit cache size to 100 entries
-        if len(prompt_cache) > 100:
-            oldest_key = min(prompt_cache.keys(), key=lambda k: prompt_cache[k]["timestamp"])
-            del prompt_cache[oldest_key]
-            logger.info(f"[CACHE-PROMPT] Removed oldest entry: {oldest_key}")
         result = {
             "success": True,
-            "key": key,
             "value_length": len(value),
             "cache_size": len(prompt_cache),
             "processing_time_ms": round(processing_time * 1000, 2),
             "timestamp": datetime.datetime.now(pytz.UTC).isoformat(),
-            "request_id": hashlib.md5(f"{key}{time.time()}".encode()).hexdigest()[:8]
         }
-        logger.info(f"[CACHE-PROMPT] ✅ Prompt cached successfully")
-        logger.info(f"[CACHE-PROMPT] Cached key: '{key}'")
-        logger.info(f"[CACHE-PROMPT] Value length: {len(value)} characters")
-        logger.info(f"[CACHE-PROMPT] Value preview: '{value[:100]}{'...' if len(value) > 100 else ''}'")
-        logger.info(f"[CACHE-PROMPT] Processing time: {processing_time:.4f}s ({processing_time*1000:.2f}ms)")
-        logger.info(f"[CACHE-PROMPT] Request ID: {result['request_id']}")
-        logger.info(f"[CACHE-PROMPT] ===== CACHE PROMPT REQUEST END =====")
         return json.dumps(result, indent=2)
     except Exception as e:
         processing_time = time.time() - start_time
-        logger.error(f"[CACHE-PROMPT] ❌ Cache prompt failed after {processing_time:.4f}s: {e}")
-        logger.error(f"[CACHE-PROMPT] Error type: {type(e).__name__}")
-        logger.error(f"[CACHE-PROMPT] Error details: {str(e)}")
-        logger.error(f"[CACHE-PROMPT] Key that caused error: '{key}'")
-        logger.error(f"[CACHE-PROMPT] Value that caused error: '{value[:200]}{'...' if len(value) > 200 else ''}'")
-        logger.error(f"[CACHE-PROMPT] ===== CACHE PROMPT REQUEST END (ERROR) =====")
         return json.dumps({
             "success": False,
@@ -159,69 +244,50 @@ def cache_prompt(key: str, value: str) -> str:
         }, indent=2)
 def get_cached_prompt(key: str) -> str:
-    """Retrieve a cached prompt with extremely detailed logging"""
-    logger.info(f"[GET-PROMPT] ===== GET CACHED PROMPT REQUEST START =====")
-    logger.info(f"[GET-PROMPT] Requested key: '{key}'")
-    logger.info(f"[GET-PROMPT] Key length: {len(key)} characters")
-    logger.info(f"[GET-PROMPT] Key hash: {hashlib.md5(key.encode()).hexdigest()[:16]}")
-    logger.info(f"[GET-PROMPT] Current cache size: {len(prompt_cache)} entries")
-    logger.info(f"[GET-PROMPT] Current cache memory usage: {sum(len(v) for v in prompt_cache.values())} characters")
-    logger.info(f"[GET-PROMPT] Available keys: {list(prompt_cache.keys())[:10]}{'...' if len(prompt_cache) > 10 else ''}")
     start_time = time.time()
     try:
-        cached_value = prompt_cache.get(key)
         processing_time = time.time() - start_time
         if cached_value is not None:
             result = {
                 "success": True,
                 "found": True,
-                "key": key,
-                "value": cached_value,
-                "value_length": len(cached_value),
                 "cache_size": len(prompt_cache),
                 "processing_time_ms": round(processing_time * 1000, 2),
                 "timestamp": datetime.datetime.now(pytz.UTC).isoformat(),
-                "request_id": hashlib.md5(f"{key}{time.time()}".encode()).hexdigest()[:8],
                 "cache_hit": True
             }
-            logger.info(f"[GET-PROMPT] ✅ Cache HIT - prompt found")
-            logger.info(f"[GET-PROMPT] Found key: '{key}'")
-            logger.info(f"[GET-PROMPT] Value length: {len(cached_value)} characters")
-            logger.info(f"[GET-PROMPT] Value preview: '{cached_value[:100]}{'...' if len(cached_value) > 100 else ''}'")
-            logger.info(f"[GET-PROMPT] Processing time: {processing_time:.4f}s ({processing_time*1000:.2f}ms)")
-            logger.info(f"[GET-PROMPT] Request ID: {result['request_id']}")
         else:
             result = {
                 "success": True,
                 "found": False,
-                "key": key,
                 "value": None,
                 "cache_size": len(prompt_cache),
                 "processing_time_ms": round(processing_time * 1000, 2),
                 "timestamp": datetime.datetime.now(pytz.UTC).isoformat(),
-                "request_id": hashlib.md5(f"{key}{time.time()}".encode()).hexdigest()[:8],
                 "cache_hit": False
             }
-            logger.warning(f"[GET-PROMPT] ⚠️ Cache MISS - prompt not found")
-            logger.warning(f"[GET-PROMPT] Missing key: '{key}'")
-            logger.warning(f"[GET-PROMPT] Processing time: {processing_time:.4f}s ({processing_time*1000:.2f}ms)")
-            logger.warning(f"[GET-PROMPT] Request ID: {result['request_id']}")
-        logger.info(f"[GET-PROMPT] ===== GET CACHED PROMPT REQUEST END =====")
         return json.dumps(result, indent=2)
     except Exception as e:
         processing_time = time.time() - start_time
-        logger.error(f"[GET-PROMPT] ❌ Get cached prompt failed after {processing_time:.4f}s: {e}")
-        logger.error(f"[GET-PROMPT] Error type: {type(e).__name__}")
-        logger.error(f"[GET-PROMPT] Error details: {str(e)}")
-        logger.error(f"[GET-PROMPT] Key that caused error: '{key}'")
-        logger.error(f"[GET-PROMPT] ===== GET CACHED PROMPT REQUEST END (ERROR) =====")
         return json.dumps({
             "success": False,
@@ -232,31 +298,24 @@ def get_cached_prompt(key: str) -> str:
         }, indent=2)
 def cache_response(prompt_hash: str, response: str) -> str:
-    """Cache a complete response with extremely detailed logging"""
-    logger.info(f"[CACHE-RESPONSE] ===== CACHE RESPONSE REQUEST START =====")
-    logger.info(f"[CACHE-RESPONSE] Prompt hash: '{prompt_hash}'")
-    logger.info(f"[CACHE-RESPONSE] Hash length: {len(prompt_hash)} characters")
-    logger.info(f"[CACHE-RESPONSE] Response length: {len(response)} characters")
-    logger.info(f"[CACHE-RESPONSE] Response preview: '{response[:150]}{'...' if len(response) > 150 else ''}'")
-    logger.info(f"[CACHE-RESPONSE] Current response cache size: {len(response_cache)} entries")
-    logger.info(f"[CACHE-RESPONSE] Current cache memory usage: {sum(len(v['response']) for v in response_cache.values())} characters")
-    logger.info(f"[CACHE-RESPONSE] Available hashes: {list(response_cache.keys())[:10]}{'...' if len(response_cache) > 10 else ''}")
     start_time = time.time()
     try:
         response_cache[prompt_hash] = {
             "response": response,
-            "timestamp": time.time()
         }
-        processing_time = time.time() - start_time
-        # Limit cache size to 50 entries
-        if len(response_cache) > 50:
-            oldest_key = min(response_cache.keys(), key=lambda k: response_cache[k]["timestamp"])
-            del response_cache[oldest_key]
-            logger.info(f"[CACHE-RESPONSE] Removed oldest entry: {oldest_key}")
         result = {
             "success": True,
@@ -265,29 +324,15 @@ def cache_response(prompt_hash: str, response: str) -> str:
             "cache_size": len(response_cache),
             "processing_time_ms": round(processing_time * 1000, 2),
             "timestamp": datetime.datetime.now(pytz.UTC).isoformat(),
-            "request_id": hashlib.md5(f"{prompt_hash}{time.time()}".encode()).hexdigest()[:8],
-            "cache_memory_usage": sum(len(v['response']) for v in response_cache.values())
         }
-        logger.info(f"[CACHE-RESPONSE] ✅ Response cached successfully")
-        logger.info(f"[CACHE-RESPONSE] Cached hash: '{prompt_hash}'")
-        logger.info(f"[CACHE-RESPONSE] Response length: {len(response)} characters")
-        logger.info(f"[CACHE-RESPONSE] New cache size: {len(response_cache)} entries")
-        logger.info(f"[CACHE-RESPONSE] New cache memory usage: {result['cache_memory_usage']} characters")
-        logger.info(f"[CACHE-RESPONSE] Processing time: {processing_time:.4f}s ({processing_time*1000:.2f}ms)")
-        logger.info(f"[CACHE-RESPONSE] Request ID: {result['request_id']}")
-        logger.info(f"[CACHE-RESPONSE] ===== CACHE RESPONSE REQUEST END =====")
         return json.dumps(result, indent=2)
     except Exception as e:
         processing_time = time.time() - start_time
-        logger.error(f"[CACHE-RESPONSE] ❌ Cache response failed after {processing_time:.4f}s: {e}")
-        logger.error(f"[CACHE-RESPONSE] Error type: {type(e).__name__}")
-        logger.error(f"[CACHE-RESPONSE] Error details: {str(e)}")
-        logger.error(f"[CACHE-RESPONSE] Hash that caused error: '{prompt_hash}'")
-        logger.error(f"[CACHE-RESPONSE] Response preview that caused error: '{response[:300]}{'...' if len(response) > 300 else ''}'")
-        logger.error(f"[CACHE-RESPONSE] ===== CACHE RESPONSE REQUEST END (ERROR) =====")
         return json.dumps({
             "success": False,
@@ -298,14 +343,7 @@ def cache_response(prompt_hash: str, response: str) -> str:
         }, indent=2)
 def get_cached_response(prompt_hash: str) -> str:
-    """Retrieve cached response with extremely detailed logging"""
-    logger.info(f"[GET-RESPONSE] ===== GET CACHED RESPONSE REQUEST START =====")
-    logger.info(f"[GET-RESPONSE] Requested hash: '{prompt_hash}'")
-    logger.info(f"[GET-RESPONSE] Hash length: {len(prompt_hash)} characters")
-    logger.info(f"[GET-RESPONSE] Current response cache size: {len(response_cache)} entries")
-    logger.info(f"[GET-RESPONSE] Current cache memory usage: {sum(len(v['response']) for v in response_cache.values())} characters")
-    logger.info(f"[GET-RESPONSE] Available hashes: {list(response_cache.keys())[:10]}{'...' if len(response_cache) > 10 else ''}")
     start_time = time.time()
     try:
@@ -314,7 +352,7 @@ def get_cached_response(prompt_hash: str) -> str:
         if cached_data is not None:
             response = cached_data["response"]
-            age_seconds = round(time.time() - cached_data["timestamp"], 2)
             result = {
                 "success": True,
@@ -328,17 +366,10 @@ def get_cached_response(prompt_hash: str) -> str:
                 "timestamp": datetime.datetime.now(pytz.UTC).isoformat(),
                 "request_id": hashlib.md5(f"{prompt_hash}{time.time()}".encode()).hexdigest()[:8],
                 "cache_hit": True,
-                "cached_at": datetime.datetime.fromtimestamp(cached_data["timestamp"], pytz.UTC).isoformat()
             }
-            logger.info(f"[GET-RESPONSE] ✅ Cache HIT - response found")
-            logger.info(f"[GET-RESPONSE] Found hash: '{prompt_hash}'")
-            logger.info(f"[GET-RESPONSE] Response length: {len(response)} characters")
-            logger.info(f"[GET-RESPONSE] Response preview: '{response[:150]}{'...' if len(response) > 150 else ''}'")
-            logger.info(f"[GET-RESPONSE] Response age: {age_seconds} seconds")
-            logger.info(f"[GET-RESPONSE] Cached at: {result['cached_at']}")
-            logger.info(f"[GET-RESPONSE] Processing time: {processing_time:.4f}s ({processing_time*1000:.2f}ms)")
-            logger.info(f"[GET-RESPONSE] Request ID: {result['request_id']}")
         else:
             result = {
                 "success": True,
@@ -352,21 +383,13 @@ def get_cached_response(prompt_hash: str) -> str:
                 "cache_hit": False
             }
-            logger.warning(f"[GET-RESPONSE] ⚠️ Cache MISS - response not found")
-            logger.warning(f"[GET-RESPONSE] Missing hash: '{prompt_hash}'")
-            logger.warning(f"[GET-RESPONSE] Processing time: {processing_time:.4f}s ({processing_time*1000:.2f}ms)")
-            logger.warning(f"[GET-RESPONSE] Request ID: {result['request_id']}")
-        logger.info(f"[GET-RESPONSE] ===== GET CACHED RESPONSE REQUEST END =====")
         return json.dumps(result, indent=2)
     except Exception as e:
         processing_time = time.time() - start_time
-        logger.error(f"[GET-RESPONSE] ❌ Get cached response failed after {processing_time:.4f}s: {e}")
-        logger.error(f"[GET-RESPONSE] Error type: {type(e).__name__}")
-        logger.error(f"[GET-RESPONSE] Error details: {str(e)}")
-        logger.error(f"[GET-RESPONSE] Hash that caused error: '{prompt_hash}'")
-        logger.error(f"[GET-RESPONSE] ===== GET CACHED RESPONSE REQUEST END (ERROR) =====")
         return json.dumps({
             "success": False,
@@ -461,90 +484,6 @@ def calculate_token_cost(username: str, duration_ms: float) -> str:
         return json.dumps({
             "success": False,
             "error": str(e),
-            "error_type": type(e).__name__,
-            "processing_time_ms": round(processing_time * 1000, 2),
-            "timestamp": datetime.datetime.now(pytz.UTC).isoformat()
-        }, indent=2)
-def get_cache_stats() -> str:
-    """Get cache statistics with extremely detailed logging"""
-    logger.info(f"[CACHE-STATS] ===== CACHE STATS REQUEST START =====")
-    logger.info(f"[CACHE-STATS] Current prompt cache size: {len(prompt_cache)} entries")
-    logger.info(f"[CACHE-STATS] Current response cache size: {len(response_cache)} entries")
-    logger.info(f"[CACHE-STATS] Current users tracked: {len(token_ledger)}")
-    logger.info(f"[CACHE-STATS] Prompt cache memory usage: {sum(len(str(v)) for v in prompt_cache.values())} characters")
-    logger.info(f"[CACHE-STATS] Response cache memory usage: {sum(len(v['response']) for v in response_cache.values())} characters")
-    logger.info(f"[CACHE-STATS] Total requests processed: {sum(u['requests'] for u in token_ledger.values())}")
-    start_time = time.time()
-    try:
-        # Calculate detailed statistics
-        total_prompt_memory = sum(len(str(v)) for v in prompt_cache.values())
-        total_response_memory = sum(len(v['response']) for v in response_cache.values())
-        total_requests = sum(u['requests'] for u in token_ledger.values())
-        total_tokens = sum(u['total_cost'] for u in token_ledger.values())
-        total_duration = sum(u['total_duration_ms'] for u in token_ledger.values())
-        # User statistics
-        active_users = len([u for u in token_ledger.values() if time.time() - u.get('last_seen', u.get('first_seen', 0)) < 3600])
-        avg_requests_per_user = total_requests / len(token_ledger) if len(token_ledger) > 0 else 0
-        avg_tokens_per_user = total_tokens / len(token_ledger) if len(token_ledger) > 0 else 0
-        processing_time = time.time() - start_time
-        result = {
-            "success": True,
-            "prompt_cache_size": len(prompt_cache),
-            "response_cache_size": len(response_cache),
-            "users_tracked": len(token_ledger),
-            "active_users_last_hour": active_users,
-            "total_requests": total_requests,
-            "total_tokens_spent": round(total_tokens, 4),
-            "total_duration_ms": round(total_duration, 2),
-            "avg_requests_per_user": round(avg_requests_per_user, 2),
-            "avg_tokens_per_user": round(avg_tokens_per_user, 4),
-            "prompt_cache_memory_bytes": total_prompt_memory,
-            "response_cache_memory_bytes": total_response_memory,
-            "total_cache_memory_bytes": total_prompt_memory + total_response_memory,
-            "processing_time_ms": round(processing_time * 1000, 2),
-            "timestamp": datetime.datetime.now(pytz.UTC).isoformat(),
-            "request_id": hashlib.md5(f"stats{time.time()}".encode()).hexdigest()[:8]
-        }
-        logger.info(f"[CACHE-STATS] ✅ Cache statistics retrieved successfully")
-        logger.info(f"[CACHE-STATS] Prompt cache: {len(prompt_cache)} entries ({total_prompt_memory} chars)")
-        logger.info(f"[CACHE-STATS] Response cache: {len(response_cache)} entries ({total_response_memory} chars)")
-        logger.info(f"[CACHE-STATS] Users tracked: {len(token_ledger)} ({active_users} active last hour)")
-        logger.info(f"[CACHE-STATS] Total requests: {total_requests}")
-        logger.info(f"[CACHE-STATS] Total tokens spent: {total_tokens}")
-        logger.info(f"[CACHE-STATS] Total duration: {total_duration}ms")
-        logger.info(f"[CACHE-STATS] Avg requests per user: {avg_requests_per_user}")
-        logger.info(f"[CACHE-STATS] Avg tokens per user: {avg_tokens_per_user}")
-        logger.info(f"[CACHE-STATS] Processing time: {processing_time:.4f}s ({processing_time*1000:.2f}ms)")
-        logger.info(f"[CACHE-STATS] Request ID: {result['request_id']}")
-        logger.info(f"[CACHE-STATS] ===== CACHE STATS REQUEST END =====")
-        return json.dumps(result, indent=2)
-    except Exception as e:
-        processing_time = time.time() - start_time
-        logger.error(f"[CACHE-STATS] ❌ Cache statistics retrieval failed after {processing_time:.4f}s: {e}")
-        logger.error(f"[CACHE-STATS] Error type: {type(e).__name__}")
-        logger.error(f"[CACHE-STATS] Error details: {str(e)}")
-        logger.error(f"[CACHE-STATS] ===== CACHE STATS REQUEST END (ERROR) =====")
-        return json.dumps({
-            "success": False,
-            "error": str(e),
-            "error_type": type(e).__name__,
-            "processing_time_ms": round(processing_time * 1000, 2),
-            "timestamp": datetime.datetime.now(pytz.UTC).isoformat()
-        }, indent=2)
-def get_backend_health() -> str:
-    """Get backend health status with extremely detailed logging"""
-    logger.info(f"[BACKEND-HEALTH] ===== BACKEND HEALTH REQUEST START =====")
     logger.info(f"[BACKEND-HEALTH] Checking backend health status...")
     logger.info(f"[BACKEND-HEALTH] Current prompt cache size: {len(prompt_cache)} entries")
     logger.info(f"[BACKEND-HEALTH] Current response cache size: {len(response_cache)} entries")

 import logging
 import datetime
 import pytz
+import psutil
+import threading
+import gc
 from typing import Dict, Optional
+from functools import lru_cache
+import concurrent.futures
+import os
 # Initialize logging for backend
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - BACKEND - %(message)s', force=True)
 warnings.filterwarnings("ignore", category=RuntimeWarning, message=".*asyncio.*")
 # ============================================================================
+# ZEROENGINE-BACKEND: Background Processing Service - SPEED OPTIMIZED
 # ============================================================================
 # This space handles:
 # - Tokenization pre-processing
 # - Response caching
 # ============================================================================
+# SPEED OPTIMIZATIONS: Larger caches with 16GB RAM available
+MAX_PROMPT_CACHE_SIZE = 50000  # Increased from default
+MAX_RESPONSE_CACHE_SIZE = 10000  # Increased from default
+MAX_TOKEN_LEDGER_SIZE = 10000   # Increased from default
+# HARD-CODED: Hugging Face Space RAM limits (same as main app)
+TOTAL_RAM_GB = 18.0  # HARD-CODED: 18GB total for container
+USABLE_RAM_GB = 16.0  # HARD-CODED: 16GB usable for backend (2GB reserved)
+# In-memory caches with optimized data structures
 prompt_cache = {}
 response_cache = {}
 token_ledger = {}
 backend_start_time = time.time()
+# Performance tracking
+performance_stats = {
+    "total_requests": 0,
+    "cache_hits": 0,
+    "cache_misses": 0,
+    "avg_response_time": 0.0,
+    "memory_usage_mb": 0.0
+}
+# Background cleanup thread
+cleanup_thread_running = True
+def background_cleanup():
+    """Background thread for cache management and optimization"""
+    while cleanup_thread_running:
+        try:
+            # Clean up old entries every 5 minutes
+            time.sleep(300)
+            current_time = time.time()
+            # Clean old prompt cache entries (older than 1 hour)
+            old_prompt_keys = [
+                key for key, data in prompt_cache.items()
+                if current_time - data.get("cached_at", 0) > 3600
+            ]
+            for key in old_prompt_keys[:100]:  # Limit cleanup batch size
+                del prompt_cache[key]
+            # Clean old response cache entries (older than 2 hours)
+            old_response_keys = [
+                key for key, data in response_cache.items()
+                if current_time - data.get("cached_at", 0) > 7200
+            ]
+            for key in old_response_keys[:50]:  # Limit cleanup batch size
+                del response_cache[key]
+            # Force garbage collection
+            gc.collect()
+            logger.info(f"[CLEANUP] Removed {len(old_prompt_keys)} old prompts, {len(old_response_keys)} old responses")
+        except Exception as e:
+            logger.error(f"[CLEANUP] Background cleanup error: {e}")
+# Start background cleanup thread
+cleanup_thread = threading.Thread(target=background_cleanup, daemon=True)
+cleanup_thread.start()
+logger.info("[INIT] Background cleanup thread started")
+# Log hard-coded RAM configuration
+logger.info(f"[RAM] HARD-CODED: Total: {TOTAL_RAM_GB:.1f}GB, Usable: {USABLE_RAM_GB:.1f}GB (Hugging Face Space)")
+logger.info(f"[RAM] (Ignoring host system memory - using container limits)")
+@lru_cache(maxsize=10000)
+def fast_hash(text: str) -> str:
+    """Fast hashing function with LRU cache"""
+    return hashlib.md5(text.encode()).hexdigest()
+def get_memory_usage() -> float:
+    """Get current memory usage in MB"""
+    try:
+        return psutil.Process().memory_info().rss / 1024 / 1024
+    except:
+        return 0.0
 def tokenize_text(text: str) -> str:
+    """SPEED-OPTIMIZED tokenization with fast caching"""
     start_time = time.time()
+    # Update performance stats
+    performance_stats["total_requests"] += 1
     try:
+        # Check cache first for instant response
+        text_hash = fast_hash(text)[:16]
+        cached_result = prompt_cache.get(text_hash)
+        if cached_result:
+            performance_stats["cache_hits"] += 1
+            processing_time = time.time() - start_time
+            result = {
+                "success": True,
+                "estimated_tokens": cached_result["tokens"],
+                "processing_time_ms": round(processing_time * 1000, 2),
+                "text_length": len(text),
+                "word_count": len(text.split()),
+                "char_count": len(text),
+                "timestamp": datetime.datetime.now(pytz.UTC).isoformat(),
+                "request_id": hashlib.md5(f"{text}{time.time()}".encode()).hexdigest()[:8],
+                "cache_hit": True
+            }
+            logger.info(f"[TOKENIZE] ⚡ CACHE HIT: {cached_result['tokens']} tokens in {processing_time*1000:.1f}ms")
+            return json.dumps(result, indent=2)
+        # Cache miss - calculate tokens
+        performance_stats["cache_misses"] += 1
+        # OPTIMIZED: Faster token estimation algorithm
+        words = text.split()
+        word_count = len(words)
+        char_count = len(text)
+        # More accurate token estimation based on patterns
+        estimated_tokens = word_count + (char_count // 4) + (len([w for w in words if len(w) > 8]) * 2)
         processing_time = time.time() - start_time
         result = {
             "estimated_tokens": estimated_tokens,
             "processing_time_ms": round(processing_time * 1000, 2),
             "text_length": len(text),
+            "word_count": word_count,
+            "char_count": char_count,
             "timestamp": datetime.datetime.now(pytz.UTC).isoformat(),
+            "request_id": hashlib.md5(f"{text}{time.time()}".encode()).hexdigest()[:8],
+            "cache_hit": False
         }
+        # Cache the result for future requests
         prompt_cache[text_hash] = {
             "text": text[:100] + "..." if len(text) > 100 else text,
             "tokens": estimated_tokens,
             "cached_at": time.time()
         }
+        # Limit cache size with LRU eviction
+        if len(prompt_cache) > MAX_PROMPT_CACHE_SIZE:
+            oldest_key = min(prompt_cache.keys(), key=lambda k: prompt_cache[k]["cached_at"])
+            del prompt_cache[oldest_key]
+        logger.info(f"[TOKENIZE] ✅ CALCULATED: {estimated_tokens} tokens in {processing_time*1000:.1f}ms")
         return json.dumps(result, indent=2)
     except Exception as e:
         processing_time = time.time() - start_time
+        logger.error(f"[TOKENIZE] ❌ Failed after {processing_time*1000:.1f}ms: {e}")
         return json.dumps({
             "success": False,
         }, indent=2)
 def cache_prompt(key: str, value: str) -> str:
+    """SPEED-OPTIMIZED prompt caching with larger limits"""
     start_time = time.time()
     try:
+        # Use fast hash for key
+        cache_key = fast_hash(key) if len(key) > 32 else key
+        prompt_cache[cache_key] = {
             "value": value,
+            "cached_at": time.time()
         }
+        # Limit cache size with optimized eviction
+        if len(prompt_cache) > MAX_PROMPT_CACHE_SIZE:
+            # Batch remove oldest 1000 entries for efficiency
+            oldest_keys = sorted(prompt_cache.keys(),
+                               key=lambda k: prompt_cache[k]["cached_at"])[:1000]
+            for old_key in oldest_keys:
+                del prompt_cache[old_key]
+        processing_time = time.time() - start_time
         result = {
             "success": True,
+            "key": cache_key,
             "value_length": len(value),
             "cache_size": len(prompt_cache),
             "processing_time_ms": round(processing_time * 1000, 2),
             "timestamp": datetime.datetime.now(pytz.UTC).isoformat(),
+            "request_id": hashlib.md5(f"{cache_key}{time.time()}".encode()).hexdigest()[:8]
         }
+        logger.info(f"[CACHE-PROMPT] ⚡ Stored: {len(value)} chars in {processing_time*1000:.1f}ms")
         return json.dumps(result, indent=2)
     except Exception as e:
         processing_time = time.time() - start_time
+        logger.error(f"[CACHE-PROMPT] ❌ Failed after {processing_time*1000:.1f}ms: {e}")
         return json.dumps({
             "success": False,
         }, indent=2)
 def get_cached_prompt(key: str) -> str:
+    """SPEED-OPTIMIZED prompt retrieval"""
     start_time = time.time()
     try:
+        # Use fast hash for key
+        cache_key = fast_hash(key) if len(key) > 32 else key
+        cached_value = prompt_cache.get(cache_key)
         processing_time = time.time() - start_time
         if cached_value is not None:
             result = {
                 "success": True,
                 "found": True,
+                "key": cache_key,
+                "value": cached_value["value"],
+                "value_length": len(cached_value["value"]),
                 "cache_size": len(prompt_cache),
                 "processing_time_ms": round(processing_time * 1000, 2),
                 "timestamp": datetime.datetime.now(pytz.UTC).isoformat(),
+                "request_id": hashlib.md5(f"{cache_key}{time.time()}".encode()).hexdigest()[:8],
                 "cache_hit": True
             }
+            logger.info(f"[GET-PROMPT] ⚡ HIT: {len(cached_value['value'])} chars in {processing_time*1000:.1f}ms")
         else:
             result = {
                 "success": True,
                 "found": False,
+                "key": cache_key,
                 "value": None,
                 "cache_size": len(prompt_cache),
                 "processing_time_ms": round(processing_time * 1000, 2),
                 "timestamp": datetime.datetime.now(pytz.UTC).isoformat(),
+                "request_id": hashlib.md5(f"{cache_key}{time.time()}".encode()).hexdigest()[:8],
                 "cache_hit": False
             }
+            logger.info(f"[GET-PROMPT] ⚠️ MISS: {cache_key} in {processing_time*1000:.1f}ms")
         return json.dumps(result, indent=2)
     except Exception as e:
         processing_time = time.time() - start_time
+        logger.error(f"[GET-PROMPT] ❌ Failed after {processing_time*1000:.1f}ms: {e}")
         return json.dumps({
             "success": False,
         }, indent=2)
 def cache_response(prompt_hash: str, response: str) -> str:
+    """SPEED-OPTIMIZED response caching with larger limits"""
     start_time = time.time()
     try:
         response_cache[prompt_hash] = {
             "response": response,
+            "cached_at": time.time()
         }
+        # Limit cache size with optimized eviction
+        if len(response_cache) > MAX_RESPONSE_CACHE_SIZE:
+            # Batch remove oldest 500 entries for efficiency
+            oldest_keys = sorted(response_cache.keys(),
+                               key=lambda k: response_cache[k]["cached_at"])[:500]
+            for old_key in oldest_keys:
+                del response_cache[old_key]
+        processing_time = time.time() - start_time
         result = {
             "success": True,
             "cache_size": len(response_cache),
             "processing_time_ms": round(processing_time * 1000, 2),
             "timestamp": datetime.datetime.now(pytz.UTC).isoformat(),
+            "request_id": hashlib.md5(f"{prompt_hash}{time.time()}".encode()).hexdigest()[:8]
         }
+        logger.info(f"[CACHE-RESPONSE] ⚡ Stored: {len(response)} chars in {processing_time*1000:.1f}ms")
         return json.dumps(result, indent=2)
     except Exception as e:
         processing_time = time.time() - start_time
+        logger.error(f"[CACHE-RESPONSE] ❌ Failed after {processing_time*1000:.1f}ms: {e}")
         return json.dumps({
             "success": False,
         }, indent=2)
 def get_cached_response(prompt_hash: str) -> str:
+    """SPEED-OPTIMIZED response retrieval"""
     start_time = time.time()
     try:
         if cached_data is not None:
             response = cached_data["response"]
+            age_seconds = round(time.time() - cached_data["cached_at"], 2)
             result = {
                 "success": True,
                 "timestamp": datetime.datetime.now(pytz.UTC).isoformat(),
                 "request_id": hashlib.md5(f"{prompt_hash}{time.time()}".encode()).hexdigest()[:8],
                 "cache_hit": True,
+                "cached_at": datetime.datetime.fromtimestamp(cached_data["cached_at"], pytz.UTC).isoformat()
             }
+            logger.info(f"[GET-RESPONSE] ⚡ HIT: {len(response)} chars in {processing_time*1000:.1f}ms")
         else:
             result = {
                 "success": True,
                 "cache_hit": False
             }
+            logger.info(f"[GET-RESPONSE] ⚠️ MISS: {prompt_hash} in {processing_time*1000:.1f}ms")
         return json.dumps(result, indent=2)
     except Exception as e:
         processing_time = time.time() - start_time
+        logger.error(f"[GET-RESPONSE] ❌ Failed after {processing_time*1000:.1f}ms: {e}")
         return json.dumps({
             "success": False,
         return json.dumps({
             "success": False,
             "error": str(e),
     logger.info(f"[BACKEND-HEALTH] Checking backend health status...")
     logger.info(f"[BACKEND-HEALTH] Current prompt cache size: {len(prompt_cache)} entries")
     logger.info(f"[BACKEND-HEALTH] Current response cache size: {len(response_cache)} entries")