Spaces:

turtle170
/

ZeroEngine

Running

App Files Files Community

turtle170 commited on Feb 1

Commit

91ebf27

verified ·

1 Parent(s): a2ffed4

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -36

app.py CHANGED Viewed

@@ -1234,58 +1234,44 @@ class ZeroEngine:
                         logger.warning(f"[BOOT] Cleanup warning: {e}")
                 # Calculate optimal parameters with token purchases
-                # FIX: Use container RAM limits, not host system memory
                 import psutil
                 ram = psutil.virtual_memory()
-                total_ram_gb = ram.total / (1024**3)
-                available_ram_gb = ram.available / (1024**3)
-                # CRITICAL FIX: Force realistic container limits for Hugging Face Spaces
-                # The host shows 123.8GB but container only has 16GB total
-                if total_ram_gb > 50.0:  # Host system memory detected
-                    logger.warning(f"[RAM] Host system memory detected ({total_ram_gb:.1f}GB), forcing container limits")
-                    total_ram_gb = 16.0  # Container limit
-                    available_ram_gb = min(available_ram_gb, 11.0)  # Conservative available RAM
-                    logger.info(f"[RAM] FORCED: Total: {total_ram_gb:.1f}GB, Available: {available_ram_gb:.1f}GB")
-                else:
-                    logger.info(f"[RAM] Total: {total_ram_gb:.1f}GB, Available: {available_ram_gb:.1f}GB")
-                # CPU-OPTIMIZED BATCH CALCULATION - Very aggressive for 16GB RAM
-                # Base calculation: use more RAM for batching on CPU
-                base_batch = int(512 * available_ram_gb / 8)  # More aggressive base
-                optimal_batch = base_batch
                 # Apply user's batch size from token purchases
                 if session_id:
                     user_batch_size = token_manager.get_purchases(session_id)["batch_size"]
-                    optimal_batch = user_batch_size
-                    logger.info(f"[TOKEN] User batch size: {user_batch_size}")
-                # CPU can handle larger batches with quantized models
-                # REDUCED BATCH SIZE FOR LOW RAM ENVIRONMENTS
-                optimal_batch = max(128, min(512, optimal_batch))  # 128-512 range for low RAM
-                # Context size - REDUCED FOR LOW RAM
-                optimal_ctx = min(1024, quant_config["ctx_size"])  # Max 1024 for low RAM
                 # Reduce context for Gemma models (they have 131K n_ctx_train)
                 if model_format == "gemma":
-                    optimal_ctx = min(512, optimal_ctx)  # Gemma works better with much lower ctx
                     logger.info(f"[FORMAT] Gemma detected: reducing context to {optimal_ctx}")
-                # FURTHER REDUCE CONTEXT FOR LOW RAM SYSTEMS
-                if available_ram_gb < 4.0:  # Less than 4GB available
-                    optimal_ctx = min(512, optimal_ctx)
-                    optimal_batch = min(256, optimal_batch)
-                    logger.info(f"[RAM-LOW] Low RAM detected: ctx={optimal_ctx}, batch={optimal_batch}")
-                # Thread optimization - ULTRA AGGRESSIVE FOR 2 vCPU
-                optimal_threads = OPTIMAL_THREADS  # Exactly 2 threads for 2 vCPU
                 # Apply CPU optimizations before model loading
-                if LOW_LATENCY_MODE:
-                    optimize_cpu_performance()
-                    boost_cpu_frequency()
                 logger.info(f"[CPU] ULTRA-OPTIMIZED: {optimal_threads} threads for 2 vCPU + 16GB RAM")

                         logger.warning(f"[BOOT] Cleanup warning: {e}")
                 # Calculate optimal parameters with token purchases
+                # HARD-CODE: Force 16GB total RAM for Hugging Face Spaces reliability
                 import psutil
                 ram = psutil.virtual_memory()
+                total_ram_gb = 16.0  # HARD-CODED: 16GB total for container
+                available_ram_gb = 6.0  # HARD-CODED: 6GB available for model
+                logger.info(f"[RAM] HARD-CODED: Total: {total_ram_gb:.1f}GB, Available: {available_ram_gb:.1f}GB")
+                logger.info(f"[RAM] (Ignoring host system {ram.total/(1024**3):.1f}GB)")
+                # Define missing variables
+                session_id = session_id if session_id else None
+                token_manager = TokenManager()  # Assuming TokenManager is defined elsewhere
+                user_batch_size = None
+                optimal_batch = None
+                optimal_ctx = None
+                optimal_threads = None
                 # Apply user's batch size from token purchases
                 if session_id:
                     user_batch_size = token_manager.get_purchases(session_id)["batch_size"]
+                    # IGNORE user batch size - use conservative settings for reliability
+                    logger.info(f"[TOKEN] User batch size: {user_batch_size} (OVERRIDDEN for reliability)")
+                # ULTRA-CONSERVATIVE SETTINGS FOR 6GB AVAILABLE RAM
+                optimal_batch = 128  # FIXED: Very conservative batch size
+                optimal_ctx = 256    # FIXED: Very conservative context size
+                optimal_threads = 2   # FIXED: 2 threads for 2 vCPU
+                logger.info(f"[RAM] ULTRA-CONSERVATIVE: batch={optimal_batch}, ctx={optimal_ctx}")
                 # Reduce context for Gemma models (they have 131K n_ctx_train)
                 if model_format == "gemma":
+                    optimal_ctx = 128  # EXTREMELY conservative for Gemma
                     logger.info(f"[FORMAT] Gemma detected: reducing context to {optimal_ctx}")
                 # Apply CPU optimizations before model loading
+                optimize_cpu_performance()
+                boost_cpu_frequency()
                 logger.info(f"[CPU] ULTRA-OPTIMIZED: {optimal_threads} threads for 2 vCPU + 16GB RAM")