Spaces:

turtle170
/

ZeroEngine

Running

App Files Files Community

turtle170 commited on 17 days ago

Commit

0c27b45

verified ·

1 Parent(s): 8806e23

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -6

app.py CHANGED Viewed

@@ -1234,8 +1234,12 @@ class ZeroEngine:
                         logger.warning(f"[BOOT] Cleanup warning: {e}")
                 # Calculate optimal parameters with token purchases
-                # Force use 16GB RAM instead of incorrect system detection
-                available_ram_gb = 16.0 * 0.7  # 70% of 16GB = ~11.2GB available
                 # CPU-OPTIMIZED BATCH CALCULATION - Very aggressive for 16GB RAM
                 # Base calculation: use more RAM for batching on CPU
@@ -1249,16 +1253,23 @@ class ZeroEngine:
                     logger.info(f"[TOKEN] User batch size: {user_batch_size}")
                 # CPU can handle larger batches with quantized models
-                optimal_batch = max(256, min(1024, optimal_batch))  # 256-1024 range for CPU (balanced performance)
-                # Context size
-                optimal_ctx = quant_config["ctx_size"]
                 # Reduce context for Gemma models (they have 131K n_ctx_train)
                 if model_format == "gemma":
-                    optimal_ctx = min(1024, optimal_ctx)  # Gemma works better with much lower ctx
                     logger.info(f"[FORMAT] Gemma detected: reducing context to {optimal_ctx}")
                 # Thread optimization - ULTRA AGGRESSIVE FOR 2 vCPU
                 optimal_threads = OPTIMAL_THREADS  # Exactly 2 threads for 2 vCPU

                         logger.warning(f"[BOOT] Cleanup warning: {e}")
                 # Calculate optimal parameters with token purchases
+                # Use actual system RAM detection
+                import psutil
+                ram = psutil.virtual_memory()
+                total_ram_gb = ram.total / (1024**3)
+                available_ram_gb = ram.available / (1024**3)
+                logger.info(f"[RAM] Total: {total_ram_gb:.1f}GB, Available: {available_ram_gb:.1f}GB")
                 # CPU-OPTIMIZED BATCH CALCULATION - Very aggressive for 16GB RAM
                 # Base calculation: use more RAM for batching on CPU
                     logger.info(f"[TOKEN] User batch size: {user_batch_size}")
                 # CPU can handle larger batches with quantized models
+                # REDUCED BATCH SIZE FOR LOW RAM ENVIRONMENTS
+                optimal_batch = max(128, min(512, optimal_batch))  # 128-512 range for low RAM
+                # Context size - REDUCED FOR LOW RAM
+                optimal_ctx = min(1024, quant_config["ctx_size"])  # Max 1024 for low RAM
                 # Reduce context for Gemma models (they have 131K n_ctx_train)
                 if model_format == "gemma":
+                    optimal_ctx = min(512, optimal_ctx)  # Gemma works better with much lower ctx
                     logger.info(f"[FORMAT] Gemma detected: reducing context to {optimal_ctx}")
+                # FURTHER REDUCE CONTEXT FOR LOW RAM SYSTEMS
+                if available_ram_gb < 4.0:  # Less than 4GB available
+                    optimal_ctx = min(512, optimal_ctx)
+                    optimal_batch = min(256, optimal_batch)
+                    logger.info(f"[RAM-LOW] Low RAM detected: ctx={optimal_ctx}, batch={optimal_batch}")
                 # Thread optimization - ULTRA AGGRESSIVE FOR 2 vCPU
                 optimal_threads = OPTIMAL_THREADS  # Exactly 2 threads for 2 vCPU