Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -1234,8 +1234,12 @@ class ZeroEngine:
|
|
| 1234 |
logger.warning(f"[BOOT] Cleanup warning: {e}")
|
| 1235 |
|
| 1236 |
# Calculate optimal parameters with token purchases
|
| 1237 |
-
#
|
| 1238 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1239 |
|
| 1240 |
# CPU-OPTIMIZED BATCH CALCULATION - Very aggressive for 16GB RAM
|
| 1241 |
# Base calculation: use more RAM for batching on CPU
|
|
@@ -1249,16 +1253,23 @@ class ZeroEngine:
|
|
| 1249 |
logger.info(f"[TOKEN] User batch size: {user_batch_size}")
|
| 1250 |
|
| 1251 |
# CPU can handle larger batches with quantized models
|
| 1252 |
-
|
|
|
|
| 1253 |
|
| 1254 |
-
# Context size
|
| 1255 |
-
optimal_ctx = quant_config["ctx_size"]
|
| 1256 |
|
| 1257 |
# Reduce context for Gemma models (they have 131K n_ctx_train)
|
| 1258 |
if model_format == "gemma":
|
| 1259 |
-
optimal_ctx = min(
|
| 1260 |
logger.info(f"[FORMAT] Gemma detected: reducing context to {optimal_ctx}")
|
| 1261 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1262 |
# Thread optimization - ULTRA AGGRESSIVE FOR 2 vCPU
|
| 1263 |
optimal_threads = OPTIMAL_THREADS # Exactly 2 threads for 2 vCPU
|
| 1264 |
|
|
|
|
| 1234 |
logger.warning(f"[BOOT] Cleanup warning: {e}")
|
| 1235 |
|
| 1236 |
# Calculate optimal parameters with token purchases
|
| 1237 |
+
# Use actual system RAM detection
|
| 1238 |
+
import psutil
|
| 1239 |
+
ram = psutil.virtual_memory()
|
| 1240 |
+
total_ram_gb = ram.total / (1024**3)
|
| 1241 |
+
available_ram_gb = ram.available / (1024**3)
|
| 1242 |
+
logger.info(f"[RAM] Total: {total_ram_gb:.1f}GB, Available: {available_ram_gb:.1f}GB")
|
| 1243 |
|
| 1244 |
# CPU-OPTIMIZED BATCH CALCULATION - Very aggressive for 16GB RAM
|
| 1245 |
# Base calculation: use more RAM for batching on CPU
|
|
|
|
| 1253 |
logger.info(f"[TOKEN] User batch size: {user_batch_size}")
|
| 1254 |
|
| 1255 |
# CPU can handle larger batches with quantized models
|
| 1256 |
+
# REDUCED BATCH SIZE FOR LOW RAM ENVIRONMENTS
|
| 1257 |
+
optimal_batch = max(128, min(512, optimal_batch)) # 128-512 range for low RAM
|
| 1258 |
|
| 1259 |
+
# Context size - REDUCED FOR LOW RAM
|
| 1260 |
+
optimal_ctx = min(1024, quant_config["ctx_size"]) # Max 1024 for low RAM
|
| 1261 |
|
| 1262 |
# Reduce context for Gemma models (they have 131K n_ctx_train)
|
| 1263 |
if model_format == "gemma":
|
| 1264 |
+
optimal_ctx = min(512, optimal_ctx) # Gemma works better with much lower ctx
|
| 1265 |
logger.info(f"[FORMAT] Gemma detected: reducing context to {optimal_ctx}")
|
| 1266 |
|
| 1267 |
+
# FURTHER REDUCE CONTEXT FOR LOW RAM SYSTEMS
|
| 1268 |
+
if available_ram_gb < 4.0: # Less than 4GB available
|
| 1269 |
+
optimal_ctx = min(512, optimal_ctx)
|
| 1270 |
+
optimal_batch = min(256, optimal_batch)
|
| 1271 |
+
logger.info(f"[RAM-LOW] Low RAM detected: ctx={optimal_ctx}, batch={optimal_batch}")
|
| 1272 |
+
|
| 1273 |
# Thread optimization - ULTRA AGGRESSIVE FOR 2 vCPU
|
| 1274 |
optimal_threads = OPTIMAL_THREADS # Exactly 2 threads for 2 vCPU
|
| 1275 |
|