turtle170 commited on
Commit
0c27b45
·
verified ·
1 Parent(s): 8806e23

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -6
app.py CHANGED
@@ -1234,8 +1234,12 @@ class ZeroEngine:
1234
  logger.warning(f"[BOOT] Cleanup warning: {e}")
1235
 
1236
  # Calculate optimal parameters with token purchases
1237
- # Force use 16GB RAM instead of incorrect system detection
1238
- available_ram_gb = 16.0 * 0.7 # 70% of 16GB = ~11.2GB available
 
 
 
 
1239
 
1240
  # CPU-OPTIMIZED BATCH CALCULATION - Very aggressive for 16GB RAM
1241
  # Base calculation: use more RAM for batching on CPU
@@ -1249,16 +1253,23 @@ class ZeroEngine:
1249
  logger.info(f"[TOKEN] User batch size: {user_batch_size}")
1250
 
1251
  # CPU can handle larger batches with quantized models
1252
- optimal_batch = max(256, min(1024, optimal_batch)) # 256-1024 range for CPU (balanced performance)
 
1253
 
1254
- # Context size
1255
- optimal_ctx = quant_config["ctx_size"]
1256
 
1257
  # Reduce context for Gemma models (they have 131K n_ctx_train)
1258
  if model_format == "gemma":
1259
- optimal_ctx = min(1024, optimal_ctx) # Gemma works better with much lower ctx
1260
  logger.info(f"[FORMAT] Gemma detected: reducing context to {optimal_ctx}")
1261
 
 
 
 
 
 
 
1262
  # Thread optimization - ULTRA AGGRESSIVE FOR 2 vCPU
1263
  optimal_threads = OPTIMAL_THREADS # Exactly 2 threads for 2 vCPU
1264
 
 
1234
  logger.warning(f"[BOOT] Cleanup warning: {e}")
1235
 
1236
  # Calculate optimal parameters with token purchases
1237
+ # Use actual system RAM detection
1238
+ import psutil
1239
+ ram = psutil.virtual_memory()
1240
+ total_ram_gb = ram.total / (1024**3)
1241
+ available_ram_gb = ram.available / (1024**3)
1242
+ logger.info(f"[RAM] Total: {total_ram_gb:.1f}GB, Available: {available_ram_gb:.1f}GB")
1243
 
1244
  # CPU-OPTIMIZED BATCH CALCULATION - Very aggressive for 16GB RAM
1245
  # Base calculation: use more RAM for batching on CPU
 
1253
  logger.info(f"[TOKEN] User batch size: {user_batch_size}")
1254
 
1255
  # CPU can handle larger batches with quantized models
1256
+ # REDUCED BATCH SIZE FOR LOW RAM ENVIRONMENTS
1257
+ optimal_batch = max(128, min(512, optimal_batch)) # 128-512 range for low RAM
1258
 
1259
+ # Context size - REDUCED FOR LOW RAM
1260
+ optimal_ctx = min(1024, quant_config["ctx_size"]) # Max 1024 for low RAM
1261
 
1262
  # Reduce context for Gemma models (they have 131K n_ctx_train)
1263
  if model_format == "gemma":
1264
+ optimal_ctx = min(512, optimal_ctx) # Gemma works better with much lower ctx
1265
  logger.info(f"[FORMAT] Gemma detected: reducing context to {optimal_ctx}")
1266
 
1267
+ # FURTHER REDUCE CONTEXT FOR LOW RAM SYSTEMS
1268
+ if available_ram_gb < 4.0: # Less than 4GB available
1269
+ optimal_ctx = min(512, optimal_ctx)
1270
+ optimal_batch = min(256, optimal_batch)
1271
+ logger.info(f"[RAM-LOW] Low RAM detected: ctx={optimal_ctx}, batch={optimal_batch}")
1272
+
1273
  # Thread optimization - ULTRA AGGRESSIVE FOR 2 vCPU
1274
  optimal_threads = OPTIMAL_THREADS # Exactly 2 threads for 2 vCPU
1275