turtle170 commited on
Commit
91ebf27
·
verified ·
1 Parent(s): a2ffed4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -36
app.py CHANGED
@@ -1234,58 +1234,44 @@ class ZeroEngine:
1234
  logger.warning(f"[BOOT] Cleanup warning: {e}")
1235
 
1236
  # Calculate optimal parameters with token purchases
1237
- # FIX: Use container RAM limits, not host system memory
1238
  import psutil
1239
  ram = psutil.virtual_memory()
1240
- total_ram_gb = ram.total / (1024**3)
1241
- available_ram_gb = ram.available / (1024**3)
1242
 
1243
- # CRITICAL FIX: Force realistic container limits for Hugging Face Spaces
1244
- # The host shows 123.8GB but container only has 16GB total
1245
- if total_ram_gb > 50.0: # Host system memory detected
1246
- logger.warning(f"[RAM] Host system memory detected ({total_ram_gb:.1f}GB), forcing container limits")
1247
- total_ram_gb = 16.0 # Container limit
1248
- available_ram_gb = min(available_ram_gb, 11.0) # Conservative available RAM
1249
- logger.info(f"[RAM] FORCED: Total: {total_ram_gb:.1f}GB, Available: {available_ram_gb:.1f}GB")
1250
- else:
1251
- logger.info(f"[RAM] Total: {total_ram_gb:.1f}GB, Available: {available_ram_gb:.1f}GB")
1252
 
1253
- # CPU-OPTIMIZED BATCH CALCULATION - Very aggressive for 16GB RAM
1254
- # Base calculation: use more RAM for batching on CPU
1255
- base_batch = int(512 * available_ram_gb / 8) # More aggressive base
1256
- optimal_batch = base_batch
 
 
 
1257
 
1258
  # Apply user's batch size from token purchases
1259
  if session_id:
1260
  user_batch_size = token_manager.get_purchases(session_id)["batch_size"]
1261
- optimal_batch = user_batch_size
1262
- logger.info(f"[TOKEN] User batch size: {user_batch_size}")
1263
 
1264
- # CPU can handle larger batches with quantized models
1265
- # REDUCED BATCH SIZE FOR LOW RAM ENVIRONMENTS
1266
- optimal_batch = max(128, min(512, optimal_batch)) # 128-512 range for low RAM
 
1267
 
1268
- # Context size - REDUCED FOR LOW RAM
1269
- optimal_ctx = min(1024, quant_config["ctx_size"]) # Max 1024 for low RAM
1270
 
1271
  # Reduce context for Gemma models (they have 131K n_ctx_train)
1272
  if model_format == "gemma":
1273
- optimal_ctx = min(512, optimal_ctx) # Gemma works better with much lower ctx
1274
  logger.info(f"[FORMAT] Gemma detected: reducing context to {optimal_ctx}")
1275
 
1276
- # FURTHER REDUCE CONTEXT FOR LOW RAM SYSTEMS
1277
- if available_ram_gb < 4.0: # Less than 4GB available
1278
- optimal_ctx = min(512, optimal_ctx)
1279
- optimal_batch = min(256, optimal_batch)
1280
- logger.info(f"[RAM-LOW] Low RAM detected: ctx={optimal_ctx}, batch={optimal_batch}")
1281
-
1282
- # Thread optimization - ULTRA AGGRESSIVE FOR 2 vCPU
1283
- optimal_threads = OPTIMAL_THREADS # Exactly 2 threads for 2 vCPU
1284
-
1285
  # Apply CPU optimizations before model loading
1286
- if LOW_LATENCY_MODE:
1287
- optimize_cpu_performance()
1288
- boost_cpu_frequency()
1289
 
1290
  logger.info(f"[CPU] ULTRA-OPTIMIZED: {optimal_threads} threads for 2 vCPU + 16GB RAM")
1291
 
 
1234
  logger.warning(f"[BOOT] Cleanup warning: {e}")
1235
 
1236
  # Calculate optimal parameters with token purchases
1237
+ # HARD-CODE: Force 16GB total RAM for Hugging Face Spaces reliability
1238
  import psutil
1239
  ram = psutil.virtual_memory()
1240
+ total_ram_gb = 16.0 # HARD-CODED: 16GB total for container
1241
+ available_ram_gb = 6.0 # HARD-CODED: 6GB available for model
1242
 
1243
+ logger.info(f"[RAM] HARD-CODED: Total: {total_ram_gb:.1f}GB, Available: {available_ram_gb:.1f}GB")
1244
+ logger.info(f"[RAM] (Ignoring host system {ram.total/(1024**3):.1f}GB)")
 
 
 
 
 
 
 
1245
 
1246
+ # Define missing variables
1247
+ session_id = session_id if session_id else None
1248
+ token_manager = TokenManager() # Assuming TokenManager is defined elsewhere
1249
+ user_batch_size = None
1250
+ optimal_batch = None
1251
+ optimal_ctx = None
1252
+ optimal_threads = None
1253
 
1254
  # Apply user's batch size from token purchases
1255
  if session_id:
1256
  user_batch_size = token_manager.get_purchases(session_id)["batch_size"]
1257
+ # IGNORE user batch size - use conservative settings for reliability
1258
+ logger.info(f"[TOKEN] User batch size: {user_batch_size} (OVERRIDDEN for reliability)")
1259
 
1260
+ # ULTRA-CONSERVATIVE SETTINGS FOR 6GB AVAILABLE RAM
1261
+ optimal_batch = 128 # FIXED: Very conservative batch size
1262
+ optimal_ctx = 256 # FIXED: Very conservative context size
1263
+ optimal_threads = 2 # FIXED: 2 threads for 2 vCPU
1264
 
1265
+ logger.info(f"[RAM] ULTRA-CONSERVATIVE: batch={optimal_batch}, ctx={optimal_ctx}")
 
1266
 
1267
  # Reduce context for Gemma models (they have 131K n_ctx_train)
1268
  if model_format == "gemma":
1269
+ optimal_ctx = 128 # EXTREMELY conservative for Gemma
1270
  logger.info(f"[FORMAT] Gemma detected: reducing context to {optimal_ctx}")
1271
 
 
 
 
 
 
 
 
 
 
1272
  # Apply CPU optimizations before model loading
1273
+ optimize_cpu_performance()
1274
+ boost_cpu_frequency()
 
1275
 
1276
  logger.info(f"[CPU] ULTRA-OPTIMIZED: {optimal_threads} threads for 2 vCPU + 16GB RAM")
1277