Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -1234,58 +1234,44 @@ class ZeroEngine:
|
|
| 1234 |
logger.warning(f"[BOOT] Cleanup warning: {e}")
|
| 1235 |
|
| 1236 |
# Calculate optimal parameters with token purchases
|
| 1237 |
-
#
|
| 1238 |
import psutil
|
| 1239 |
ram = psutil.virtual_memory()
|
| 1240 |
-
total_ram_gb =
|
| 1241 |
-
available_ram_gb =
|
| 1242 |
|
| 1243 |
-
|
| 1244 |
-
|
| 1245 |
-
if total_ram_gb > 50.0: # Host system memory detected
|
| 1246 |
-
logger.warning(f"[RAM] Host system memory detected ({total_ram_gb:.1f}GB), forcing container limits")
|
| 1247 |
-
total_ram_gb = 16.0 # Container limit
|
| 1248 |
-
available_ram_gb = min(available_ram_gb, 11.0) # Conservative available RAM
|
| 1249 |
-
logger.info(f"[RAM] FORCED: Total: {total_ram_gb:.1f}GB, Available: {available_ram_gb:.1f}GB")
|
| 1250 |
-
else:
|
| 1251 |
-
logger.info(f"[RAM] Total: {total_ram_gb:.1f}GB, Available: {available_ram_gb:.1f}GB")
|
| 1252 |
|
| 1253 |
-
#
|
| 1254 |
-
|
| 1255 |
-
|
| 1256 |
-
|
|
|
|
|
|
|
|
|
|
| 1257 |
|
| 1258 |
# Apply user's batch size from token purchases
|
| 1259 |
if session_id:
|
| 1260 |
user_batch_size = token_manager.get_purchases(session_id)["batch_size"]
|
| 1261 |
-
|
| 1262 |
-
logger.info(f"[TOKEN] User batch size: {user_batch_size}")
|
| 1263 |
|
| 1264 |
-
#
|
| 1265 |
-
|
| 1266 |
-
|
|
|
|
| 1267 |
|
| 1268 |
-
|
| 1269 |
-
optimal_ctx = min(1024, quant_config["ctx_size"]) # Max 1024 for low RAM
|
| 1270 |
|
| 1271 |
# Reduce context for Gemma models (they have 131K n_ctx_train)
|
| 1272 |
if model_format == "gemma":
|
| 1273 |
-
optimal_ctx =
|
| 1274 |
logger.info(f"[FORMAT] Gemma detected: reducing context to {optimal_ctx}")
|
| 1275 |
|
| 1276 |
-
# FURTHER REDUCE CONTEXT FOR LOW RAM SYSTEMS
|
| 1277 |
-
if available_ram_gb < 4.0: # Less than 4GB available
|
| 1278 |
-
optimal_ctx = min(512, optimal_ctx)
|
| 1279 |
-
optimal_batch = min(256, optimal_batch)
|
| 1280 |
-
logger.info(f"[RAM-LOW] Low RAM detected: ctx={optimal_ctx}, batch={optimal_batch}")
|
| 1281 |
-
|
| 1282 |
-
# Thread optimization - ULTRA AGGRESSIVE FOR 2 vCPU
|
| 1283 |
-
optimal_threads = OPTIMAL_THREADS # Exactly 2 threads for 2 vCPU
|
| 1284 |
-
|
| 1285 |
# Apply CPU optimizations before model loading
|
| 1286 |
-
|
| 1287 |
-
|
| 1288 |
-
boost_cpu_frequency()
|
| 1289 |
|
| 1290 |
logger.info(f"[CPU] ULTRA-OPTIMIZED: {optimal_threads} threads for 2 vCPU + 16GB RAM")
|
| 1291 |
|
|
|
|
| 1234 |
logger.warning(f"[BOOT] Cleanup warning: {e}")
|
| 1235 |
|
| 1236 |
# Calculate optimal parameters with token purchases
|
| 1237 |
+
# HARD-CODE: Force 16GB total RAM for Hugging Face Spaces reliability
|
| 1238 |
import psutil
|
| 1239 |
ram = psutil.virtual_memory()
|
| 1240 |
+
total_ram_gb = 16.0 # HARD-CODED: 16GB total for container
|
| 1241 |
+
available_ram_gb = 6.0 # HARD-CODED: 6GB available for model
|
| 1242 |
|
| 1243 |
+
logger.info(f"[RAM] HARD-CODED: Total: {total_ram_gb:.1f}GB, Available: {available_ram_gb:.1f}GB")
|
| 1244 |
+
logger.info(f"[RAM] (Ignoring host system {ram.total/(1024**3):.1f}GB)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1245 |
|
| 1246 |
+
# Define missing variables
|
| 1247 |
+
session_id = session_id if session_id else None
|
| 1248 |
+
token_manager = TokenManager() # Assuming TokenManager is defined elsewhere
|
| 1249 |
+
user_batch_size = None
|
| 1250 |
+
optimal_batch = None
|
| 1251 |
+
optimal_ctx = None
|
| 1252 |
+
optimal_threads = None
|
| 1253 |
|
| 1254 |
# Apply user's batch size from token purchases
|
| 1255 |
if session_id:
|
| 1256 |
user_batch_size = token_manager.get_purchases(session_id)["batch_size"]
|
| 1257 |
+
# IGNORE user batch size - use conservative settings for reliability
|
| 1258 |
+
logger.info(f"[TOKEN] User batch size: {user_batch_size} (OVERRIDDEN for reliability)")
|
| 1259 |
|
| 1260 |
+
# ULTRA-CONSERVATIVE SETTINGS FOR 6GB AVAILABLE RAM
|
| 1261 |
+
optimal_batch = 128 # FIXED: Very conservative batch size
|
| 1262 |
+
optimal_ctx = 256 # FIXED: Very conservative context size
|
| 1263 |
+
optimal_threads = 2 # FIXED: 2 threads for 2 vCPU
|
| 1264 |
|
| 1265 |
+
logger.info(f"[RAM] ULTRA-CONSERVATIVE: batch={optimal_batch}, ctx={optimal_ctx}")
|
|
|
|
| 1266 |
|
| 1267 |
# Reduce context for Gemma models (they have 131K n_ctx_train)
|
| 1268 |
if model_format == "gemma":
|
| 1269 |
+
optimal_ctx = 128 # EXTREMELY conservative for Gemma
|
| 1270 |
logger.info(f"[FORMAT] Gemma detected: reducing context to {optimal_ctx}")
|
| 1271 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1272 |
# Apply CPU optimizations before model loading
|
| 1273 |
+
optimize_cpu_performance()
|
| 1274 |
+
boost_cpu_frequency()
|
|
|
|
| 1275 |
|
| 1276 |
logger.info(f"[CPU] ULTRA-OPTIMIZED: {optimal_threads} threads for 2 vCPU + 16GB RAM")
|
| 1277 |
|