Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -1234,11 +1234,11 @@ class ZeroEngine:
|
|
| 1234 |
logger.warning(f"[BOOT] Cleanup warning: {e}")
|
| 1235 |
|
| 1236 |
# Calculate optimal parameters with token purchases
|
| 1237 |
-
# HARD-CODE: Force
|
| 1238 |
import psutil
|
| 1239 |
ram = psutil.virtual_memory()
|
| 1240 |
-
total_ram_gb =
|
| 1241 |
-
available_ram_gb =
|
| 1242 |
|
| 1243 |
logger.info(f"[RAM] HARD-CODED: Total: {total_ram_gb:.1f}GB, Available: {available_ram_gb:.1f}GB")
|
| 1244 |
logger.info(f"[RAM] (Ignoring host system {ram.total/(1024**3):.1f}GB)")
|
|
@@ -1257,23 +1257,23 @@ class ZeroEngine:
|
|
| 1257 |
# IGNORE user batch size - use conservative settings for reliability
|
| 1258 |
logger.info(f"[TOKEN] User batch size: {user_batch_size} (OVERRIDDEN for reliability)")
|
| 1259 |
|
| 1260 |
-
#
|
| 1261 |
-
optimal_batch =
|
| 1262 |
-
optimal_ctx =
|
| 1263 |
optimal_threads = 2 # FIXED: 2 threads for 2 vCPU
|
| 1264 |
|
| 1265 |
-
logger.info(f"[RAM]
|
| 1266 |
|
| 1267 |
# Reduce context for Gemma models (they have 131K n_ctx_train)
|
| 1268 |
if model_format == "gemma":
|
| 1269 |
-
optimal_ctx =
|
| 1270 |
logger.info(f"[FORMAT] Gemma detected: reducing context to {optimal_ctx}")
|
| 1271 |
|
| 1272 |
# Apply CPU optimizations before model loading
|
| 1273 |
optimize_cpu_performance()
|
| 1274 |
boost_cpu_frequency()
|
| 1275 |
|
| 1276 |
-
logger.info(f"[CPU] ULTRA-OPTIMIZED: {optimal_threads} threads for 2 vCPU +
|
| 1277 |
|
| 1278 |
try:
|
| 1279 |
logger.info(f"[BOOT] Initializing {model_format.upper()} {quant_config['type']}: threads={optimal_threads}, batch={optimal_batch}, ctx={optimal_ctx}")
|
|
|
|
| 1234 |
logger.warning(f"[BOOT] Cleanup warning: {e}")
|
| 1235 |
|
| 1236 |
# Calculate optimal parameters with token purchases
|
| 1237 |
+
# HARD-CODE: Force 18GB total RAM for Hugging Face Spaces reliability
|
| 1238 |
import psutil
|
| 1239 |
ram = psutil.virtual_memory()
|
| 1240 |
+
total_ram_gb = 18.0 # HARD-CODED: 18GB total for container
|
| 1241 |
+
available_ram_gb = 16.0 # HARD-CODED: 16GB usable for model (2GB reserved)
|
| 1242 |
|
| 1243 |
logger.info(f"[RAM] HARD-CODED: Total: {total_ram_gb:.1f}GB, Available: {available_ram_gb:.1f}GB")
|
| 1244 |
logger.info(f"[RAM] (Ignoring host system {ram.total/(1024**3):.1f}GB)")
|
|
|
|
| 1257 |
# IGNORE user batch size - use conservative settings for reliability
|
| 1258 |
logger.info(f"[TOKEN] User batch size: {user_batch_size} (OVERRIDDEN for reliability)")
|
| 1259 |
|
| 1260 |
+
# OPTIMIZED SETTINGS FOR 16GB USABLE RAM
|
| 1261 |
+
optimal_batch = 512 # OPTIMIZED: Good batch size
|
| 1262 |
+
optimal_ctx = 1024 # OPTIMIZED: Good context size
|
| 1263 |
optimal_threads = 2 # FIXED: 2 threads for 2 vCPU
|
| 1264 |
|
| 1265 |
+
logger.info(f"[RAM] OPTIMIZED: batch={optimal_batch}, ctx={optimal_ctx}")
|
| 1266 |
|
| 1267 |
# Reduce context for Gemma models (they have 131K n_ctx_train)
|
| 1268 |
if model_format == "gemma":
|
| 1269 |
+
optimal_ctx = 512 # OPTIMIZED for Gemma
|
| 1270 |
logger.info(f"[FORMAT] Gemma detected: reducing context to {optimal_ctx}")
|
| 1271 |
|
| 1272 |
# Apply CPU optimizations before model loading
|
| 1273 |
optimize_cpu_performance()
|
| 1274 |
boost_cpu_frequency()
|
| 1275 |
|
| 1276 |
+
logger.info(f"[CPU] ULTRA-OPTIMIZED: {optimal_threads} threads for 2 vCPU + 18GB RAM")
|
| 1277 |
|
| 1278 |
try:
|
| 1279 |
logger.info(f"[BOOT] Initializing {model_format.upper()} {quant_config['type']}: threads={optimal_threads}, batch={optimal_batch}, ctx={optimal_ctx}")
|