Spaces:

turtle170
/

ZeroEngine

Running

turtle170 commited on Jan 31

Commit

814783f

verified ·

1 Parent(s): 7e42957

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -154,17 +154,17 @@ boost_cpu_frequency()
 optimize_memory_layout()
 QUANT_OPTIMIZATIONS = {
-    "BF16": {"batch_multiplier": 0.4, "ctx_size": 4096, "threads_boost": 1.0},
-    "F16": {"batch_multiplier": 0.5, "ctx_size": 4096, "threads_boost": 1.0},
-    "Q8_0": {"batch_multiplier": 1.0, "ctx_size": 8192, "threads_boost": 1.0},
-    "Q6_K": {"batch_multiplier": 1.2, "ctx_size": 8192, "threads_boost": 1.0},
-    "Q5_K_M": {"batch_multiplier": 1.5, "ctx_size": 12288, "threads_boost": 1.0},
-    "Q5_K_S": {"batch_multiplier": 1.5, "ctx_size": 12288, "threads_boost": 1.0},
-    "Q4_K_M": {"batch_multiplier": 2.0, "ctx_size": 16384, "threads_boost": 1.0},
-    "Q4_K_S": {"batch_multiplier": 2.0, "ctx_size": 16384, "threads_boost": 1.0},
-    "Q4_0": {"batch_multiplier": 2.2, "ctx_size": 16384, "threads_boost": 1.0},
-    "Q3_K_M": {"batch_multiplier": 2.5, "ctx_size": 20480, "threads_boost": 1.0},
-    "Q2_K": {"batch_multiplier": 3.0, "ctx_size": 24576, "threads_boost": 1.0},
 }
 # Model format/architecture detection patterns
@@ -895,7 +895,7 @@ class ZeroEngine:
                 # Reduce context for Gemma models (they have 131K n_ctx_train)
                 if model_format == "gemma":
-                    optimal_ctx = min(8192, optimal_ctx)  # Gemma works better with lower ctx
                     logger.info(f"[FORMAT] Gemma detected: reducing context to {optimal_ctx}")
                 # Thread optimization - ULTRA AGGRESSIVE FOR 2 vCPU

 optimize_memory_layout()
 QUANT_OPTIMIZATIONS = {
+    "BF16": {"batch_multiplier": 0.4, "ctx_size": 2048, "threads_boost": 1.0},
+    "F16": {"batch_multiplier": 0.5, "ctx_size": 2048, "threads_boost": 1.0},
+    "Q8_0": {"batch_multiplier": 1.0, "ctx_size": 2048, "threads_boost": 1.0},
+    "Q6_K": {"batch_multiplier": 1.2, "ctx_size": 2048, "threads_boost": 1.0},
+    "Q5_K_M": {"batch_multiplier": 1.5, "ctx_size": 2048, "threads_boost": 1.0},
+    "Q5_K_S": {"batch_multiplier": 1.5, "ctx_size": 2048, "threads_boost": 1.0},
+    "Q4_K_M": {"batch_multiplier": 2.0, "ctx_size": 2048, "threads_boost": 1.0},
+    "Q4_K_S": {"batch_multiplier": 2.0, "ctx_size": 2048, "threads_boost": 1.0},
+    "Q4_0": {"batch_multiplier": 2.2, "ctx_size": 2048, "threads_boost": 1.0},
+    "Q3_K_M": {"batch_multiplier": 2.5, "ctx_size": 2048, "threads_boost": 1.0},
+    "Q2_K": {"batch_multiplier": 3.0, "ctx_size": 2048, "threads_boost": 1.0},
 }
 # Model format/architecture detection patterns
                 # Reduce context for Gemma models (they have 131K n_ctx_train)
                 if model_format == "gemma":
+                    optimal_ctx = min(1024, optimal_ctx)  # Gemma works better with much lower ctx
                     logger.info(f"[FORMAT] Gemma detected: reducing context to {optimal_ctx}")
                 # Thread optimization - ULTRA AGGRESSIVE FOR 2 vCPU