Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -154,17 +154,17 @@ boost_cpu_frequency()
|
|
| 154 |
optimize_memory_layout()
|
| 155 |
|
| 156 |
QUANT_OPTIMIZATIONS = {
|
| 157 |
-
"BF16": {"batch_multiplier": 0.4, "ctx_size":
|
| 158 |
-
"F16": {"batch_multiplier": 0.5, "ctx_size":
|
| 159 |
-
"Q8_0": {"batch_multiplier": 1.0, "ctx_size":
|
| 160 |
-
"Q6_K": {"batch_multiplier": 1.2, "ctx_size":
|
| 161 |
-
"Q5_K_M": {"batch_multiplier": 1.5, "ctx_size":
|
| 162 |
-
"Q5_K_S": {"batch_multiplier": 1.5, "ctx_size":
|
| 163 |
-
"Q4_K_M": {"batch_multiplier": 2.0, "ctx_size":
|
| 164 |
-
"Q4_K_S": {"batch_multiplier": 2.0, "ctx_size":
|
| 165 |
-
"Q4_0": {"batch_multiplier": 2.2, "ctx_size":
|
| 166 |
-
"Q3_K_M": {"batch_multiplier": 2.5, "ctx_size":
|
| 167 |
-
"Q2_K": {"batch_multiplier": 3.0, "ctx_size":
|
| 168 |
}
|
| 169 |
|
| 170 |
# Model format/architecture detection patterns
|
|
@@ -895,7 +895,7 @@ class ZeroEngine:
|
|
| 895 |
|
| 896 |
# Reduce context for Gemma models (they have 131K n_ctx_train)
|
| 897 |
if model_format == "gemma":
|
| 898 |
-
optimal_ctx = min(
|
| 899 |
logger.info(f"[FORMAT] Gemma detected: reducing context to {optimal_ctx}")
|
| 900 |
|
| 901 |
# Thread optimization - ULTRA AGGRESSIVE FOR 2 vCPU
|
|
|
|
| 154 |
optimize_memory_layout()
|
| 155 |
|
| 156 |
QUANT_OPTIMIZATIONS = {
|
| 157 |
+
"BF16": {"batch_multiplier": 0.4, "ctx_size": 2048, "threads_boost": 1.0},
|
| 158 |
+
"F16": {"batch_multiplier": 0.5, "ctx_size": 2048, "threads_boost": 1.0},
|
| 159 |
+
"Q8_0": {"batch_multiplier": 1.0, "ctx_size": 2048, "threads_boost": 1.0},
|
| 160 |
+
"Q6_K": {"batch_multiplier": 1.2, "ctx_size": 2048, "threads_boost": 1.0},
|
| 161 |
+
"Q5_K_M": {"batch_multiplier": 1.5, "ctx_size": 2048, "threads_boost": 1.0},
|
| 162 |
+
"Q5_K_S": {"batch_multiplier": 1.5, "ctx_size": 2048, "threads_boost": 1.0},
|
| 163 |
+
"Q4_K_M": {"batch_multiplier": 2.0, "ctx_size": 2048, "threads_boost": 1.0},
|
| 164 |
+
"Q4_K_S": {"batch_multiplier": 2.0, "ctx_size": 2048, "threads_boost": 1.0},
|
| 165 |
+
"Q4_0": {"batch_multiplier": 2.2, "ctx_size": 2048, "threads_boost": 1.0},
|
| 166 |
+
"Q3_K_M": {"batch_multiplier": 2.5, "ctx_size": 2048, "threads_boost": 1.0},
|
| 167 |
+
"Q2_K": {"batch_multiplier": 3.0, "ctx_size": 2048, "threads_boost": 1.0},
|
| 168 |
}
|
| 169 |
|
| 170 |
# Model format/architecture detection patterns
|
|
|
|
| 895 |
|
| 896 |
# Reduce context for Gemma models (they have 131K n_ctx_train)
|
| 897 |
if model_format == "gemma":
|
| 898 |
+
optimal_ctx = min(1024, optimal_ctx) # Gemma works better with much lower ctx
|
| 899 |
logger.info(f"[FORMAT] Gemma detected: reducing context to {optimal_ctx}")
|
| 900 |
|
| 901 |
# Thread optimization - ULTRA AGGRESSIVE FOR 2 vCPU
|