turtle170 commited on
Commit
814783f
·
verified ·
1 Parent(s): 7e42957

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -12
app.py CHANGED
@@ -154,17 +154,17 @@ boost_cpu_frequency()
154
  optimize_memory_layout()
155
 
156
  QUANT_OPTIMIZATIONS = {
157
- "BF16": {"batch_multiplier": 0.4, "ctx_size": 4096, "threads_boost": 1.0},
158
- "F16": {"batch_multiplier": 0.5, "ctx_size": 4096, "threads_boost": 1.0},
159
- "Q8_0": {"batch_multiplier": 1.0, "ctx_size": 8192, "threads_boost": 1.0},
160
- "Q6_K": {"batch_multiplier": 1.2, "ctx_size": 8192, "threads_boost": 1.0},
161
- "Q5_K_M": {"batch_multiplier": 1.5, "ctx_size": 12288, "threads_boost": 1.0},
162
- "Q5_K_S": {"batch_multiplier": 1.5, "ctx_size": 12288, "threads_boost": 1.0},
163
- "Q4_K_M": {"batch_multiplier": 2.0, "ctx_size": 16384, "threads_boost": 1.0},
164
- "Q4_K_S": {"batch_multiplier": 2.0, "ctx_size": 16384, "threads_boost": 1.0},
165
- "Q4_0": {"batch_multiplier": 2.2, "ctx_size": 16384, "threads_boost": 1.0},
166
- "Q3_K_M": {"batch_multiplier": 2.5, "ctx_size": 20480, "threads_boost": 1.0},
167
- "Q2_K": {"batch_multiplier": 3.0, "ctx_size": 24576, "threads_boost": 1.0},
168
  }
169
 
170
  # Model format/architecture detection patterns
@@ -895,7 +895,7 @@ class ZeroEngine:
895
 
896
  # Reduce context for Gemma models (they have 131K n_ctx_train)
897
  if model_format == "gemma":
898
- optimal_ctx = min(8192, optimal_ctx) # Gemma works better with lower ctx
899
  logger.info(f"[FORMAT] Gemma detected: reducing context to {optimal_ctx}")
900
 
901
  # Thread optimization - ULTRA AGGRESSIVE FOR 2 vCPU
 
154
  optimize_memory_layout()
155
 
156
  QUANT_OPTIMIZATIONS = {
157
+ "BF16": {"batch_multiplier": 0.4, "ctx_size": 2048, "threads_boost": 1.0},
158
+ "F16": {"batch_multiplier": 0.5, "ctx_size": 2048, "threads_boost": 1.0},
159
+ "Q8_0": {"batch_multiplier": 1.0, "ctx_size": 2048, "threads_boost": 1.0},
160
+ "Q6_K": {"batch_multiplier": 1.2, "ctx_size": 2048, "threads_boost": 1.0},
161
+ "Q5_K_M": {"batch_multiplier": 1.5, "ctx_size": 2048, "threads_boost": 1.0},
162
+ "Q5_K_S": {"batch_multiplier": 1.5, "ctx_size": 2048, "threads_boost": 1.0},
163
+ "Q4_K_M": {"batch_multiplier": 2.0, "ctx_size": 2048, "threads_boost": 1.0},
164
+ "Q4_K_S": {"batch_multiplier": 2.0, "ctx_size": 2048, "threads_boost": 1.0},
165
+ "Q4_0": {"batch_multiplier": 2.2, "ctx_size": 2048, "threads_boost": 1.0},
166
+ "Q3_K_M": {"batch_multiplier": 2.5, "ctx_size": 2048, "threads_boost": 1.0},
167
+ "Q2_K": {"batch_multiplier": 3.0, "ctx_size": 2048, "threads_boost": 1.0},
168
  }
169
 
170
  # Model format/architecture detection patterns
 
895
 
896
  # Reduce context for Gemma models (they have 131K n_ctx_train)
897
  if model_format == "gemma":
898
+ optimal_ctx = min(1024, optimal_ctx) # Gemma works better with much lower ctx
899
  logger.info(f"[FORMAT] Gemma detected: reducing context to {optimal_ctx}")
900
 
901
  # Thread optimization - ULTRA AGGRESSIVE FOR 2 vCPU