Luigi commited on
Commit
510e0aa
·
1 Parent(s): b39005f

feat: Add thinking headroom for max_tokens and Qwen3-30B model

Browse files

- Added calculate_effective_max_tokens() function to extend max_tokens by 50%
for thinking models when reasoning is enabled
- Adjusted max_tokens before n_ctx calculation to ensure proper context sizing
- Added Qwen3-30B-A3B-Thinking model with TQ1_0 quantization
- Updated info display to show when max_tokens has been adjusted
- Models affected: all Qwen3 (toggle/thinking), ERNIE 21B, GLM-4.7-Flash, Qwen3-30B

Files changed (1) hide show
  1. app.py +62 -1
app.py CHANGED
@@ -190,6 +190,20 @@ AVAILABLE_MODELS = {
190
  "repeat_penalty": 1.0,
191
  },
192
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  "granite4_tiny_q3": {
194
  "name": "Granite 4.0 Tiny 7B (128K Context)",
195
  "repo_id": "unsloth/granite-4.0-h-tiny-GGUF",
@@ -407,6 +421,44 @@ def calculate_n_ctx(model_key: str, transcript: str, max_tokens: int, enable_rea
407
  return n_ctx, warning
408
 
409
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
410
  def get_model_info(model_key: str) -> Tuple[str, str, float, int]:
411
  """Get model information and inference settings for UI display.
412
 
@@ -520,6 +572,12 @@ def summarize_streaming(
520
  model = AVAILABLE_MODELS[model_key]
521
  usable_max = min(model["max_context"], MAX_USABLE_CTX)
522
 
 
 
 
 
 
 
523
  # Validate max_tokens fits in context
524
  if max_tokens > usable_max - 512:
525
  max_tokens = usable_max - 512
@@ -574,11 +632,14 @@ def summarize_streaming(
574
 
575
  # Build info text
576
  input_tokens = estimate_tokens(transcript)
 
 
 
577
  info = (
578
  f"**Model:** {model['name']}\n\n"
579
  f"**Context:** {n_ctx:,} tokens | "
580
  f"**Input:** ~{input_tokens:,} tokens | "
581
- f"**Max output:** {max_tokens:,} tokens"
582
  )
583
  if warning:
584
  info += f"\n\n{warning}"
 
190
  "repeat_penalty": 1.0,
191
  },
192
  },
193
+ "qwen3_30b_thinking_q1": {
194
+ "name": "Qwen3 30B Thinking (256K Context)",
195
+ "repo_id": "unsloth/Qwen3-30B-A3B-Thinking-2507-GGUF",
196
+ "filename": "*TQ1_0.gguf",
197
+ "max_context": 262144,
198
+ "default_temperature": 0.6,
199
+ "supports_toggle": False, # Thinking-only mode
200
+ "inference_settings": {
201
+ "temperature": 0.6,
202
+ "top_p": 0.95,
203
+ "top_k": 20,
204
+ "repeat_penalty": 1.0,
205
+ },
206
+ },
207
  "granite4_tiny_q3": {
208
  "name": "Granite 4.0 Tiny 7B (128K Context)",
209
  "repo_id": "unsloth/granite-4.0-h-tiny-GGUF",
 
421
  return n_ctx, warning
422
 
423
 
424
+ def calculate_effective_max_tokens(model_key: str, max_tokens: int, enable_reasoning: bool) -> int:
425
+ """
426
+ Calculate effective max_tokens with thinking headroom for reasoning models.
427
+
428
+ When reasoning is enabled for thinking-capable models, adds 50% headroom
429
+ to accommodate both thinking process and final output.
430
+
431
+ Args:
432
+ model_key: Model identifier from AVAILABLE_MODELS
433
+ max_tokens: User-specified maximum tokens
434
+ enable_reasoning: Whether reasoning mode is enabled
435
+
436
+ Returns:
437
+ Adjusted max_tokens value (1.5x for reasoning models, unchanged otherwise)
438
+ """
439
+ if not enable_reasoning:
440
+ return max_tokens
441
+
442
+ model_config = AVAILABLE_MODELS.get(model_key)
443
+ if not model_config:
444
+ return max_tokens
445
+
446
+ # Check if model supports reasoning/thinking
447
+ is_thinking_model = (
448
+ model_config.get("supports_toggle", False) or
449
+ "thinking" in model_key.lower()
450
+ )
451
+
452
+ if is_thinking_model:
453
+ # Add 50% headroom for thinking process
454
+ thinking_headroom = int(max_tokens * 0.5)
455
+ effective_max = max_tokens + thinking_headroom
456
+ logger.info(f"Reasoning enabled for {model_key}: extending max_tokens from {max_tokens} to {effective_max}")
457
+ return effective_max
458
+
459
+ return max_tokens
460
+
461
+
462
  def get_model_info(model_key: str) -> Tuple[str, str, float, int]:
463
  """Get model information and inference settings for UI display.
464
 
 
572
  model = AVAILABLE_MODELS[model_key]
573
  usable_max = min(model["max_context"], MAX_USABLE_CTX)
574
 
575
+ # Adjust max_tokens for thinking models when reasoning is enabled
576
+ original_max_tokens = max_tokens
577
+ max_tokens = calculate_effective_max_tokens(model_key, max_tokens, enable_reasoning)
578
+ if max_tokens != original_max_tokens:
579
+ logger.info(f"Adjusted max_tokens from {original_max_tokens} to {max_tokens} for reasoning mode")
580
+
581
  # Validate max_tokens fits in context
582
  if max_tokens > usable_max - 512:
583
  max_tokens = usable_max - 512
 
632
 
633
  # Build info text
634
  input_tokens = estimate_tokens(transcript)
635
+ max_output_text = f"{max_tokens:,} tokens"
636
+ if max_tokens != original_max_tokens:
637
+ max_output_text += f" (adjusted from {original_max_tokens:,} for thinking mode)"
638
  info = (
639
  f"**Model:** {model['name']}\n\n"
640
  f"**Context:** {n_ctx:,} tokens | "
641
  f"**Input:** ~{input_tokens:,} tokens | "
642
+ f"**Max output:** {max_output_text}"
643
  )
644
  if warning:
645
  info += f"\n\n{warning}"