Spaces:
Running
Running
feat: Add thinking headroom for max_tokens and Qwen3-30B model
Browse files- Added calculate_effective_max_tokens() function to extend max_tokens by 50%
for thinking models when reasoning is enabled
- Adjusted max_tokens before n_ctx calculation to ensure proper context sizing
- Added Qwen3-30B-A3B-Thinking model with TQ1_0 quantization
- Updated info display to show when max_tokens has been adjusted
- Models affected: all Qwen3 (toggle/thinking), ERNIE 21B, GLM-4.7-Flash, Qwen3-30B
app.py
CHANGED
|
@@ -190,6 +190,20 @@ AVAILABLE_MODELS = {
|
|
| 190 |
"repeat_penalty": 1.0,
|
| 191 |
},
|
| 192 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
"granite4_tiny_q3": {
|
| 194 |
"name": "Granite 4.0 Tiny 7B (128K Context)",
|
| 195 |
"repo_id": "unsloth/granite-4.0-h-tiny-GGUF",
|
|
@@ -407,6 +421,44 @@ def calculate_n_ctx(model_key: str, transcript: str, max_tokens: int, enable_rea
|
|
| 407 |
return n_ctx, warning
|
| 408 |
|
| 409 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 410 |
def get_model_info(model_key: str) -> Tuple[str, str, float, int]:
|
| 411 |
"""Get model information and inference settings for UI display.
|
| 412 |
|
|
@@ -520,6 +572,12 @@ def summarize_streaming(
|
|
| 520 |
model = AVAILABLE_MODELS[model_key]
|
| 521 |
usable_max = min(model["max_context"], MAX_USABLE_CTX)
|
| 522 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 523 |
# Validate max_tokens fits in context
|
| 524 |
if max_tokens > usable_max - 512:
|
| 525 |
max_tokens = usable_max - 512
|
|
@@ -574,11 +632,14 @@ def summarize_streaming(
|
|
| 574 |
|
| 575 |
# Build info text
|
| 576 |
input_tokens = estimate_tokens(transcript)
|
|
|
|
|
|
|
|
|
|
| 577 |
info = (
|
| 578 |
f"**Model:** {model['name']}\n\n"
|
| 579 |
f"**Context:** {n_ctx:,} tokens | "
|
| 580 |
f"**Input:** ~{input_tokens:,} tokens | "
|
| 581 |
-
f"**Max output:** {
|
| 582 |
)
|
| 583 |
if warning:
|
| 584 |
info += f"\n\n{warning}"
|
|
|
|
| 190 |
"repeat_penalty": 1.0,
|
| 191 |
},
|
| 192 |
},
|
| 193 |
+
"qwen3_30b_thinking_q1": {
|
| 194 |
+
"name": "Qwen3 30B Thinking (256K Context)",
|
| 195 |
+
"repo_id": "unsloth/Qwen3-30B-A3B-Thinking-2507-GGUF",
|
| 196 |
+
"filename": "*TQ1_0.gguf",
|
| 197 |
+
"max_context": 262144,
|
| 198 |
+
"default_temperature": 0.6,
|
| 199 |
+
"supports_toggle": False, # Thinking-only mode
|
| 200 |
+
"inference_settings": {
|
| 201 |
+
"temperature": 0.6,
|
| 202 |
+
"top_p": 0.95,
|
| 203 |
+
"top_k": 20,
|
| 204 |
+
"repeat_penalty": 1.0,
|
| 205 |
+
},
|
| 206 |
+
},
|
| 207 |
"granite4_tiny_q3": {
|
| 208 |
"name": "Granite 4.0 Tiny 7B (128K Context)",
|
| 209 |
"repo_id": "unsloth/granite-4.0-h-tiny-GGUF",
|
|
|
|
| 421 |
return n_ctx, warning
|
| 422 |
|
| 423 |
|
| 424 |
+
def calculate_effective_max_tokens(model_key: str, max_tokens: int, enable_reasoning: bool) -> int:
|
| 425 |
+
"""
|
| 426 |
+
Calculate effective max_tokens with thinking headroom for reasoning models.
|
| 427 |
+
|
| 428 |
+
When reasoning is enabled for thinking-capable models, adds 50% headroom
|
| 429 |
+
to accommodate both thinking process and final output.
|
| 430 |
+
|
| 431 |
+
Args:
|
| 432 |
+
model_key: Model identifier from AVAILABLE_MODELS
|
| 433 |
+
max_tokens: User-specified maximum tokens
|
| 434 |
+
enable_reasoning: Whether reasoning mode is enabled
|
| 435 |
+
|
| 436 |
+
Returns:
|
| 437 |
+
Adjusted max_tokens value (1.5x for reasoning models, unchanged otherwise)
|
| 438 |
+
"""
|
| 439 |
+
if not enable_reasoning:
|
| 440 |
+
return max_tokens
|
| 441 |
+
|
| 442 |
+
model_config = AVAILABLE_MODELS.get(model_key)
|
| 443 |
+
if not model_config:
|
| 444 |
+
return max_tokens
|
| 445 |
+
|
| 446 |
+
# Check if model supports reasoning/thinking
|
| 447 |
+
is_thinking_model = (
|
| 448 |
+
model_config.get("supports_toggle", False) or
|
| 449 |
+
"thinking" in model_key.lower()
|
| 450 |
+
)
|
| 451 |
+
|
| 452 |
+
if is_thinking_model:
|
| 453 |
+
# Add 50% headroom for thinking process
|
| 454 |
+
thinking_headroom = int(max_tokens * 0.5)
|
| 455 |
+
effective_max = max_tokens + thinking_headroom
|
| 456 |
+
logger.info(f"Reasoning enabled for {model_key}: extending max_tokens from {max_tokens} to {effective_max}")
|
| 457 |
+
return effective_max
|
| 458 |
+
|
| 459 |
+
return max_tokens
|
| 460 |
+
|
| 461 |
+
|
| 462 |
def get_model_info(model_key: str) -> Tuple[str, str, float, int]:
|
| 463 |
"""Get model information and inference settings for UI display.
|
| 464 |
|
|
|
|
| 572 |
model = AVAILABLE_MODELS[model_key]
|
| 573 |
usable_max = min(model["max_context"], MAX_USABLE_CTX)
|
| 574 |
|
| 575 |
+
# Adjust max_tokens for thinking models when reasoning is enabled
|
| 576 |
+
original_max_tokens = max_tokens
|
| 577 |
+
max_tokens = calculate_effective_max_tokens(model_key, max_tokens, enable_reasoning)
|
| 578 |
+
if max_tokens != original_max_tokens:
|
| 579 |
+
logger.info(f"Adjusted max_tokens from {original_max_tokens} to {max_tokens} for reasoning mode")
|
| 580 |
+
|
| 581 |
# Validate max_tokens fits in context
|
| 582 |
if max_tokens > usable_max - 512:
|
| 583 |
max_tokens = usable_max - 512
|
|
|
|
| 632 |
|
| 633 |
# Build info text
|
| 634 |
input_tokens = estimate_tokens(transcript)
|
| 635 |
+
max_output_text = f"{max_tokens:,} tokens"
|
| 636 |
+
if max_tokens != original_max_tokens:
|
| 637 |
+
max_output_text += f" (adjusted from {original_max_tokens:,} for thinking mode)"
|
| 638 |
info = (
|
| 639 |
f"**Model:** {model['name']}\n\n"
|
| 640 |
f"**Context:** {n_ctx:,} tokens | "
|
| 641 |
f"**Input:** ~{input_tokens:,} tokens | "
|
| 642 |
+
f"**Max output:** {max_output_text}"
|
| 643 |
)
|
| 644 |
if warning:
|
| 645 |
info += f"\n\n{warning}"
|