Spaces:

Luigi
/

tiny-scribe

Running

Luigi commited on Feb 1

Commit

510e0aa

1 Parent(s): b39005f

feat: Add thinking headroom for max_tokens and Qwen3-30B model

- Added calculate_effective_max_tokens() function to extend max_tokens by 50%
for thinking models when reasoning is enabled
- Adjusted max_tokens before n_ctx calculation to ensure proper context sizing
- Added Qwen3-30B-A3B-Thinking model with TQ1_0 quantization
- Updated info display to show when max_tokens has been adjusted
- Models affected: all Qwen3 (toggle/thinking), ERNIE 21B, GLM-4.7-Flash, Qwen3-30B

Files changed (1) hide show

app.py +62 -1

app.py CHANGED Viewed

@@ -190,6 +190,20 @@ AVAILABLE_MODELS = {
             "repeat_penalty": 1.0,
         },
     },
     "granite4_tiny_q3": {
         "name": "Granite 4.0 Tiny 7B (128K Context)",
         "repo_id": "unsloth/granite-4.0-h-tiny-GGUF",
@@ -407,6 +421,44 @@ def calculate_n_ctx(model_key: str, transcript: str, max_tokens: int, enable_rea
     return n_ctx, warning
 def get_model_info(model_key: str) -> Tuple[str, str, float, int]:
     """Get model information and inference settings for UI display.
@@ -520,6 +572,12 @@ def summarize_streaming(
     model = AVAILABLE_MODELS[model_key]
     usable_max = min(model["max_context"], MAX_USABLE_CTX)
     # Validate max_tokens fits in context
     if max_tokens > usable_max - 512:
         max_tokens = usable_max - 512
@@ -574,11 +632,14 @@ def summarize_streaming(
     # Build info text
     input_tokens = estimate_tokens(transcript)
     info = (
         f"**Model:** {model['name']}\n\n"
         f"**Context:** {n_ctx:,} tokens | "
         f"**Input:** ~{input_tokens:,} tokens | "
-        f"**Max output:** {max_tokens:,} tokens"
     )
     if warning:
         info += f"\n\n{warning}"

             "repeat_penalty": 1.0,
         },
     },
+    "qwen3_30b_thinking_q1": {
+        "name": "Qwen3 30B Thinking (256K Context)",
+        "repo_id": "unsloth/Qwen3-30B-A3B-Thinking-2507-GGUF",
+        "filename": "*TQ1_0.gguf",
+        "max_context": 262144,
+        "default_temperature": 0.6,
+        "supports_toggle": False,  # Thinking-only mode
+        "inference_settings": {
+            "temperature": 0.6,
+            "top_p": 0.95,
+            "top_k": 20,
+            "repeat_penalty": 1.0,
+        },
+    },
     "granite4_tiny_q3": {
         "name": "Granite 4.0 Tiny 7B (128K Context)",
         "repo_id": "unsloth/granite-4.0-h-tiny-GGUF",
     return n_ctx, warning
+def calculate_effective_max_tokens(model_key: str, max_tokens: int, enable_reasoning: bool) -> int:
+    """
+    Calculate effective max_tokens with thinking headroom for reasoning models.
+    When reasoning is enabled for thinking-capable models, adds 50% headroom
+    to accommodate both thinking process and final output.
+    Args:
+        model_key: Model identifier from AVAILABLE_MODELS
+        max_tokens: User-specified maximum tokens
+        enable_reasoning: Whether reasoning mode is enabled
+    Returns:
+        Adjusted max_tokens value (1.5x for reasoning models, unchanged otherwise)
+    """
+    if not enable_reasoning:
+        return max_tokens
+    model_config = AVAILABLE_MODELS.get(model_key)
+    if not model_config:
+        return max_tokens
+    # Check if model supports reasoning/thinking
+    is_thinking_model = (
+        model_config.get("supports_toggle", False) or
+        "thinking" in model_key.lower()
+    )
+    if is_thinking_model:
+        # Add 50% headroom for thinking process
+        thinking_headroom = int(max_tokens * 0.5)
+        effective_max = max_tokens + thinking_headroom
+        logger.info(f"Reasoning enabled for {model_key}: extending max_tokens from {max_tokens} to {effective_max}")
+        return effective_max
+    return max_tokens
 def get_model_info(model_key: str) -> Tuple[str, str, float, int]:
     """Get model information and inference settings for UI display.
     model = AVAILABLE_MODELS[model_key]
     usable_max = min(model["max_context"], MAX_USABLE_CTX)
+    # Adjust max_tokens for thinking models when reasoning is enabled
+    original_max_tokens = max_tokens
+    max_tokens = calculate_effective_max_tokens(model_key, max_tokens, enable_reasoning)
+    if max_tokens != original_max_tokens:
+        logger.info(f"Adjusted max_tokens from {original_max_tokens} to {max_tokens} for reasoning mode")
     # Validate max_tokens fits in context
     if max_tokens > usable_max - 512:
         max_tokens = usable_max - 512
     # Build info text
     input_tokens = estimate_tokens(transcript)
+    max_output_text = f"{max_tokens:,} tokens"
+    if max_tokens != original_max_tokens:
+        max_output_text += f" (adjusted from {original_max_tokens:,} for thinking mode)"
     info = (
         f"**Model:** {model['name']}\n\n"
         f"**Context:** {n_ctx:,} tokens | "
         f"**Input:** ~{input_tokens:,} tokens | "
+        f"**Max output:** {max_output_text}"
     )
     if warning:
         info += f"\n\n{warning}"