Spaces:

Luigi
/

tiny-scribe

Running

Luigi commited on Jan 31

Commit

8937994

1 Parent(s): 2fa67ca

Add model-specific inference settings and dynamic UI controls

- Add inference_settings dict to all 10 models with official parameters:
* Falcon-H1 (100M/1.5B): temp=0.1, top_p=0.9, top_k=40, repeat=1.05 (TII official)
* Gemma-3 270M: temp=1.0, top_p=0.95, top_k=64, repeat=1.0 (Gemma official)
* Granite-4.0 350M: temp=0.0, top_p=1.0, top_k=0 (IBM official)
* LFM2 350M: temp=0.1, top_p=0.1, top_k=50, repeat=1.05 (LiquidAI official)
* Qwen3 (0.6B/1.7B): temp=0.6, top_p=0.95, top_k=20, repeat=1.05 (Qwen official)
* ERNIE/BitCPM4/Hunyuan: estimated conservative settings

- Replace temperature slider with locked display showing model's value
- Add top_p slider (0.0-1.0, step 0.05) with model defaults
- Add top_k slider (0-100, step 5) with model defaults
- Update summarize_streaming() to use model-specific settings
- Dynamic UI updates when model selection changes
- Temperature locked to official values, top_p/top_k user-adjustable

Files changed (1) hide show

app.py +113 -21

app.py CHANGED Viewed

@@ -35,42 +35,84 @@ AVAILABLE_MODELS = {
         "repo_id": "mradermacher/Falcon-H1-Tiny-Multilingual-100M-Instruct-GGUF",
         "filename": "*Q8_0.gguf",
         "max_context": 32768,
     },
     "gemma3_270m": {
         "name": "Gemma-3 270M",
         "repo_id": "unsloth/gemma-3-270m-it-qat-GGUF",
         "filename": "*Q8_0.gguf",
         "max_context": 32768,
     },
     "ernie_300m": {
         "name": "ERNIE-4.5 0.3B (131K Context)",
         "repo_id": "unsloth/ERNIE-4.5-0.3B-PT-GGUF",
         "filename": "*Q8_0.gguf",
         "max_context": 131072,
     },
     "granite_350m": {
         "name": "Granite-4.0 350M",
         "repo_id": "unsloth/granite-4.0-h-350m-GGUF",
         "filename": "*Q8_0.gguf",
         "max_context": 32768,
     },
     "lfm2_350m": {
         "name": "LFM2 350M",
         "repo_id": "LiquidAI/LFM2-350M-GGUF",
         "filename": "*Q8_0.gguf",
         "max_context": 32768,
     },
     "bitcpm4_500m": {
         "name": "BitCPM4 0.5B (128K Context)",
         "repo_id": "openbmb/BitCPM4-0.5B-GGUF",
         "filename": "*q4_0.gguf",
         "max_context": 131072,
     },
     "hunyuan_500m": {
         "name": "Hunyuan 0.5B (256K Context)",
         "repo_id": "mradermacher/Hunyuan-0.5B-Instruct-GGUF",
         "filename": "*Q8_0.gguf",
         "max_context": 262144,
     },
     "qwen3_600m_q4": {
         "name": "Qwen3 0.6B Q4 (Default)",
@@ -78,12 +120,24 @@ AVAILABLE_MODELS = {
         "filename": "*Q4_K_M.gguf",
         "max_context": 32768,
         "supports_toggle": True,
     },
     "falcon_h1_1.5b_q4": {
         "name": "Falcon-H1 1.5B Q4",
         "repo_id": "unsloth/Falcon-H1-1.5B-Deep-Instruct-GGUF",
         "filename": "*Q4_K_M.gguf",
         "max_context": 32768,
     },
     "qwen3_1.7b_q4": {
         "name": "Qwen3 1.7B Q4",
@@ -91,6 +145,12 @@ AVAILABLE_MODELS = {
         "filename": "*Q4_K_M.gguf",
         "max_context": 32768,
         "supports_toggle": True,
     },
 }
@@ -197,17 +257,27 @@ def calculate_n_ctx(model_key: str, transcript: str, max_tokens: int) -> Tuple[i
     return n_ctx, warning
-def get_model_info_text(model_key: str) -> str:
-    """Format model information for UI display."""
     m = AVAILABLE_MODELS[model_key]
     usable_ctx = min(m["max_context"], MAX_USABLE_CTX)
-    return (
         f"**{m['name']}**\n\n"
         f"- Max context: {m['max_context']:,} tokens "
         f"(capped at {usable_ctx:,} for performance)\n"
         f"- Repo: `{m['repo_id']}`\n"
-        f"- Quant: `{m['filename']}`"
     )
 def parse_thinking_blocks(content: str, streaming: bool = False) -> Tuple[str, str]:
@@ -256,7 +326,8 @@ def summarize_streaming(
     model_key: str,
     enable_reasoning: bool = True,
     max_tokens: int = 2048,
-    temperature: float = 0.6,
 ) -> Generator[Tuple[str, str, str], None, None]:
     """
     Stream summary generation from uploaded file.
@@ -266,7 +337,8 @@ def summarize_streaming(
         model_key: Model identifier from AVAILABLE_MODELS
         enable_reasoning: Whether to use reasoning mode (/think) for Qwen3 models
         max_tokens: Maximum tokens to generate
-        temperature: Sampling temperature
     Yields:
         Tuple of (thinking_text, summary_text, info_text)
@@ -336,22 +408,28 @@ def summarize_streaming(
         {"role": "user", "content": f"請總結以下內容：\n\n{transcript}"},
     ]
     # Stream - NO stop= parameter, let GGUF metadata handle it
     full_response = ""
     current_thinking = ""
     current_summary = ""
     try:
-        # Apply repeat penalty for all models to prevent repetitive loops
-        # Conservative value (1.05) to avoid hurting coherence
         stream = llm.create_chat_completion(
             messages=messages,
             max_tokens=max_tokens,
             temperature=temperature,
             min_p=0.0,
-            top_p=0.95,
-            top_k=20,
-            repeat_penalty=1.05,
             stream=True,
         )
@@ -608,6 +686,12 @@ def create_interface():
                                 info="Qwen3 only: uses /think for deeper analysis (slower) or /no_think for direct output (faster). Enabled by default.",
                                 interactive=True,
                             )
                             max_tokens = gr.Slider(
                                 minimum=256,
                                 maximum=4096,
@@ -616,13 +700,21 @@ def create_interface():
                                 label="Max Output Tokens",
                                 info="Higher = more detailed summary"
                             )
-                            temperature = gr.Slider(
-                                minimum=0.1,
                                 maximum=1.0,
-                                value=0.6,
-                                step=0.1,
-                                label="Temperature",
-                                info="Lower = more focused, Higher = more creative"
                             )
                     submit_btn = gr.Button(
@@ -635,7 +727,7 @@ def create_interface():
                 with gr.Group():
                     gr.HTML('<div class="section-header"><span class="section-icon">📊</span> Model Information</div>')
                     info_output = gr.Markdown(
-                        value=get_model_info_text(DEFAULT_MODEL_KEY),
                         elem_classes=["stats-grid"]
                     )
@@ -664,15 +756,15 @@ def create_interface():
         # Event handlers
         submit_btn.click(
             fn=summarize_streaming,
-            inputs=[file_input, model_dropdown, enable_reasoning, max_tokens, temperature],
             outputs=[thinking_output, summary_output, info_output],
             show_progress="full"
         )
         model_dropdown.change(
-            fn=get_model_info_text,
             inputs=[model_dropdown],
-            outputs=[info_output],
         )
         # Footer

         "repo_id": "mradermacher/Falcon-H1-Tiny-Multilingual-100M-Instruct-GGUF",
         "filename": "*Q8_0.gguf",
         "max_context": 32768,
+        "inference_settings": {
+            "temperature": 0.1,
+            "top_p": 0.9,
+            "top_k": 40,
+            "repeat_penalty": 1.05,
+        },
     },
     "gemma3_270m": {
         "name": "Gemma-3 270M",
         "repo_id": "unsloth/gemma-3-270m-it-qat-GGUF",
         "filename": "*Q8_0.gguf",
         "max_context": 32768,
+        "inference_settings": {
+            "temperature": 1.0,
+            "top_p": 0.95,
+            "top_k": 64,
+            "repeat_penalty": 1.0,
+        },
     },
     "ernie_300m": {
         "name": "ERNIE-4.5 0.3B (131K Context)",
         "repo_id": "unsloth/ERNIE-4.5-0.3B-PT-GGUF",
         "filename": "*Q8_0.gguf",
         "max_context": 131072,
+        "inference_settings": {
+            "temperature": 0.3,
+            "top_p": 0.95,
+            "top_k": 30,
+            "repeat_penalty": 1.05,
+        },
     },
     "granite_350m": {
         "name": "Granite-4.0 350M",
         "repo_id": "unsloth/granite-4.0-h-350m-GGUF",
         "filename": "*Q8_0.gguf",
         "max_context": 32768,
+        "inference_settings": {
+            "temperature": 0.0,
+            "top_p": 1.0,
+            "top_k": 0,
+            "repeat_penalty": 1.05,
+        },
     },
     "lfm2_350m": {
         "name": "LFM2 350M",
         "repo_id": "LiquidAI/LFM2-350M-GGUF",
         "filename": "*Q8_0.gguf",
         "max_context": 32768,
+        "inference_settings": {
+            "temperature": 0.1,
+            "top_p": 0.1,
+            "top_k": 50,
+            "repeat_penalty": 1.05,
+        },
     },
     "bitcpm4_500m": {
         "name": "BitCPM4 0.5B (128K Context)",
         "repo_id": "openbmb/BitCPM4-0.5B-GGUF",
         "filename": "*q4_0.gguf",
         "max_context": 131072,
+        "inference_settings": {
+            "temperature": 0.3,
+            "top_p": 0.95,
+            "top_k": 30,
+            "repeat_penalty": 1.05,
+        },
     },
     "hunyuan_500m": {
         "name": "Hunyuan 0.5B (256K Context)",
         "repo_id": "mradermacher/Hunyuan-0.5B-Instruct-GGUF",
         "filename": "*Q8_0.gguf",
         "max_context": 262144,
+        "inference_settings": {
+            "temperature": 0.3,
+            "top_p": 0.95,
+            "top_k": 30,
+            "repeat_penalty": 1.05,
+        },
     },
     "qwen3_600m_q4": {
         "name": "Qwen3 0.6B Q4 (Default)",
         "filename": "*Q4_K_M.gguf",
         "max_context": 32768,
         "supports_toggle": True,
+        "inference_settings": {
+            "temperature": 0.6,
+            "top_p": 0.95,
+            "top_k": 20,
+            "repeat_penalty": 1.05,
+        },
     },
     "falcon_h1_1.5b_q4": {
         "name": "Falcon-H1 1.5B Q4",
         "repo_id": "unsloth/Falcon-H1-1.5B-Deep-Instruct-GGUF",
         "filename": "*Q4_K_M.gguf",
         "max_context": 32768,
+        "inference_settings": {
+            "temperature": 0.1,
+            "top_p": 0.9,
+            "top_k": 40,
+            "repeat_penalty": 1.05,
+        },
     },
     "qwen3_1.7b_q4": {
         "name": "Qwen3 1.7B Q4",
         "filename": "*Q4_K_M.gguf",
         "max_context": 32768,
         "supports_toggle": True,
+        "inference_settings": {
+            "temperature": 0.6,
+            "top_p": 0.95,
+            "top_k": 20,
+            "repeat_penalty": 1.05,
+        },
     },
 }
     return n_ctx, warning
+def get_model_info(model_key: str) -> Tuple[str, str, float, int]:
+    """Get model information and inference settings for UI display.
+    Returns:
+        Tuple of (info_text, temperature, top_p, top_k)
+    """
     m = AVAILABLE_MODELS[model_key]
     usable_ctx = min(m["max_context"], MAX_USABLE_CTX)
+    settings = m["inference_settings"]
+    info_text = (
         f"**{m['name']}**\n\n"
         f"- Max context: {m['max_context']:,} tokens "
         f"(capped at {usable_ctx:,} for performance)\n"
         f"- Repo: `{m['repo_id']}`\n"
+        f"- Quant: `{m['filename']}`\n"
+        f"- Temperature: {settings['temperature']} (locked)\n"
+        f"- Top P: {settings['top_p']}, Top K: {settings['top_k']}"
     )
+    return info_text, str(settings["temperature"]), settings["top_p"], settings["top_k"]
 def parse_thinking_blocks(content: str, streaming: bool = False) -> Tuple[str, str]:
     model_key: str,
     enable_reasoning: bool = True,
     max_tokens: int = 2048,
+    top_p: float = None,
+    top_k: int = None,
 ) -> Generator[Tuple[str, str, str], None, None]:
     """
     Stream summary generation from uploaded file.
         model_key: Model identifier from AVAILABLE_MODELS
         enable_reasoning: Whether to use reasoning mode (/think) for Qwen3 models
         max_tokens: Maximum tokens to generate
+        top_p: Nucleus sampling parameter (uses model default if None)
+        top_k: Top-k sampling parameter (uses model default if None)
     Yields:
         Tuple of (thinking_text, summary_text, info_text)
         {"role": "user", "content": f"請總結以下內容：\n\n{transcript}"},
     ]
+    # Get model-specific inference settings
+    inference_settings = model["inference_settings"]
+    temperature = inference_settings["temperature"]
+    final_top_p = top_p if top_p is not None else inference_settings["top_p"]
+    final_top_k = top_k if top_k is not None else inference_settings["top_k"]
+    repeat_penalty = inference_settings["repeat_penalty"]
     # Stream - NO stop= parameter, let GGUF metadata handle it
     full_response = ""
     current_thinking = ""
     current_summary = ""
     try:
+        # Apply model-specific inference settings
         stream = llm.create_chat_completion(
             messages=messages,
             max_tokens=max_tokens,
             temperature=temperature,
             min_p=0.0,
+            top_p=final_top_p,
+            top_k=final_top_k,
+            repeat_penalty=repeat_penalty,
             stream=True,
         )
                                 info="Qwen3 only: uses /think for deeper analysis (slower) or /no_think for direct output (faster). Enabled by default.",
                                 interactive=True,
                             )
+                            temperature_display = gr.Textbox(
+                                label="Temperature (Locked)",
+                                value="0.6",
+                                interactive=False,
+                                info="Set by model's recommended settings. Cannot be changed."
+                            )
                             max_tokens = gr.Slider(
                                 minimum=256,
                                 maximum=4096,
                                 label="Max Output Tokens",
                                 info="Higher = more detailed summary"
                             )
+                            top_p = gr.Slider(
+                                minimum=0.0,
                                 maximum=1.0,
+                                value=0.95,
+                                step=0.05,
+                                label="Top P (Nucleus Sampling)",
+                                info="Lower = more focused, Higher = more diverse"
+                            )
+                            top_k = gr.Slider(
+                                minimum=0,
+                                maximum=100,
+                                value=20,
+                                step=5,
+                                label="Top K",
+                                info="Limits token selection to top K tokens (0 = disabled)"
                             )
                     submit_btn = gr.Button(
                 with gr.Group():
                     gr.HTML('<div class="section-header"><span class="section-icon">📊</span> Model Information</div>')
                     info_output = gr.Markdown(
+                        value=get_model_info(DEFAULT_MODEL_KEY)[0],
                         elem_classes=["stats-grid"]
                     )
         # Event handlers
         submit_btn.click(
             fn=summarize_streaming,
+            inputs=[file_input, model_dropdown, enable_reasoning, max_tokens, top_p, top_k],
             outputs=[thinking_output, summary_output, info_output],
             show_progress="full"
         )
         model_dropdown.change(
+            fn=get_model_info,
             inputs=[model_dropdown],
+            outputs=[info_output, temperature_display, top_p, top_k],
         )
         # Footer