Spaces:
Running
Add model-specific inference settings and dynamic UI controls
Browse files- Add inference_settings dict to all 10 models with official parameters:
* Falcon-H1 (100M/1.5B): temp=0.1, top_p=0.9, top_k=40, repeat=1.05 (TII official)
* Gemma-3 270M: temp=1.0, top_p=0.95, top_k=64, repeat=1.0 (Gemma official)
* Granite-4.0 350M: temp=0.0, top_p=1.0, top_k=0 (IBM official)
* LFM2 350M: temp=0.1, top_p=0.1, top_k=50, repeat=1.05 (LiquidAI official)
* Qwen3 (0.6B/1.7B): temp=0.6, top_p=0.95, top_k=20, repeat=1.05 (Qwen official)
* ERNIE/BitCPM4/Hunyuan: estimated conservative settings
- Replace temperature slider with locked display showing model's value
- Add top_p slider (0.0-1.0, step 0.05) with model defaults
- Add top_k slider (0-100, step 5) with model defaults
- Update summarize_streaming() to use model-specific settings
- Dynamic UI updates when model selection changes
- Temperature locked to official values, top_p/top_k user-adjustable
|
@@ -35,42 +35,84 @@ AVAILABLE_MODELS = {
|
|
| 35 |
"repo_id": "mradermacher/Falcon-H1-Tiny-Multilingual-100M-Instruct-GGUF",
|
| 36 |
"filename": "*Q8_0.gguf",
|
| 37 |
"max_context": 32768,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
},
|
| 39 |
"gemma3_270m": {
|
| 40 |
"name": "Gemma-3 270M",
|
| 41 |
"repo_id": "unsloth/gemma-3-270m-it-qat-GGUF",
|
| 42 |
"filename": "*Q8_0.gguf",
|
| 43 |
"max_context": 32768,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
},
|
| 45 |
"ernie_300m": {
|
| 46 |
"name": "ERNIE-4.5 0.3B (131K Context)",
|
| 47 |
"repo_id": "unsloth/ERNIE-4.5-0.3B-PT-GGUF",
|
| 48 |
"filename": "*Q8_0.gguf",
|
| 49 |
"max_context": 131072,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
},
|
| 51 |
"granite_350m": {
|
| 52 |
"name": "Granite-4.0 350M",
|
| 53 |
"repo_id": "unsloth/granite-4.0-h-350m-GGUF",
|
| 54 |
"filename": "*Q8_0.gguf",
|
| 55 |
"max_context": 32768,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
},
|
| 57 |
"lfm2_350m": {
|
| 58 |
"name": "LFM2 350M",
|
| 59 |
"repo_id": "LiquidAI/LFM2-350M-GGUF",
|
| 60 |
"filename": "*Q8_0.gguf",
|
| 61 |
"max_context": 32768,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
},
|
| 63 |
"bitcpm4_500m": {
|
| 64 |
"name": "BitCPM4 0.5B (128K Context)",
|
| 65 |
"repo_id": "openbmb/BitCPM4-0.5B-GGUF",
|
| 66 |
"filename": "*q4_0.gguf",
|
| 67 |
"max_context": 131072,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
},
|
| 69 |
"hunyuan_500m": {
|
| 70 |
"name": "Hunyuan 0.5B (256K Context)",
|
| 71 |
"repo_id": "mradermacher/Hunyuan-0.5B-Instruct-GGUF",
|
| 72 |
"filename": "*Q8_0.gguf",
|
| 73 |
"max_context": 262144,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
},
|
| 75 |
"qwen3_600m_q4": {
|
| 76 |
"name": "Qwen3 0.6B Q4 (Default)",
|
|
@@ -78,12 +120,24 @@ AVAILABLE_MODELS = {
|
|
| 78 |
"filename": "*Q4_K_M.gguf",
|
| 79 |
"max_context": 32768,
|
| 80 |
"supports_toggle": True,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
},
|
| 82 |
"falcon_h1_1.5b_q4": {
|
| 83 |
"name": "Falcon-H1 1.5B Q4",
|
| 84 |
"repo_id": "unsloth/Falcon-H1-1.5B-Deep-Instruct-GGUF",
|
| 85 |
"filename": "*Q4_K_M.gguf",
|
| 86 |
"max_context": 32768,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
},
|
| 88 |
"qwen3_1.7b_q4": {
|
| 89 |
"name": "Qwen3 1.7B Q4",
|
|
@@ -91,6 +145,12 @@ AVAILABLE_MODELS = {
|
|
| 91 |
"filename": "*Q4_K_M.gguf",
|
| 92 |
"max_context": 32768,
|
| 93 |
"supports_toggle": True,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
},
|
| 95 |
}
|
| 96 |
|
|
@@ -197,17 +257,27 @@ def calculate_n_ctx(model_key: str, transcript: str, max_tokens: int) -> Tuple[i
|
|
| 197 |
return n_ctx, warning
|
| 198 |
|
| 199 |
|
| 200 |
-
def
|
| 201 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
m = AVAILABLE_MODELS[model_key]
|
| 203 |
usable_ctx = min(m["max_context"], MAX_USABLE_CTX)
|
| 204 |
-
|
|
|
|
|
|
|
| 205 |
f"**{m['name']}**\n\n"
|
| 206 |
f"- Max context: {m['max_context']:,} tokens "
|
| 207 |
f"(capped at {usable_ctx:,} for performance)\n"
|
| 208 |
f"- Repo: `{m['repo_id']}`\n"
|
| 209 |
-
f"- Quant: `{m['filename']}`"
|
|
|
|
|
|
|
| 210 |
)
|
|
|
|
|
|
|
| 211 |
|
| 212 |
|
| 213 |
def parse_thinking_blocks(content: str, streaming: bool = False) -> Tuple[str, str]:
|
|
@@ -256,7 +326,8 @@ def summarize_streaming(
|
|
| 256 |
model_key: str,
|
| 257 |
enable_reasoning: bool = True,
|
| 258 |
max_tokens: int = 2048,
|
| 259 |
-
|
|
|
|
| 260 |
) -> Generator[Tuple[str, str, str], None, None]:
|
| 261 |
"""
|
| 262 |
Stream summary generation from uploaded file.
|
|
@@ -266,7 +337,8 @@ def summarize_streaming(
|
|
| 266 |
model_key: Model identifier from AVAILABLE_MODELS
|
| 267 |
enable_reasoning: Whether to use reasoning mode (/think) for Qwen3 models
|
| 268 |
max_tokens: Maximum tokens to generate
|
| 269 |
-
|
|
|
|
| 270 |
|
| 271 |
Yields:
|
| 272 |
Tuple of (thinking_text, summary_text, info_text)
|
|
@@ -336,22 +408,28 @@ def summarize_streaming(
|
|
| 336 |
{"role": "user", "content": f"請總結以下內容:\n\n{transcript}"},
|
| 337 |
]
|
| 338 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 339 |
# Stream - NO stop= parameter, let GGUF metadata handle it
|
| 340 |
full_response = ""
|
| 341 |
current_thinking = ""
|
| 342 |
current_summary = ""
|
| 343 |
|
| 344 |
try:
|
| 345 |
-
# Apply
|
| 346 |
-
# Conservative value (1.05) to avoid hurting coherence
|
| 347 |
stream = llm.create_chat_completion(
|
| 348 |
messages=messages,
|
| 349 |
max_tokens=max_tokens,
|
| 350 |
temperature=temperature,
|
| 351 |
min_p=0.0,
|
| 352 |
-
top_p=
|
| 353 |
-
top_k=
|
| 354 |
-
repeat_penalty=
|
| 355 |
stream=True,
|
| 356 |
)
|
| 357 |
|
|
@@ -608,6 +686,12 @@ def create_interface():
|
|
| 608 |
info="Qwen3 only: uses /think for deeper analysis (slower) or /no_think for direct output (faster). Enabled by default.",
|
| 609 |
interactive=True,
|
| 610 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 611 |
max_tokens = gr.Slider(
|
| 612 |
minimum=256,
|
| 613 |
maximum=4096,
|
|
@@ -616,13 +700,21 @@ def create_interface():
|
|
| 616 |
label="Max Output Tokens",
|
| 617 |
info="Higher = more detailed summary"
|
| 618 |
)
|
| 619 |
-
|
| 620 |
-
minimum=0.
|
| 621 |
maximum=1.0,
|
| 622 |
-
value=0.
|
| 623 |
-
step=0.
|
| 624 |
-
label="
|
| 625 |
-
info="Lower = more focused, Higher = more
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 626 |
)
|
| 627 |
|
| 628 |
submit_btn = gr.Button(
|
|
@@ -635,7 +727,7 @@ def create_interface():
|
|
| 635 |
with gr.Group():
|
| 636 |
gr.HTML('<div class="section-header"><span class="section-icon">📊</span> Model Information</div>')
|
| 637 |
info_output = gr.Markdown(
|
| 638 |
-
value=
|
| 639 |
elem_classes=["stats-grid"]
|
| 640 |
)
|
| 641 |
|
|
@@ -664,15 +756,15 @@ def create_interface():
|
|
| 664 |
# Event handlers
|
| 665 |
submit_btn.click(
|
| 666 |
fn=summarize_streaming,
|
| 667 |
-
inputs=[file_input, model_dropdown, enable_reasoning, max_tokens,
|
| 668 |
outputs=[thinking_output, summary_output, info_output],
|
| 669 |
show_progress="full"
|
| 670 |
)
|
| 671 |
|
| 672 |
model_dropdown.change(
|
| 673 |
-
fn=
|
| 674 |
inputs=[model_dropdown],
|
| 675 |
-
outputs=[info_output],
|
| 676 |
)
|
| 677 |
|
| 678 |
# Footer
|
|
|
|
| 35 |
"repo_id": "mradermacher/Falcon-H1-Tiny-Multilingual-100M-Instruct-GGUF",
|
| 36 |
"filename": "*Q8_0.gguf",
|
| 37 |
"max_context": 32768,
|
| 38 |
+
"inference_settings": {
|
| 39 |
+
"temperature": 0.1,
|
| 40 |
+
"top_p": 0.9,
|
| 41 |
+
"top_k": 40,
|
| 42 |
+
"repeat_penalty": 1.05,
|
| 43 |
+
},
|
| 44 |
},
|
| 45 |
"gemma3_270m": {
|
| 46 |
"name": "Gemma-3 270M",
|
| 47 |
"repo_id": "unsloth/gemma-3-270m-it-qat-GGUF",
|
| 48 |
"filename": "*Q8_0.gguf",
|
| 49 |
"max_context": 32768,
|
| 50 |
+
"inference_settings": {
|
| 51 |
+
"temperature": 1.0,
|
| 52 |
+
"top_p": 0.95,
|
| 53 |
+
"top_k": 64,
|
| 54 |
+
"repeat_penalty": 1.0,
|
| 55 |
+
},
|
| 56 |
},
|
| 57 |
"ernie_300m": {
|
| 58 |
"name": "ERNIE-4.5 0.3B (131K Context)",
|
| 59 |
"repo_id": "unsloth/ERNIE-4.5-0.3B-PT-GGUF",
|
| 60 |
"filename": "*Q8_0.gguf",
|
| 61 |
"max_context": 131072,
|
| 62 |
+
"inference_settings": {
|
| 63 |
+
"temperature": 0.3,
|
| 64 |
+
"top_p": 0.95,
|
| 65 |
+
"top_k": 30,
|
| 66 |
+
"repeat_penalty": 1.05,
|
| 67 |
+
},
|
| 68 |
},
|
| 69 |
"granite_350m": {
|
| 70 |
"name": "Granite-4.0 350M",
|
| 71 |
"repo_id": "unsloth/granite-4.0-h-350m-GGUF",
|
| 72 |
"filename": "*Q8_0.gguf",
|
| 73 |
"max_context": 32768,
|
| 74 |
+
"inference_settings": {
|
| 75 |
+
"temperature": 0.0,
|
| 76 |
+
"top_p": 1.0,
|
| 77 |
+
"top_k": 0,
|
| 78 |
+
"repeat_penalty": 1.05,
|
| 79 |
+
},
|
| 80 |
},
|
| 81 |
"lfm2_350m": {
|
| 82 |
"name": "LFM2 350M",
|
| 83 |
"repo_id": "LiquidAI/LFM2-350M-GGUF",
|
| 84 |
"filename": "*Q8_0.gguf",
|
| 85 |
"max_context": 32768,
|
| 86 |
+
"inference_settings": {
|
| 87 |
+
"temperature": 0.1,
|
| 88 |
+
"top_p": 0.1,
|
| 89 |
+
"top_k": 50,
|
| 90 |
+
"repeat_penalty": 1.05,
|
| 91 |
+
},
|
| 92 |
},
|
| 93 |
"bitcpm4_500m": {
|
| 94 |
"name": "BitCPM4 0.5B (128K Context)",
|
| 95 |
"repo_id": "openbmb/BitCPM4-0.5B-GGUF",
|
| 96 |
"filename": "*q4_0.gguf",
|
| 97 |
"max_context": 131072,
|
| 98 |
+
"inference_settings": {
|
| 99 |
+
"temperature": 0.3,
|
| 100 |
+
"top_p": 0.95,
|
| 101 |
+
"top_k": 30,
|
| 102 |
+
"repeat_penalty": 1.05,
|
| 103 |
+
},
|
| 104 |
},
|
| 105 |
"hunyuan_500m": {
|
| 106 |
"name": "Hunyuan 0.5B (256K Context)",
|
| 107 |
"repo_id": "mradermacher/Hunyuan-0.5B-Instruct-GGUF",
|
| 108 |
"filename": "*Q8_0.gguf",
|
| 109 |
"max_context": 262144,
|
| 110 |
+
"inference_settings": {
|
| 111 |
+
"temperature": 0.3,
|
| 112 |
+
"top_p": 0.95,
|
| 113 |
+
"top_k": 30,
|
| 114 |
+
"repeat_penalty": 1.05,
|
| 115 |
+
},
|
| 116 |
},
|
| 117 |
"qwen3_600m_q4": {
|
| 118 |
"name": "Qwen3 0.6B Q4 (Default)",
|
|
|
|
| 120 |
"filename": "*Q4_K_M.gguf",
|
| 121 |
"max_context": 32768,
|
| 122 |
"supports_toggle": True,
|
| 123 |
+
"inference_settings": {
|
| 124 |
+
"temperature": 0.6,
|
| 125 |
+
"top_p": 0.95,
|
| 126 |
+
"top_k": 20,
|
| 127 |
+
"repeat_penalty": 1.05,
|
| 128 |
+
},
|
| 129 |
},
|
| 130 |
"falcon_h1_1.5b_q4": {
|
| 131 |
"name": "Falcon-H1 1.5B Q4",
|
| 132 |
"repo_id": "unsloth/Falcon-H1-1.5B-Deep-Instruct-GGUF",
|
| 133 |
"filename": "*Q4_K_M.gguf",
|
| 134 |
"max_context": 32768,
|
| 135 |
+
"inference_settings": {
|
| 136 |
+
"temperature": 0.1,
|
| 137 |
+
"top_p": 0.9,
|
| 138 |
+
"top_k": 40,
|
| 139 |
+
"repeat_penalty": 1.05,
|
| 140 |
+
},
|
| 141 |
},
|
| 142 |
"qwen3_1.7b_q4": {
|
| 143 |
"name": "Qwen3 1.7B Q4",
|
|
|
|
| 145 |
"filename": "*Q4_K_M.gguf",
|
| 146 |
"max_context": 32768,
|
| 147 |
"supports_toggle": True,
|
| 148 |
+
"inference_settings": {
|
| 149 |
+
"temperature": 0.6,
|
| 150 |
+
"top_p": 0.95,
|
| 151 |
+
"top_k": 20,
|
| 152 |
+
"repeat_penalty": 1.05,
|
| 153 |
+
},
|
| 154 |
},
|
| 155 |
}
|
| 156 |
|
|
|
|
| 257 |
return n_ctx, warning
|
| 258 |
|
| 259 |
|
| 260 |
+
def get_model_info(model_key: str) -> Tuple[str, str, float, int]:
|
| 261 |
+
"""Get model information and inference settings for UI display.
|
| 262 |
+
|
| 263 |
+
Returns:
|
| 264 |
+
Tuple of (info_text, temperature, top_p, top_k)
|
| 265 |
+
"""
|
| 266 |
m = AVAILABLE_MODELS[model_key]
|
| 267 |
usable_ctx = min(m["max_context"], MAX_USABLE_CTX)
|
| 268 |
+
settings = m["inference_settings"]
|
| 269 |
+
|
| 270 |
+
info_text = (
|
| 271 |
f"**{m['name']}**\n\n"
|
| 272 |
f"- Max context: {m['max_context']:,} tokens "
|
| 273 |
f"(capped at {usable_ctx:,} for performance)\n"
|
| 274 |
f"- Repo: `{m['repo_id']}`\n"
|
| 275 |
+
f"- Quant: `{m['filename']}`\n"
|
| 276 |
+
f"- Temperature: {settings['temperature']} (locked)\n"
|
| 277 |
+
f"- Top P: {settings['top_p']}, Top K: {settings['top_k']}"
|
| 278 |
)
|
| 279 |
+
|
| 280 |
+
return info_text, str(settings["temperature"]), settings["top_p"], settings["top_k"]
|
| 281 |
|
| 282 |
|
| 283 |
def parse_thinking_blocks(content: str, streaming: bool = False) -> Tuple[str, str]:
|
|
|
|
| 326 |
model_key: str,
|
| 327 |
enable_reasoning: bool = True,
|
| 328 |
max_tokens: int = 2048,
|
| 329 |
+
top_p: float = None,
|
| 330 |
+
top_k: int = None,
|
| 331 |
) -> Generator[Tuple[str, str, str], None, None]:
|
| 332 |
"""
|
| 333 |
Stream summary generation from uploaded file.
|
|
|
|
| 337 |
model_key: Model identifier from AVAILABLE_MODELS
|
| 338 |
enable_reasoning: Whether to use reasoning mode (/think) for Qwen3 models
|
| 339 |
max_tokens: Maximum tokens to generate
|
| 340 |
+
top_p: Nucleus sampling parameter (uses model default if None)
|
| 341 |
+
top_k: Top-k sampling parameter (uses model default if None)
|
| 342 |
|
| 343 |
Yields:
|
| 344 |
Tuple of (thinking_text, summary_text, info_text)
|
|
|
|
| 408 |
{"role": "user", "content": f"請總結以下內容:\n\n{transcript}"},
|
| 409 |
]
|
| 410 |
|
| 411 |
+
# Get model-specific inference settings
|
| 412 |
+
inference_settings = model["inference_settings"]
|
| 413 |
+
temperature = inference_settings["temperature"]
|
| 414 |
+
final_top_p = top_p if top_p is not None else inference_settings["top_p"]
|
| 415 |
+
final_top_k = top_k if top_k is not None else inference_settings["top_k"]
|
| 416 |
+
repeat_penalty = inference_settings["repeat_penalty"]
|
| 417 |
+
|
| 418 |
# Stream - NO stop= parameter, let GGUF metadata handle it
|
| 419 |
full_response = ""
|
| 420 |
current_thinking = ""
|
| 421 |
current_summary = ""
|
| 422 |
|
| 423 |
try:
|
| 424 |
+
# Apply model-specific inference settings
|
|
|
|
| 425 |
stream = llm.create_chat_completion(
|
| 426 |
messages=messages,
|
| 427 |
max_tokens=max_tokens,
|
| 428 |
temperature=temperature,
|
| 429 |
min_p=0.0,
|
| 430 |
+
top_p=final_top_p,
|
| 431 |
+
top_k=final_top_k,
|
| 432 |
+
repeat_penalty=repeat_penalty,
|
| 433 |
stream=True,
|
| 434 |
)
|
| 435 |
|
|
|
|
| 686 |
info="Qwen3 only: uses /think for deeper analysis (slower) or /no_think for direct output (faster). Enabled by default.",
|
| 687 |
interactive=True,
|
| 688 |
)
|
| 689 |
+
temperature_display = gr.Textbox(
|
| 690 |
+
label="Temperature (Locked)",
|
| 691 |
+
value="0.6",
|
| 692 |
+
interactive=False,
|
| 693 |
+
info="Set by model's recommended settings. Cannot be changed."
|
| 694 |
+
)
|
| 695 |
max_tokens = gr.Slider(
|
| 696 |
minimum=256,
|
| 697 |
maximum=4096,
|
|
|
|
| 700 |
label="Max Output Tokens",
|
| 701 |
info="Higher = more detailed summary"
|
| 702 |
)
|
| 703 |
+
top_p = gr.Slider(
|
| 704 |
+
minimum=0.0,
|
| 705 |
maximum=1.0,
|
| 706 |
+
value=0.95,
|
| 707 |
+
step=0.05,
|
| 708 |
+
label="Top P (Nucleus Sampling)",
|
| 709 |
+
info="Lower = more focused, Higher = more diverse"
|
| 710 |
+
)
|
| 711 |
+
top_k = gr.Slider(
|
| 712 |
+
minimum=0,
|
| 713 |
+
maximum=100,
|
| 714 |
+
value=20,
|
| 715 |
+
step=5,
|
| 716 |
+
label="Top K",
|
| 717 |
+
info="Limits token selection to top K tokens (0 = disabled)"
|
| 718 |
)
|
| 719 |
|
| 720 |
submit_btn = gr.Button(
|
|
|
|
| 727 |
with gr.Group():
|
| 728 |
gr.HTML('<div class="section-header"><span class="section-icon">📊</span> Model Information</div>')
|
| 729 |
info_output = gr.Markdown(
|
| 730 |
+
value=get_model_info(DEFAULT_MODEL_KEY)[0],
|
| 731 |
elem_classes=["stats-grid"]
|
| 732 |
)
|
| 733 |
|
|
|
|
| 756 |
# Event handlers
|
| 757 |
submit_btn.click(
|
| 758 |
fn=summarize_streaming,
|
| 759 |
+
inputs=[file_input, model_dropdown, enable_reasoning, max_tokens, top_p, top_k],
|
| 760 |
outputs=[thinking_output, summary_output, info_output],
|
| 761 |
show_progress="full"
|
| 762 |
)
|
| 763 |
|
| 764 |
model_dropdown.change(
|
| 765 |
+
fn=get_model_info,
|
| 766 |
inputs=[model_dropdown],
|
| 767 |
+
outputs=[info_output, temperature_display, top_p, top_k],
|
| 768 |
)
|
| 769 |
|
| 770 |
# Footer
|