Luigi commited on
Commit
8937994
·
1 Parent(s): 2fa67ca

Add model-specific inference settings and dynamic UI controls

Browse files

- Add inference_settings dict to all 10 models with official parameters:
* Falcon-H1 (100M/1.5B): temp=0.1, top_p=0.9, top_k=40, repeat=1.05 (TII official)
* Gemma-3 270M: temp=1.0, top_p=0.95, top_k=64, repeat=1.0 (Gemma official)
* Granite-4.0 350M: temp=0.0, top_p=1.0, top_k=0 (IBM official)
* LFM2 350M: temp=0.1, top_p=0.1, top_k=50, repeat=1.05 (LiquidAI official)
* Qwen3 (0.6B/1.7B): temp=0.6, top_p=0.95, top_k=20, repeat=1.05 (Qwen official)
* ERNIE/BitCPM4/Hunyuan: estimated conservative settings

- Replace temperature slider with locked display showing model's value
- Add top_p slider (0.0-1.0, step 0.05) with model defaults
- Add top_k slider (0-100, step 5) with model defaults
- Update summarize_streaming() to use model-specific settings
- Dynamic UI updates when model selection changes
- Temperature locked to official values, top_p/top_k user-adjustable

Files changed (1) hide show
  1. app.py +113 -21
app.py CHANGED
@@ -35,42 +35,84 @@ AVAILABLE_MODELS = {
35
  "repo_id": "mradermacher/Falcon-H1-Tiny-Multilingual-100M-Instruct-GGUF",
36
  "filename": "*Q8_0.gguf",
37
  "max_context": 32768,
 
 
 
 
 
 
38
  },
39
  "gemma3_270m": {
40
  "name": "Gemma-3 270M",
41
  "repo_id": "unsloth/gemma-3-270m-it-qat-GGUF",
42
  "filename": "*Q8_0.gguf",
43
  "max_context": 32768,
 
 
 
 
 
 
44
  },
45
  "ernie_300m": {
46
  "name": "ERNIE-4.5 0.3B (131K Context)",
47
  "repo_id": "unsloth/ERNIE-4.5-0.3B-PT-GGUF",
48
  "filename": "*Q8_0.gguf",
49
  "max_context": 131072,
 
 
 
 
 
 
50
  },
51
  "granite_350m": {
52
  "name": "Granite-4.0 350M",
53
  "repo_id": "unsloth/granite-4.0-h-350m-GGUF",
54
  "filename": "*Q8_0.gguf",
55
  "max_context": 32768,
 
 
 
 
 
 
56
  },
57
  "lfm2_350m": {
58
  "name": "LFM2 350M",
59
  "repo_id": "LiquidAI/LFM2-350M-GGUF",
60
  "filename": "*Q8_0.gguf",
61
  "max_context": 32768,
 
 
 
 
 
 
62
  },
63
  "bitcpm4_500m": {
64
  "name": "BitCPM4 0.5B (128K Context)",
65
  "repo_id": "openbmb/BitCPM4-0.5B-GGUF",
66
  "filename": "*q4_0.gguf",
67
  "max_context": 131072,
 
 
 
 
 
 
68
  },
69
  "hunyuan_500m": {
70
  "name": "Hunyuan 0.5B (256K Context)",
71
  "repo_id": "mradermacher/Hunyuan-0.5B-Instruct-GGUF",
72
  "filename": "*Q8_0.gguf",
73
  "max_context": 262144,
 
 
 
 
 
 
74
  },
75
  "qwen3_600m_q4": {
76
  "name": "Qwen3 0.6B Q4 (Default)",
@@ -78,12 +120,24 @@ AVAILABLE_MODELS = {
78
  "filename": "*Q4_K_M.gguf",
79
  "max_context": 32768,
80
  "supports_toggle": True,
 
 
 
 
 
 
81
  },
82
  "falcon_h1_1.5b_q4": {
83
  "name": "Falcon-H1 1.5B Q4",
84
  "repo_id": "unsloth/Falcon-H1-1.5B-Deep-Instruct-GGUF",
85
  "filename": "*Q4_K_M.gguf",
86
  "max_context": 32768,
 
 
 
 
 
 
87
  },
88
  "qwen3_1.7b_q4": {
89
  "name": "Qwen3 1.7B Q4",
@@ -91,6 +145,12 @@ AVAILABLE_MODELS = {
91
  "filename": "*Q4_K_M.gguf",
92
  "max_context": 32768,
93
  "supports_toggle": True,
 
 
 
 
 
 
94
  },
95
  }
96
 
@@ -197,17 +257,27 @@ def calculate_n_ctx(model_key: str, transcript: str, max_tokens: int) -> Tuple[i
197
  return n_ctx, warning
198
 
199
 
200
- def get_model_info_text(model_key: str) -> str:
201
- """Format model information for UI display."""
 
 
 
 
202
  m = AVAILABLE_MODELS[model_key]
203
  usable_ctx = min(m["max_context"], MAX_USABLE_CTX)
204
- return (
 
 
205
  f"**{m['name']}**\n\n"
206
  f"- Max context: {m['max_context']:,} tokens "
207
  f"(capped at {usable_ctx:,} for performance)\n"
208
  f"- Repo: `{m['repo_id']}`\n"
209
- f"- Quant: `{m['filename']}`"
 
 
210
  )
 
 
211
 
212
 
213
  def parse_thinking_blocks(content: str, streaming: bool = False) -> Tuple[str, str]:
@@ -256,7 +326,8 @@ def summarize_streaming(
256
  model_key: str,
257
  enable_reasoning: bool = True,
258
  max_tokens: int = 2048,
259
- temperature: float = 0.6,
 
260
  ) -> Generator[Tuple[str, str, str], None, None]:
261
  """
262
  Stream summary generation from uploaded file.
@@ -266,7 +337,8 @@ def summarize_streaming(
266
  model_key: Model identifier from AVAILABLE_MODELS
267
  enable_reasoning: Whether to use reasoning mode (/think) for Qwen3 models
268
  max_tokens: Maximum tokens to generate
269
- temperature: Sampling temperature
 
270
 
271
  Yields:
272
  Tuple of (thinking_text, summary_text, info_text)
@@ -336,22 +408,28 @@ def summarize_streaming(
336
  {"role": "user", "content": f"請總結以下內容:\n\n{transcript}"},
337
  ]
338
 
 
 
 
 
 
 
 
339
  # Stream - NO stop= parameter, let GGUF metadata handle it
340
  full_response = ""
341
  current_thinking = ""
342
  current_summary = ""
343
 
344
  try:
345
- # Apply repeat penalty for all models to prevent repetitive loops
346
- # Conservative value (1.05) to avoid hurting coherence
347
  stream = llm.create_chat_completion(
348
  messages=messages,
349
  max_tokens=max_tokens,
350
  temperature=temperature,
351
  min_p=0.0,
352
- top_p=0.95,
353
- top_k=20,
354
- repeat_penalty=1.05,
355
  stream=True,
356
  )
357
 
@@ -608,6 +686,12 @@ def create_interface():
608
  info="Qwen3 only: uses /think for deeper analysis (slower) or /no_think for direct output (faster). Enabled by default.",
609
  interactive=True,
610
  )
 
 
 
 
 
 
611
  max_tokens = gr.Slider(
612
  minimum=256,
613
  maximum=4096,
@@ -616,13 +700,21 @@ def create_interface():
616
  label="Max Output Tokens",
617
  info="Higher = more detailed summary"
618
  )
619
- temperature = gr.Slider(
620
- minimum=0.1,
621
  maximum=1.0,
622
- value=0.6,
623
- step=0.1,
624
- label="Temperature",
625
- info="Lower = more focused, Higher = more creative"
 
 
 
 
 
 
 
 
626
  )
627
 
628
  submit_btn = gr.Button(
@@ -635,7 +727,7 @@ def create_interface():
635
  with gr.Group():
636
  gr.HTML('<div class="section-header"><span class="section-icon">📊</span> Model Information</div>')
637
  info_output = gr.Markdown(
638
- value=get_model_info_text(DEFAULT_MODEL_KEY),
639
  elem_classes=["stats-grid"]
640
  )
641
 
@@ -664,15 +756,15 @@ def create_interface():
664
  # Event handlers
665
  submit_btn.click(
666
  fn=summarize_streaming,
667
- inputs=[file_input, model_dropdown, enable_reasoning, max_tokens, temperature],
668
  outputs=[thinking_output, summary_output, info_output],
669
  show_progress="full"
670
  )
671
 
672
  model_dropdown.change(
673
- fn=get_model_info_text,
674
  inputs=[model_dropdown],
675
- outputs=[info_output],
676
  )
677
 
678
  # Footer
 
35
  "repo_id": "mradermacher/Falcon-H1-Tiny-Multilingual-100M-Instruct-GGUF",
36
  "filename": "*Q8_0.gguf",
37
  "max_context": 32768,
38
+ "inference_settings": {
39
+ "temperature": 0.1,
40
+ "top_p": 0.9,
41
+ "top_k": 40,
42
+ "repeat_penalty": 1.05,
43
+ },
44
  },
45
  "gemma3_270m": {
46
  "name": "Gemma-3 270M",
47
  "repo_id": "unsloth/gemma-3-270m-it-qat-GGUF",
48
  "filename": "*Q8_0.gguf",
49
  "max_context": 32768,
50
+ "inference_settings": {
51
+ "temperature": 1.0,
52
+ "top_p": 0.95,
53
+ "top_k": 64,
54
+ "repeat_penalty": 1.0,
55
+ },
56
  },
57
  "ernie_300m": {
58
  "name": "ERNIE-4.5 0.3B (131K Context)",
59
  "repo_id": "unsloth/ERNIE-4.5-0.3B-PT-GGUF",
60
  "filename": "*Q8_0.gguf",
61
  "max_context": 131072,
62
+ "inference_settings": {
63
+ "temperature": 0.3,
64
+ "top_p": 0.95,
65
+ "top_k": 30,
66
+ "repeat_penalty": 1.05,
67
+ },
68
  },
69
  "granite_350m": {
70
  "name": "Granite-4.0 350M",
71
  "repo_id": "unsloth/granite-4.0-h-350m-GGUF",
72
  "filename": "*Q8_0.gguf",
73
  "max_context": 32768,
74
+ "inference_settings": {
75
+ "temperature": 0.0,
76
+ "top_p": 1.0,
77
+ "top_k": 0,
78
+ "repeat_penalty": 1.05,
79
+ },
80
  },
81
  "lfm2_350m": {
82
  "name": "LFM2 350M",
83
  "repo_id": "LiquidAI/LFM2-350M-GGUF",
84
  "filename": "*Q8_0.gguf",
85
  "max_context": 32768,
86
+ "inference_settings": {
87
+ "temperature": 0.1,
88
+ "top_p": 0.1,
89
+ "top_k": 50,
90
+ "repeat_penalty": 1.05,
91
+ },
92
  },
93
  "bitcpm4_500m": {
94
  "name": "BitCPM4 0.5B (128K Context)",
95
  "repo_id": "openbmb/BitCPM4-0.5B-GGUF",
96
  "filename": "*q4_0.gguf",
97
  "max_context": 131072,
98
+ "inference_settings": {
99
+ "temperature": 0.3,
100
+ "top_p": 0.95,
101
+ "top_k": 30,
102
+ "repeat_penalty": 1.05,
103
+ },
104
  },
105
  "hunyuan_500m": {
106
  "name": "Hunyuan 0.5B (256K Context)",
107
  "repo_id": "mradermacher/Hunyuan-0.5B-Instruct-GGUF",
108
  "filename": "*Q8_0.gguf",
109
  "max_context": 262144,
110
+ "inference_settings": {
111
+ "temperature": 0.3,
112
+ "top_p": 0.95,
113
+ "top_k": 30,
114
+ "repeat_penalty": 1.05,
115
+ },
116
  },
117
  "qwen3_600m_q4": {
118
  "name": "Qwen3 0.6B Q4 (Default)",
 
120
  "filename": "*Q4_K_M.gguf",
121
  "max_context": 32768,
122
  "supports_toggle": True,
123
+ "inference_settings": {
124
+ "temperature": 0.6,
125
+ "top_p": 0.95,
126
+ "top_k": 20,
127
+ "repeat_penalty": 1.05,
128
+ },
129
  },
130
  "falcon_h1_1.5b_q4": {
131
  "name": "Falcon-H1 1.5B Q4",
132
  "repo_id": "unsloth/Falcon-H1-1.5B-Deep-Instruct-GGUF",
133
  "filename": "*Q4_K_M.gguf",
134
  "max_context": 32768,
135
+ "inference_settings": {
136
+ "temperature": 0.1,
137
+ "top_p": 0.9,
138
+ "top_k": 40,
139
+ "repeat_penalty": 1.05,
140
+ },
141
  },
142
  "qwen3_1.7b_q4": {
143
  "name": "Qwen3 1.7B Q4",
 
145
  "filename": "*Q4_K_M.gguf",
146
  "max_context": 32768,
147
  "supports_toggle": True,
148
+ "inference_settings": {
149
+ "temperature": 0.6,
150
+ "top_p": 0.95,
151
+ "top_k": 20,
152
+ "repeat_penalty": 1.05,
153
+ },
154
  },
155
  }
156
 
 
257
  return n_ctx, warning
258
 
259
 
260
+ def get_model_info(model_key: str) -> Tuple[str, str, float, int]:
261
+ """Get model information and inference settings for UI display.
262
+
263
+ Returns:
264
+ Tuple of (info_text, temperature, top_p, top_k)
265
+ """
266
  m = AVAILABLE_MODELS[model_key]
267
  usable_ctx = min(m["max_context"], MAX_USABLE_CTX)
268
+ settings = m["inference_settings"]
269
+
270
+ info_text = (
271
  f"**{m['name']}**\n\n"
272
  f"- Max context: {m['max_context']:,} tokens "
273
  f"(capped at {usable_ctx:,} for performance)\n"
274
  f"- Repo: `{m['repo_id']}`\n"
275
+ f"- Quant: `{m['filename']}`\n"
276
+ f"- Temperature: {settings['temperature']} (locked)\n"
277
+ f"- Top P: {settings['top_p']}, Top K: {settings['top_k']}"
278
  )
279
+
280
+ return info_text, str(settings["temperature"]), settings["top_p"], settings["top_k"]
281
 
282
 
283
  def parse_thinking_blocks(content: str, streaming: bool = False) -> Tuple[str, str]:
 
326
  model_key: str,
327
  enable_reasoning: bool = True,
328
  max_tokens: int = 2048,
329
+ top_p: float = None,
330
+ top_k: int = None,
331
  ) -> Generator[Tuple[str, str, str], None, None]:
332
  """
333
  Stream summary generation from uploaded file.
 
337
  model_key: Model identifier from AVAILABLE_MODELS
338
  enable_reasoning: Whether to use reasoning mode (/think) for Qwen3 models
339
  max_tokens: Maximum tokens to generate
340
+ top_p: Nucleus sampling parameter (uses model default if None)
341
+ top_k: Top-k sampling parameter (uses model default if None)
342
 
343
  Yields:
344
  Tuple of (thinking_text, summary_text, info_text)
 
408
  {"role": "user", "content": f"請總結以下內容:\n\n{transcript}"},
409
  ]
410
 
411
+ # Get model-specific inference settings
412
+ inference_settings = model["inference_settings"]
413
+ temperature = inference_settings["temperature"]
414
+ final_top_p = top_p if top_p is not None else inference_settings["top_p"]
415
+ final_top_k = top_k if top_k is not None else inference_settings["top_k"]
416
+ repeat_penalty = inference_settings["repeat_penalty"]
417
+
418
  # Stream - NO stop= parameter, let GGUF metadata handle it
419
  full_response = ""
420
  current_thinking = ""
421
  current_summary = ""
422
 
423
  try:
424
+ # Apply model-specific inference settings
 
425
  stream = llm.create_chat_completion(
426
  messages=messages,
427
  max_tokens=max_tokens,
428
  temperature=temperature,
429
  min_p=0.0,
430
+ top_p=final_top_p,
431
+ top_k=final_top_k,
432
+ repeat_penalty=repeat_penalty,
433
  stream=True,
434
  )
435
 
 
686
  info="Qwen3 only: uses /think for deeper analysis (slower) or /no_think for direct output (faster). Enabled by default.",
687
  interactive=True,
688
  )
689
+ temperature_display = gr.Textbox(
690
+ label="Temperature (Locked)",
691
+ value="0.6",
692
+ interactive=False,
693
+ info="Set by model's recommended settings. Cannot be changed."
694
+ )
695
  max_tokens = gr.Slider(
696
  minimum=256,
697
  maximum=4096,
 
700
  label="Max Output Tokens",
701
  info="Higher = more detailed summary"
702
  )
703
+ top_p = gr.Slider(
704
+ minimum=0.0,
705
  maximum=1.0,
706
+ value=0.95,
707
+ step=0.05,
708
+ label="Top P (Nucleus Sampling)",
709
+ info="Lower = more focused, Higher = more diverse"
710
+ )
711
+ top_k = gr.Slider(
712
+ minimum=0,
713
+ maximum=100,
714
+ value=20,
715
+ step=5,
716
+ label="Top K",
717
+ info="Limits token selection to top K tokens (0 = disabled)"
718
  )
719
 
720
  submit_btn = gr.Button(
 
727
  with gr.Group():
728
  gr.HTML('<div class="section-header"><span class="section-icon">📊</span> Model Information</div>')
729
  info_output = gr.Markdown(
730
+ value=get_model_info(DEFAULT_MODEL_KEY)[0],
731
  elem_classes=["stats-grid"]
732
  )
733
 
 
756
  # Event handlers
757
  submit_btn.click(
758
  fn=summarize_streaming,
759
+ inputs=[file_input, model_dropdown, enable_reasoning, max_tokens, top_p, top_k],
760
  outputs=[thinking_output, summary_output, info_output],
761
  show_progress="full"
762
  )
763
 
764
  model_dropdown.change(
765
+ fn=get_model_info,
766
  inputs=[model_dropdown],
767
+ outputs=[info_output, temperature_display, top_p, top_k],
768
  )
769
 
770
  # Footer