rifatSDAS commited on
Commit
00fa073
·
1 Parent(s): 689e0fd

Increase max tokens to 16384 for reasoning models

Browse files

- Increased MAX_NEW_TOKENS from 2048 to 16384
- Added max tokens slider UI (1024-32768) for user control
- Updated generate_response to accept max_tokens parameter
- Fixes truncated answers from reasoning models like DeepSeek/Kimi

Files changed (2) hide show
  1. app.py +16 -4
  2. config.py +3 -1
app.py CHANGED
@@ -21,6 +21,7 @@ from config import (
21
  AVAILABLE_MODELS,
22
  DEFAULT_MODEL,
23
  MAX_NEW_TOKENS,
 
24
  TEMPERATURE,
25
  TOP_P,
26
  REPETITION_PENALTY,
@@ -73,6 +74,7 @@ def generate_response(
73
  message: str,
74
  history: list,
75
  model_name: str = DEFAULT_MODEL,
 
76
  ) -> Generator[str, None, None]:
77
  """Generate streaming response from the model."""
78
 
@@ -106,7 +108,7 @@ def generate_response(
106
  stream = client.chat_completion(
107
  model=model_id,
108
  messages=messages,
109
- max_tokens=MAX_NEW_TOKENS,
110
  temperature=TEMPERATURE,
111
  top_p=TOP_P,
112
  stream=True,
@@ -261,6 +263,15 @@ def create_app():
261
  info="💡 Larger models (DeepSeek, Kimi, MiniMax) take longer than smaller models (Llama, Mistral, GLM) which are faster.",
262
  scale=2,
263
  )
 
 
 
 
 
 
 
 
 
264
 
265
  # Main Chat Interface
266
  chatbot = gr.Chatbot(
@@ -318,7 +329,7 @@ def create_app():
318
  return "", history
319
  return "", history + [{"role": "user", "content": message}]
320
 
321
- def bot_response(history, selected_model):
322
  """Generate bot response with streaming."""
323
  if not history:
324
  return history, "", ""
@@ -329,6 +340,7 @@ def create_app():
329
  user_msg,
330
  history[:-1],
331
  selected_model,
 
332
  ):
333
  yield history + [{"role": "assistant", "content": response}], response, user_msg
334
 
@@ -360,7 +372,7 @@ def create_app():
360
  queue=False,
361
  ).then(
362
  bot_response,
363
- [chatbot, model_selector],
364
  [chatbot, last_response, last_query],
365
  )
366
 
@@ -371,7 +383,7 @@ def create_app():
371
  queue=False,
372
  ).then(
373
  bot_response,
374
- [chatbot, model_selector],
375
  [chatbot, last_response, last_query],
376
  )
377
 
 
21
  AVAILABLE_MODELS,
22
  DEFAULT_MODEL,
23
  MAX_NEW_TOKENS,
24
+ MAX_TOKENS_UI_LIMIT,
25
  TEMPERATURE,
26
  TOP_P,
27
  REPETITION_PENALTY,
 
74
  message: str,
75
  history: list,
76
  model_name: str = DEFAULT_MODEL,
77
+ max_tokens: int = MAX_NEW_TOKENS,
78
  ) -> Generator[str, None, None]:
79
  """Generate streaming response from the model."""
80
 
 
108
  stream = client.chat_completion(
109
  model=model_id,
110
  messages=messages,
111
+ max_tokens=max_tokens,
112
  temperature=TEMPERATURE,
113
  top_p=TOP_P,
114
  stream=True,
 
263
  info="💡 Larger models (DeepSeek, Kimi, MiniMax) take longer than smaller models (Llama, Mistral, GLM) which are faster.",
264
  scale=2,
265
  )
266
+ max_tokens_slider = gr.Slider(
267
+ minimum=1024,
268
+ maximum=MAX_TOKENS_UI_LIMIT,
269
+ value=MAX_NEW_TOKENS,
270
+ step=1024,
271
+ label="📝 Max Tokens",
272
+ info="Increase for longer responses from reasoning models",
273
+ scale=1,
274
+ )
275
 
276
  # Main Chat Interface
277
  chatbot = gr.Chatbot(
 
329
  return "", history
330
  return "", history + [{"role": "user", "content": message}]
331
 
332
+ def bot_response(history, selected_model, max_tokens):
333
  """Generate bot response with streaming."""
334
  if not history:
335
  return history, "", ""
 
340
  user_msg,
341
  history[:-1],
342
  selected_model,
343
+ int(max_tokens),
344
  ):
345
  yield history + [{"role": "assistant", "content": response}], response, user_msg
346
 
 
372
  queue=False,
373
  ).then(
374
  bot_response,
375
+ [chatbot, model_selector, max_tokens_slider],
376
  [chatbot, last_response, last_query],
377
  )
378
 
 
383
  queue=False,
384
  ).then(
385
  bot_response,
386
+ [chatbot, model_selector, max_tokens_slider],
387
  [chatbot, last_response, last_query],
388
  )
389
 
config.py CHANGED
@@ -38,7 +38,9 @@ AVAILABLE_MODELS = {
38
  DEFAULT_MODEL = "DeepSeek V3.2"
39
 
40
  # Generation Parameters
41
- MAX_NEW_TOKENS = 2048
 
 
42
  TEMPERATURE = 0.3
43
  TOP_P = 0.9
44
  REPETITION_PENALTY = 1.1
 
38
  DEFAULT_MODEL = "DeepSeek V3.2"
39
 
40
  # Generation Parameters
41
+ # Increased for reasoning models that need more tokens for thinking + answer
42
+ MAX_NEW_TOKENS = 16384
43
+ MAX_TOKENS_UI_LIMIT = 32768 # Maximum allowed via UI slider
44
  TEMPERATURE = 0.3
45
  TOP_P = 0.9
46
  REPETITION_PENALTY = 1.1