Spaces:
Running
Running
Increase max tokens to 16384 for reasoning models
Browse files- Increased MAX_NEW_TOKENS from 2048 to 16384
- Added max tokens slider UI (1024-32768) for user control
- Updated generate_response to accept max_tokens parameter
- Fixes truncated answers from reasoning models like DeepSeek/Kimi
app.py
CHANGED
|
@@ -21,6 +21,7 @@ from config import (
|
|
| 21 |
AVAILABLE_MODELS,
|
| 22 |
DEFAULT_MODEL,
|
| 23 |
MAX_NEW_TOKENS,
|
|
|
|
| 24 |
TEMPERATURE,
|
| 25 |
TOP_P,
|
| 26 |
REPETITION_PENALTY,
|
|
@@ -73,6 +74,7 @@ def generate_response(
|
|
| 73 |
message: str,
|
| 74 |
history: list,
|
| 75 |
model_name: str = DEFAULT_MODEL,
|
|
|
|
| 76 |
) -> Generator[str, None, None]:
|
| 77 |
"""Generate streaming response from the model."""
|
| 78 |
|
|
@@ -106,7 +108,7 @@ def generate_response(
|
|
| 106 |
stream = client.chat_completion(
|
| 107 |
model=model_id,
|
| 108 |
messages=messages,
|
| 109 |
-
max_tokens=
|
| 110 |
temperature=TEMPERATURE,
|
| 111 |
top_p=TOP_P,
|
| 112 |
stream=True,
|
|
@@ -261,6 +263,15 @@ def create_app():
|
|
| 261 |
info="💡 Larger models (DeepSeek, Kimi, MiniMax) take longer than smaller models (Llama, Mistral, GLM) which are faster.",
|
| 262 |
scale=2,
|
| 263 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
|
| 265 |
# Main Chat Interface
|
| 266 |
chatbot = gr.Chatbot(
|
|
@@ -318,7 +329,7 @@ def create_app():
|
|
| 318 |
return "", history
|
| 319 |
return "", history + [{"role": "user", "content": message}]
|
| 320 |
|
| 321 |
-
def bot_response(history, selected_model):
|
| 322 |
"""Generate bot response with streaming."""
|
| 323 |
if not history:
|
| 324 |
return history, "", ""
|
|
@@ -329,6 +340,7 @@ def create_app():
|
|
| 329 |
user_msg,
|
| 330 |
history[:-1],
|
| 331 |
selected_model,
|
|
|
|
| 332 |
):
|
| 333 |
yield history + [{"role": "assistant", "content": response}], response, user_msg
|
| 334 |
|
|
@@ -360,7 +372,7 @@ def create_app():
|
|
| 360 |
queue=False,
|
| 361 |
).then(
|
| 362 |
bot_response,
|
| 363 |
-
[chatbot, model_selector],
|
| 364 |
[chatbot, last_response, last_query],
|
| 365 |
)
|
| 366 |
|
|
@@ -371,7 +383,7 @@ def create_app():
|
|
| 371 |
queue=False,
|
| 372 |
).then(
|
| 373 |
bot_response,
|
| 374 |
-
[chatbot, model_selector],
|
| 375 |
[chatbot, last_response, last_query],
|
| 376 |
)
|
| 377 |
|
|
|
|
| 21 |
AVAILABLE_MODELS,
|
| 22 |
DEFAULT_MODEL,
|
| 23 |
MAX_NEW_TOKENS,
|
| 24 |
+
MAX_TOKENS_UI_LIMIT,
|
| 25 |
TEMPERATURE,
|
| 26 |
TOP_P,
|
| 27 |
REPETITION_PENALTY,
|
|
|
|
| 74 |
message: str,
|
| 75 |
history: list,
|
| 76 |
model_name: str = DEFAULT_MODEL,
|
| 77 |
+
max_tokens: int = MAX_NEW_TOKENS,
|
| 78 |
) -> Generator[str, None, None]:
|
| 79 |
"""Generate streaming response from the model."""
|
| 80 |
|
|
|
|
| 108 |
stream = client.chat_completion(
|
| 109 |
model=model_id,
|
| 110 |
messages=messages,
|
| 111 |
+
max_tokens=max_tokens,
|
| 112 |
temperature=TEMPERATURE,
|
| 113 |
top_p=TOP_P,
|
| 114 |
stream=True,
|
|
|
|
| 263 |
info="💡 Larger models (DeepSeek, Kimi, MiniMax) take longer than smaller models (Llama, Mistral, GLM) which are faster.",
|
| 264 |
scale=2,
|
| 265 |
)
|
| 266 |
+
max_tokens_slider = gr.Slider(
|
| 267 |
+
minimum=1024,
|
| 268 |
+
maximum=MAX_TOKENS_UI_LIMIT,
|
| 269 |
+
value=MAX_NEW_TOKENS,
|
| 270 |
+
step=1024,
|
| 271 |
+
label="📝 Max Tokens",
|
| 272 |
+
info="Increase for longer responses from reasoning models",
|
| 273 |
+
scale=1,
|
| 274 |
+
)
|
| 275 |
|
| 276 |
# Main Chat Interface
|
| 277 |
chatbot = gr.Chatbot(
|
|
|
|
| 329 |
return "", history
|
| 330 |
return "", history + [{"role": "user", "content": message}]
|
| 331 |
|
| 332 |
+
def bot_response(history, selected_model, max_tokens):
|
| 333 |
"""Generate bot response with streaming."""
|
| 334 |
if not history:
|
| 335 |
return history, "", ""
|
|
|
|
| 340 |
user_msg,
|
| 341 |
history[:-1],
|
| 342 |
selected_model,
|
| 343 |
+
int(max_tokens),
|
| 344 |
):
|
| 345 |
yield history + [{"role": "assistant", "content": response}], response, user_msg
|
| 346 |
|
|
|
|
| 372 |
queue=False,
|
| 373 |
).then(
|
| 374 |
bot_response,
|
| 375 |
+
[chatbot, model_selector, max_tokens_slider],
|
| 376 |
[chatbot, last_response, last_query],
|
| 377 |
)
|
| 378 |
|
|
|
|
| 383 |
queue=False,
|
| 384 |
).then(
|
| 385 |
bot_response,
|
| 386 |
+
[chatbot, model_selector, max_tokens_slider],
|
| 387 |
[chatbot, last_response, last_query],
|
| 388 |
)
|
| 389 |
|
config.py
CHANGED
|
@@ -38,7 +38,9 @@ AVAILABLE_MODELS = {
|
|
| 38 |
DEFAULT_MODEL = "DeepSeek V3.2"
|
| 39 |
|
| 40 |
# Generation Parameters
|
| 41 |
-
|
|
|
|
|
|
|
| 42 |
TEMPERATURE = 0.3
|
| 43 |
TOP_P = 0.9
|
| 44 |
REPETITION_PENALTY = 1.1
|
|
|
|
| 38 |
DEFAULT_MODEL = "DeepSeek V3.2"
|
| 39 |
|
| 40 |
# Generation Parameters
|
| 41 |
+
# Increased for reasoning models that need more tokens for thinking + answer
|
| 42 |
+
MAX_NEW_TOKENS = 16384
|
| 43 |
+
MAX_TOKENS_UI_LIMIT = 32768 # Maximum allowed via UI slider
|
| 44 |
TEMPERATURE = 0.3
|
| 45 |
TOP_P = 0.9
|
| 46 |
REPETITION_PENALTY = 1.1
|