Spaces:

rifatSDAS
/

geoai-coding-agent

Running

rifatSDAS commited on 9 days ago

Commit

00fa073

1 Parent(s): 689e0fd

Increase max tokens to 16384 for reasoning models

- Increased MAX_NEW_TOKENS from 2048 to 16384
- Added max tokens slider UI (1024-32768) for user control
- Updated generate_response to accept max_tokens parameter
- Fixes truncated answers from reasoning models like DeepSeek/Kimi

Files changed (2) hide show

app.py +16 -4
config.py +3 -1

app.py CHANGED Viewed

@@ -21,6 +21,7 @@ from config import (
     AVAILABLE_MODELS,
     DEFAULT_MODEL,
     MAX_NEW_TOKENS,
     TEMPERATURE,
     TOP_P,
     REPETITION_PENALTY,
@@ -73,6 +74,7 @@ def generate_response(
     message: str,
     history: list,
     model_name: str = DEFAULT_MODEL,
 ) -> Generator[str, None, None]:
     """Generate streaming response from the model."""
@@ -106,7 +108,7 @@ def generate_response(
         stream = client.chat_completion(
             model=model_id,
             messages=messages,
-            max_tokens=MAX_NEW_TOKENS,
             temperature=TEMPERATURE,
             top_p=TOP_P,
             stream=True,
@@ -261,6 +263,15 @@ def create_app():
                 info="💡 Larger models (DeepSeek, Kimi, MiniMax) take longer than smaller models (Llama, Mistral, GLM) which are faster.",
                 scale=2,
             )
         # Main Chat Interface
         chatbot = gr.Chatbot(
@@ -318,7 +329,7 @@ def create_app():
                 return "", history
             return "", history + [{"role": "user", "content": message}]
-        def bot_response(history, selected_model):
             """Generate bot response with streaming."""
             if not history:
                 return history, "", ""
@@ -329,6 +340,7 @@ def create_app():
                 user_msg,
                 history[:-1],
                 selected_model,
             ):
                 yield history + [{"role": "assistant", "content": response}], response, user_msg
@@ -360,7 +372,7 @@ def create_app():
             queue=False,
         ).then(
             bot_response,
-            [chatbot, model_selector],
             [chatbot, last_response, last_query],
         )
@@ -371,7 +383,7 @@ def create_app():
             queue=False,
         ).then(
             bot_response,
-            [chatbot, model_selector],
             [chatbot, last_response, last_query],
         )

     AVAILABLE_MODELS,
     DEFAULT_MODEL,
     MAX_NEW_TOKENS,
+    MAX_TOKENS_UI_LIMIT,
     TEMPERATURE,
     TOP_P,
     REPETITION_PENALTY,
     message: str,
     history: list,
     model_name: str = DEFAULT_MODEL,
+    max_tokens: int = MAX_NEW_TOKENS,
 ) -> Generator[str, None, None]:
     """Generate streaming response from the model."""
         stream = client.chat_completion(
             model=model_id,
             messages=messages,
+            max_tokens=max_tokens,
             temperature=TEMPERATURE,
             top_p=TOP_P,
             stream=True,
                 info="💡 Larger models (DeepSeek, Kimi, MiniMax) take longer than smaller models (Llama, Mistral, GLM) which are faster.",
                 scale=2,
             )
+            max_tokens_slider = gr.Slider(
+                minimum=1024,
+                maximum=MAX_TOKENS_UI_LIMIT,
+                value=MAX_NEW_TOKENS,
+                step=1024,
+                label="📝 Max Tokens",
+                info="Increase for longer responses from reasoning models",
+                scale=1,
+            )
         # Main Chat Interface
         chatbot = gr.Chatbot(
                 return "", history
             return "", history + [{"role": "user", "content": message}]
+        def bot_response(history, selected_model, max_tokens):
             """Generate bot response with streaming."""
             if not history:
                 return history, "", ""
                 user_msg,
                 history[:-1],
                 selected_model,
+                int(max_tokens),
             ):
                 yield history + [{"role": "assistant", "content": response}], response, user_msg
             queue=False,
         ).then(
             bot_response,
+            [chatbot, model_selector, max_tokens_slider],
             [chatbot, last_response, last_query],
         )
             queue=False,
         ).then(
             bot_response,
+            [chatbot, model_selector, max_tokens_slider],
             [chatbot, last_response, last_query],
         )

config.py CHANGED Viewed

@@ -38,7 +38,9 @@ AVAILABLE_MODELS = {
 DEFAULT_MODEL = "DeepSeek V3.2"
 # Generation Parameters
-MAX_NEW_TOKENS = 2048
 TEMPERATURE = 0.3
 TOP_P = 0.9
 REPETITION_PENALTY = 1.1

 DEFAULT_MODEL = "DeepSeek V3.2"
 # Generation Parameters
+# Increased for reasoning models that need more tokens for thinking + answer
+MAX_NEW_TOKENS = 16384
+MAX_TOKENS_UI_LIMIT = 32768  # Maximum allowed via UI slider
 TEMPERATURE = 0.3
 TOP_P = 0.9
 REPETITION_PENALTY = 1.1