Spaces:

david167
/

question-generation-api

Sleeping

App Files Files Community

david167 commited on Aug 6, 2025

Commit

4185c2a

1 Parent(s): 0d85e38

Fix response truncation: disable early stopping, increase token limits to 4096, add debugging logs

Browse files

Files changed (1) hide show

gradio_app.py +10 -3

gradio_app.py CHANGED Viewed

@@ -100,7 +100,7 @@ def chat_with_model(message, history, temperature):
 """
             # Generate response using the model directly
-            inputs = model_manager.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048)
             # Force all inputs to the same device as the model
             if model_manager.device == "cuda:0":
@@ -110,13 +110,15 @@ def chat_with_model(message, history, temperature):
             with torch.no_grad():
                 outputs = model_manager.model.generate(
                     **inputs,
-                    max_new_tokens=2048,
                     temperature=temperature,
                     top_p=0.95,
                     do_sample=True,
                     num_beams=1,
                     pad_token_id=model_manager.tokenizer.eos_token_id,
-                    early_stopping=True
                 )
             # Decode the generated text and remove the input prompt
@@ -128,6 +130,7 @@ def chat_with_model(message, history, temperature):
                 # Find the position after the assistant header
                 response_start = full_text.find(assistant_start) + len(assistant_start)
                 response = full_text[response_start:].strip()
             else:
                 # Fallback: try to remove the original prompt
                 try:
@@ -135,6 +138,10 @@ def chat_with_model(message, history, temperature):
                 except:
                     response = full_text.strip()
             if not response:
                 response = "I couldn't generate a response. Please try a different prompt."

 """
             # Generate response using the model directly
+            inputs = model_manager.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096)
             # Force all inputs to the same device as the model
             if model_manager.device == "cuda:0":
             with torch.no_grad():
                 outputs = model_manager.model.generate(
                     **inputs,
+                    max_new_tokens=4096,
                     temperature=temperature,
                     top_p=0.95,
                     do_sample=True,
                     num_beams=1,
                     pad_token_id=model_manager.tokenizer.eos_token_id,
+                    eos_token_id=model_manager.tokenizer.eos_token_id,
+                    early_stopping=False,  # Disable early stopping to prevent premature truncation
+                    repetition_penalty=1.1  # Add slight repetition penalty to improve quality
                 )
             # Decode the generated text and remove the input prompt
                 # Find the position after the assistant header
                 response_start = full_text.find(assistant_start) + len(assistant_start)
                 response = full_text[response_start:].strip()
+                logger.info(f"Extracted response length: {len(response)}")
             else:
                 # Fallback: try to remove the original prompt
                 try:
                 except:
                     response = full_text.strip()
+            # Check if response ends abruptly (might indicate truncation)
+            if response and not response.endswith(('.', '!', '?', ':', ';')):
+                logger.warning(f"Response may be truncated - ends with: '{response[-20:]}'")
             if not response:
                 response = "I couldn't generate a response. Please try a different prompt."