david167 commited on
Commit
1ba70a2
·
1 Parent(s): 342694d

Increase max_new_tokens to 8192 for unlimited length responses

Browse files
Files changed (1) hide show
  1. gradio_app.py +15 -15
gradio_app.py CHANGED
@@ -222,21 +222,21 @@ def chat_with_model(message, history, temperature, json_mode=False, json_templat
222
  inputs = {k: v.to(model_device) for k, v in inputs.items()}
223
 
224
  with torch.no_grad():
225
- outputs = model_manager.model.generate(
226
- **inputs,
227
- max_new_tokens=2048, # Reduced but sufficient for JSON responses
228
- temperature=temperature,
229
- top_p=0.95,
230
- do_sample=True,
231
- num_beams=1,
232
- pad_token_id=model_manager.tokenizer.eos_token_id,
233
- eos_token_id=model_manager.tokenizer.eos_token_id,
234
- early_stopping=False, # Disable early stopping
235
- repetition_penalty=1.05, # Lighter repetition penalty
236
- no_repeat_ngram_size=0, # Disable n-gram repetition blocking
237
- length_penalty=1.0, # Neutral length penalty
238
- min_new_tokens=50 # Ensure minimum response length
239
- )
240
 
241
  # Decode response
242
  generated_text = model_manager.tokenizer.decode(outputs[0], skip_special_tokens=True)
 
222
  inputs = {k: v.to(model_device) for k, v in inputs.items()}
223
 
224
  with torch.no_grad():
225
+ outputs = model_manager.model.generate(
226
+ **inputs,
227
+ max_new_tokens=8192, # Much higher limit for complete responses
228
+ temperature=temperature,
229
+ top_p=0.95,
230
+ do_sample=True,
231
+ num_beams=1,
232
+ pad_token_id=model_manager.tokenizer.eos_token_id,
233
+ eos_token_id=model_manager.tokenizer.eos_token_id,
234
+ early_stopping=False, # Disable early stopping
235
+ repetition_penalty=1.05, # Lighter repetition penalty
236
+ no_repeat_ngram_size=0, # Disable n-gram repetition blocking
237
+ length_penalty=1.0, # Neutral length penalty
238
+ min_new_tokens=50 # Ensure minimum response length
239
+ )
240
 
241
  # Decode response
242
  generated_text = model_manager.tokenizer.decode(outputs[0], skip_special_tokens=True)