Ilke Ileri commited on
Commit
b9e9889
·
1 Parent(s): 88b53d1

Optimize for speed: max_tokens=50, greedy decoding for real-time voice

Browse files
Files changed (1) hide show
  1. app.py +3 -6
app.py CHANGED
@@ -157,12 +157,9 @@ def chat_completions():
157
  print(f"Generating response...")
158
  outputs = model.generate(
159
  **inputs,
160
- max_new_tokens=100, # Further reduced for faster response (was 150)
161
- temperature=0.7,
162
- do_sample=True,
163
- top_p=0.9,
164
- num_return_sequences=1,
165
- repetition_penalty=1.15,
166
  pad_token_id=tokenizer.pad_token_id,
167
  eos_token_id=tokenizer.eos_token_id
168
  )
 
157
  print(f"Generating response...")
158
  outputs = model.generate(
159
  **inputs,
160
+ max_new_tokens=50, # Drastically reduced for real-time voice (was 100)
161
+ temperature=0.3, # Lower for faster, more focused responses
162
+ do_sample=False, # Greedy decoding for speed
 
 
 
163
  pad_token_id=tokenizer.pad_token_id,
164
  eos_token_id=tokenizer.eos_token_id
165
  )