Spaces:
Sleeping
Sleeping
Ilke Ileri
commited on
Commit
·
b9e9889
1
Parent(s):
88b53d1
Optimize for speed: max_tokens=50, greedy decoding for real-time voice
Browse files
app.py
CHANGED
|
@@ -157,12 +157,9 @@ def chat_completions():
|
|
| 157 |
print(f"Generating response...")
|
| 158 |
outputs = model.generate(
|
| 159 |
**inputs,
|
| 160 |
-
max_new_tokens=
|
| 161 |
-
temperature=0.
|
| 162 |
-
do_sample=
|
| 163 |
-
top_p=0.9,
|
| 164 |
-
num_return_sequences=1,
|
| 165 |
-
repetition_penalty=1.15,
|
| 166 |
pad_token_id=tokenizer.pad_token_id,
|
| 167 |
eos_token_id=tokenizer.eos_token_id
|
| 168 |
)
|
|
|
|
| 157 |
print(f"Generating response...")
|
| 158 |
outputs = model.generate(
|
| 159 |
**inputs,
|
| 160 |
+
max_new_tokens=50, # Drastically reduced for real-time voice (was 100)
|
| 161 |
+
temperature=0.3, # Lower for faster, more focused responses
|
| 162 |
+
do_sample=False, # Greedy decoding for speed
|
|
|
|
|
|
|
|
|
|
| 163 |
pad_token_id=tokenizer.pad_token_id,
|
| 164 |
eos_token_id=tokenizer.eos_token_id
|
| 165 |
)
|