Spaces:
Sleeping
Sleeping
Ilke Ileri
commited on
Commit
·
190133f
1
Parent(s):
39bb917
Optimize for speed: reduce to 80 tokens, lower sampling params, add timing logs
Browse files
app.py
CHANGED
|
@@ -126,17 +126,21 @@ def chat_completions():
|
|
| 126 |
inputs = {k: v.to(model.device) for k, v in inputs.items()}
|
| 127 |
|
| 128 |
print(f"Generating response...")
|
|
|
|
|
|
|
|
|
|
| 129 |
outputs = model.generate(
|
| 130 |
**inputs,
|
| 131 |
-
max_new_tokens=
|
| 132 |
-
temperature=0.
|
| 133 |
-
do_sample=True,
|
| 134 |
-
top_p=0.
|
| 135 |
-
repetition_penalty=1.1,
|
| 136 |
pad_token_id=tokenizer.pad_token_id,
|
| 137 |
eos_token_id=tokenizer.eos_token_id
|
| 138 |
)
|
| 139 |
-
|
|
|
|
|
|
|
| 140 |
|
| 141 |
full_response = tokenizer.decode(outputs[0], skip_special_tokens=False)
|
| 142 |
|
|
|
|
| 126 |
inputs = {k: v.to(model.device) for k, v in inputs.items()}
|
| 127 |
|
| 128 |
print(f"Generating response...")
|
| 129 |
+
import time
|
| 130 |
+
start_time = time.time()
|
| 131 |
+
|
| 132 |
outputs = model.generate(
|
| 133 |
**inputs,
|
| 134 |
+
max_new_tokens=80, # Balanced: fast but complete (reduced from 150)
|
| 135 |
+
temperature=0.6, # Slightly lower for faster convergence
|
| 136 |
+
do_sample=True,
|
| 137 |
+
top_p=0.85, # Reduced for faster sampling
|
|
|
|
| 138 |
pad_token_id=tokenizer.pad_token_id,
|
| 139 |
eos_token_id=tokenizer.eos_token_id
|
| 140 |
)
|
| 141 |
+
|
| 142 |
+
elapsed = time.time() - start_time
|
| 143 |
+
print(f"Response generated in {elapsed:.2f}s")
|
| 144 |
|
| 145 |
full_response = tokenizer.decode(outputs[0], skip_special_tokens=False)
|
| 146 |
|