Spaces:
Sleeping
Sleeping
Commit
·
d2505af
1
Parent(s):
c45eea2
Reduce tokens for faster CPU inference
Browse files- app/ora_server.py +1 -1
app/ora_server.py
CHANGED
|
@@ -98,7 +98,7 @@ async def chat_endpoint(req: ChatRequest):
|
|
| 98 |
|
| 99 |
outputs = model.generate(
|
| 100 |
input_ids,
|
| 101 |
-
max_new_tokens=
|
| 102 |
eos_token_id=terminators,
|
| 103 |
do_sample=True,
|
| 104 |
temperature=0.7,
|
|
|
|
| 98 |
|
| 99 |
outputs = model.generate(
|
| 100 |
input_ids,
|
| 101 |
+
max_new_tokens=128, # Reduced for faster CPU inference
|
| 102 |
eos_token_id=terminators,
|
| 103 |
do_sample=True,
|
| 104 |
temperature=0.7,
|