Abdalkaderdev commited on
Commit
d2505af
·
1 Parent(s): c45eea2

Reduce tokens for faster CPU inference

Browse files
Files changed (1) hide show
  1. app/ora_server.py +1 -1
app/ora_server.py CHANGED
@@ -98,7 +98,7 @@ async def chat_endpoint(req: ChatRequest):
98
 
99
  outputs = model.generate(
100
  input_ids,
101
- max_new_tokens=256,
102
  eos_token_id=terminators,
103
  do_sample=True,
104
  temperature=0.7,
 
98
 
99
  outputs = model.generate(
100
  input_ids,
101
+ max_new_tokens=128, # Reduced for faster CPU inference
102
  eos_token_id=terminators,
103
  do_sample=True,
104
  temperature=0.7,