Ilke Ileri commited on
Commit
190133f
·
1 Parent(s): 39bb917

Optimize for speed: reduce to 80 tokens, lower sampling params, add timing logs

Browse files
Files changed (1) hide show
  1. app.py +10 -6
app.py CHANGED
@@ -126,17 +126,21 @@ def chat_completions():
126
  inputs = {k: v.to(model.device) for k, v in inputs.items()}
127
 
128
  print(f"Generating response...")
 
 
 
129
  outputs = model.generate(
130
  **inputs,
131
- max_new_tokens=150, # Increased for complete appointment booking responses
132
- temperature=0.7, # Higher for more natural conversation
133
- do_sample=True, # Enable sampling for variety
134
- top_p=0.9,
135
- repetition_penalty=1.1,
136
  pad_token_id=tokenizer.pad_token_id,
137
  eos_token_id=tokenizer.eos_token_id
138
  )
139
- print(f"Response generated!")
 
 
140
 
141
  full_response = tokenizer.decode(outputs[0], skip_special_tokens=False)
142
 
 
126
  inputs = {k: v.to(model.device) for k, v in inputs.items()}
127
 
128
  print(f"Generating response...")
129
+ import time
130
+ start_time = time.time()
131
+
132
  outputs = model.generate(
133
  **inputs,
134
+ max_new_tokens=80, # Balanced: fast but complete (reduced from 150)
135
+ temperature=0.6, # Slightly lower for faster convergence
136
+ do_sample=True,
137
+ top_p=0.85, # Reduced for faster sampling
 
138
  pad_token_id=tokenizer.pad_token_id,
139
  eos_token_id=tokenizer.eos_token_id
140
  )
141
+
142
+ elapsed = time.time() - start_time
143
+ print(f"Response generated in {elapsed:.2f}s")
144
 
145
  full_response = tokenizer.decode(outputs[0], skip_special_tokens=False)
146