Ilke Ileri commited on
Commit
0ccd1fa
·
1 Parent(s): c6b30d3

Add streaming support for Vapi compatibility

Browse files
Files changed (1) hide show
  1. app.py +32 -2
app.py CHANGED
@@ -75,6 +75,11 @@ def chat_completions():
75
 
76
  try:
77
  data = request.get_json()
 
 
 
 
 
78
  messages = data.get("messages", [])
79
 
80
  # Extract user messages (ignore system messages from Vapi)
@@ -135,9 +140,10 @@ def chat_completions():
135
  if hasattr(model, 'device'):
136
  inputs = {k: v.to(model.device) for k, v in inputs.items()}
137
 
 
138
  outputs = model.generate(
139
  **inputs,
140
- max_new_tokens=256,
141
  temperature=0.7,
142
  do_sample=True,
143
  top_p=0.9,
@@ -146,6 +152,7 @@ def chat_completions():
146
  pad_token_id=tokenizer.pad_token_id,
147
  eos_token_id=tokenizer.eos_token_id
148
  )
 
149
 
150
  full_response = tokenizer.decode(outputs[0], skip_special_tokens=False)
151
 
@@ -153,7 +160,30 @@ def chat_completions():
153
  response_text = full_response.split("<start_of_turn>model\n")[-1]
154
  response_text = response_text.replace("<end_of_turn>", "").strip()
155
 
156
- # OpenAI-compatible response format (Vapi için tam uyumlu)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  vapi_response = {
158
  "id": "chatcmpl-" + str(hash(prompt))[-10:],
159
  "object": "chat.completion",
 
75
 
76
  try:
77
  data = request.get_json()
78
+ print(f"Full request data: {data}")
79
+
80
+ # Check if streaming is requested
81
+ stream = data.get("stream", False)
82
+
83
  messages = data.get("messages", [])
84
 
85
  # Extract user messages (ignore system messages from Vapi)
 
140
  if hasattr(model, 'device'):
141
  inputs = {k: v.to(model.device) for k, v in inputs.items()}
142
 
143
+ print(f"Generating response...")
144
  outputs = model.generate(
145
  **inputs,
146
+ max_new_tokens=150, # Reduced from 256 for faster response
147
  temperature=0.7,
148
  do_sample=True,
149
  top_p=0.9,
 
152
  pad_token_id=tokenizer.pad_token_id,
153
  eos_token_id=tokenizer.eos_token_id
154
  )
155
+ print(f"Response generated!")
156
 
157
  full_response = tokenizer.decode(outputs[0], skip_special_tokens=False)
158
 
 
160
  response_text = full_response.split("<start_of_turn>model\n")[-1]
161
  response_text = response_text.replace("<end_of_turn>", "").strip()
162
 
163
+ # If streaming requested, return SSE format
164
+ if stream:
165
+ def generate():
166
+ # Send the complete response as a single chunk for simplicity
167
+ chunk = {
168
+ "id": "chatcmpl-" + str(hash(prompt))[-10:],
169
+ "object": "chat.completion.chunk",
170
+ "created": int(__import__('time').time()),
171
+ "model": MODEL_NAME,
172
+ "choices": [{
173
+ "index": 0,
174
+ "delta": {
175
+ "role": "assistant",
176
+ "content": response_text
177
+ },
178
+ "finish_reason": "stop"
179
+ }]
180
+ }
181
+ yield f"data: {__import__('json').dumps(chunk)}\n\n"
182
+ yield "data: [DONE]\n\n"
183
+
184
+ return app.response_class(generate(), mimetype='text/event-stream')
185
+
186
+ # OpenAI-compatible response format (non-streaming)
187
  vapi_response = {
188
  "id": "chatcmpl-" + str(hash(prompt))[-10:],
189
  "object": "chat.completion",