tonyassi commited on
Commit
e6f73aa
·
verified ·
1 Parent(s): 63936cf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +115 -25
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import os
2
  import time
 
3
  import tempfile
4
  from collections import deque
5
 
@@ -12,7 +13,7 @@ from google.genai import types
12
  from faster_whisper import WhisperModel
13
 
14
  from elevenlabs.client import ElevenLabs
15
- from elevenlabs import save # uses generator streaming under the hood
16
 
17
  app = Flask(__name__)
18
 
@@ -54,10 +55,24 @@ HISTORY = deque(maxlen=MAX_MESSAGES)
54
  _whisper_model = None
55
 
56
 
 
 
 
57
  def _client_ip() -> str:
58
  return request.headers.get("x-forwarded-for", request.remote_addr or "unknown")
59
 
60
 
 
 
 
 
 
 
 
 
 
 
 
61
  def _get_whisper_model() -> WhisperModel:
62
  global _whisper_model
63
  if _whisper_model is None:
@@ -136,6 +151,31 @@ def llm_chat(user_text: str) -> str:
136
  raise
137
 
138
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  # -------------------------
140
  # Endpoints
141
  # -------------------------
@@ -196,12 +236,64 @@ def chat_text():
196
  return jsonify({"error": "Gemini call failed"}), 500
197
 
198
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  @app.post("/v1/utterance")
200
  def utterance_audio_to_audio():
201
  """
202
  Accepts: multipart/form-data with field "audio" containing a .wav file
203
  Returns: audio/mpeg (mp3)
204
- Also includes timing headers:
 
205
  X-STT-MS, X-LLM-MS, X-TTS-MS, X-TOTAL-MS
206
  """
207
  t0 = time.time()
@@ -223,7 +315,6 @@ def utterance_audio_to_audio():
223
  print(f"[/v1/utterance] ERROR non-wav filename={filename!r} ip={ip}")
224
  return jsonify({"error": "Please upload a .wav file"}), 400
225
 
226
- # Save uploaded wav
227
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_in:
228
  wav_path = tmp_in.name
229
  f.save(wav_path)
@@ -238,7 +329,7 @@ def utterance_audio_to_audio():
238
  t_stt = time.time()
239
  model = _get_whisper_model()
240
 
241
- segments, info = model.transcribe(
242
  wav_path,
243
  language=WHISPER_LANGUAGE,
244
  vad_filter=True,
@@ -263,21 +354,22 @@ def utterance_audio_to_audio():
263
  print(f"[/v1/utterance] reply_len={len(reply_text)} llm_ms={llm_ms}")
264
  print(f"[/v1/utterance] bot_reply={reply_text!r}")
265
 
266
- # ---- TTS (ElevenLabs) ----
267
- t_tts = time.time()
268
- audio_stream = eleven.text_to_speech.convert(
269
- text=reply_text,
270
- voice_id=ELEVEN_VOICE_ID,
271
- model_id=ELEVEN_MODEL_ID,
272
- output_format=ELEVEN_OUTPUT_FORMAT,
273
- )
274
-
275
- # Save mp3 stream to temp file
276
- with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_out:
277
- mp3_path = tmp_out.name
278
-
279
- save(audio_stream, mp3_path)
280
- tts_ms = int((time.time() - t_tts) * 1000)
 
281
 
282
  total_ms = int((time.time() - t0) * 1000)
283
  print(f"[/v1/utterance] tts_ms={tts_ms} total_ms={total_ms}")
@@ -290,7 +382,6 @@ def utterance_audio_to_audio():
290
  download_name="andy.mp3",
291
  conditional=False,
292
  )
293
- # Timing headers (super handy for your client)
294
  resp.headers["X-STT-MS"] = str(stt_ms)
295
  resp.headers["X-LLM-MS"] = str(llm_ms)
296
  resp.headers["X-TTS-MS"] = str(tts_ms)
@@ -299,12 +390,11 @@ def utterance_audio_to_audio():
299
 
300
  except Exception as e:
301
  total_ms = int((time.time() - t0) * 1000)
302
- print("Utterance pipeline error:", repr(e))
303
- print(f"[/v1/utterance] FAIL ip={ip} total_ms={total_ms}")
304
- return jsonify({"error": "Utterance pipeline failed"}), 500
305
 
306
  finally:
307
- # cleanup
308
  try:
309
  os.remove(wav_path)
310
  except Exception:
@@ -331,5 +421,5 @@ if __name__ == "__main__":
331
  port = int(os.environ.get("PORT", "7860"))
332
  print(f"[startup] model={MODEL} thinking_level={THINKING_LEVEL} max_messages={MAX_MESSAGES} port={port}")
333
  print(f"[startup] whisper_model={WHISPER_MODEL_NAME} device={WHISPER_DEVICE} compute={WHISPER_COMPUTE_TYPE}")
334
- print(f"[startup] eleven_voice_id={ELEVEN_VOICE_ID} eleven_model_id={ELEVEN_MODEL_ID} out={ELEVEN_OUTPUT_FORMAT}")
335
  serve(app, host="0.0.0.0", port=port)
 
1
  import os
2
  import time
3
+ import json
4
  import tempfile
5
  from collections import deque
6
 
 
13
  from faster_whisper import WhisperModel
14
 
15
  from elevenlabs.client import ElevenLabs
16
+ from elevenlabs import save # saves generator/stream to file
17
 
18
  app = Flask(__name__)
19
 
 
55
  _whisper_model = None
56
 
57
 
58
+ # -------------------------
59
+ # Helpers
60
+ # -------------------------
61
  def _client_ip() -> str:
62
  return request.headers.get("x-forwarded-for", request.remote_addr or "unknown")
63
 
64
 
65
+ def _err_details(e: Exception) -> dict:
66
+ d = {"type": type(e).__name__, "repr": repr(e)}
67
+ for k in ["status_code", "body", "message", "response", "details"]:
68
+ if hasattr(e, k):
69
+ try:
70
+ d[k] = getattr(e, k)
71
+ except Exception:
72
+ pass
73
+ return d
74
+
75
+
76
  def _get_whisper_model() -> WhisperModel:
77
  global _whisper_model
78
  if _whisper_model is None:
 
151
  raise
152
 
153
 
154
+ def _tts_to_mp3_file(text: str) -> tuple[str, int]:
155
+ """
156
+ Returns: (mp3_path, tts_ms)
157
+ Raises exception on failure.
158
+ """
159
+ if eleven is None:
160
+ raise RuntimeError("Server missing ELEVEN_API_KEY")
161
+
162
+ t0 = time.time()
163
+
164
+ audio_stream = eleven.text_to_speech.convert(
165
+ text=text,
166
+ voice_id=ELEVEN_VOICE_ID,
167
+ model_id=ELEVEN_MODEL_ID,
168
+ output_format=ELEVEN_OUTPUT_FORMAT,
169
+ )
170
+
171
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_out:
172
+ mp3_path = tmp_out.name
173
+
174
+ save(audio_stream, mp3_path)
175
+ tts_ms = int((time.time() - t0) * 1000)
176
+ return mp3_path, tts_ms
177
+
178
+
179
  # -------------------------
180
  # Endpoints
181
  # -------------------------
 
236
  return jsonify({"error": "Gemini call failed"}), 500
237
 
238
 
239
+ @app.post("/v1/tts")
240
+ def tts_only():
241
+ """
242
+ JSON body: { "text": "hello" }
243
+ Returns: audio/mpeg (mp3)
244
+ Timing headers:
245
+ X-TTS-MS, X-TOTAL-MS
246
+ """
247
+ ip = _client_ip()
248
+ t0 = time.time()
249
+
250
+ data = request.get_json(silent=True) or {}
251
+ text = (data.get("text") or "").strip()
252
+
253
+ print(f"[/v1/tts] START {time.strftime('%Y-%m-%d %H:%M:%S')} ip={ip} text_len={len(text)}")
254
+
255
+ if not text:
256
+ return jsonify({"error": "Missing 'text'"}), 400
257
+
258
+ mp3_path = None
259
+ try:
260
+ mp3_path, tts_ms = _tts_to_mp3_file(text)
261
+ total_ms = int((time.time() - t0) * 1000)
262
+
263
+ print(f"[/v1/tts] OK tts_ms={tts_ms} total_ms={total_ms}")
264
+
265
+ resp = send_file(
266
+ mp3_path,
267
+ mimetype="audio/mpeg",
268
+ as_attachment=False,
269
+ download_name="andy.mp3",
270
+ conditional=False,
271
+ )
272
+ resp.headers["X-TTS-MS"] = str(tts_ms)
273
+ resp.headers["X-TOTAL-MS"] = str(total_ms)
274
+ return resp
275
+
276
+ except Exception as e:
277
+ details = _err_details(e)
278
+ total_ms = int((time.time() - t0) * 1000)
279
+ print(f"[/v1/tts] FAIL total_ms={total_ms} details={json.dumps(details, default=str)[:2000]}")
280
+ return jsonify({"error": "ElevenLabs TTS failed", "details": details, "total_ms": total_ms}), 502
281
+
282
+ finally:
283
+ if mp3_path:
284
+ try:
285
+ os.remove(mp3_path)
286
+ except Exception:
287
+ pass
288
+
289
+
290
  @app.post("/v1/utterance")
291
  def utterance_audio_to_audio():
292
  """
293
  Accepts: multipart/form-data with field "audio" containing a .wav file
294
  Returns: audio/mpeg (mp3)
295
+
296
+ Timing headers:
297
  X-STT-MS, X-LLM-MS, X-TTS-MS, X-TOTAL-MS
298
  """
299
  t0 = time.time()
 
315
  print(f"[/v1/utterance] ERROR non-wav filename={filename!r} ip={ip}")
316
  return jsonify({"error": "Please upload a .wav file"}), 400
317
 
 
318
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_in:
319
  wav_path = tmp_in.name
320
  f.save(wav_path)
 
329
  t_stt = time.time()
330
  model = _get_whisper_model()
331
 
332
+ segments, _info = model.transcribe(
333
  wav_path,
334
  language=WHISPER_LANGUAGE,
335
  vad_filter=True,
 
354
  print(f"[/v1/utterance] reply_len={len(reply_text)} llm_ms={llm_ms}")
355
  print(f"[/v1/utterance] bot_reply={reply_text!r}")
356
 
357
+ # ---- TTS ----
358
+ try:
359
+ mp3_path, tts_ms = _tts_to_mp3_file(reply_text)
360
+ except Exception as e:
361
+ details = _err_details(e)
362
+ total_ms = int((time.time() - t0) * 1000)
363
+ print(f"[/v1/utterance] TTS FAIL total_ms={total_ms} details={json.dumps(details, default=str)[:2000]}")
364
+ return jsonify({
365
+ "error": "ElevenLabs TTS failed",
366
+ "details": details,
367
+ "transcript": transcript,
368
+ "reply_text": reply_text,
369
+ "stt_ms": stt_ms,
370
+ "llm_ms": llm_ms,
371
+ "total_ms": total_ms,
372
+ }), 502
373
 
374
  total_ms = int((time.time() - t0) * 1000)
375
  print(f"[/v1/utterance] tts_ms={tts_ms} total_ms={total_ms}")
 
382
  download_name="andy.mp3",
383
  conditional=False,
384
  )
 
385
  resp.headers["X-STT-MS"] = str(stt_ms)
386
  resp.headers["X-LLM-MS"] = str(llm_ms)
387
  resp.headers["X-TTS-MS"] = str(tts_ms)
 
390
 
391
  except Exception as e:
392
  total_ms = int((time.time() - t0) * 1000)
393
+ details = _err_details(e)
394
+ print(f"[/v1/utterance] FAIL ip={ip} total_ms={total_ms} details={json.dumps(details, default=str)[:2000]}")
395
+ return jsonify({"error": "Utterance pipeline failed", "details": details, "total_ms": total_ms}), 500
396
 
397
  finally:
 
398
  try:
399
  os.remove(wav_path)
400
  except Exception:
 
421
  port = int(os.environ.get("PORT", "7860"))
422
  print(f"[startup] model={MODEL} thinking_level={THINKING_LEVEL} max_messages={MAX_MESSAGES} port={port}")
423
  print(f"[startup] whisper_model={WHISPER_MODEL_NAME} device={WHISPER_DEVICE} compute={WHISPER_COMPUTE_TYPE}")
424
+ print(f"[startup] eleven_ok={bool(ELEVEN_API_KEY)} voice={ELEVEN_VOICE_ID} model={ELEVEN_MODEL_ID} out={ELEVEN_OUTPUT_FORMAT}")
425
  serve(app, host="0.0.0.0", port=port)