Update app.py
Browse files
app.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
import os
|
| 2 |
import time
|
|
|
|
| 3 |
import tempfile
|
| 4 |
from collections import deque
|
| 5 |
|
|
@@ -12,7 +13,7 @@ from google.genai import types
|
|
| 12 |
from faster_whisper import WhisperModel
|
| 13 |
|
| 14 |
from elevenlabs.client import ElevenLabs
|
| 15 |
-
from elevenlabs import save #
|
| 16 |
|
| 17 |
app = Flask(__name__)
|
| 18 |
|
|
@@ -54,10 +55,24 @@ HISTORY = deque(maxlen=MAX_MESSAGES)
|
|
| 54 |
_whisper_model = None
|
| 55 |
|
| 56 |
|
|
|
|
|
|
|
|
|
|
| 57 |
def _client_ip() -> str:
|
| 58 |
return request.headers.get("x-forwarded-for", request.remote_addr or "unknown")
|
| 59 |
|
| 60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
def _get_whisper_model() -> WhisperModel:
|
| 62 |
global _whisper_model
|
| 63 |
if _whisper_model is None:
|
|
@@ -136,6 +151,31 @@ def llm_chat(user_text: str) -> str:
|
|
| 136 |
raise
|
| 137 |
|
| 138 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
# -------------------------
|
| 140 |
# Endpoints
|
| 141 |
# -------------------------
|
|
@@ -196,12 +236,64 @@ def chat_text():
|
|
| 196 |
return jsonify({"error": "Gemini call failed"}), 500
|
| 197 |
|
| 198 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
@app.post("/v1/utterance")
|
| 200 |
def utterance_audio_to_audio():
|
| 201 |
"""
|
| 202 |
Accepts: multipart/form-data with field "audio" containing a .wav file
|
| 203 |
Returns: audio/mpeg (mp3)
|
| 204 |
-
|
|
|
|
| 205 |
X-STT-MS, X-LLM-MS, X-TTS-MS, X-TOTAL-MS
|
| 206 |
"""
|
| 207 |
t0 = time.time()
|
|
@@ -223,7 +315,6 @@ def utterance_audio_to_audio():
|
|
| 223 |
print(f"[/v1/utterance] ERROR non-wav filename={filename!r} ip={ip}")
|
| 224 |
return jsonify({"error": "Please upload a .wav file"}), 400
|
| 225 |
|
| 226 |
-
# Save uploaded wav
|
| 227 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_in:
|
| 228 |
wav_path = tmp_in.name
|
| 229 |
f.save(wav_path)
|
|
@@ -238,7 +329,7 @@ def utterance_audio_to_audio():
|
|
| 238 |
t_stt = time.time()
|
| 239 |
model = _get_whisper_model()
|
| 240 |
|
| 241 |
-
segments,
|
| 242 |
wav_path,
|
| 243 |
language=WHISPER_LANGUAGE,
|
| 244 |
vad_filter=True,
|
|
@@ -263,21 +354,22 @@ def utterance_audio_to_audio():
|
|
| 263 |
print(f"[/v1/utterance] reply_len={len(reply_text)} llm_ms={llm_ms}")
|
| 264 |
print(f"[/v1/utterance] bot_reply={reply_text!r}")
|
| 265 |
|
| 266 |
-
# ---- TTS
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
|
|
|
| 281 |
|
| 282 |
total_ms = int((time.time() - t0) * 1000)
|
| 283 |
print(f"[/v1/utterance] tts_ms={tts_ms} total_ms={total_ms}")
|
|
@@ -290,7 +382,6 @@ def utterance_audio_to_audio():
|
|
| 290 |
download_name="andy.mp3",
|
| 291 |
conditional=False,
|
| 292 |
)
|
| 293 |
-
# Timing headers (super handy for your client)
|
| 294 |
resp.headers["X-STT-MS"] = str(stt_ms)
|
| 295 |
resp.headers["X-LLM-MS"] = str(llm_ms)
|
| 296 |
resp.headers["X-TTS-MS"] = str(tts_ms)
|
|
@@ -299,12 +390,11 @@ def utterance_audio_to_audio():
|
|
| 299 |
|
| 300 |
except Exception as e:
|
| 301 |
total_ms = int((time.time() - t0) * 1000)
|
| 302 |
-
|
| 303 |
-
print(f"[/v1/utterance] FAIL ip={ip} total_ms={total_ms}")
|
| 304 |
-
return jsonify({"error": "Utterance pipeline failed"}), 500
|
| 305 |
|
| 306 |
finally:
|
| 307 |
-
# cleanup
|
| 308 |
try:
|
| 309 |
os.remove(wav_path)
|
| 310 |
except Exception:
|
|
@@ -331,5 +421,5 @@ if __name__ == "__main__":
|
|
| 331 |
port = int(os.environ.get("PORT", "7860"))
|
| 332 |
print(f"[startup] model={MODEL} thinking_level={THINKING_LEVEL} max_messages={MAX_MESSAGES} port={port}")
|
| 333 |
print(f"[startup] whisper_model={WHISPER_MODEL_NAME} device={WHISPER_DEVICE} compute={WHISPER_COMPUTE_TYPE}")
|
| 334 |
-
print(f"[startup]
|
| 335 |
serve(app, host="0.0.0.0", port=port)
|
|
|
|
| 1 |
import os
|
| 2 |
import time
|
| 3 |
+
import json
|
| 4 |
import tempfile
|
| 5 |
from collections import deque
|
| 6 |
|
|
|
|
| 13 |
from faster_whisper import WhisperModel
|
| 14 |
|
| 15 |
from elevenlabs.client import ElevenLabs
|
| 16 |
+
from elevenlabs import save # saves generator/stream to file
|
| 17 |
|
| 18 |
app = Flask(__name__)
|
| 19 |
|
|
|
|
| 55 |
_whisper_model = None
|
| 56 |
|
| 57 |
|
| 58 |
+
# -------------------------
|
| 59 |
+
# Helpers
|
| 60 |
+
# -------------------------
|
| 61 |
def _client_ip() -> str:
|
| 62 |
return request.headers.get("x-forwarded-for", request.remote_addr or "unknown")
|
| 63 |
|
| 64 |
|
| 65 |
+
def _err_details(e: Exception) -> dict:
|
| 66 |
+
d = {"type": type(e).__name__, "repr": repr(e)}
|
| 67 |
+
for k in ["status_code", "body", "message", "response", "details"]:
|
| 68 |
+
if hasattr(e, k):
|
| 69 |
+
try:
|
| 70 |
+
d[k] = getattr(e, k)
|
| 71 |
+
except Exception:
|
| 72 |
+
pass
|
| 73 |
+
return d
|
| 74 |
+
|
| 75 |
+
|
| 76 |
def _get_whisper_model() -> WhisperModel:
|
| 77 |
global _whisper_model
|
| 78 |
if _whisper_model is None:
|
|
|
|
| 151 |
raise
|
| 152 |
|
| 153 |
|
| 154 |
+
def _tts_to_mp3_file(text: str) -> tuple[str, int]:
|
| 155 |
+
"""
|
| 156 |
+
Returns: (mp3_path, tts_ms)
|
| 157 |
+
Raises exception on failure.
|
| 158 |
+
"""
|
| 159 |
+
if eleven is None:
|
| 160 |
+
raise RuntimeError("Server missing ELEVEN_API_KEY")
|
| 161 |
+
|
| 162 |
+
t0 = time.time()
|
| 163 |
+
|
| 164 |
+
audio_stream = eleven.text_to_speech.convert(
|
| 165 |
+
text=text,
|
| 166 |
+
voice_id=ELEVEN_VOICE_ID,
|
| 167 |
+
model_id=ELEVEN_MODEL_ID,
|
| 168 |
+
output_format=ELEVEN_OUTPUT_FORMAT,
|
| 169 |
+
)
|
| 170 |
+
|
| 171 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_out:
|
| 172 |
+
mp3_path = tmp_out.name
|
| 173 |
+
|
| 174 |
+
save(audio_stream, mp3_path)
|
| 175 |
+
tts_ms = int((time.time() - t0) * 1000)
|
| 176 |
+
return mp3_path, tts_ms
|
| 177 |
+
|
| 178 |
+
|
| 179 |
# -------------------------
|
| 180 |
# Endpoints
|
| 181 |
# -------------------------
|
|
|
|
| 236 |
return jsonify({"error": "Gemini call failed"}), 500
|
| 237 |
|
| 238 |
|
| 239 |
+
@app.post("/v1/tts")
|
| 240 |
+
def tts_only():
|
| 241 |
+
"""
|
| 242 |
+
JSON body: { "text": "hello" }
|
| 243 |
+
Returns: audio/mpeg (mp3)
|
| 244 |
+
Timing headers:
|
| 245 |
+
X-TTS-MS, X-TOTAL-MS
|
| 246 |
+
"""
|
| 247 |
+
ip = _client_ip()
|
| 248 |
+
t0 = time.time()
|
| 249 |
+
|
| 250 |
+
data = request.get_json(silent=True) or {}
|
| 251 |
+
text = (data.get("text") or "").strip()
|
| 252 |
+
|
| 253 |
+
print(f"[/v1/tts] START {time.strftime('%Y-%m-%d %H:%M:%S')} ip={ip} text_len={len(text)}")
|
| 254 |
+
|
| 255 |
+
if not text:
|
| 256 |
+
return jsonify({"error": "Missing 'text'"}), 400
|
| 257 |
+
|
| 258 |
+
mp3_path = None
|
| 259 |
+
try:
|
| 260 |
+
mp3_path, tts_ms = _tts_to_mp3_file(text)
|
| 261 |
+
total_ms = int((time.time() - t0) * 1000)
|
| 262 |
+
|
| 263 |
+
print(f"[/v1/tts] OK tts_ms={tts_ms} total_ms={total_ms}")
|
| 264 |
+
|
| 265 |
+
resp = send_file(
|
| 266 |
+
mp3_path,
|
| 267 |
+
mimetype="audio/mpeg",
|
| 268 |
+
as_attachment=False,
|
| 269 |
+
download_name="andy.mp3",
|
| 270 |
+
conditional=False,
|
| 271 |
+
)
|
| 272 |
+
resp.headers["X-TTS-MS"] = str(tts_ms)
|
| 273 |
+
resp.headers["X-TOTAL-MS"] = str(total_ms)
|
| 274 |
+
return resp
|
| 275 |
+
|
| 276 |
+
except Exception as e:
|
| 277 |
+
details = _err_details(e)
|
| 278 |
+
total_ms = int((time.time() - t0) * 1000)
|
| 279 |
+
print(f"[/v1/tts] FAIL total_ms={total_ms} details={json.dumps(details, default=str)[:2000]}")
|
| 280 |
+
return jsonify({"error": "ElevenLabs TTS failed", "details": details, "total_ms": total_ms}), 502
|
| 281 |
+
|
| 282 |
+
finally:
|
| 283 |
+
if mp3_path:
|
| 284 |
+
try:
|
| 285 |
+
os.remove(mp3_path)
|
| 286 |
+
except Exception:
|
| 287 |
+
pass
|
| 288 |
+
|
| 289 |
+
|
| 290 |
@app.post("/v1/utterance")
|
| 291 |
def utterance_audio_to_audio():
|
| 292 |
"""
|
| 293 |
Accepts: multipart/form-data with field "audio" containing a .wav file
|
| 294 |
Returns: audio/mpeg (mp3)
|
| 295 |
+
|
| 296 |
+
Timing headers:
|
| 297 |
X-STT-MS, X-LLM-MS, X-TTS-MS, X-TOTAL-MS
|
| 298 |
"""
|
| 299 |
t0 = time.time()
|
|
|
|
| 315 |
print(f"[/v1/utterance] ERROR non-wav filename={filename!r} ip={ip}")
|
| 316 |
return jsonify({"error": "Please upload a .wav file"}), 400
|
| 317 |
|
|
|
|
| 318 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_in:
|
| 319 |
wav_path = tmp_in.name
|
| 320 |
f.save(wav_path)
|
|
|
|
| 329 |
t_stt = time.time()
|
| 330 |
model = _get_whisper_model()
|
| 331 |
|
| 332 |
+
segments, _info = model.transcribe(
|
| 333 |
wav_path,
|
| 334 |
language=WHISPER_LANGUAGE,
|
| 335 |
vad_filter=True,
|
|
|
|
| 354 |
print(f"[/v1/utterance] reply_len={len(reply_text)} llm_ms={llm_ms}")
|
| 355 |
print(f"[/v1/utterance] bot_reply={reply_text!r}")
|
| 356 |
|
| 357 |
+
# ---- TTS ----
|
| 358 |
+
try:
|
| 359 |
+
mp3_path, tts_ms = _tts_to_mp3_file(reply_text)
|
| 360 |
+
except Exception as e:
|
| 361 |
+
details = _err_details(e)
|
| 362 |
+
total_ms = int((time.time() - t0) * 1000)
|
| 363 |
+
print(f"[/v1/utterance] TTS FAIL total_ms={total_ms} details={json.dumps(details, default=str)[:2000]}")
|
| 364 |
+
return jsonify({
|
| 365 |
+
"error": "ElevenLabs TTS failed",
|
| 366 |
+
"details": details,
|
| 367 |
+
"transcript": transcript,
|
| 368 |
+
"reply_text": reply_text,
|
| 369 |
+
"stt_ms": stt_ms,
|
| 370 |
+
"llm_ms": llm_ms,
|
| 371 |
+
"total_ms": total_ms,
|
| 372 |
+
}), 502
|
| 373 |
|
| 374 |
total_ms = int((time.time() - t0) * 1000)
|
| 375 |
print(f"[/v1/utterance] tts_ms={tts_ms} total_ms={total_ms}")
|
|
|
|
| 382 |
download_name="andy.mp3",
|
| 383 |
conditional=False,
|
| 384 |
)
|
|
|
|
| 385 |
resp.headers["X-STT-MS"] = str(stt_ms)
|
| 386 |
resp.headers["X-LLM-MS"] = str(llm_ms)
|
| 387 |
resp.headers["X-TTS-MS"] = str(tts_ms)
|
|
|
|
| 390 |
|
| 391 |
except Exception as e:
|
| 392 |
total_ms = int((time.time() - t0) * 1000)
|
| 393 |
+
details = _err_details(e)
|
| 394 |
+
print(f"[/v1/utterance] FAIL ip={ip} total_ms={total_ms} details={json.dumps(details, default=str)[:2000]}")
|
| 395 |
+
return jsonify({"error": "Utterance pipeline failed", "details": details, "total_ms": total_ms}), 500
|
| 396 |
|
| 397 |
finally:
|
|
|
|
| 398 |
try:
|
| 399 |
os.remove(wav_path)
|
| 400 |
except Exception:
|
|
|
|
| 421 |
port = int(os.environ.get("PORT", "7860"))
|
| 422 |
print(f"[startup] model={MODEL} thinking_level={THINKING_LEVEL} max_messages={MAX_MESSAGES} port={port}")
|
| 423 |
print(f"[startup] whisper_model={WHISPER_MODEL_NAME} device={WHISPER_DEVICE} compute={WHISPER_COMPUTE_TYPE}")
|
| 424 |
+
print(f"[startup] eleven_ok={bool(ELEVEN_API_KEY)} voice={ELEVEN_VOICE_ID} model={ELEVEN_MODEL_ID} out={ELEVEN_OUTPUT_FORMAT}")
|
| 425 |
serve(app, host="0.0.0.0", port=port)
|