Spaces:

tonyassi
/

andy

Sleeping

App Files Files Community

tonyassi commited on Jan 6

Commit

e6f73aa

verified ·

1 Parent(s): 63936cf

Update app.py

Browse files

Files changed (1) hide show

app.py +115 -25

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import os
 import time
 import tempfile
 from collections import deque
@@ -12,7 +13,7 @@ from google.genai import types
 from faster_whisper import WhisperModel
 from elevenlabs.client import ElevenLabs
-from elevenlabs import save  # uses generator streaming under the hood
 app = Flask(__name__)
@@ -54,10 +55,24 @@ HISTORY = deque(maxlen=MAX_MESSAGES)
 _whisper_model = None
 def _client_ip() -> str:
     return request.headers.get("x-forwarded-for", request.remote_addr or "unknown")
 def _get_whisper_model() -> WhisperModel:
     global _whisper_model
     if _whisper_model is None:
@@ -136,6 +151,31 @@ def llm_chat(user_text: str) -> str:
         raise
 # -------------------------
 # Endpoints
 # -------------------------
@@ -196,12 +236,64 @@ def chat_text():
         return jsonify({"error": "Gemini call failed"}), 500
 @app.post("/v1/utterance")
 def utterance_audio_to_audio():
     """
     Accepts: multipart/form-data with field "audio" containing a .wav file
     Returns: audio/mpeg (mp3)
-    Also includes timing headers:
       X-STT-MS, X-LLM-MS, X-TTS-MS, X-TOTAL-MS
     """
     t0 = time.time()
@@ -223,7 +315,6 @@ def utterance_audio_to_audio():
         print(f"[/v1/utterance] ERROR non-wav filename={filename!r} ip={ip}")
         return jsonify({"error": "Please upload a .wav file"}), 400
-    # Save uploaded wav
     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_in:
         wav_path = tmp_in.name
         f.save(wav_path)
@@ -238,7 +329,7 @@ def utterance_audio_to_audio():
         t_stt = time.time()
         model = _get_whisper_model()
-        segments, info = model.transcribe(
             wav_path,
             language=WHISPER_LANGUAGE,
             vad_filter=True,
@@ -263,21 +354,22 @@ def utterance_audio_to_audio():
         print(f"[/v1/utterance] reply_len={len(reply_text)} llm_ms={llm_ms}")
         print(f"[/v1/utterance] bot_reply={reply_text!r}")
-        # ---- TTS (ElevenLabs) ----
-        t_tts = time.time()
-        audio_stream = eleven.text_to_speech.convert(
-            text=reply_text,
-            voice_id=ELEVEN_VOICE_ID,
-            model_id=ELEVEN_MODEL_ID,
-            output_format=ELEVEN_OUTPUT_FORMAT,
-        )
-        # Save mp3 stream to temp file
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_out:
-            mp3_path = tmp_out.name
-        save(audio_stream, mp3_path)
-        tts_ms = int((time.time() - t_tts) * 1000)
         total_ms = int((time.time() - t0) * 1000)
         print(f"[/v1/utterance] tts_ms={tts_ms} total_ms={total_ms}")
@@ -290,7 +382,6 @@ def utterance_audio_to_audio():
             download_name="andy.mp3",
             conditional=False,
         )
-        # Timing headers (super handy for your client)
         resp.headers["X-STT-MS"] = str(stt_ms)
         resp.headers["X-LLM-MS"] = str(llm_ms)
         resp.headers["X-TTS-MS"] = str(tts_ms)
@@ -299,12 +390,11 @@ def utterance_audio_to_audio():
     except Exception as e:
         total_ms = int((time.time() - t0) * 1000)
-        print("Utterance pipeline error:", repr(e))
-        print(f"[/v1/utterance] FAIL ip={ip} total_ms={total_ms}")
-        return jsonify({"error": "Utterance pipeline failed"}), 500
     finally:
-        # cleanup
         try:
             os.remove(wav_path)
         except Exception:
@@ -331,5 +421,5 @@ if __name__ == "__main__":
     port = int(os.environ.get("PORT", "7860"))
     print(f"[startup] model={MODEL} thinking_level={THINKING_LEVEL} max_messages={MAX_MESSAGES} port={port}")
     print(f"[startup] whisper_model={WHISPER_MODEL_NAME} device={WHISPER_DEVICE} compute={WHISPER_COMPUTE_TYPE}")
-    print(f"[startup] eleven_voice_id={ELEVEN_VOICE_ID} eleven_model_id={ELEVEN_MODEL_ID} out={ELEVEN_OUTPUT_FORMAT}")
     serve(app, host="0.0.0.0", port=port)

 import os
 import time
+import json
 import tempfile
 from collections import deque
 from faster_whisper import WhisperModel
 from elevenlabs.client import ElevenLabs
+from elevenlabs import save  # saves generator/stream to file
 app = Flask(__name__)
 _whisper_model = None
+# -------------------------
+# Helpers
+# -------------------------
 def _client_ip() -> str:
     return request.headers.get("x-forwarded-for", request.remote_addr or "unknown")
+def _err_details(e: Exception) -> dict:
+    d = {"type": type(e).__name__, "repr": repr(e)}
+    for k in ["status_code", "body", "message", "response", "details"]:
+        if hasattr(e, k):
+            try:
+                d[k] = getattr(e, k)
+            except Exception:
+                pass
+    return d
 def _get_whisper_model() -> WhisperModel:
     global _whisper_model
     if _whisper_model is None:
         raise
+def _tts_to_mp3_file(text: str) -> tuple[str, int]:
+    """
+    Returns: (mp3_path, tts_ms)
+    Raises exception on failure.
+    """
+    if eleven is None:
+        raise RuntimeError("Server missing ELEVEN_API_KEY")
+    t0 = time.time()
+    audio_stream = eleven.text_to_speech.convert(
+        text=text,
+        voice_id=ELEVEN_VOICE_ID,
+        model_id=ELEVEN_MODEL_ID,
+        output_format=ELEVEN_OUTPUT_FORMAT,
+    )
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_out:
+        mp3_path = tmp_out.name
+    save(audio_stream, mp3_path)
+    tts_ms = int((time.time() - t0) * 1000)
+    return mp3_path, tts_ms
 # -------------------------
 # Endpoints
 # -------------------------
         return jsonify({"error": "Gemini call failed"}), 500
+@app.post("/v1/tts")
+def tts_only():
+    """
+    JSON body: { "text": "hello" }
+    Returns: audio/mpeg (mp3)
+    Timing headers:
+      X-TTS-MS, X-TOTAL-MS
+    """
+    ip = _client_ip()
+    t0 = time.time()
+    data = request.get_json(silent=True) or {}
+    text = (data.get("text") or "").strip()
+    print(f"[/v1/tts] START {time.strftime('%Y-%m-%d %H:%M:%S')} ip={ip} text_len={len(text)}")
+    if not text:
+        return jsonify({"error": "Missing 'text'"}), 400
+    mp3_path = None
+    try:
+        mp3_path, tts_ms = _tts_to_mp3_file(text)
+        total_ms = int((time.time() - t0) * 1000)
+        print(f"[/v1/tts] OK tts_ms={tts_ms} total_ms={total_ms}")
+        resp = send_file(
+            mp3_path,
+            mimetype="audio/mpeg",
+            as_attachment=False,
+            download_name="andy.mp3",
+            conditional=False,
+        )
+        resp.headers["X-TTS-MS"] = str(tts_ms)
+        resp.headers["X-TOTAL-MS"] = str(total_ms)
+        return resp
+    except Exception as e:
+        details = _err_details(e)
+        total_ms = int((time.time() - t0) * 1000)
+        print(f"[/v1/tts] FAIL total_ms={total_ms} details={json.dumps(details, default=str)[:2000]}")
+        return jsonify({"error": "ElevenLabs TTS failed", "details": details, "total_ms": total_ms}), 502
+    finally:
+        if mp3_path:
+            try:
+                os.remove(mp3_path)
+            except Exception:
+                pass
 @app.post("/v1/utterance")
 def utterance_audio_to_audio():
     """
     Accepts: multipart/form-data with field "audio" containing a .wav file
     Returns: audio/mpeg (mp3)
+    Timing headers:
       X-STT-MS, X-LLM-MS, X-TTS-MS, X-TOTAL-MS
     """
     t0 = time.time()
         print(f"[/v1/utterance] ERROR non-wav filename={filename!r} ip={ip}")
         return jsonify({"error": "Please upload a .wav file"}), 400
     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_in:
         wav_path = tmp_in.name
         f.save(wav_path)
         t_stt = time.time()
         model = _get_whisper_model()
+        segments, _info = model.transcribe(
             wav_path,
             language=WHISPER_LANGUAGE,
             vad_filter=True,
         print(f"[/v1/utterance] reply_len={len(reply_text)} llm_ms={llm_ms}")
         print(f"[/v1/utterance] bot_reply={reply_text!r}")
+        # ---- TTS ----
+        try:
+            mp3_path, tts_ms = _tts_to_mp3_file(reply_text)
+        except Exception as e:
+            details = _err_details(e)
+            total_ms = int((time.time() - t0) * 1000)
+            print(f"[/v1/utterance] TTS FAIL total_ms={total_ms} details={json.dumps(details, default=str)[:2000]}")
+            return jsonify({
+                "error": "ElevenLabs TTS failed",
+                "details": details,
+                "transcript": transcript,
+                "reply_text": reply_text,
+                "stt_ms": stt_ms,
+                "llm_ms": llm_ms,
+                "total_ms": total_ms,
+            }), 502
         total_ms = int((time.time() - t0) * 1000)
         print(f"[/v1/utterance] tts_ms={tts_ms} total_ms={total_ms}")
             download_name="andy.mp3",
             conditional=False,
         )
         resp.headers["X-STT-MS"] = str(stt_ms)
         resp.headers["X-LLM-MS"] = str(llm_ms)
         resp.headers["X-TTS-MS"] = str(tts_ms)
     except Exception as e:
         total_ms = int((time.time() - t0) * 1000)
+        details = _err_details(e)
+        print(f"[/v1/utterance] FAIL ip={ip} total_ms={total_ms} details={json.dumps(details, default=str)[:2000]}")
+        return jsonify({"error": "Utterance pipeline failed", "details": details, "total_ms": total_ms}), 500
     finally:
         try:
             os.remove(wav_path)
         except Exception:
     port = int(os.environ.get("PORT", "7860"))
     print(f"[startup] model={MODEL} thinking_level={THINKING_LEVEL} max_messages={MAX_MESSAGES} port={port}")
     print(f"[startup] whisper_model={WHISPER_MODEL_NAME} device={WHISPER_DEVICE} compute={WHISPER_COMPUTE_TYPE}")
+    print(f"[startup] eleven_ok={bool(ELEVEN_API_KEY)} voice={ELEVEN_VOICE_ID} model={ELEVEN_MODEL_ID} out={ELEVEN_OUTPUT_FORMAT}")
     serve(app, host="0.0.0.0", port=port)