Spaces:

tonyassi
/

andy

Sleeping

App Files Files Community

tonyassi commited on Jan 6

Commit

0d48af6

verified ·

1 Parent(s): 8a1e13a

Update app.py

Browse files

Files changed (1) hide show

app.py +87 -28

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import os
 import time
 from collections import deque
 from flask import Flask, request, jsonify
 from waitress import serve
@@ -7,6 +8,8 @@ from waitress import serve
 from google import genai
 from google.genai import types
 app = Flask(__name__)
 # -------------------------
@@ -20,6 +23,12 @@ SYSTEM_PROMPT = (
     "Respond in 1-3 sentences and less than 300 characters."
 )
 # Gemini client (expects GEMINI_API_KEY set as a HF Space Secret)
 client = genai.Client(api_key=os.environ.get("GEMINI_API_KEY"))
@@ -27,15 +36,31 @@ client = genai.Client(api_key=os.environ.get("GEMINI_API_KEY"))
 MAX_MESSAGES = 20  # user+assistant messages combined
 HISTORY = deque(maxlen=MAX_MESSAGES)  # holds types.Content objects
 def _client_ip() -> str:
-    # HF may proxy requests; this is best-effort
     return request.headers.get("x-forwarded-for", request.remote_addr or "unknown")
 def _gemini_config() -> types.GenerateContentConfig:
-    # NOTE: Setting thresholds to OFF is permissive and may not be honored for all content;
-    # some protections are not adjustable.
     return types.GenerateContentConfig(
         system_instruction=[types.Part.from_text(text=SYSTEM_PROMPT)],
         thinking_config=types.ThinkingConfig(thinking_level=THINKING_LEVEL),
@@ -61,21 +86,11 @@ def _gemini_config() -> types.GenerateContentConfig:
 def llm_chat(user_text: str) -> str:
-    """
-    Updates global HISTORY (user + model), calls Gemini, returns model reply text.
-    Rolls back the last user message if Gemini call fails.
-    """
     user_text = (user_text or "").strip()
     if not user_text:
         raise ValueError("Missing 'text'")
-    # Add user message to memory
-    HISTORY.append(
-        types.Content(
-            role="user",
-            parts=[types.Part.from_text(text=user_text)],
-        )
-    )
     try:
         resp = client.models.generate_content(
@@ -85,18 +100,10 @@ def llm_chat(user_text: str) -> str:
         )
         reply_text = (resp.text or "").strip()
-        # Add assistant message to memory
-        HISTORY.append(
-            types.Content(
-                role="model",
-                parts=[types.Part.from_text(text=reply_text)],
-            )
-        )
         return reply_text
     except Exception:
-        # Roll back last user message on failure
         if len(HISTORY) > 0 and getattr(HISTORY[-1], "role", None) == "user":
             HISTORY.pop()
         raise
@@ -114,6 +121,9 @@ def health():
         "thinking_level": THINKING_LEVEL,
         "memory_messages": len(HISTORY),
         "max_messages": MAX_MESSAGES,
     })
@@ -158,14 +168,62 @@ def chat_text():
 @app.post("/v1/utterance")
-def chat_audio():
     """
-    Audio endpoint (placeholder for now).
-    Later: accept audio (multipart/form-data), run STT -> llm_chat -> TTS -> return audio.
     """
     ip = _client_ip()
-    print(f"[/v1/utterance] HIT {time.strftime('%Y-%m-%d %H:%M:%S')} ip={ip} (not implemented)")
-    return jsonify({"error": "Not implemented yet"}), 501
 @app.post("/v1/reset")
@@ -182,4 +240,5 @@ def reset():
 if __name__ == "__main__":
     port = int(os.environ.get("PORT", "7860"))
     print(f"[startup] model={MODEL} thinking_level={THINKING_LEVEL} max_messages={MAX_MESSAGES} port={port}")
     serve(app, host="0.0.0.0", port=port)

 import os
 import time
+import tempfile
 from collections import deque
 from flask import Flask, request, jsonify
 from waitress import serve
 from google import genai
 from google.genai import types
+from faster_whisper import WhisperModel
 app = Flask(__name__)
 # -------------------------
     "Respond in 1-3 sentences and less than 300 characters."
 )
+# STT config (we chose base.en)
+WHISPER_MODEL_NAME = os.environ.get("WHISPER_MODEL", "base.en")
+WHISPER_DEVICE = os.environ.get("WHISPER_DEVICE", "cpu")
+WHISPER_COMPUTE_TYPE = os.environ.get("WHISPER_COMPUTE_TYPE", "int8")
+WHISPER_LANGUAGE = os.environ.get("WHISPER_LANGUAGE", "en")
 # Gemini client (expects GEMINI_API_KEY set as a HF Space Secret)
 client = genai.Client(api_key=os.environ.get("GEMINI_API_KEY"))
 MAX_MESSAGES = 20  # user+assistant messages combined
 HISTORY = deque(maxlen=MAX_MESSAGES)  # holds types.Content objects
+# ---- Whisper model (lazy init) ----
+_whisper_model = None
 def _client_ip() -> str:
     return request.headers.get("x-forwarded-for", request.remote_addr or "unknown")
+def _get_whisper_model() -> WhisperModel:
+    global _whisper_model
+    if _whisper_model is None:
+        print(
+            f"[whisper] loading model={WHISPER_MODEL_NAME} "
+            f"device={WHISPER_DEVICE} compute_type={WHISPER_COMPUTE_TYPE}"
+        )
+        _whisper_model = WhisperModel(
+            WHISPER_MODEL_NAME,
+            device=WHISPER_DEVICE,
+            compute_type=WHISPER_COMPUTE_TYPE,
+        )
+        print("[whisper] loaded")
+    return _whisper_model
 def _gemini_config() -> types.GenerateContentConfig:
     return types.GenerateContentConfig(
         system_instruction=[types.Part.from_text(text=SYSTEM_PROMPT)],
         thinking_config=types.ThinkingConfig(thinking_level=THINKING_LEVEL),
 def llm_chat(user_text: str) -> str:
     user_text = (user_text or "").strip()
     if not user_text:
         raise ValueError("Missing 'text'")
+    HISTORY.append(types.Content(role="user", parts=[types.Part.from_text(text=user_text)]))
     try:
         resp = client.models.generate_content(
         )
         reply_text = (resp.text or "").strip()
+        HISTORY.append(types.Content(role="model", parts=[types.Part.from_text(text=reply_text)]))
         return reply_text
     except Exception:
         if len(HISTORY) > 0 and getattr(HISTORY[-1], "role", None) == "user":
             HISTORY.pop()
         raise
         "thinking_level": THINKING_LEVEL,
         "memory_messages": len(HISTORY),
         "max_messages": MAX_MESSAGES,
+        "whisper_model": WHISPER_MODEL_NAME,
+        "whisper_device": WHISPER_DEVICE,
+        "whisper_compute_type": WHISPER_COMPUTE_TYPE,
     })
 @app.post("/v1/utterance")
+def utterance_to_text():
     """
+    Accepts: multipart/form-data with field "audio" containing a .wav file
+    Returns: JSON { "text": "<transcript>", "total_ms": <int> }
     """
+    t0 = time.time()
     ip = _client_ip()
+    print(f"[/v1/utterance] START {time.strftime('%Y-%m-%d %H:%M:%S')} ip={ip}")
+    if "audio" not in request.files:
+        print(f"[/v1/utterance] ERROR missing file field 'audio' ip={ip}")
+        return jsonify({"error": "Missing file field 'audio'"}), 400
+    f = request.files["audio"]
+    filename = (f.filename or "").strip() or "audio.wav"
+    if not filename.lower().endswith(".wav"):
+        print(f"[/v1/utterance] ERROR non-wav filename={filename!r} ip={ip}")
+        return jsonify({"error": "Please upload a .wav file"}), 400
+    print(f"[/v1/utterance] received filename={filename!r} content_type={f.content_type!r}")
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
+        tmp_path = tmp.name
+        f.save(tmp_path)
+    try:
+        model = _get_whisper_model()
+        segments, info = model.transcribe(
+            tmp_path,
+            language=WHISPER_LANGUAGE,
+            vad_filter=True,
+            beam_size=1,  # fast
+        )
+        text = "".join(seg.text for seg in segments).strip()
+        dt_ms = int((time.time() - t0) * 1000)
+        print(f"[/v1/utterance] transcript_len={len(text)} total_ms={dt_ms}")
+        print(f"[/v1/utterance] transcript={text!r}")
+        return jsonify({"text": text, "total_ms": dt_ms})
+    except Exception as e:
+        dt_ms = int((time.time() - t0) * 1000)
+        print("Whisper error:", repr(e))
+        print(f"[/v1/utterance] FAIL ip={ip} total_ms={dt_ms}")
+        return jsonify({"error": "STT failed"}), 500
+    finally:
+        try:
+            os.remove(tmp_path)
+        except Exception:
+            pass
 @app.post("/v1/reset")
 if __name__ == "__main__":
     port = int(os.environ.get("PORT", "7860"))
     print(f"[startup] model={MODEL} thinking_level={THINKING_LEVEL} max_messages={MAX_MESSAGES} port={port}")
+    print(f"[startup] whisper_model={WHISPER_MODEL_NAME} device={WHISPER_DEVICE} compute={WHISPER_COMPUTE_TYPE}")
     serve(app, host="0.0.0.0", port=port)