Spaces:

SalexAI
/

public-airo-api

Sleeping

App Files Files Community

SalexAI commited on Sep 7, 2025

Commit

9e0fd09

verified ·

1 Parent(s): 9ab5c56

Update app.py

Browse files

Files changed (1) hide show

app.py +306 -53

app.py CHANGED Viewed

@@ -1,81 +1,334 @@
 import os
-import requests
-from fastapi import FastAPI, Request
 from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import JSONResponse
-app = FastAPI()
-# Allow ScratchX / PenguinMod
 app.add_middleware(
     CORSMiddleware,
-    allow_origins=["*"],  # 🔒 restrict later if desired
     allow_credentials=True,
-    allow_methods=["GET", "POST", "OPTIONS"],
     allow_headers=["*"],
 )
-OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
-OPENAI_REALTIME_URL = "https://api.openai.com/v1/realtime/sessions"
-def _mint_ephemeral(model: str, voice: str):
-    """Helper to call OpenAI and mint ephemeral token."""
-    if not OPENAI_API_KEY:
-        return JSONResponse(
-            status_code=500,
-            content={"error": "OPENAI_API_KEY not set in environment"},
         )
-    headers = {
-        "Authorization": f"Bearer {OPENAI_API_KEY}",
-        "Content-Type": "application/json",
-        "OpenAI-Beta": "realtime=v1",
     }
-    body = {"model": model, "voice": voice}
     try:
-        r = requests.post(OPENAI_REALTIME_URL, headers=headers, json=body)
-        r.raise_for_status()
-        return r.json()
-    except Exception as e:
-        return JSONResponse(status_code=500, content={"error": str(e)})
-# --- Health endpoints ---
 @app.get("/health")
-@app.get("/health/")
-@app.get("/proxy/health")
-@app.get("/proxy/health/")
 def health():
     return {"status": "ok"}
-@app.middleware("http")
-async def log_requests(request: Request, call_next):
-    print(f"[DEBUG] Incoming: {request.method} {request.url.path}")
-    response = await call_next(request)
-    return response
-# --- Ephemeral endpoints ---
-@app.get("/ephemeral")
-@app.get("/ephemeral/")
-@app.get("/proxy/ephemeral")
-@app.get("/proxy/ephemeral/")
-def ephemeral_get(model: str = "gpt-4o-realtime-preview", voice: str = "verse"):
-    return _mint_ephemeral(model, voice)
-@app.post("/ephemeral")
-@app.post("/ephemeral/")
-@app.post("/proxy/ephemeral")
-@app.post("/proxy/ephemeral/")
-async def ephemeral_post(request: Request):
-    try:
-        data = await request.json()
-        model = data.get("model", "gpt-4o-realtime-preview")
-        voice = data.get("voice", "verse")
-    except Exception:
-        model, voice = "gpt-4o-realtime-preview", "verse"
-    return _mint_ephemeral(model, voice)

 import os
+import json
+import asyncio
+import threading
+import numpy as np
+from scipy.signal import resample
+from fastapi import FastAPI, WebSocket, Request, Response
 from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import HTMLResponse, JSONResponse, PlainTextResponse
+from fastapi.staticfiles import StaticFiles
+# Realtime STT
+from RealtimeSTT import AudioToTextRecorder
+# ----------------------------
+# App + CORS
+# ----------------------------
+app = FastAPI()
 app.add_middleware(
     CORSMiddleware,
+    allow_origins=["*"],          # tighten if desired
     allow_credentials=True,
+    allow_methods=["*"],
     allow_headers=["*"],
 )
+# ----------------------------
+# Global recorder (singleton)
+# ----------------------------
+recorder = None
+recorder_ready = threading.Event()
+is_running = True
+# Active websocket(s) to stream results to (basic single-client model)
+# You can turn this into a set() if you want multi-client broadcast.
+active_ws: WebSocket | None = None
+main_loop = None  # asyncio loop for scheduling cross-thread sends
+# ----------------------------
+# RealtimeSTT callbacks
+# ----------------------------
+async def _send_to_client(payload: dict):
+    global active_ws
+    if active_ws is None:
+        return
+    try:
+        await active_ws.send_text(json.dumps(payload))
+    except Exception:
+        # client probably disconnected
+        pass
+def on_realtime_text(text: str):
+    """Called by recorder thread for stabilized realtime partials."""
+    global main_loop
+    if main_loop:
+        asyncio.run_coroutine_threadsafe(
+            _send_to_client({"type": "realtime", "text": text}),
+            main_loop
         )
+# ----------------------------
+# Recorder thread
+# ----------------------------
+def _recorder_thread():
+    global recorder, is_running, main_loop
+    cfg = {
+        "spinner": False,
+        "use_microphone": False,    # we feed audio via .feed_audio
+        "model": "large-v2",        # adjust if your hardware is limited
+        "language": "en",
+        "silero_sensitivity": 0.4,
+        "webrtc_sensitivity": 2,
+        "post_speech_silence_duration": 0.7,
+        "min_length_of_recording": 0,
+        "min_gap_between_recordings": 0,
+        "enable_realtime_transcription": True,
+        "realtime_processing_pause": 0,
+        "realtime_model_type": "tiny.en",   # fast streaming model
+        "on_realtime_transcription_stabilized": on_realtime_text,
     }
+    recorder = AudioToTextRecorder(**cfg)
+    recorder_ready.set()
+    # Continuously poll for final sentences and forward them to the client
+    while is_running:
+        try:
+            full_sentence = recorder.text()
+            if full_sentence:
+                if main_loop:
+                    asyncio.run_coroutine_threadsafe(
+                        _send_to_client({"type": "fullSentence", "text": full_sentence}),
+                        main_loop
+                    )
+        except Exception:
+            continue
+# Start recorder thread once on startup
+@app.on_event("startup")
+async def _startup():
+    global main_loop
+    main_loop = asyncio.get_running_loop()
+    t = threading.Thread(target=_recorder_thread, daemon=True)
+    t.start()
+    recorder_ready.wait(timeout=120)
+@app.on_event("shutdown")
+async def _shutdown():
+    global is_running, recorder
+    is_running = False
     try:
+        if recorder:
+            recorder.stop()
+            recorder.shutdown()
+    except Exception:
+        pass
+# ----------------------------
+# Audio helpers
+# ----------------------------
+def decode_and_resample(audio_bytes: bytes, orig_sr: int, target_sr: int = 16000) -> bytes:
+    """Resample PCM16LE buffer to target SR."""
+    try:
+        audio_np = np.frombuffer(audio_bytes, dtype=np.int16)
+        if orig_sr == target_sr:
+            return audio_np.tobytes()
+        n_orig = len(audio_np)
+        if n_orig == 0:
+            return b""
+        n_target = int(n_orig * target_sr / orig_sr)
+        resampled = resample(audio_np, n_target).astype(np.int16)
+        return resampled.tobytes()
+    except Exception:
+        # If resample fails, just return original chunk
+        return audio_bytes
+# ----------------------------
+# WebSocket: /ws
+# Frame format: [4-byte little-endian length][UTF-8 JSON metadata][PCM16 payload]
+# metadata: {"sampleRate": 48000}
+# ----------------------------
+@app.websocket("/ws")
+async def ws_endpoint(ws: WebSocket):
+    global active_ws
+    await ws.accept()
+    active_ws = ws
+    # Ensure recorder is ready
+    if not recorder_ready.is_set():
+        await ws.send_text(json.dumps({"type": "error", "error": "Recorder not ready"}))
+    try:
+        while True:
+            # Expect a single binary message per chunk
+            data = await ws.receive_bytes()
+            # Parse metadata length
+            if len(data) < 4:
+                continue
+            meta_len = int.from_bytes(data[:4], byteorder="little", signed=False)
+            if 4 + meta_len > len(data):
+                continue
+            # Parse metadata JSON
+            meta_json = data[4:4+meta_len].decode("utf-8", errors="ignore")
+            try:
+                meta = json.loads(meta_json)
+                sample_rate = int(meta.get("sampleRate", 48000))
+            except Exception:
+                sample_rate = 48000
+            # PCM16 payload
+            chunk = data[4+meta_len:]
+            if not chunk:
+                continue
+            # Convert to 16k mono PCM16
+            resampled = decode_and_resample(chunk, sample_rate, 16000)
+            # Feed into the recorder
+            try:
+                recorder.feed_audio(resampled)
+            except Exception:
+                # recorder not ready or an intermittent error; ignore this chunk
+                pass
+    except Exception:
+        # connection closed or error
+        pass
+    finally:
+        # mark inactive
+        if active_ws is ws:
+            active_ws = None
+# ----------------------------
+# Health
+# ----------------------------
 @app.get("/health")
 def health():
     return {"status": "ok"}
+# ----------------------------
+# Frontend: index.html + client JS
+# ----------------------------
+INDEX_HTML = """<!doctype html>
+<html>
+<head>
+  <meta charset="utf-8" />
+  <title>Realtime STT (HF Space)</title>
+  <style>
+    body { font-family: system-ui, -apple-system, Segoe UI, Roboto, sans-serif; margin: 24px; }
+    .row { margin: 12px 0; }
+    #log { white-space: pre-wrap; background: #111; color: #0f0; padding: 12px; border-radius: 8px; height: 240px; overflow:auto; }
+    button { padding: 8px 12px; border-radius: 8px; border: 1px solid #888; background: #222; color: #fff; cursor: pointer; }
+    input[type=number] { width: 100px; }
+    label { display: inline-block; min-width: 130px; }
+  </style>
+</head>
+<body>
+  <h2>Realtime STT WebSocket Demo</h2>
+  <div class="row">
+    <label>Sample Rate</label>
+    <input id="sr" type="number" value="48000" />
+  </div>
+  <div class="row">
+    <button id="start">Start</button>
+    <button id="stop">Stop</button>
+  </div>
+  <div class="row">
+    <strong>Live output:</strong>
+    <div id="log"></div>
+  </div>
+<script>
+let ws = null;
+let audioCtx = null;
+let micStream = null;
+let processor = null;
+let source = null;
+function log(s) {
+  const el = document.getElementById('log');
+  el.textContent += s + "\\n";
+  el.scrollTop = el.scrollHeight;
+}
+async function start() {
+  const targetSR = parseInt(document.getElementById('sr').value, 10) || 48000;
+  // Setup WS
+  const wsProto = location.protocol === 'https:' ? 'wss' : 'ws';
+  const wsURL = wsProto + '://' + location.host + '/ws';
+  ws = new WebSocket(wsURL);
+  ws.onopen = () => log('WS connected: ' + wsURL);
+  ws.onmessage = (ev) => {
+    try {
+      const msg = JSON.parse(ev.data);
+      if (msg.type === 'realtime') {
+        log('[partial] ' + msg.text);
+      } else if (msg.type === 'fullSentence') {
+        log('[final]   ' + msg.text);
+      } else if (msg.type === 'error') {
+        log('[error] ' + msg.error);
+      } else {
+        log('[msg] ' + ev.data);
+      }
+    } catch (e) {
+      log('[raw] ' + ev.data);
+    }
+  };
+  ws.onerror = (e) => log('WS error');
+  ws.onclose = () => log('WS closed');
+  // Setup audio
+  audioCtx = new (window.AudioContext || window.webkitAudioContext)({ sampleRate: targetSR });
+  micStream = await navigator.mediaDevices.getUserMedia({ audio: true });
+  source = audioCtx.createMediaStreamSource(micStream);
+  // ScriptProcessor (deprecated but widely supported)
+  processor = audioCtx.createScriptProcessor(4096, 1, 1);
+  processor.onaudioprocess = (e) => {
+    // Float32 [-1,1] -> PCM16 little-endian
+    const input = e.inputBuffer.getChannelData(0);
+    const buf = new ArrayBuffer(input.length * 2);
+    const view = new DataView(buf);
+    for (let i = 0; i < input.length; i++) {
+      let s = Math.max(-1, Math.min(1, input[i]));
+      view.setInt16(i * 2, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
+    }
+    // Build frame: [4-byte meta length][meta JSON][PCM16 payload]
+    const meta = JSON.stringify({ sampleRate: audioCtx.sampleRate });
+    const metaBytes = new TextEncoder().encode(meta);
+    const header = new Uint8Array(4 + metaBytes.length);
+    const dv = new DataView(header.buffer);
+    dv.setUint32(0, metaBytes.length, true);
+    header.set(metaBytes, 4);
+    // Concatenate header + payload
+    const payload = new Uint8Array(buf);
+    const frame = new Uint8Array(header.length + payload.length);
+    frame.set(header, 0);
+    frame.set(payload, header.length);
+    if (ws && ws.readyState === 1) {
+      ws.send(frame);
+    }
+  };
+  source.connect(processor);
+  processor.connect(audioCtx.destination);
+}
+function stop() {
+  try { if (processor) processor.disconnect(); } catch {}
+  try { if (source) source.disconnect(); } catch {}
+  try { if (micStream) micStream.getTracks().forEach(t => t.stop()); } catch {}
+  try { if (audioCtx) audioCtx.close(); } catch {}
+  try { if (ws) ws.close(); } catch {}
+  ws = null; micStream = null; source = null; processor = null; audioCtx = null;
+  log('stopped.');
+}
+document.getElementById('start').onclick = () => start().catch(e => log('start error: ' + e.message));
+document.getElementById('stop').onclick = () => stop();
+</script>
+</body>
+</html>
+"""
+@app.get("/")
+def index():
+    return HTMLResponse(INDEX_HTML)