""" realtime_server.py - Optional WebSocket server for real-time audio streaming Chạy riêng biệt: uvicorn realtime_server:app --host 0.0.0.0 --port 8000 """ import os import asyncio import json import base64 from typing import Optional from fastapi import FastAPI, WebSocket, WebSocketDisconnect from fastapi.responses import HTMLResponse import websockets import sys sys.dont_write_bytecode = True OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "") OPENAI_REALTIME_MODEL = os.getenv("OPENAI_REALTIME_MODEL", "gpt-4o-realtime-preview") app = FastAPI() # Simple HTML test page html = """ Realtime Audio Test

Realtime Audio Test

Status: Ready
""" @app.get("/") async def get(): return HTMLResponse(html) @app.post("/process-audio") async def process_audio(request: dict): """Process audio from frontend""" try: audio_data = base64.b64decode(request.get("audio", "")) # Save to temp file import tempfile with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f: f.write(audio_data) temp_path = f.name # Transcribe using OpenAI from openai import OpenAI client = OpenAI(api_key=OPENAI_API_KEY) with open(temp_path, "rb") as audio_file: transcript = client.audio.transcriptions.create( model="whisper-1", file=audio_file, language="de" ) # Clean up import os os.unlink(temp_path) return {"success": True, "transcript": transcript.text} except Exception as e: return {"success": False, "error": str(e)} @app.websocket("/ws") async def websocket_endpoint(websocket: WebSocket): """WebSocket endpoint for real-time audio streaming""" await websocket.accept() try: # Connect to OpenAI Realtime API headers = { "Authorization": f"Bearer {OPENAI_API_KEY}", "OpenAI-Beta": "realtime=v1", } async with websockets.connect( f"wss://api.openai.com/v1/realtime?model={OPENAI_REALTIME_MODEL}", extra_headers=headers ) as openai_ws: async def process_utterance(b64_wav: str, instructions: Optional[str] = None): # Append audio buffer await openai_ws.send(json.dumps({ "type": "input_audio_buffer.append", "audio": {"data": b64_wav, "format": "wav"} })) # Commit audio await openai_ws.send(json.dumps({ "type": "input_audio_buffer.commit" })) # Request response with audio + text await openai_ws.send(json.dumps({ "type": "response.create", "response": { "modalities": ["audio", "text"], "instructions": instructions or "" } })) audio_chunks = [] transcript = "" # Read stream until completed while True: msg = await openai_ws.recv() try: event = json.loads(msg) except: continue etype = event.get("type") if etype == "response.audio.delta": data = event.get("delta") or event.get("data") if data: audio_chunks.append(data) await websocket.send_text(json.dumps({ "type": "audio_delta", "data": data })) elif etype == "response.transcript.delta": delta = event.get("delta", "") transcript += delta await websocket.send_text(json.dumps({ "type": "transcript_delta", "text": delta })) elif etype == "response.completed": await websocket.send_text(json.dumps({ "type": "response_completed", "transcript": transcript, "audio": "".join(audio_chunks) })) break # Main loop: receive client messages while True: try: text = await websocket.receive_text() except WebSocketDisconnect: break try: msg = json.loads(text) except: continue mtype = msg.get("type") if mtype == "utterance": b64_wav = msg.get("audio", "") instructions = msg.get("instructions", "") if b64_wav: await process_utterance(b64_wav, instructions) elif mtype == "ping": await websocket.send_text(json.dumps({"type": "pong"})) except Exception as e: print(f"WebSocket error: {e}") finally: await websocket.close() if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=8000)