"""
realtime_server.py - Optional WebSocket server for real-time audio streaming
Chạy riêng biệt: uvicorn realtime_server:app --host 0.0.0.0 --port 8000
"""
import os
import asyncio
import json
import base64
from typing import Optional
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
from fastapi.responses import HTMLResponse
import websockets
import sys
sys.dont_write_bytecode = True
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
OPENAI_REALTIME_MODEL = os.getenv("OPENAI_REALTIME_MODEL", "gpt-4o-realtime-preview")
app = FastAPI()
# Simple HTML test page
html = """
Realtime Audio Test
Realtime Audio Test
Status: Ready
"""
@app.get("/")
async def get():
return HTMLResponse(html)
@app.post("/process-audio")
async def process_audio(request: dict):
"""Process audio from frontend"""
try:
audio_data = base64.b64decode(request.get("audio", ""))
# Save to temp file
import tempfile
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
f.write(audio_data)
temp_path = f.name
# Transcribe using OpenAI
from openai import OpenAI
client = OpenAI(api_key=OPENAI_API_KEY)
with open(temp_path, "rb") as audio_file:
transcript = client.audio.transcriptions.create(
model="whisper-1",
file=audio_file,
language="de"
)
# Clean up
import os
os.unlink(temp_path)
return {"success": True, "transcript": transcript.text}
except Exception as e:
return {"success": False, "error": str(e)}
@app.websocket("/ws")
async def websocket_endpoint(websocket: WebSocket):
"""WebSocket endpoint for real-time audio streaming"""
await websocket.accept()
try:
# Connect to OpenAI Realtime API
headers = {
"Authorization": f"Bearer {OPENAI_API_KEY}",
"OpenAI-Beta": "realtime=v1",
}
async with websockets.connect(
f"wss://api.openai.com/v1/realtime?model={OPENAI_REALTIME_MODEL}",
extra_headers=headers
) as openai_ws:
async def process_utterance(b64_wav: str, instructions: Optional[str] = None):
# Append audio buffer
await openai_ws.send(json.dumps({
"type": "input_audio_buffer.append",
"audio": {"data": b64_wav, "format": "wav"}
}))
# Commit audio
await openai_ws.send(json.dumps({
"type": "input_audio_buffer.commit"
}))
# Request response with audio + text
await openai_ws.send(json.dumps({
"type": "response.create",
"response": {
"modalities": ["audio", "text"],
"instructions": instructions or ""
}
}))
audio_chunks = []
transcript = ""
# Read stream until completed
while True:
msg = await openai_ws.recv()
try:
event = json.loads(msg)
except:
continue
etype = event.get("type")
if etype == "response.audio.delta":
data = event.get("delta") or event.get("data")
if data:
audio_chunks.append(data)
await websocket.send_text(json.dumps({
"type": "audio_delta",
"data": data
}))
elif etype == "response.transcript.delta":
delta = event.get("delta", "")
transcript += delta
await websocket.send_text(json.dumps({
"type": "transcript_delta",
"text": delta
}))
elif etype == "response.completed":
await websocket.send_text(json.dumps({
"type": "response_completed",
"transcript": transcript,
"audio": "".join(audio_chunks)
}))
break
# Main loop: receive client messages
while True:
try:
text = await websocket.receive_text()
except WebSocketDisconnect:
break
try:
msg = json.loads(text)
except:
continue
mtype = msg.get("type")
if mtype == "utterance":
b64_wav = msg.get("audio", "")
instructions = msg.get("instructions", "")
if b64_wav:
await process_utterance(b64_wav, instructions)
elif mtype == "ping":
await websocket.send_text(json.dumps({"type": "pong"}))
except Exception as e:
print(f"WebSocket error: {e}")
finally:
await websocket.close()
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)