chatbot1 / realtime_server.py
Nguyen5's picture
commit
54f3783
"""
realtime_server.py - Optional WebSocket server for real-time audio streaming
Chạy riêng biệt: uvicorn realtime_server:app --host 0.0.0.0 --port 8000
"""
import os
import asyncio
import json
import base64
from typing import Optional
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
from fastapi.responses import HTMLResponse
import websockets
import sys
sys.dont_write_bytecode = True
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
OPENAI_REALTIME_MODEL = os.getenv("OPENAI_REALTIME_MODEL", "gpt-4o-realtime-preview")
app = FastAPI()
# Simple HTML test page
html = """
<!DOCTYPE html>
<html>
<head>
<title>Realtime Audio Test</title>
</head>
<body>
<h1>Realtime Audio Test</h1>
<button id="startBtn">Start Recording</button>
<button id="stopBtn" disabled>Stop Recording</button>
<div id="status">Status: Ready</div>
<div><label>Instructions: <input id="instructions" placeholder="Optional prompt" /></label></div>
<div id="transcript"></div>
<audio id="player" controls></audio>
<script>
let mediaRecorder;
let audioChunks = [];
let ws;
function ensureWS() {
if (ws && ws.readyState === WebSocket.OPEN) return ws;
const basePath = (location.pathname.endsWith('/') ? location.pathname.slice(0,-1) : location.pathname);
ws = new WebSocket((location.protocol === 'https:' ? 'wss://' : 'ws://') + location.host + basePath + '/ws');
ws.onopen = () => {
document.getElementById('status').textContent = 'Status: WS connected';
};
ws.onmessage = (event) => {
try {
const msg = JSON.parse(event.data);
if (msg.type === 'transcript_delta') {
const el = document.getElementById('transcript');
el.innerHTML = `<strong>Transcript:</strong> ${el.textContent}${msg.text}`;
} else if (msg.type === 'response_completed') {
if (msg.audio) {
const b64 = msg.audio;
const audioBlob = base64ToWavBlob(b64);
const url = URL.createObjectURL(audioBlob);
const player = document.getElementById('player');
player.src = url;
player.play();
}
document.getElementById('status').textContent = 'Status: Completed';
}
} catch {}
};
ws.onclose = () => {
document.getElementById('status').textContent = 'Status: WS closed';
};
return ws;
}
function base64ToWavBlob(base64) {
const byteCharacters = atob(base64);
const byteNumbers = new Array(byteCharacters.length);
for (let i = 0; i < byteCharacters.length; i++) {
byteNumbers[i] = byteCharacters.charCodeAt(i);
}
const byteArray = new Uint8Array(byteNumbers);
return new Blob([byteArray], { type: 'audio/wav' });
}
document.getElementById('startBtn').onclick = async () => {
ensureWS();
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
mediaRecorder = new MediaRecorder(stream);
audioChunks = [];
document.getElementById('transcript').textContent = '';
mediaRecorder.ondataavailable = (event) => {
audioChunks.push(event.data);
};
mediaRecorder.onstop = async () => {
const audioBlob = new Blob(audioChunks, { type: 'audio/wav' });
audioChunks = [];
const reader = new FileReader();
reader.readAsDataURL(audioBlob);
reader.onloadend = () => {
const base64data = reader.result.split(',')[1];
const instructions = document.getElementById('instructions').value || '';
ws.send(JSON.stringify({ type: 'utterance', audio: base64data, instructions }));
document.getElementById('status').textContent = 'Status: Sending to OpenAI...';
};
};
mediaRecorder.start();
document.getElementById('startBtn').disabled = true;
document.getElementById('stopBtn').disabled = false;
document.getElementById('status').textContent = 'Status: Recording...';
};
document.getElementById('stopBtn').onclick = () => {
mediaRecorder.stop();
document.getElementById('startBtn').disabled = false;
document.getElementById('stopBtn').disabled = true;
};
</script>
</body>
</html>
"""
@app.get("/")
async def get():
return HTMLResponse(html)
@app.post("/process-audio")
async def process_audio(request: dict):
"""Process audio from frontend"""
try:
audio_data = base64.b64decode(request.get("audio", ""))
# Save to temp file
import tempfile
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
f.write(audio_data)
temp_path = f.name
# Transcribe using OpenAI
from openai import OpenAI
client = OpenAI(api_key=OPENAI_API_KEY)
with open(temp_path, "rb") as audio_file:
transcript = client.audio.transcriptions.create(
model="whisper-1",
file=audio_file,
language="de"
)
# Clean up
import os
os.unlink(temp_path)
return {"success": True, "transcript": transcript.text}
except Exception as e:
return {"success": False, "error": str(e)}
@app.websocket("/ws")
async def websocket_endpoint(websocket: WebSocket):
"""WebSocket endpoint for real-time audio streaming"""
await websocket.accept()
try:
# Connect to OpenAI Realtime API
headers = {
"Authorization": f"Bearer {OPENAI_API_KEY}",
"OpenAI-Beta": "realtime=v1",
}
async with websockets.connect(
f"wss://api.openai.com/v1/realtime?model={OPENAI_REALTIME_MODEL}",
extra_headers=headers
) as openai_ws:
async def process_utterance(b64_wav: str, instructions: Optional[str] = None):
# Append audio buffer
await openai_ws.send(json.dumps({
"type": "input_audio_buffer.append",
"audio": {"data": b64_wav, "format": "wav"}
}))
# Commit audio
await openai_ws.send(json.dumps({
"type": "input_audio_buffer.commit"
}))
# Request response with audio + text
await openai_ws.send(json.dumps({
"type": "response.create",
"response": {
"modalities": ["audio", "text"],
"instructions": instructions or ""
}
}))
audio_chunks = []
transcript = ""
# Read stream until completed
while True:
msg = await openai_ws.recv()
try:
event = json.loads(msg)
except:
continue
etype = event.get("type")
if etype == "response.audio.delta":
data = event.get("delta") or event.get("data")
if data:
audio_chunks.append(data)
await websocket.send_text(json.dumps({
"type": "audio_delta",
"data": data
}))
elif etype == "response.transcript.delta":
delta = event.get("delta", "")
transcript += delta
await websocket.send_text(json.dumps({
"type": "transcript_delta",
"text": delta
}))
elif etype == "response.completed":
await websocket.send_text(json.dumps({
"type": "response_completed",
"transcript": transcript,
"audio": "".join(audio_chunks)
}))
break
# Main loop: receive client messages
while True:
try:
text = await websocket.receive_text()
except WebSocketDisconnect:
break
try:
msg = json.loads(text)
except:
continue
mtype = msg.get("type")
if mtype == "utterance":
b64_wav = msg.get("audio", "")
instructions = msg.get("instructions", "")
if b64_wav:
await process_utterance(b64_wav, instructions)
elif mtype == "ping":
await websocket.send_text(json.dumps({"type": "pong"}))
except Exception as e:
print(f"WebSocket error: {e}")
finally:
await websocket.close()
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)