|
|
""" |
|
|
realtime_server.py - Optional WebSocket server for real-time audio streaming |
|
|
Chạy riêng biệt: uvicorn realtime_server:app --host 0.0.0.0 --port 8000 |
|
|
""" |
|
|
|
|
|
import os |
|
|
import asyncio |
|
|
import json |
|
|
import base64 |
|
|
from typing import Optional |
|
|
from fastapi import FastAPI, WebSocket, WebSocketDisconnect |
|
|
from fastapi.responses import HTMLResponse |
|
|
import websockets |
|
|
import sys |
|
|
sys.dont_write_bytecode = True |
|
|
|
|
|
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "") |
|
|
OPENAI_REALTIME_MODEL = os.getenv("OPENAI_REALTIME_MODEL", "gpt-4o-realtime-preview") |
|
|
|
|
|
app = FastAPI() |
|
|
|
|
|
|
|
|
html = """ |
|
|
<!DOCTYPE html> |
|
|
<html> |
|
|
<head> |
|
|
<title>Realtime Audio Test</title> |
|
|
</head> |
|
|
<body> |
|
|
<h1>Realtime Audio Test</h1> |
|
|
<button id="startBtn">Start Recording</button> |
|
|
<button id="stopBtn" disabled>Stop Recording</button> |
|
|
<div id="status">Status: Ready</div> |
|
|
<div><label>Instructions: <input id="instructions" placeholder="Optional prompt" /></label></div> |
|
|
<div id="transcript"></div> |
|
|
<audio id="player" controls></audio> |
|
|
|
|
|
<script> |
|
|
let mediaRecorder; |
|
|
let audioChunks = []; |
|
|
let ws; |
|
|
|
|
|
function ensureWS() { |
|
|
if (ws && ws.readyState === WebSocket.OPEN) return ws; |
|
|
const basePath = (location.pathname.endsWith('/') ? location.pathname.slice(0,-1) : location.pathname); |
|
|
ws = new WebSocket((location.protocol === 'https:' ? 'wss://' : 'ws://') + location.host + basePath + '/ws'); |
|
|
ws.onopen = () => { |
|
|
document.getElementById('status').textContent = 'Status: WS connected'; |
|
|
}; |
|
|
ws.onmessage = (event) => { |
|
|
try { |
|
|
const msg = JSON.parse(event.data); |
|
|
if (msg.type === 'transcript_delta') { |
|
|
const el = document.getElementById('transcript'); |
|
|
el.innerHTML = `<strong>Transcript:</strong> ${el.textContent}${msg.text}`; |
|
|
} else if (msg.type === 'response_completed') { |
|
|
if (msg.audio) { |
|
|
const b64 = msg.audio; |
|
|
const audioBlob = base64ToWavBlob(b64); |
|
|
const url = URL.createObjectURL(audioBlob); |
|
|
const player = document.getElementById('player'); |
|
|
player.src = url; |
|
|
player.play(); |
|
|
} |
|
|
document.getElementById('status').textContent = 'Status: Completed'; |
|
|
} |
|
|
} catch {} |
|
|
}; |
|
|
ws.onclose = () => { |
|
|
document.getElementById('status').textContent = 'Status: WS closed'; |
|
|
}; |
|
|
return ws; |
|
|
} |
|
|
|
|
|
function base64ToWavBlob(base64) { |
|
|
const byteCharacters = atob(base64); |
|
|
const byteNumbers = new Array(byteCharacters.length); |
|
|
for (let i = 0; i < byteCharacters.length; i++) { |
|
|
byteNumbers[i] = byteCharacters.charCodeAt(i); |
|
|
} |
|
|
const byteArray = new Uint8Array(byteNumbers); |
|
|
return new Blob([byteArray], { type: 'audio/wav' }); |
|
|
} |
|
|
|
|
|
document.getElementById('startBtn').onclick = async () => { |
|
|
ensureWS(); |
|
|
const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); |
|
|
mediaRecorder = new MediaRecorder(stream); |
|
|
|
|
|
audioChunks = []; |
|
|
document.getElementById('transcript').textContent = ''; |
|
|
|
|
|
mediaRecorder.ondataavailable = (event) => { |
|
|
audioChunks.push(event.data); |
|
|
}; |
|
|
|
|
|
mediaRecorder.onstop = async () => { |
|
|
const audioBlob = new Blob(audioChunks, { type: 'audio/wav' }); |
|
|
audioChunks = []; |
|
|
const reader = new FileReader(); |
|
|
reader.readAsDataURL(audioBlob); |
|
|
reader.onloadend = () => { |
|
|
const base64data = reader.result.split(',')[1]; |
|
|
const instructions = document.getElementById('instructions').value || ''; |
|
|
ws.send(JSON.stringify({ type: 'utterance', audio: base64data, instructions })); |
|
|
document.getElementById('status').textContent = 'Status: Sending to OpenAI...'; |
|
|
}; |
|
|
}; |
|
|
|
|
|
mediaRecorder.start(); |
|
|
document.getElementById('startBtn').disabled = true; |
|
|
document.getElementById('stopBtn').disabled = false; |
|
|
document.getElementById('status').textContent = 'Status: Recording...'; |
|
|
}; |
|
|
|
|
|
document.getElementById('stopBtn').onclick = () => { |
|
|
mediaRecorder.stop(); |
|
|
document.getElementById('startBtn').disabled = false; |
|
|
document.getElementById('stopBtn').disabled = true; |
|
|
}; |
|
|
</script> |
|
|
</body> |
|
|
</html> |
|
|
""" |
|
|
|
|
|
@app.get("/") |
|
|
async def get(): |
|
|
return HTMLResponse(html) |
|
|
|
|
|
@app.post("/process-audio") |
|
|
async def process_audio(request: dict): |
|
|
"""Process audio from frontend""" |
|
|
try: |
|
|
audio_data = base64.b64decode(request.get("audio", "")) |
|
|
|
|
|
|
|
|
import tempfile |
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f: |
|
|
f.write(audio_data) |
|
|
temp_path = f.name |
|
|
|
|
|
|
|
|
from openai import OpenAI |
|
|
client = OpenAI(api_key=OPENAI_API_KEY) |
|
|
|
|
|
with open(temp_path, "rb") as audio_file: |
|
|
transcript = client.audio.transcriptions.create( |
|
|
model="whisper-1", |
|
|
file=audio_file, |
|
|
language="de" |
|
|
) |
|
|
|
|
|
|
|
|
import os |
|
|
os.unlink(temp_path) |
|
|
|
|
|
return {"success": True, "transcript": transcript.text} |
|
|
|
|
|
except Exception as e: |
|
|
return {"success": False, "error": str(e)} |
|
|
|
|
|
@app.websocket("/ws") |
|
|
async def websocket_endpoint(websocket: WebSocket): |
|
|
"""WebSocket endpoint for real-time audio streaming""" |
|
|
await websocket.accept() |
|
|
|
|
|
try: |
|
|
|
|
|
headers = { |
|
|
"Authorization": f"Bearer {OPENAI_API_KEY}", |
|
|
"OpenAI-Beta": "realtime=v1", |
|
|
} |
|
|
|
|
|
async with websockets.connect( |
|
|
f"wss://api.openai.com/v1/realtime?model={OPENAI_REALTIME_MODEL}", |
|
|
extra_headers=headers |
|
|
) as openai_ws: |
|
|
|
|
|
async def process_utterance(b64_wav: str, instructions: Optional[str] = None): |
|
|
|
|
|
await openai_ws.send(json.dumps({ |
|
|
"type": "input_audio_buffer.append", |
|
|
"audio": {"data": b64_wav, "format": "wav"} |
|
|
})) |
|
|
|
|
|
await openai_ws.send(json.dumps({ |
|
|
"type": "input_audio_buffer.commit" |
|
|
})) |
|
|
|
|
|
await openai_ws.send(json.dumps({ |
|
|
"type": "response.create", |
|
|
"response": { |
|
|
"modalities": ["audio", "text"], |
|
|
"instructions": instructions or "" |
|
|
} |
|
|
})) |
|
|
|
|
|
audio_chunks = [] |
|
|
transcript = "" |
|
|
|
|
|
while True: |
|
|
msg = await openai_ws.recv() |
|
|
try: |
|
|
event = json.loads(msg) |
|
|
except: |
|
|
continue |
|
|
|
|
|
etype = event.get("type") |
|
|
if etype == "response.audio.delta": |
|
|
data = event.get("delta") or event.get("data") |
|
|
if data: |
|
|
audio_chunks.append(data) |
|
|
await websocket.send_text(json.dumps({ |
|
|
"type": "audio_delta", |
|
|
"data": data |
|
|
})) |
|
|
elif etype == "response.transcript.delta": |
|
|
delta = event.get("delta", "") |
|
|
transcript += delta |
|
|
await websocket.send_text(json.dumps({ |
|
|
"type": "transcript_delta", |
|
|
"text": delta |
|
|
})) |
|
|
elif etype == "response.completed": |
|
|
await websocket.send_text(json.dumps({ |
|
|
"type": "response_completed", |
|
|
"transcript": transcript, |
|
|
"audio": "".join(audio_chunks) |
|
|
})) |
|
|
break |
|
|
|
|
|
|
|
|
while True: |
|
|
try: |
|
|
text = await websocket.receive_text() |
|
|
except WebSocketDisconnect: |
|
|
break |
|
|
try: |
|
|
msg = json.loads(text) |
|
|
except: |
|
|
continue |
|
|
|
|
|
mtype = msg.get("type") |
|
|
if mtype == "utterance": |
|
|
b64_wav = msg.get("audio", "") |
|
|
instructions = msg.get("instructions", "") |
|
|
if b64_wav: |
|
|
await process_utterance(b64_wav, instructions) |
|
|
elif mtype == "ping": |
|
|
await websocket.send_text(json.dumps({"type": "pong"})) |
|
|
|
|
|
except Exception as e: |
|
|
print(f"WebSocket error: {e}") |
|
|
finally: |
|
|
await websocket.close() |
|
|
|
|
|
if __name__ == "__main__": |
|
|
import uvicorn |
|
|
uvicorn.run(app, host="0.0.0.0", port=8000) |
|
|
|