avatar-integrador / server.py
marcosremar2's picture
Initial commit: Avatar integrator server with performance statistics
53b54ed
"""
Servidor Integrador - Avatar com TTS
Porta: 8080
Conecta:
- Orpheus TTS: localhost:8880
- Wav2Lip: localhost:8085
"""
import asyncio
import json
import os
from aiohttp import web, ClientSession
import aiohttp
PORT = 8080
WAV2LIP_URL = "http://localhost:8085"
TTS_URL = "http://localhost:8880"
# HTML da interface
HTML_TEMPLATE = """
<!DOCTYPE html>
<html lang="pt-BR">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Avatar Integrado</title>
<style>
* { margin: 0; padding: 0; box-sizing: border-box; }
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
min-height: 100vh;
display: flex;
flex-direction: column;
align-items: center;
padding: 20px;
color: #fff;
}
h1 {
margin-bottom: 20px;
font-weight: 300;
font-size: 1.8rem;
}
.container {
display: flex;
gap: 20px;
flex-wrap: wrap;
justify-content: center;
max-width: 1200px;
}
.video-container {
background: #000;
border-radius: 12px;
overflow: hidden;
box-shadow: 0 10px 40px rgba(0,0,0,0.5);
}
#avatar-video {
width: 512px;
height: 512px;
object-fit: cover;
}
.controls {
background: rgba(255,255,255,0.1);
backdrop-filter: blur(10px);
border-radius: 12px;
padding: 20px;
width: 350px;
}
.control-group {
margin-bottom: 15px;
}
label {
display: block;
margin-bottom: 5px;
font-size: 0.9rem;
color: #aaa;
}
input, select, textarea {
width: 100%;
padding: 10px;
border: none;
border-radius: 8px;
background: rgba(255,255,255,0.1);
color: #fff;
font-size: 1rem;
}
textarea {
min-height: 100px;
resize: vertical;
}
button {
width: 100%;
padding: 12px;
border: none;
border-radius: 8px;
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: #fff;
font-size: 1rem;
cursor: pointer;
transition: transform 0.2s, box-shadow 0.2s;
}
button:hover {
transform: translateY(-2px);
box-shadow: 0 5px 20px rgba(102, 126, 234, 0.4);
}
button:disabled {
opacity: 0.5;
cursor: not-allowed;
transform: none;
}
.status {
margin-top: 15px;
padding: 10px;
border-radius: 8px;
background: rgba(0,0,0,0.3);
font-size: 0.85rem;
}
.status.connected { border-left: 3px solid #4caf50; }
.status.disconnected { border-left: 3px solid #f44336; }
.status.speaking { border-left: 3px solid #2196f3; }
#audio-player { display: none; }
/* Statistics Panel */
.stats-panel {
background: rgba(255,255,255,0.1);
backdrop-filter: blur(10px);
border-radius: 12px;
padding: 20px;
width: 350px;
max-height: 500px;
overflow-y: auto;
}
.stats-panel h2 {
font-size: 1.1rem;
margin-bottom: 15px;
color: #aaa;
font-weight: 400;
}
.stat-row {
display: flex;
justify-content: space-between;
padding: 8px 0;
border-bottom: 1px solid rgba(255,255,255,0.1);
}
.stat-label {
color: #aaa;
font-size: 0.85rem;
}
.stat-value {
font-weight: 500;
font-size: 0.9rem;
}
.stat-value.fast { color: #4caf50; }
.stat-value.medium { color: #ff9800; }
.stat-value.slow { color: #f44336; }
.stats-history {
margin-top: 15px;
}
.stats-history h3 {
font-size: 0.95rem;
margin-bottom: 10px;
color: #888;
}
.history-item {
background: rgba(0,0,0,0.3);
border-radius: 8px;
padding: 10px;
margin-bottom: 8px;
font-size: 0.8rem;
}
.history-item .timestamp {
color: #666;
font-size: 0.75rem;
}
.history-item .metrics {
display: grid;
grid-template-columns: 1fr 1fr;
gap: 5px;
margin-top: 5px;
}
.avg-stats {
background: rgba(102, 126, 234, 0.2);
border-radius: 8px;
padding: 10px;
margin-bottom: 15px;
}
.avg-stats h3 {
font-size: 0.9rem;
margin-bottom: 8px;
color: #667eea;
}
</style>
</head>
<body>
<h1>Avatar Integrado</h1>
<div class="container">
<div class="video-container">
<img id="avatar-video" alt="Avatar">
</div>
<div class="controls">
<div class="control-group">
<label>Voz</label>
<select id="voice-select">
<option value="tara">Tara (Feminina)</option>
<option value="leah">Leah (Feminina)</option>
<option value="jess">Jess (Feminina)</option>
<option value="mia">Mia (Feminina)</option>
<option value="leo">Leo (Masculina)</option>
<option value="dan">Dan (Masculina)</option>
<option value="zac">Zac (Masculina)</option>
<option value="zoe">Zoe (Feminina)</option>
</select>
</div>
<div class="control-group">
<label>Texto para falar</label>
<textarea id="text-input" placeholder="Enter the text here...">Hello! I am a digital avatar with real-time voice synthesis.</textarea>
</div>
<button id="speak-btn" onclick="speak()">Falar</button>
<div id="status" class="status disconnected">
Conectando...
</div>
</div>
<!-- Statistics Panel -->
<div class="stats-panel">
<h2>Performance Statistics</h2>
<div class="avg-stats">
<h3>Averages (last 10)</h3>
<div class="stat-row">
<span class="stat-label">Avg Round-Trip</span>
<span id="avg-roundtrip" class="stat-value">--</span>
</div>
<div class="stat-row">
<span class="stat-label">Avg TTS Time</span>
<span id="avg-tts" class="stat-value">--</span>
</div>
<div class="stat-row">
<span class="stat-label">Avg Wav2Lip Time</span>
<span id="avg-wav2lip" class="stat-value">--</span>
</div>
</div>
<div class="stat-row">
<span class="stat-label">Last Round-Trip</span>
<span id="last-roundtrip" class="stat-value">--</span>
</div>
<div class="stat-row">
<span class="stat-label">Last TTS Time</span>
<span id="last-tts" class="stat-value">--</span>
</div>
<div class="stat-row">
<span class="stat-label">Last Wav2Lip Time</span>
<span id="last-wav2lip" class="stat-value">--</span>
</div>
<div class="stat-row">
<span class="stat-label">First Frame</span>
<span id="last-firstframe" class="stat-value">--</span>
</div>
<div class="stat-row">
<span class="stat-label">Audio Duration</span>
<span id="last-audioduration" class="stat-value">--</span>
</div>
<div class="stat-row">
<span class="stat-label">Text Length</span>
<span id="last-textlen" class="stat-value">--</span>
</div>
<div class="stats-history">
<h3>Request History</h3>
<div id="history-container"></div>
</div>
</div>
</div>
<audio id="audio-player"></audio>
<script>
const statusEl = document.getElementById('status');
const speakBtn = document.getElementById('speak-btn');
const audioPlayer = document.getElementById('audio-player');
let ws = null;
let isConnected = false;
// Statistics tracking
let requestStartTime = null;
let statsHistory = [];
const MAX_HISTORY = 10;
// Conectar ao WebSocket do Wav2Lip
function connectWebSocket() {
ws = new WebSocket('ws://' + window.location.host + '/ws/avatar');
ws.onopen = () => {
isConnected = true;
statusEl.textContent = 'Conectado ao servidor';
statusEl.className = 'status connected';
};
ws.onmessage = (event) => {
const data = JSON.parse(event.data);
if (data.status === 'speaking') {
statusEl.textContent = 'Falando...';
statusEl.className = 'status speaking';
speakBtn.disabled = true;
} else if (data.status === 'idle') {
statusEl.textContent = 'Pronto';
statusEl.className = 'status connected';
speakBtn.disabled = false;
// Calculate round-trip time when idle
if (requestStartTime) {
const roundTrip = (performance.now() - requestStartTime) / 1000;
updateStats({roundTrip});
requestStartTime = null;
}
} else if (data.status === 'error') {
statusEl.textContent = 'Erro: ' + data.message;
statusEl.className = 'status disconnected';
speakBtn.disabled = false;
requestStartTime = null;
} else if (data.audio) {
// Recebeu áudio para reproduzir
const audioBlob = base64ToBlob(data.audio, 'audio/wav');
audioPlayer.src = URL.createObjectURL(audioBlob);
audioPlayer.play();
}
// Process timing stats from server
if (data.stats) {
updateStats(data.stats);
}
// Also check for individual timing fields
if (data.tts_time !== undefined || data.wav2lip_time !== undefined || data.first_frame_time !== undefined) {
updateStats({
tts_time: data.tts_time,
wav2lip_time: data.wav2lip_time,
first_frame_time: data.first_frame_time,
audio_duration: data.audio_duration,
text_length: data.text_length
});
}
};
ws.onclose = () => {
isConnected = false;
statusEl.textContent = 'Desconectado. Reconectando...';
statusEl.className = 'status disconnected';
setTimeout(connectWebSocket, 2000);
};
ws.onerror = (error) => {
console.error('WebSocket error:', error);
};
}
function base64ToBlob(base64, mimeType) {
const byteCharacters = atob(base64);
const byteNumbers = new Array(byteCharacters.length);
for (let i = 0; i < byteCharacters.length; i++) {
byteNumbers[i] = byteCharacters.charCodeAt(i);
}
const byteArray = new Uint8Array(byteNumbers);
return new Blob([byteArray], { type: mimeType });
}
async function speak() {
const text = document.getElementById('text-input').value.trim();
const voice = document.getElementById('voice-select').value;
if (!text) {
alert('Digite um texto para falar');
return;
}
if (!isConnected) {
alert('Não conectado ao servidor');
return;
}
speakBtn.disabled = true;
statusEl.textContent = 'Gerando áudio...';
statusEl.className = 'status speaking';
// Record start time for round-trip measurement
requestStartTime = performance.now();
// Enviar comando para falar
ws.send(JSON.stringify({
action: 'speak',
text: text,
voice: voice,
text_length: text.length
}));
}
// Statistics functions
function formatTime(seconds) {
if (seconds === undefined || seconds === null) return '--';
return seconds.toFixed(2) + 's';
}
function getSpeedClass(seconds, thresholds) {
if (seconds === undefined || seconds === null) return '';
if (seconds <= thresholds.fast) return 'fast';
if (seconds <= thresholds.medium) return 'medium';
return 'slow';
}
function updateStats(newStats) {
const now = new Date();
const entry = {
timestamp: now,
...newStats
};
// Update last stats display
if (newStats.roundTrip !== undefined) {
const el = document.getElementById('last-roundtrip');
el.textContent = formatTime(newStats.roundTrip);
el.className = 'stat-value ' + getSpeedClass(newStats.roundTrip, {fast: 3, medium: 6});
}
if (newStats.tts_time !== undefined) {
const el = document.getElementById('last-tts');
el.textContent = formatTime(newStats.tts_time);
el.className = 'stat-value ' + getSpeedClass(newStats.tts_time, {fast: 2, medium: 4});
}
if (newStats.wav2lip_time !== undefined) {
const el = document.getElementById('last-wav2lip');
el.textContent = formatTime(newStats.wav2lip_time);
el.className = 'stat-value ' + getSpeedClass(newStats.wav2lip_time, {fast: 1, medium: 2});
}
if (newStats.first_frame_time !== undefined) {
const el = document.getElementById('last-firstframe');
el.textContent = formatTime(newStats.first_frame_time);
el.className = 'stat-value ' + getSpeedClass(newStats.first_frame_time, {fast: 3, medium: 5});
}
if (newStats.audio_duration !== undefined) {
document.getElementById('last-audioduration').textContent = formatTime(newStats.audio_duration);
}
if (newStats.text_length !== undefined) {
document.getElementById('last-textlen').textContent = newStats.text_length + ' chars';
}
// Only add to history if we have timing data
if (newStats.tts_time !== undefined || newStats.roundTrip !== undefined) {
statsHistory.unshift(entry);
if (statsHistory.length > MAX_HISTORY) {
statsHistory.pop();
}
updateAverages();
updateHistory();
}
}
function updateAverages() {
const validRoundTrips = statsHistory.filter(s => s.roundTrip !== undefined).map(s => s.roundTrip);
const validTts = statsHistory.filter(s => s.tts_time !== undefined).map(s => s.tts_time);
const validWav2lip = statsHistory.filter(s => s.wav2lip_time !== undefined).map(s => s.wav2lip_time);
if (validRoundTrips.length > 0) {
const avg = validRoundTrips.reduce((a, b) => a + b, 0) / validRoundTrips.length;
const el = document.getElementById('avg-roundtrip');
el.textContent = formatTime(avg);
el.className = 'stat-value ' + getSpeedClass(avg, {fast: 3, medium: 6});
}
if (validTts.length > 0) {
const avg = validTts.reduce((a, b) => a + b, 0) / validTts.length;
const el = document.getElementById('avg-tts');
el.textContent = formatTime(avg);
el.className = 'stat-value ' + getSpeedClass(avg, {fast: 2, medium: 4});
}
if (validWav2lip.length > 0) {
const avg = validWav2lip.reduce((a, b) => a + b, 0) / validWav2lip.length;
const el = document.getElementById('avg-wav2lip');
el.textContent = formatTime(avg);
el.className = 'stat-value ' + getSpeedClass(avg, {fast: 1, medium: 2});
}
}
function updateHistory() {
const container = document.getElementById('history-container');
container.innerHTML = statsHistory.map((entry, idx) => {
const time = entry.timestamp.toLocaleTimeString();
return `
<div class="history-item">
<div class="timestamp">#${idx + 1} - ${time}</div>
<div class="metrics">
${entry.roundTrip !== undefined ? `<span>Round-trip: ${formatTime(entry.roundTrip)}</span>` : ''}
${entry.tts_time !== undefined ? `<span>TTS: ${formatTime(entry.tts_time)}</span>` : ''}
${entry.wav2lip_time !== undefined ? `<span>Wav2Lip: ${formatTime(entry.wav2lip_time)}</span>` : ''}
${entry.first_frame_time !== undefined ? `<span>1st Frame: ${formatTime(entry.first_frame_time)}</span>` : ''}
</div>
</div>
`;
}).join('');
}
// Iniciar conexão
connectWebSocket();
// Set MJPEG source based on current host (uses port 8085 for Wav2Lip)
const avatarImg = document.getElementById('avatar-video');
avatarImg.src = 'http://' + window.location.hostname + ':8085/mjpeg';
</script>
</body>
</html>
"""
async def index(request):
"""Serve a página principal"""
return web.Response(text=HTML_TEMPLATE, content_type='text/html')
async def proxy_mjpeg(request):
"""Proxy para o stream MJPEG do Wav2Lip"""
try:
async with ClientSession() as session:
async with session.get(f"{WAV2LIP_URL}/mjpeg") as resp:
if resp.status == 200:
response = web.StreamResponse()
response.content_type = resp.content_type
await response.prepare(request)
async for chunk in resp.content.iter_any():
await response.write(chunk)
return response
except Exception as e:
print(f"Erro ao obter vídeo: {e}")
return web.Response(status=503, text="Video not available")
async def websocket_handler(request):
"""WebSocket handler que conecta ao Wav2Lip e TTS"""
ws_response = web.WebSocketResponse()
await ws_response.prepare(request)
# Conectar ao WebSocket do Wav2Lip
wav2lip_ws = None
try:
async with ClientSession() as session:
async with session.ws_connect(f"{WAV2LIP_URL}/ws") as wav2lip_ws:
async def forward_from_wav2lip():
"""Encaminha mensagens do Wav2Lip para o cliente"""
try:
async for msg in wav2lip_ws:
if msg.type == aiohttp.WSMsgType.TEXT:
await ws_response.send_str(msg.data)
elif msg.type == aiohttp.WSMsgType.BINARY:
await ws_response.send_bytes(msg.data)
elif msg.type == aiohttp.WSMsgType.ERROR:
break
except Exception as e:
print(f"Erro ao encaminhar de Wav2Lip: {e}")
async def forward_from_client():
"""Encaminha mensagens do cliente para o Wav2Lip"""
try:
async for msg in ws_response:
if msg.type == aiohttp.WSMsgType.TEXT:
data = json.loads(msg.data)
if data.get('action') == 'speak':
# Envia para o Wav2Lip que já integra com TTS
await wav2lip_ws.send_str(json.dumps({
'action': 'speak',
'text': data['text'],
'voice': data.get('voice', 'tara')
}))
else:
await wav2lip_ws.send_str(msg.data)
elif msg.type == aiohttp.WSMsgType.ERROR:
break
except Exception as e:
print(f"Erro ao encaminhar do cliente: {e}")
# Executar ambos em paralelo
await asyncio.gather(
forward_from_wav2lip(),
forward_from_client()
)
except Exception as e:
print(f"Erro WebSocket: {e}")
await ws_response.send_json({"status": "error", "message": str(e)})
return ws_response
async def health(request):
"""Health check endpoint"""
status = {
"status": "ok",
"services": {}
}
async with ClientSession() as session:
# Check TTS
try:
async with session.get(f"{TTS_URL}/") as resp:
status["services"]["tts"] = resp.status == 200
except:
status["services"]["tts"] = False
# Check Wav2Lip
try:
async with session.get(f"{WAV2LIP_URL}/") as resp:
status["services"]["wav2lip"] = resp.status == 200
except:
status["services"]["wav2lip"] = False
return web.json_response(status)
def create_app():
app = web.Application()
app.router.add_get('/', index)
app.router.add_get('/mjpeg', proxy_mjpeg)
app.router.add_get('/ws/avatar', websocket_handler)
app.router.add_get('/health', health)
return app
if __name__ == '__main__':
print(f"=== Servidor Integrador ===")
print(f"Porta: {PORT}")
print(f"TTS: {TTS_URL}")
print(f"Wav2Lip: {WAV2LIP_URL}")
print(f"Acesse: http://localhost:{PORT}")
print("=" * 30)
app = create_app()
web.run_app(app, host='0.0.0.0', port=PORT)