avatar-integrador / server.py

Initial commit: Avatar integrator server with performance statistics

53b54ed about 1 month ago

23.5 kB

	"""
	Servidor Integrador - Avatar com TTS
	Porta: 8080
	Conecta:
	- Orpheus TTS: localhost:8880
	- Wav2Lip: localhost:8085
	"""
	import asyncio
	import json
	import os
	from aiohttp import web, ClientSession
	import aiohttp

	PORT = 8080
	WAV2LIP_URL = "http://localhost:8085"
	TTS_URL = "http://localhost:8880"

	# HTML da interface
	HTML_TEMPLATE = """
	<!DOCTYPE html>
	<html lang="pt-BR">
	<head>
	<meta charset="UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1.0">
	<title>Avatar Integrado</title>
	<style>
	* { margin: 0; padding: 0; box-sizing: border-box; }
	body {
	font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
	background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
	min-height: 100vh;
	display: flex;
	flex-direction: column;
	align-items: center;
	padding: 20px;
	color: #fff;
	}
	h1 {
	margin-bottom: 20px;
	font-weight: 300;
	font-size: 1.8rem;
	}
	.container {
	display: flex;
	gap: 20px;
	flex-wrap: wrap;
	justify-content: center;
	max-width: 1200px;
	}
	.video-container {
	background: #000;
	border-radius: 12px;
	overflow: hidden;
	box-shadow: 0 10px 40px rgba(0,0,0,0.5);
	}
	#avatar-video {
	width: 512px;
	height: 512px;
	object-fit: cover;
	}
	.controls {
	background: rgba(255,255,255,0.1);
	backdrop-filter: blur(10px);
	border-radius: 12px;
	padding: 20px;
	width: 350px;
	}
	.control-group {
	margin-bottom: 15px;
	}
	label {
	display: block;
	margin-bottom: 5px;
	font-size: 0.9rem;
	color: #aaa;
	}
	input, select, textarea {
	width: 100%;
	padding: 10px;
	border: none;
	border-radius: 8px;
	background: rgba(255,255,255,0.1);
	color: #fff;
	font-size: 1rem;
	}
	textarea {
	min-height: 100px;
	resize: vertical;
	}
	button {
	width: 100%;
	padding: 12px;
	border: none;
	border-radius: 8px;
	background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
	color: #fff;
	font-size: 1rem;
	cursor: pointer;
	transition: transform 0.2s, box-shadow 0.2s;
	}
	button:hover {
	transform: translateY(-2px);
	box-shadow: 0 5px 20px rgba(102, 126, 234, 0.4);
	}
	button:disabled {
	opacity: 0.5;
	cursor: not-allowed;
	transform: none;
	}
	.status {
	margin-top: 15px;
	padding: 10px;
	border-radius: 8px;
	background: rgba(0,0,0,0.3);
	font-size: 0.85rem;
	}
	.status.connected { border-left: 3px solid #4caf50; }
	.status.disconnected { border-left: 3px solid #f44336; }
	.status.speaking { border-left: 3px solid #2196f3; }
	#audio-player { display: none; }

	/* Statistics Panel */
	.stats-panel {
	background: rgba(255,255,255,0.1);
	backdrop-filter: blur(10px);
	border-radius: 12px;
	padding: 20px;
	width: 350px;
	max-height: 500px;
	overflow-y: auto;
	}
	.stats-panel h2 {
	font-size: 1.1rem;
	margin-bottom: 15px;
	color: #aaa;
	font-weight: 400;
	}
	.stat-row {
	display: flex;
	justify-content: space-between;
	padding: 8px 0;
	border-bottom: 1px solid rgba(255,255,255,0.1);
	}
	.stat-label {
	color: #aaa;
	font-size: 0.85rem;
	}
	.stat-value {
	font-weight: 500;
	font-size: 0.9rem;
	}
	.stat-value.fast { color: #4caf50; }
	.stat-value.medium { color: #ff9800; }
	.stat-value.slow { color: #f44336; }
	.stats-history {
	margin-top: 15px;
	}
	.stats-history h3 {
	font-size: 0.95rem;
	margin-bottom: 10px;
	color: #888;
	}
	.history-item {
	background: rgba(0,0,0,0.3);
	border-radius: 8px;
	padding: 10px;
	margin-bottom: 8px;
	font-size: 0.8rem;
	}
	.history-item .timestamp {
	color: #666;
	font-size: 0.75rem;
	}
	.history-item .metrics {
	display: grid;
	grid-template-columns: 1fr 1fr;
	gap: 5px;
	margin-top: 5px;
	}
	.avg-stats {
	background: rgba(102, 126, 234, 0.2);
	border-radius: 8px;
	padding: 10px;
	margin-bottom: 15px;
	}
	.avg-stats h3 {
	font-size: 0.9rem;
	margin-bottom: 8px;
	color: #667eea;
	}
	</style>
	</head>
	<body>
	<h1>Avatar Integrado</h1>

	<div class="container">
	<div class="video-container">
	<img id="avatar-video" alt="Avatar">
	</div>

	<div class="controls">
	<div class="control-group">
	<label>Voz</label>
	<select id="voice-select">
	<option value="tara">Tara (Feminina)</option>
	<option value="leah">Leah (Feminina)</option>
	<option value="jess">Jess (Feminina)</option>
	<option value="mia">Mia (Feminina)</option>
	<option value="leo">Leo (Masculina)</option>
	<option value="dan">Dan (Masculina)</option>
	<option value="zac">Zac (Masculina)</option>
	<option value="zoe">Zoe (Feminina)</option>
	</select>
	</div>

	<div class="control-group">
	<label>Texto para falar</label>
	<textarea id="text-input" placeholder="Enter the text here...">Hello! I am a digital avatar with real-time voice synthesis.</textarea>
	</div>

	<button id="speak-btn" onclick="speak()">Falar</button>

	<div id="status" class="status disconnected">
	Conectando...
	</div>
	</div>

	<!-- Statistics Panel -->
	<div class="stats-panel">
	<h2>Performance Statistics</h2>

	<div class="avg-stats">
	<h3>Averages (last 10)</h3>
	<div class="stat-row">
	<span class="stat-label">Avg Round-Trip</span>
	<span id="avg-roundtrip" class="stat-value">--</span>
	</div>
	<div class="stat-row">
	<span class="stat-label">Avg TTS Time</span>
	<span id="avg-tts" class="stat-value">--</span>
	</div>
	<div class="stat-row">
	<span class="stat-label">Avg Wav2Lip Time</span>
	<span id="avg-wav2lip" class="stat-value">--</span>
	</div>
	</div>

	<div class="stat-row">
	<span class="stat-label">Last Round-Trip</span>
	<span id="last-roundtrip" class="stat-value">--</span>
	</div>
	<div class="stat-row">
	<span class="stat-label">Last TTS Time</span>
	<span id="last-tts" class="stat-value">--</span>
	</div>
	<div class="stat-row">
	<span class="stat-label">Last Wav2Lip Time</span>
	<span id="last-wav2lip" class="stat-value">--</span>
	</div>
	<div class="stat-row">
	<span class="stat-label">First Frame</span>
	<span id="last-firstframe" class="stat-value">--</span>
	</div>
	<div class="stat-row">
	<span class="stat-label">Audio Duration</span>
	<span id="last-audioduration" class="stat-value">--</span>
	</div>
	<div class="stat-row">
	<span class="stat-label">Text Length</span>
	<span id="last-textlen" class="stat-value">--</span>
	</div>

	<div class="stats-history">
	<h3>Request History</h3>
	<div id="history-container"></div>
	</div>
	</div>
	</div>

	<audio id="audio-player"></audio>

	<script>
	const statusEl = document.getElementById('status');
	const speakBtn = document.getElementById('speak-btn');
	const audioPlayer = document.getElementById('audio-player');
	let ws = null;
	let isConnected = false;

	// Statistics tracking
	let requestStartTime = null;
	let statsHistory = [];
	const MAX_HISTORY = 10;

	// Conectar ao WebSocket do Wav2Lip
	function connectWebSocket() {
	ws = new WebSocket('ws://' + window.location.host + '/ws/avatar');

	ws.onopen = () => {
	isConnected = true;
	statusEl.textContent = 'Conectado ao servidor';
	statusEl.className = 'status connected';
	};

	ws.onmessage = (event) => {
	const data = JSON.parse(event.data);
	if (data.status === 'speaking') {
	statusEl.textContent = 'Falando...';
	statusEl.className = 'status speaking';
	speakBtn.disabled = true;
	} else if (data.status === 'idle') {
	statusEl.textContent = 'Pronto';
	statusEl.className = 'status connected';
	speakBtn.disabled = false;
	// Calculate round-trip time when idle
	if (requestStartTime) {
	const roundTrip = (performance.now() - requestStartTime) / 1000;
	updateStats({roundTrip});
	requestStartTime = null;
	}
	} else if (data.status === 'error') {
	statusEl.textContent = 'Erro: ' + data.message;
	statusEl.className = 'status disconnected';
	speakBtn.disabled = false;
	requestStartTime = null;
	} else if (data.audio) {
	// Recebeu áudio para reproduzir
	const audioBlob = base64ToBlob(data.audio, 'audio/wav');
	audioPlayer.src = URL.createObjectURL(audioBlob);
	audioPlayer.play();
	}
	// Process timing stats from server
	if (data.stats) {
	updateStats(data.stats);
	}
	// Also check for individual timing fields
	if (data.tts_time !== undefined \|\| data.wav2lip_time !== undefined \|\| data.first_frame_time !== undefined) {
	updateStats({
	tts_time: data.tts_time,
	wav2lip_time: data.wav2lip_time,
	first_frame_time: data.first_frame_time,
	audio_duration: data.audio_duration,
	text_length: data.text_length
	});
	}
	};

	ws.onclose = () => {
	isConnected = false;
	statusEl.textContent = 'Desconectado. Reconectando...';
	statusEl.className = 'status disconnected';
	setTimeout(connectWebSocket, 2000);
	};

	ws.onerror = (error) => {
	console.error('WebSocket error:', error);
	};
	}

	function base64ToBlob(base64, mimeType) {
	const byteCharacters = atob(base64);
	const byteNumbers = new Array(byteCharacters.length);
	for (let i = 0; i < byteCharacters.length; i++) {
	byteNumbers[i] = byteCharacters.charCodeAt(i);
	}
	const byteArray = new Uint8Array(byteNumbers);
	return new Blob([byteArray], { type: mimeType });
	}

	async function speak() {
	const text = document.getElementById('text-input').value.trim();
	const voice = document.getElementById('voice-select').value;

	if (!text) {
	alert('Digite um texto para falar');
	return;
	}

	if (!isConnected) {
	alert('Não conectado ao servidor');
	return;
	}

	speakBtn.disabled = true;
	statusEl.textContent = 'Gerando áudio...';
	statusEl.className = 'status speaking';

	// Record start time for round-trip measurement
	requestStartTime = performance.now();

	// Enviar comando para falar
	ws.send(JSON.stringify({
	action: 'speak',
	text: text,
	voice: voice,
	text_length: text.length
	}));
	}

	// Statistics functions
	function formatTime(seconds) {
	if (seconds === undefined \|\| seconds === null) return '--';
	return seconds.toFixed(2) + 's';
	}

	function getSpeedClass(seconds, thresholds) {
	if (seconds === undefined \|\| seconds === null) return '';
	if (seconds <= thresholds.fast) return 'fast';
	if (seconds <= thresholds.medium) return 'medium';
	return 'slow';
	}

	function updateStats(newStats) {
	const now = new Date();
	const entry = {
	timestamp: now,
	...newStats
	};

	// Update last stats display
	if (newStats.roundTrip !== undefined) {
	const el = document.getElementById('last-roundtrip');
	el.textContent = formatTime(newStats.roundTrip);
	el.className = 'stat-value ' + getSpeedClass(newStats.roundTrip, {fast: 3, medium: 6});
	}
	if (newStats.tts_time !== undefined) {
	const el = document.getElementById('last-tts');
	el.textContent = formatTime(newStats.tts_time);
	el.className = 'stat-value ' + getSpeedClass(newStats.tts_time, {fast: 2, medium: 4});
	}
	if (newStats.wav2lip_time !== undefined) {
	const el = document.getElementById('last-wav2lip');
	el.textContent = formatTime(newStats.wav2lip_time);
	el.className = 'stat-value ' + getSpeedClass(newStats.wav2lip_time, {fast: 1, medium: 2});
	}
	if (newStats.first_frame_time !== undefined) {
	const el = document.getElementById('last-firstframe');
	el.textContent = formatTime(newStats.first_frame_time);
	el.className = 'stat-value ' + getSpeedClass(newStats.first_frame_time, {fast: 3, medium: 5});
	}
	if (newStats.audio_duration !== undefined) {
	document.getElementById('last-audioduration').textContent = formatTime(newStats.audio_duration);
	}
	if (newStats.text_length !== undefined) {
	document.getElementById('last-textlen').textContent = newStats.text_length + ' chars';
	}

	// Only add to history if we have timing data
	if (newStats.tts_time !== undefined \|\| newStats.roundTrip !== undefined) {
	statsHistory.unshift(entry);
	if (statsHistory.length > MAX_HISTORY) {
	statsHistory.pop();
	}
	updateAverages();
	updateHistory();
	}
	}

	function updateAverages() {
	const validRoundTrips = statsHistory.filter(s => s.roundTrip !== undefined).map(s => s.roundTrip);
	const validTts = statsHistory.filter(s => s.tts_time !== undefined).map(s => s.tts_time);
	const validWav2lip = statsHistory.filter(s => s.wav2lip_time !== undefined).map(s => s.wav2lip_time);

	if (validRoundTrips.length > 0) {
	const avg = validRoundTrips.reduce((a, b) => a + b, 0) / validRoundTrips.length;
	const el = document.getElementById('avg-roundtrip');
	el.textContent = formatTime(avg);
	el.className = 'stat-value ' + getSpeedClass(avg, {fast: 3, medium: 6});
	}
	if (validTts.length > 0) {
	const avg = validTts.reduce((a, b) => a + b, 0) / validTts.length;
	const el = document.getElementById('avg-tts');
	el.textContent = formatTime(avg);
	el.className = 'stat-value ' + getSpeedClass(avg, {fast: 2, medium: 4});
	}
	if (validWav2lip.length > 0) {
	const avg = validWav2lip.reduce((a, b) => a + b, 0) / validWav2lip.length;
	const el = document.getElementById('avg-wav2lip');
	el.textContent = formatTime(avg);
	el.className = 'stat-value ' + getSpeedClass(avg, {fast: 1, medium: 2});
	}
	}

	function updateHistory() {
	const container = document.getElementById('history-container');
	container.innerHTML = statsHistory.map((entry, idx) => {
	const time = entry.timestamp.toLocaleTimeString();
	return `
	<div class="history-item">
	<div class="timestamp">#${idx + 1} - ${time}</div>
	<div class="metrics">
	${entry.roundTrip !== undefined ? `<span>Round-trip: ${formatTime(entry.roundTrip)}</span>` : ''}
	${entry.tts_time !== undefined ? `<span>TTS: ${formatTime(entry.tts_time)}</span>` : ''}
	${entry.wav2lip_time !== undefined ? `<span>Wav2Lip: ${formatTime(entry.wav2lip_time)}</span>` : ''}
	${entry.first_frame_time !== undefined ? `<span>1st Frame: ${formatTime(entry.first_frame_time)}</span>` : ''}
	</div>
	</div>
	`;
	}).join('');
	}

	// Iniciar conexão
	connectWebSocket();

	// Set MJPEG source based on current host (uses port 8085 for Wav2Lip)
	const avatarImg = document.getElementById('avatar-video');
	avatarImg.src = 'http://' + window.location.hostname + ':8085/mjpeg';
	</script>
	</body>
	</html>
	"""

	async def index(request):
	"""Serve a página principal"""
	return web.Response(text=HTML_TEMPLATE, content_type='text/html')

	async def proxy_mjpeg(request):
	"""Proxy para o stream MJPEG do Wav2Lip"""
	try:
	async with ClientSession() as session:
	async with session.get(f"{WAV2LIP_URL}/mjpeg") as resp:
	if resp.status == 200:
	response = web.StreamResponse()
	response.content_type = resp.content_type
	await response.prepare(request)

	async for chunk in resp.content.iter_any():
	await response.write(chunk)

	return response
	except Exception as e:
	print(f"Erro ao obter vídeo: {e}")

	return web.Response(status=503, text="Video not available")

	async def websocket_handler(request):
	"""WebSocket handler que conecta ao Wav2Lip e TTS"""
	ws_response = web.WebSocketResponse()
	await ws_response.prepare(request)

	# Conectar ao WebSocket do Wav2Lip
	wav2lip_ws = None
	try:
	async with ClientSession() as session:
	async with session.ws_connect(f"{WAV2LIP_URL}/ws") as wav2lip_ws:

	async def forward_from_wav2lip():
	"""Encaminha mensagens do Wav2Lip para o cliente"""
	try:
	async for msg in wav2lip_ws:
	if msg.type == aiohttp.WSMsgType.TEXT:
	await ws_response.send_str(msg.data)
	elif msg.type == aiohttp.WSMsgType.BINARY:
	await ws_response.send_bytes(msg.data)
	elif msg.type == aiohttp.WSMsgType.ERROR:
	break
	except Exception as e:
	print(f"Erro ao encaminhar de Wav2Lip: {e}")

	async def forward_from_client():
	"""Encaminha mensagens do cliente para o Wav2Lip"""
	try:
	async for msg in ws_response:
	if msg.type == aiohttp.WSMsgType.TEXT:
	data = json.loads(msg.data)

	if data.get('action') == 'speak':
	# Envia para o Wav2Lip que já integra com TTS
	await wav2lip_ws.send_str(json.dumps({
	'action': 'speak',
	'text': data['text'],
	'voice': data.get('voice', 'tara')
	}))
	else:
	await wav2lip_ws.send_str(msg.data)

	elif msg.type == aiohttp.WSMsgType.ERROR:
	break
	except Exception as e:
	print(f"Erro ao encaminhar do cliente: {e}")

	# Executar ambos em paralelo
	await asyncio.gather(
	forward_from_wav2lip(),
	forward_from_client()
	)

	except Exception as e:
	print(f"Erro WebSocket: {e}")
	await ws_response.send_json({"status": "error", "message": str(e)})

	return ws_response

	async def health(request):
	"""Health check endpoint"""
	status = {
	"status": "ok",
	"services": {}
	}

	async with ClientSession() as session:
	# Check TTS
	try:
	async with session.get(f"{TTS_URL}/") as resp:
	status["services"]["tts"] = resp.status == 200
	except:
	status["services"]["tts"] = False

	# Check Wav2Lip
	try:
	async with session.get(f"{WAV2LIP_URL}/") as resp:
	status["services"]["wav2lip"] = resp.status == 200
	except:
	status["services"]["wav2lip"] = False

	return web.json_response(status)

	def create_app():
	app = web.Application()
	app.router.add_get('/', index)
	app.router.add_get('/mjpeg', proxy_mjpeg)
	app.router.add_get('/ws/avatar', websocket_handler)
	app.router.add_get('/health', health)
	return app

	if __name__ == '__main__':
	print(f"=== Servidor Integrador ===")
	print(f"Porta: {PORT}")
	print(f"TTS: {TTS_URL}")
	print(f"Wav2Lip: {WAV2LIP_URL}")
	print(f"Acesse: http://localhost:{PORT}")
	print("=" * 30)

	app = create_app()
	web.run_app(app, host='0.0.0.0', port=PORT)