Commit ·
0e82ee5
1
Parent(s): 4f5ba94
Improve idle video sync and smooth transitions
Browse files- Add idle_video_time_ms tracking for seamless transitions
- Sync idle video position when speech ends (end_video_time_ms)
- Pre-render first frame before showing canvas (eliminates flash)
- Use indexed array for O(1) frame access
- Add generate_complete action proxy for Wav2Lip
- Remove server-side crossfade (transition handled by client)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
- interface/index_streaming.html +93 -28
- interface/server_streaming.py +189 -52
interface/index_streaming.html
CHANGED
|
@@ -66,9 +66,9 @@ const btnGo = document.getElementById('btnGo');
|
|
| 66 |
const btnStop = document.getElementById('btnStop');
|
| 67 |
|
| 68 |
let ws = null;
|
| 69 |
-
let
|
| 70 |
let isPlaying = false;
|
| 71 |
-
let isBuffering = false;
|
| 72 |
let audioContext = null;
|
| 73 |
let audioBuffer = null;
|
| 74 |
let audioSource = null;
|
|
@@ -77,6 +77,8 @@ let frameCount = 0;
|
|
| 77 |
let totalFrames = 0;
|
| 78 |
let playbackStartTime = 0;
|
| 79 |
let animationId = null;
|
|
|
|
|
|
|
| 80 |
|
| 81 |
// Configuracao
|
| 82 |
const TARGET_FPS = 25;
|
|
@@ -149,14 +151,18 @@ function connect() {
|
|
| 149 |
case 'audio':
|
| 150 |
// MUDANCA: Audio chegou - AGORA iniciar playback sincronizado!
|
| 151 |
setMetric('mAudio', (msg.duration_ms / 1000).toFixed(2) + 's');
|
|
|
|
|
|
|
|
|
|
| 152 |
startSyncedPlayback(msg.audio, msg.duration_ms);
|
| 153 |
break;
|
| 154 |
|
| 155 |
case 'done':
|
| 156 |
totalFrames = msg.frames;
|
| 157 |
setMetric('mFrames', msg.frames);
|
| 158 |
-
|
| 159 |
-
|
|
|
|
| 160 |
break;
|
| 161 |
|
| 162 |
case 'error':
|
|
@@ -169,14 +175,15 @@ function connect() {
|
|
| 169 |
}
|
| 170 |
|
| 171 |
function addFrame(base64Frame, index) {
|
| 172 |
-
// Decodificar frame e adicionar
|
| 173 |
const img = new Image();
|
| 174 |
img.onload = () => {
|
| 175 |
-
|
|
|
|
| 176 |
frameCount++;
|
| 177 |
|
| 178 |
// Ajustar tamanho do canvas no primeiro frame
|
| 179 |
-
if (
|
| 180 |
talkCanvas.width = img.width;
|
| 181 |
talkCanvas.height = img.height;
|
| 182 |
}
|
|
@@ -193,14 +200,19 @@ function startBuffering() {
|
|
| 193 |
// Preparar para receber frames, mas NAO iniciar playback
|
| 194 |
isBuffering = true;
|
| 195 |
isPlaying = false;
|
| 196 |
-
|
| 197 |
frameCount = 0;
|
| 198 |
totalFrames = 0;
|
|
|
|
| 199 |
|
| 200 |
-
//
|
| 201 |
-
talkCanvas.style.display = 'block';
|
| 202 |
}
|
| 203 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
async function startSyncedPlayback(base64Audio, durationMs) {
|
| 205 |
// Audio chegou - iniciar playback sincronizado de video + audio
|
| 206 |
setStatus('Reproduzindo...', 'ok');
|
|
@@ -240,14 +252,37 @@ async function startSyncedPlayback(base64Audio, durationMs) {
|
|
| 240 |
|
| 241 |
audioSource.onended = () => {
|
| 242 |
audioSource = null;
|
| 243 |
-
//
|
| 244 |
-
|
| 245 |
-
if (!audioSource) {
|
| 246 |
-
stopPlayback();
|
| 247 |
-
}
|
| 248 |
-
}, 200);
|
| 249 |
};
|
| 250 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
// INICIAR TUDO SINCRONIZADO: audio + video ao mesmo tempo!
|
| 252 |
isPlaying = true;
|
| 253 |
isBuffering = false;
|
|
@@ -271,27 +306,40 @@ function renderLoop() {
|
|
| 271 |
if (!isPlaying) return;
|
| 272 |
|
| 273 |
const elapsed = performance.now() - playbackStartTime;
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
const frame = frameQueue.find(f => f.index === targetFrame);
|
| 278 |
|
| 279 |
-
|
| 280 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 281 |
|
| 282 |
-
|
| 283 |
-
|
|
|
|
|
|
|
| 284 |
}
|
| 285 |
|
| 286 |
// Atualizar progresso visual
|
| 287 |
-
const total = totalFrames || frameCount;
|
| 288 |
if (total > 0) {
|
| 289 |
const displayedFrame = Math.min(targetFrame, total);
|
| 290 |
progress.style.width = (displayedFrame / total * 100) + '%';
|
| 291 |
}
|
| 292 |
|
| 293 |
// Continuar enquanto tiver audio ou frames
|
| 294 |
-
if (audioSource || targetFrame <
|
| 295 |
animationId = requestAnimationFrame(renderLoop);
|
| 296 |
} else {
|
| 297 |
stopPlayback();
|
|
@@ -314,11 +362,23 @@ function stopPlayback() {
|
|
| 314 |
audioSource = null;
|
| 315 |
}
|
| 316 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 317 |
// Esconder canvas, mostrar idle
|
| 318 |
talkCanvas.style.display = 'none';
|
| 319 |
ctx.clearRect(0, 0, talkCanvas.width, talkCanvas.height);
|
| 320 |
|
| 321 |
-
|
|
|
|
| 322 |
setStatus('Pronto', 'ok');
|
| 323 |
setButtons(false);
|
| 324 |
}
|
|
@@ -341,10 +401,15 @@ btnGo.onclick = () => {
|
|
| 341 |
setStatus('Gerando...', 'busy');
|
| 342 |
startTime = Date.now();
|
| 343 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 344 |
ws.send(JSON.stringify({
|
| 345 |
action: 'generate',
|
| 346 |
text: text,
|
| 347 |
-
voice: document.getElementById('voice').value
|
|
|
|
| 348 |
}));
|
| 349 |
};
|
| 350 |
|
|
|
|
| 66 |
const btnStop = document.getElementById('btnStop');
|
| 67 |
|
| 68 |
let ws = null;
|
| 69 |
+
let frames = []; // Array indexado para acesso O(1)
|
| 70 |
let isPlaying = false;
|
| 71 |
+
let isBuffering = false;
|
| 72 |
let audioContext = null;
|
| 73 |
let audioBuffer = null;
|
| 74 |
let audioSource = null;
|
|
|
|
| 77 |
let totalFrames = 0;
|
| 78 |
let playbackStartTime = 0;
|
| 79 |
let animationId = null;
|
| 80 |
+
let lastRenderedFrame = -1; // Para evitar re-renderizar mesmo frame
|
| 81 |
+
let endVideoTimeMs = 0; // Tempo do idle video onde a fala termina
|
| 82 |
|
| 83 |
// Configuracao
|
| 84 |
const TARGET_FPS = 25;
|
|
|
|
| 151 |
case 'audio':
|
| 152 |
// MUDANCA: Audio chegou - AGORA iniciar playback sincronizado!
|
| 153 |
setMetric('mAudio', (msg.duration_ms / 1000).toFixed(2) + 's');
|
| 154 |
+
// Calcular FPS real baseado na duracao do audio
|
| 155 |
+
const realFps = (totalFrames || frameCount) / (msg.duration_ms / 1000);
|
| 156 |
+
setMetric('mFps', realFps.toFixed(1));
|
| 157 |
startSyncedPlayback(msg.audio, msg.duration_ms);
|
| 158 |
break;
|
| 159 |
|
| 160 |
case 'done':
|
| 161 |
totalFrames = msg.frames;
|
| 162 |
setMetric('mFrames', msg.frames);
|
| 163 |
+
// Salvar end_video_time_ms para sincronizar idle quando fala terminar
|
| 164 |
+
endVideoTimeMs = msg.end_video_time_ms || 0;
|
| 165 |
+
console.log(`Done: ${msg.frames} frames, end_video_time: ${endVideoTimeMs}ms`);
|
| 166 |
break;
|
| 167 |
|
| 168 |
case 'error':
|
|
|
|
| 175 |
}
|
| 176 |
|
| 177 |
function addFrame(base64Frame, index) {
|
| 178 |
+
// Decodificar frame e adicionar no array indexado
|
| 179 |
const img = new Image();
|
| 180 |
img.onload = () => {
|
| 181 |
+
// Armazenar no indice correto para acesso O(1)
|
| 182 |
+
frames[index] = img;
|
| 183 |
frameCount++;
|
| 184 |
|
| 185 |
// Ajustar tamanho do canvas no primeiro frame
|
| 186 |
+
if (index === 0) {
|
| 187 |
talkCanvas.width = img.width;
|
| 188 |
talkCanvas.height = img.height;
|
| 189 |
}
|
|
|
|
| 200 |
// Preparar para receber frames, mas NAO iniciar playback
|
| 201 |
isBuffering = true;
|
| 202 |
isPlaying = false;
|
| 203 |
+
frames = []; // Reset array indexado
|
| 204 |
frameCount = 0;
|
| 205 |
totalFrames = 0;
|
| 206 |
+
lastRenderedFrame = -1;
|
| 207 |
|
| 208 |
+
// NAO mostrar canvas ainda - so quando primeiro frame estiver pronto
|
| 209 |
+
// talkCanvas.style.display = 'block';
|
| 210 |
}
|
| 211 |
|
| 212 |
+
// Duracao real do audio (usado para sincronizar frames)
|
| 213 |
+
let audioDurationMs = 0;
|
| 214 |
+
let dynamicFrameDuration = FRAME_DURATION;
|
| 215 |
+
|
| 216 |
async function startSyncedPlayback(base64Audio, durationMs) {
|
| 217 |
// Audio chegou - iniciar playback sincronizado de video + audio
|
| 218 |
setStatus('Reproduzindo...', 'ok');
|
|
|
|
| 252 |
|
| 253 |
audioSource.onended = () => {
|
| 254 |
audioSource = null;
|
| 255 |
+
// Transicao imediata quando audio termina
|
| 256 |
+
stopPlayback();
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
};
|
| 258 |
|
| 259 |
+
// Calcular quantos frames usar baseado na duracao do audio
|
| 260 |
+
// Manter 25fps fixo e usar apenas os frames necessarios
|
| 261 |
+
audioDurationMs = durationMs;
|
| 262 |
+
dynamicFrameDuration = FRAME_DURATION; // Sempre 40ms (25fps)
|
| 263 |
+
|
| 264 |
+
// Calcular quantos frames cabem na duracao do audio
|
| 265 |
+
const framesNeeded = Math.floor(durationMs / FRAME_DURATION);
|
| 266 |
+
const numFrames = totalFrames || frameCount;
|
| 267 |
+
|
| 268 |
+
// Limitar ao numero de frames disponiveis ou necessarios (o menor)
|
| 269 |
+
const framesToUse = Math.min(framesNeeded, numFrames);
|
| 270 |
+
|
| 271 |
+
console.log(`Audio: ${durationMs}ms, Frames disponiveis: ${numFrames}, Frames a usar: ${framesToUse} (${(1000/dynamicFrameDuration).toFixed(1)}fps)`);
|
| 272 |
+
|
| 273 |
+
// Atualizar totalFrames para usar apenas os necessarios
|
| 274 |
+
totalFrames = framesToUse;
|
| 275 |
+
|
| 276 |
+
// PRE-RENDERIZAR primeiro frame ANTES de mostrar canvas
|
| 277 |
+
// Isso evita o "tec" de um frame em branco
|
| 278 |
+
if (frames[0]) {
|
| 279 |
+
ctx.drawImage(frames[0], 0, 0);
|
| 280 |
+
lastRenderedFrame = 0;
|
| 281 |
+
}
|
| 282 |
+
|
| 283 |
+
// Agora mostrar o canvas (ja com o primeiro frame renderizado)
|
| 284 |
+
talkCanvas.style.display = 'block';
|
| 285 |
+
|
| 286 |
// INICIAR TUDO SINCRONIZADO: audio + video ao mesmo tempo!
|
| 287 |
isPlaying = true;
|
| 288 |
isBuffering = false;
|
|
|
|
| 306 |
if (!isPlaying) return;
|
| 307 |
|
| 308 |
const elapsed = performance.now() - playbackStartTime;
|
| 309 |
+
// Usar duracao dinamica para sincronizar com audio
|
| 310 |
+
const targetFrame = Math.floor(elapsed / dynamicFrameDuration);
|
| 311 |
+
const total = totalFrames || frameCount;
|
|
|
|
| 312 |
|
| 313 |
+
// So renderizar se for um frame diferente do anterior
|
| 314 |
+
if (targetFrame !== lastRenderedFrame && targetFrame < total) {
|
| 315 |
+
// Acesso O(1) ao frame pelo indice
|
| 316 |
+
let frameToRender = frames[targetFrame];
|
| 317 |
+
|
| 318 |
+
// Se frame ainda nao chegou, usar o ultimo frame disponivel
|
| 319 |
+
if (!frameToRender) {
|
| 320 |
+
// Procurar frame mais proximo anterior
|
| 321 |
+
for (let i = targetFrame - 1; i >= 0; i--) {
|
| 322 |
+
if (frames[i]) {
|
| 323 |
+
frameToRender = frames[i];
|
| 324 |
+
break;
|
| 325 |
+
}
|
| 326 |
+
}
|
| 327 |
+
}
|
| 328 |
|
| 329 |
+
if (frameToRender) {
|
| 330 |
+
ctx.drawImage(frameToRender, 0, 0);
|
| 331 |
+
lastRenderedFrame = targetFrame;
|
| 332 |
+
}
|
| 333 |
}
|
| 334 |
|
| 335 |
// Atualizar progresso visual
|
|
|
|
| 336 |
if (total > 0) {
|
| 337 |
const displayedFrame = Math.min(targetFrame, total);
|
| 338 |
progress.style.width = (displayedFrame / total * 100) + '%';
|
| 339 |
}
|
| 340 |
|
| 341 |
// Continuar enquanto tiver audio ou frames
|
| 342 |
+
if (audioSource || targetFrame < total) {
|
| 343 |
animationId = requestAnimationFrame(renderLoop);
|
| 344 |
} else {
|
| 345 |
stopPlayback();
|
|
|
|
| 362 |
audioSource = null;
|
| 363 |
}
|
| 364 |
|
| 365 |
+
// Sincronizar idle video para o tempo correto (onde a fala terminou)
|
| 366 |
+
if (endVideoTimeMs > 0) {
|
| 367 |
+
const targetTime = endVideoTimeMs / 1000;
|
| 368 |
+
// Garantir que o tempo esta dentro da duracao do video
|
| 369 |
+
if (idleVideo.duration > 0) {
|
| 370 |
+
idleVideo.currentTime = targetTime % idleVideo.duration;
|
| 371 |
+
console.log(`Idle video sync: ${targetTime.toFixed(2)}s`);
|
| 372 |
+
}
|
| 373 |
+
endVideoTimeMs = 0; // Reset para proxima vez
|
| 374 |
+
}
|
| 375 |
+
|
| 376 |
// Esconder canvas, mostrar idle
|
| 377 |
talkCanvas.style.display = 'none';
|
| 378 |
ctx.clearRect(0, 0, talkCanvas.width, talkCanvas.height);
|
| 379 |
|
| 380 |
+
frames = [];
|
| 381 |
+
lastRenderedFrame = -1;
|
| 382 |
setStatus('Pronto', 'ok');
|
| 383 |
setButtons(false);
|
| 384 |
}
|
|
|
|
| 401 |
setStatus('Gerando...', 'busy');
|
| 402 |
startTime = Date.now();
|
| 403 |
|
| 404 |
+
// Capturar o tempo atual do video idle para sincronizacao
|
| 405 |
+
const idleVideoTimeMs = Math.floor(idleVideo.currentTime * 1000);
|
| 406 |
+
console.log(`Idle video time: ${idleVideoTimeMs}ms`);
|
| 407 |
+
|
| 408 |
ws.send(JSON.stringify({
|
| 409 |
action: 'generate',
|
| 410 |
text: text,
|
| 411 |
+
voice: document.getElementById('voice').value,
|
| 412 |
+
idle_video_time_ms: idleVideoTimeMs // Enviar para servidor sincronizar frames
|
| 413 |
}));
|
| 414 |
};
|
| 415 |
|
interface/server_streaming.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
"""
|
| 2 |
-
Interface Server - Streaming
|
| 3 |
-
|
| 4 |
"""
|
| 5 |
from aiohttp import web
|
| 6 |
import aiohttp
|
|
@@ -9,18 +9,91 @@ import json
|
|
| 9 |
import base64
|
| 10 |
import os
|
| 11 |
import time
|
|
|
|
|
|
|
| 12 |
|
| 13 |
WAV2LIP_WS = os.getenv("WAV2LIP_WS", "ws://localhost:8082/ws")
|
| 14 |
PORT = int(os.getenv("PORT", "8000"))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
routes = web.RouteTableDef()
|
| 17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
@routes.get("/ws")
|
| 19 |
async def websocket_handler(request):
|
| 20 |
ws = web.WebSocketResponse()
|
| 21 |
await ws.prepare(request)
|
| 22 |
print("Cliente conectado")
|
| 23 |
|
|
|
|
|
|
|
|
|
|
| 24 |
try:
|
| 25 |
async for msg in ws:
|
| 26 |
if msg.type == aiohttp.WSMsgType.TEXT:
|
|
@@ -30,12 +103,13 @@ async def websocket_handler(request):
|
|
| 30 |
if action == "generate":
|
| 31 |
text = data.get("text", "").strip()
|
| 32 |
voice = data.get("voice", "tara")
|
|
|
|
| 33 |
|
| 34 |
if not text:
|
| 35 |
await ws.send_json({"type": "error", "message": "Text required"})
|
| 36 |
continue
|
| 37 |
|
| 38 |
-
print(f"Gerando: {text[:50]}...")
|
| 39 |
start_time = time.time()
|
| 40 |
|
| 41 |
try:
|
|
@@ -45,15 +119,18 @@ async def websocket_handler(request):
|
|
| 45 |
timeout=aiohttp.ClientWSTimeout(ws_close=120)
|
| 46 |
)
|
| 47 |
|
| 48 |
-
# Usar action: generate que faz streaming de frames
|
| 49 |
await wav2lip_ws.send_json({
|
| 50 |
"action": "generate",
|
| 51 |
"text": text,
|
| 52 |
-
"voice": voice
|
|
|
|
| 53 |
})
|
| 54 |
|
| 55 |
-
|
| 56 |
-
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
async for w2l_msg in wav2lip_ws:
|
| 59 |
if w2l_msg.type == aiohttp.WSMsgType.TEXT:
|
|
@@ -63,45 +140,19 @@ async def websocket_handler(request):
|
|
| 63 |
if msg_type == "status":
|
| 64 |
await ws.send_json(w2l_data)
|
| 65 |
|
| 66 |
-
elif msg_type == "first_chunk":
|
| 67 |
-
latency = w2l_data.get("latency_ms", 0)
|
| 68 |
-
await ws.send_json({
|
| 69 |
-
"type": "first_chunk",
|
| 70 |
-
"latency_ms": latency
|
| 71 |
-
})
|
| 72 |
-
|
| 73 |
elif msg_type == "frame":
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
ttfb = int((time.time() - start_time) * 1000)
|
| 79 |
-
await ws.send_json({
|
| 80 |
-
"type": "stream_start",
|
| 81 |
-
"ttfb_ms": ttfb
|
| 82 |
-
})
|
| 83 |
-
|
| 84 |
-
await ws.send_json({
|
| 85 |
-
"type": "frame",
|
| 86 |
-
"frame": w2l_data.get("frame"),
|
| 87 |
-
"index": frame_count - 1
|
| 88 |
-
})
|
| 89 |
|
| 90 |
elif msg_type == "full_audio":
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
"type": "audio",
|
| 94 |
-
"audio": w2l_data.get("audio"),
|
| 95 |
-
"duration_ms": w2l_data.get("duration_ms", 0)
|
| 96 |
-
})
|
| 97 |
|
| 98 |
elif msg_type == "done":
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
"type": "done",
|
| 102 |
-
"frames": frame_count,
|
| 103 |
-
"elapsed_ms": elapsed
|
| 104 |
-
})
|
| 105 |
break
|
| 106 |
|
| 107 |
elif msg_type == "error":
|
|
@@ -113,8 +164,96 @@ async def websocket_handler(request):
|
|
| 113 |
|
| 114 |
await wav2lip_ws.close()
|
| 115 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
except Exception as e:
|
| 117 |
print(f"Erro: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
await ws.send_json({"type": "error", "message": str(e)})
|
| 119 |
|
| 120 |
elif action == "ping":
|
|
@@ -128,16 +267,9 @@ async def websocket_handler(request):
|
|
| 128 |
return ws
|
| 129 |
|
| 130 |
|
| 131 |
-
@routes.get("/health")
|
| 132 |
-
async def health(request):
|
| 133 |
-
return web.json_response({"status": "ok", "mode": "streaming"})
|
| 134 |
-
|
| 135 |
-
|
| 136 |
@routes.get("/")
|
| 137 |
async def index(request):
|
| 138 |
-
return web.FileResponse(
|
| 139 |
-
os.path.join(os.path.dirname(__file__), "index_streaming.html")
|
| 140 |
-
)
|
| 141 |
|
| 142 |
|
| 143 |
@routes.get("/{filename}")
|
|
@@ -154,9 +286,14 @@ app.add_routes(routes)
|
|
| 154 |
|
| 155 |
if __name__ == "__main__":
|
| 156 |
print("=" * 50)
|
| 157 |
-
print("Streaming Server
|
|
|
|
|
|
|
|
|
|
| 158 |
print("=" * 50)
|
| 159 |
-
|
| 160 |
-
|
|
|
|
|
|
|
| 161 |
print("=" * 50)
|
| 162 |
web.run_app(app, host="0.0.0.0", port=PORT)
|
|
|
|
| 1 |
"""
|
| 2 |
+
Interface Server - Streaming com Crossfade Suave
|
| 3 |
+
Faz transicao suave entre idle e fala usando blending de frames
|
| 4 |
"""
|
| 5 |
from aiohttp import web
|
| 6 |
import aiohttp
|
|
|
|
| 9 |
import base64
|
| 10 |
import os
|
| 11 |
import time
|
| 12 |
+
import cv2
|
| 13 |
+
import numpy as np
|
| 14 |
|
| 15 |
WAV2LIP_WS = os.getenv("WAV2LIP_WS", "ws://localhost:8082/ws")
|
| 16 |
PORT = int(os.getenv("PORT", "8000"))
|
| 17 |
+
IDLE_VIDEO = os.path.join(os.path.dirname(__file__), "idle.mp4")
|
| 18 |
+
|
| 19 |
+
# Configuracao de crossfade
|
| 20 |
+
CROSSFADE_FRAMES = 5 # Numero de frames para transicao (200ms @ 25fps)
|
| 21 |
|
| 22 |
routes = web.RouteTableDef()
|
| 23 |
|
| 24 |
+
# Cache de frames idle
|
| 25 |
+
idle_frames = []
|
| 26 |
+
idle_frame_count = 0
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def load_idle_frames():
|
| 30 |
+
"""Carrega frames do idle.mp4"""
|
| 31 |
+
global idle_frames, idle_frame_count
|
| 32 |
+
|
| 33 |
+
if idle_frames:
|
| 34 |
+
return
|
| 35 |
+
|
| 36 |
+
if not os.path.exists(IDLE_VIDEO):
|
| 37 |
+
print(f"[AVISO] Idle video nao encontrado: {IDLE_VIDEO}")
|
| 38 |
+
return
|
| 39 |
+
|
| 40 |
+
print(f"Carregando idle frames de {IDLE_VIDEO}...")
|
| 41 |
+
cap = cv2.VideoCapture(IDLE_VIDEO)
|
| 42 |
+
|
| 43 |
+
while True:
|
| 44 |
+
ret, frame = cap.read()
|
| 45 |
+
if not ret:
|
| 46 |
+
break
|
| 47 |
+
# Manter em BGR para processamento, converter para JPEG depois
|
| 48 |
+
idle_frames.append(frame)
|
| 49 |
+
|
| 50 |
+
cap.release()
|
| 51 |
+
idle_frame_count = len(idle_frames)
|
| 52 |
+
print(f"Carregados {idle_frame_count} frames idle")
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def frame_to_jpeg_base64(frame, quality=85):
|
| 56 |
+
"""Converte frame numpy para JPEG base64"""
|
| 57 |
+
encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), quality]
|
| 58 |
+
_, buffer = cv2.imencode('.jpg', frame, encode_param)
|
| 59 |
+
return base64.b64encode(buffer).decode('utf-8')
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def jpeg_base64_to_frame(b64_data):
|
| 63 |
+
"""Converte JPEG base64 para frame numpy"""
|
| 64 |
+
jpeg_data = base64.b64decode(b64_data)
|
| 65 |
+
nparr = np.frombuffer(jpeg_data, np.uint8)
|
| 66 |
+
return cv2.imdecode(nparr, cv2.IMREAD_COLOR)
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def blend_frames(frame1, frame2, alpha):
|
| 70 |
+
"""Blend entre dois frames. alpha=0 -> frame1, alpha=1 -> frame2"""
|
| 71 |
+
# Garantir que ambos frames tem o mesmo tamanho
|
| 72 |
+
if frame1.shape != frame2.shape:
|
| 73 |
+
frame2 = cv2.resize(frame2, (frame1.shape[1], frame1.shape[0]))
|
| 74 |
+
|
| 75 |
+
return cv2.addWeighted(frame1, 1 - alpha, frame2, alpha, 0)
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def create_crossfade_frames(from_frame, to_frame, num_frames):
|
| 79 |
+
"""Cria frames de transicao suave entre dois frames"""
|
| 80 |
+
frames = []
|
| 81 |
+
for i in range(num_frames):
|
| 82 |
+
alpha = (i + 1) / (num_frames + 1) # 0.16, 0.33, 0.5, 0.66, 0.83 para 5 frames
|
| 83 |
+
blended = blend_frames(from_frame, to_frame, alpha)
|
| 84 |
+
frames.append(blended)
|
| 85 |
+
return frames
|
| 86 |
+
|
| 87 |
+
|
| 88 |
@routes.get("/ws")
|
| 89 |
async def websocket_handler(request):
|
| 90 |
ws = web.WebSocketResponse()
|
| 91 |
await ws.prepare(request)
|
| 92 |
print("Cliente conectado")
|
| 93 |
|
| 94 |
+
# Posicao atual no idle loop (para continuidade)
|
| 95 |
+
idle_position = 0
|
| 96 |
+
|
| 97 |
try:
|
| 98 |
async for msg in ws:
|
| 99 |
if msg.type == aiohttp.WSMsgType.TEXT:
|
|
|
|
| 103 |
if action == "generate":
|
| 104 |
text = data.get("text", "").strip()
|
| 105 |
voice = data.get("voice", "tara")
|
| 106 |
+
idle_video_time_ms = data.get("idle_video_time_ms", 0)
|
| 107 |
|
| 108 |
if not text:
|
| 109 |
await ws.send_json({"type": "error", "message": "Text required"})
|
| 110 |
continue
|
| 111 |
|
| 112 |
+
print(f"Gerando: {text[:50]}... (idle_time: {idle_video_time_ms}ms)")
|
| 113 |
start_time = time.time()
|
| 114 |
|
| 115 |
try:
|
|
|
|
| 119 |
timeout=aiohttp.ClientWSTimeout(ws_close=120)
|
| 120 |
)
|
| 121 |
|
|
|
|
| 122 |
await wav2lip_ws.send_json({
|
| 123 |
"action": "generate",
|
| 124 |
"text": text,
|
| 125 |
+
"voice": voice,
|
| 126 |
+
"idle_video_time_ms": idle_video_time_ms
|
| 127 |
})
|
| 128 |
|
| 129 |
+
# Coletar todos os frames
|
| 130 |
+
speaking_frames = []
|
| 131 |
+
audio_data = None
|
| 132 |
+
audio_duration = 0
|
| 133 |
+
end_video_time_ms = 0
|
| 134 |
|
| 135 |
async for w2l_msg in wav2lip_ws:
|
| 136 |
if w2l_msg.type == aiohttp.WSMsgType.TEXT:
|
|
|
|
| 140 |
if msg_type == "status":
|
| 141 |
await ws.send_json(w2l_data)
|
| 142 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
elif msg_type == "frame":
|
| 144 |
+
frame_b64 = w2l_data.get("frame", "")
|
| 145 |
+
if frame_b64:
|
| 146 |
+
frame = jpeg_base64_to_frame(frame_b64)
|
| 147 |
+
speaking_frames.append(frame)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
|
| 149 |
elif msg_type == "full_audio":
|
| 150 |
+
audio_data = w2l_data.get("audio", "")
|
| 151 |
+
audio_duration = w2l_data.get("duration_ms", 0)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
|
| 153 |
elif msg_type == "done":
|
| 154 |
+
# Capturar end_video_time_ms para sincronizar idle
|
| 155 |
+
end_video_time_ms = w2l_data.get("end_video_time_ms", 0)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
break
|
| 157 |
|
| 158 |
elif msg_type == "error":
|
|
|
|
| 164 |
|
| 165 |
await wav2lip_ws.close()
|
| 166 |
|
| 167 |
+
# Enviar frames SEM crossfade - transicao e feita no cliente
|
| 168 |
+
if speaking_frames:
|
| 169 |
+
# Atualizar posicao do idle para continuidade apos fala
|
| 170 |
+
if idle_frames:
|
| 171 |
+
idle_position = (idle_position + len(speaking_frames)) % idle_frame_count
|
| 172 |
+
|
| 173 |
+
# Enviar stream_start
|
| 174 |
+
ttfb = int((time.time() - start_time) * 1000)
|
| 175 |
+
await ws.send_json({"type": "stream_start", "ttfb_ms": ttfb})
|
| 176 |
+
|
| 177 |
+
# Enviar apenas os frames de fala (sem crossfade)
|
| 178 |
+
for idx, frame in enumerate(speaking_frames):
|
| 179 |
+
frame_b64 = frame_to_jpeg_base64(frame)
|
| 180 |
+
await ws.send_json({
|
| 181 |
+
"type": "frame",
|
| 182 |
+
"frame": frame_b64,
|
| 183 |
+
"index": idx
|
| 184 |
+
})
|
| 185 |
+
|
| 186 |
+
# Enviar audio
|
| 187 |
+
if audio_data:
|
| 188 |
+
await ws.send_json({
|
| 189 |
+
"type": "audio",
|
| 190 |
+
"audio": audio_data,
|
| 191 |
+
"duration_ms": audio_duration
|
| 192 |
+
})
|
| 193 |
+
|
| 194 |
+
# Enviar done com end_video_time_ms para sincronizar idle
|
| 195 |
+
elapsed = int((time.time() - start_time) * 1000)
|
| 196 |
+
await ws.send_json({
|
| 197 |
+
"type": "done",
|
| 198 |
+
"frames": len(speaking_frames),
|
| 199 |
+
"elapsed_ms": elapsed,
|
| 200 |
+
"end_video_time_ms": end_video_time_ms
|
| 201 |
+
})
|
| 202 |
+
|
| 203 |
+
print(f"Enviados {len(speaking_frames)} frames de fala (sem crossfade)")
|
| 204 |
+
|
| 205 |
except Exception as e:
|
| 206 |
print(f"Erro: {e}")
|
| 207 |
+
import traceback
|
| 208 |
+
traceback.print_exc()
|
| 209 |
+
await ws.send_json({"type": "error", "message": str(e)})
|
| 210 |
+
|
| 211 |
+
elif action == "generate_complete":
|
| 212 |
+
# Proxy para generate_complete do Wav2Lip
|
| 213 |
+
text = data.get("text", "").strip()
|
| 214 |
+
voice = data.get("voice", "tara")
|
| 215 |
+
idle_before_frames = data.get("idle_before_frames", 0)
|
| 216 |
+
idle_after_frames = data.get("idle_after_frames", 0)
|
| 217 |
+
crossfade_frames = data.get("crossfade_frames", 0)
|
| 218 |
+
jpeg_quality = data.get("jpeg_quality", 95)
|
| 219 |
+
|
| 220 |
+
if not text:
|
| 221 |
+
await ws.send_json({"type": "error", "message": "Text required"})
|
| 222 |
+
continue
|
| 223 |
+
|
| 224 |
+
print(f"Generate Complete: {text[:50]}...")
|
| 225 |
+
|
| 226 |
+
try:
|
| 227 |
+
async with aiohttp.ClientSession() as session:
|
| 228 |
+
wav2lip_ws = await session.ws_connect(
|
| 229 |
+
WAV2LIP_WS,
|
| 230 |
+
timeout=aiohttp.ClientWSTimeout(ws_close=120)
|
| 231 |
+
)
|
| 232 |
+
|
| 233 |
+
await wav2lip_ws.send_json({
|
| 234 |
+
"action": "generate_complete",
|
| 235 |
+
"text": text,
|
| 236 |
+
"voice": voice,
|
| 237 |
+
"idle_before_frames": idle_before_frames,
|
| 238 |
+
"idle_after_frames": idle_after_frames,
|
| 239 |
+
"crossfade_frames": crossfade_frames,
|
| 240 |
+
"jpeg_quality": jpeg_quality
|
| 241 |
+
})
|
| 242 |
+
|
| 243 |
+
# Repassar todas as mensagens
|
| 244 |
+
async for w2l_msg in wav2lip_ws:
|
| 245 |
+
if w2l_msg.type == aiohttp.WSMsgType.TEXT:
|
| 246 |
+
await ws.send_str(w2l_msg.data)
|
| 247 |
+
w2l_data = json.loads(w2l_msg.data)
|
| 248 |
+
if w2l_data.get("type") in ("done", "error"):
|
| 249 |
+
break
|
| 250 |
+
elif w2l_msg.type in (aiohttp.WSMsgType.CLOSED, aiohttp.WSMsgType.ERROR):
|
| 251 |
+
break
|
| 252 |
+
|
| 253 |
+
await wav2lip_ws.close()
|
| 254 |
+
|
| 255 |
+
except Exception as e:
|
| 256 |
+
print(f"Erro generate_complete: {e}")
|
| 257 |
await ws.send_json({"type": "error", "message": str(e)})
|
| 258 |
|
| 259 |
elif action == "ping":
|
|
|
|
| 267 |
return ws
|
| 268 |
|
| 269 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 270 |
@routes.get("/")
|
| 271 |
async def index(request):
|
| 272 |
+
return web.FileResponse(os.path.join(os.path.dirname(__file__), "index_streaming.html"))
|
|
|
|
|
|
|
| 273 |
|
| 274 |
|
| 275 |
@routes.get("/{filename}")
|
|
|
|
| 286 |
|
| 287 |
if __name__ == "__main__":
|
| 288 |
print("=" * 50)
|
| 289 |
+
print("Streaming Server com Crossfade - Porta", PORT)
|
| 290 |
+
print("Wav2Lip:", WAV2LIP_WS)
|
| 291 |
+
print("Idle Video:", IDLE_VIDEO)
|
| 292 |
+
print("Crossfade: DESABILITADO (transicao no cliente)")
|
| 293 |
print("=" * 50)
|
| 294 |
+
|
| 295 |
+
# Carregar idle frames
|
| 296 |
+
load_idle_frames()
|
| 297 |
+
|
| 298 |
print("=" * 50)
|
| 299 |
web.run_app(app, host="0.0.0.0", port=PORT)
|