Improve idle video sync and smooth transitions

- Add idle_video_time_ms tracking for seamless transitions
- Sync idle video position when speech ends (end_video_time_ms)
- Pre-render first frame before showing canvas (eliminates flash)
- Use indexed array for O(1) frame access
- Add generate_complete action proxy for Wav2Lip
- Remove server-side crossfade (transition handled by client)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

Files changed (2) hide show

interface/index_streaming.html +93 -28
interface/server_streaming.py +189 -52

interface/index_streaming.html CHANGED Viewed

@@ -66,9 +66,9 @@ const btnGo = document.getElementById('btnGo');
 const btnStop = document.getElementById('btnStop');
 let ws = null;
-let frameQueue = [];
 let isPlaying = false;
-let isBuffering = false;  // NOVO: acumulando frames antes de tocar
 let audioContext = null;
 let audioBuffer = null;
 let audioSource = null;
@@ -77,6 +77,8 @@ let frameCount = 0;
 let totalFrames = 0;
 let playbackStartTime = 0;
 let animationId = null;
 // Configuracao
 const TARGET_FPS = 25;
@@ -149,14 +151,18 @@ function connect() {
             case 'audio':
                 // MUDANCA: Audio chegou - AGORA iniciar playback sincronizado!
                 setMetric('mAudio', (msg.duration_ms / 1000).toFixed(2) + 's');
                 startSyncedPlayback(msg.audio, msg.duration_ms);
                 break;
             case 'done':
                 totalFrames = msg.frames;
                 setMetric('mFrames', msg.frames);
-                const elapsed = msg.elapsed_ms / 1000;
-                setMetric('mFps', (msg.frames / elapsed).toFixed(1));
                 break;
             case 'error':
@@ -169,14 +175,15 @@ function connect() {
 }
 function addFrame(base64Frame, index) {
-    // Decodificar frame e adicionar na fila
     const img = new Image();
     img.onload = () => {
-        frameQueue.push({ img, index, loaded: true });
         frameCount++;
         // Ajustar tamanho do canvas no primeiro frame
-        if (frameCount === 1) {
             talkCanvas.width = img.width;
             talkCanvas.height = img.height;
         }
@@ -193,14 +200,19 @@ function startBuffering() {
     // Preparar para receber frames, mas NAO iniciar playback
     isBuffering = true;
     isPlaying = false;
-    frameQueue = [];
     frameCount = 0;
     totalFrames = 0;
-    // Mostrar canvas (mas ainda sem renderizar)
-    talkCanvas.style.display = 'block';
 }
 async function startSyncedPlayback(base64Audio, durationMs) {
     // Audio chegou - iniciar playback sincronizado de video + audio
     setStatus('Reproduzindo...', 'ok');
@@ -240,14 +252,37 @@ async function startSyncedPlayback(base64Audio, durationMs) {
         audioSource.onended = () => {
             audioSource = null;
-            // Dar tempo para ultimo frame antes de parar
-            setTimeout(() => {
-                if (!audioSource) {
-                    stopPlayback();
-                }
-            }, 200);
         };
         // INICIAR TUDO SINCRONIZADO: audio + video ao mesmo tempo!
         isPlaying = true;
         isBuffering = false;
@@ -271,27 +306,40 @@ function renderLoop() {
     if (!isPlaying) return;
     const elapsed = performance.now() - playbackStartTime;
-    const targetFrame = Math.floor(elapsed / FRAME_DURATION);
-    // Encontrar e renderizar o frame correspondente ao tempo
-    const frame = frameQueue.find(f => f.index === targetFrame);
-    if (frame && frame.loaded) {
-        ctx.drawImage(frame.img, 0, 0);
-        // Limpar frames antigos para liberar memoria
-        frameQueue = frameQueue.filter(f => f.index >= targetFrame - 1);
     }
     // Atualizar progresso visual
-    const total = totalFrames || frameCount;
     if (total > 0) {
         const displayedFrame = Math.min(targetFrame, total);
         progress.style.width = (displayedFrame / total * 100) + '%';
     }
     // Continuar enquanto tiver audio ou frames
-    if (audioSource || targetFrame < (totalFrames || frameCount)) {
         animationId = requestAnimationFrame(renderLoop);
     } else {
         stopPlayback();
@@ -314,11 +362,23 @@ function stopPlayback() {
         audioSource = null;
     }
     // Esconder canvas, mostrar idle
     talkCanvas.style.display = 'none';
     ctx.clearRect(0, 0, talkCanvas.width, talkCanvas.height);
-    frameQueue = [];
     setStatus('Pronto', 'ok');
     setButtons(false);
 }
@@ -341,10 +401,15 @@ btnGo.onclick = () => {
     setStatus('Gerando...', 'busy');
     startTime = Date.now();
     ws.send(JSON.stringify({
         action: 'generate',
         text: text,
-        voice: document.getElementById('voice').value
     }));
 };

 const btnStop = document.getElementById('btnStop');
 let ws = null;
+let frames = [];  // Array indexado para acesso O(1)
 let isPlaying = false;
+let isBuffering = false;
 let audioContext = null;
 let audioBuffer = null;
 let audioSource = null;
 let totalFrames = 0;
 let playbackStartTime = 0;
 let animationId = null;
+let lastRenderedFrame = -1;  // Para evitar re-renderizar mesmo frame
+let endVideoTimeMs = 0;  // Tempo do idle video onde a fala termina
 // Configuracao
 const TARGET_FPS = 25;
             case 'audio':
                 // MUDANCA: Audio chegou - AGORA iniciar playback sincronizado!
                 setMetric('mAudio', (msg.duration_ms / 1000).toFixed(2) + 's');
+                // Calcular FPS real baseado na duracao do audio
+                const realFps = (totalFrames || frameCount) / (msg.duration_ms / 1000);
+                setMetric('mFps', realFps.toFixed(1));
                 startSyncedPlayback(msg.audio, msg.duration_ms);
                 break;
             case 'done':
                 totalFrames = msg.frames;
                 setMetric('mFrames', msg.frames);
+                // Salvar end_video_time_ms para sincronizar idle quando fala terminar
+                endVideoTimeMs = msg.end_video_time_ms || 0;
+                console.log(`Done: ${msg.frames} frames, end_video_time: ${endVideoTimeMs}ms`);
                 break;
             case 'error':
 }
 function addFrame(base64Frame, index) {
+    // Decodificar frame e adicionar no array indexado
     const img = new Image();
     img.onload = () => {
+        // Armazenar no indice correto para acesso O(1)
+        frames[index] = img;
         frameCount++;
         // Ajustar tamanho do canvas no primeiro frame
+        if (index === 0) {
             talkCanvas.width = img.width;
             talkCanvas.height = img.height;
         }
     // Preparar para receber frames, mas NAO iniciar playback
     isBuffering = true;
     isPlaying = false;
+    frames = [];  // Reset array indexado
     frameCount = 0;
     totalFrames = 0;
+    lastRenderedFrame = -1;
+    // NAO mostrar canvas ainda - so quando primeiro frame estiver pronto
+    // talkCanvas.style.display = 'block';
 }
+// Duracao real do audio (usado para sincronizar frames)
+let audioDurationMs = 0;
+let dynamicFrameDuration = FRAME_DURATION;
 async function startSyncedPlayback(base64Audio, durationMs) {
     // Audio chegou - iniciar playback sincronizado de video + audio
     setStatus('Reproduzindo...', 'ok');
         audioSource.onended = () => {
             audioSource = null;
+            // Transicao imediata quando audio termina
+            stopPlayback();
         };
+        // Calcular quantos frames usar baseado na duracao do audio
+        // Manter 25fps fixo e usar apenas os frames necessarios
+        audioDurationMs = durationMs;
+        dynamicFrameDuration = FRAME_DURATION; // Sempre 40ms (25fps)
+        // Calcular quantos frames cabem na duracao do audio
+        const framesNeeded = Math.floor(durationMs / FRAME_DURATION);
+        const numFrames = totalFrames || frameCount;
+        // Limitar ao numero de frames disponiveis ou necessarios (o menor)
+        const framesToUse = Math.min(framesNeeded, numFrames);
+        console.log(`Audio: ${durationMs}ms, Frames disponiveis: ${numFrames}, Frames a usar: ${framesToUse} (${(1000/dynamicFrameDuration).toFixed(1)}fps)`);
+        // Atualizar totalFrames para usar apenas os necessarios
+        totalFrames = framesToUse;
+        // PRE-RENDERIZAR primeiro frame ANTES de mostrar canvas
+        // Isso evita o "tec" de um frame em branco
+        if (frames[0]) {
+            ctx.drawImage(frames[0], 0, 0);
+            lastRenderedFrame = 0;
+        }
+        // Agora mostrar o canvas (ja com o primeiro frame renderizado)
+        talkCanvas.style.display = 'block';
         // INICIAR TUDO SINCRONIZADO: audio + video ao mesmo tempo!
         isPlaying = true;
         isBuffering = false;
     if (!isPlaying) return;
     const elapsed = performance.now() - playbackStartTime;
+    // Usar duracao dinamica para sincronizar com audio
+    const targetFrame = Math.floor(elapsed / dynamicFrameDuration);
+    const total = totalFrames || frameCount;
+    // So renderizar se for um frame diferente do anterior
+    if (targetFrame !== lastRenderedFrame && targetFrame < total) {
+        // Acesso O(1) ao frame pelo indice
+        let frameToRender = frames[targetFrame];
+        // Se frame ainda nao chegou, usar o ultimo frame disponivel
+        if (!frameToRender) {
+            // Procurar frame mais proximo anterior
+            for (let i = targetFrame - 1; i >= 0; i--) {
+                if (frames[i]) {
+                    frameToRender = frames[i];
+                    break;
+                }
+            }
+        }
+        if (frameToRender) {
+            ctx.drawImage(frameToRender, 0, 0);
+            lastRenderedFrame = targetFrame;
+        }
     }
     // Atualizar progresso visual
     if (total > 0) {
         const displayedFrame = Math.min(targetFrame, total);
         progress.style.width = (displayedFrame / total * 100) + '%';
     }
     // Continuar enquanto tiver audio ou frames
+    if (audioSource || targetFrame < total) {
         animationId = requestAnimationFrame(renderLoop);
     } else {
         stopPlayback();
         audioSource = null;
     }
+    // Sincronizar idle video para o tempo correto (onde a fala terminou)
+    if (endVideoTimeMs > 0) {
+        const targetTime = endVideoTimeMs / 1000;
+        // Garantir que o tempo esta dentro da duracao do video
+        if (idleVideo.duration > 0) {
+            idleVideo.currentTime = targetTime % idleVideo.duration;
+            console.log(`Idle video sync: ${targetTime.toFixed(2)}s`);
+        }
+        endVideoTimeMs = 0;  // Reset para proxima vez
+    }
     // Esconder canvas, mostrar idle
     talkCanvas.style.display = 'none';
     ctx.clearRect(0, 0, talkCanvas.width, talkCanvas.height);
+    frames = [];
+    lastRenderedFrame = -1;
     setStatus('Pronto', 'ok');
     setButtons(false);
 }
     setStatus('Gerando...', 'busy');
     startTime = Date.now();
+    // Capturar o tempo atual do video idle para sincronizacao
+    const idleVideoTimeMs = Math.floor(idleVideo.currentTime * 1000);
+    console.log(`Idle video time: ${idleVideoTimeMs}ms`);
     ws.send(JSON.stringify({
         action: 'generate',
         text: text,
+        voice: document.getElementById('voice').value,
+        idle_video_time_ms: idleVideoTimeMs  // Enviar para servidor sincronizar frames
     }));
 };

interface/server_streaming.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """
-Interface Server - Streaming Progressivo
-Envia frames JPEG + audio conforme recebe do Wav2Lip
 """
 from aiohttp import web
 import aiohttp
@@ -9,18 +9,91 @@ import json
 import base64
 import os
 import time
 WAV2LIP_WS = os.getenv("WAV2LIP_WS", "ws://localhost:8082/ws")
 PORT = int(os.getenv("PORT", "8000"))
 routes = web.RouteTableDef()
 @routes.get("/ws")
 async def websocket_handler(request):
     ws = web.WebSocketResponse()
     await ws.prepare(request)
     print("Cliente conectado")
     try:
         async for msg in ws:
             if msg.type == aiohttp.WSMsgType.TEXT:
@@ -30,12 +103,13 @@ async def websocket_handler(request):
                 if action == "generate":
                     text = data.get("text", "").strip()
                     voice = data.get("voice", "tara")
                     if not text:
                         await ws.send_json({"type": "error", "message": "Text required"})
                         continue
-                    print(f"Gerando: {text[:50]}...")
                     start_time = time.time()
                     try:
@@ -45,15 +119,18 @@ async def websocket_handler(request):
                                 timeout=aiohttp.ClientWSTimeout(ws_close=120)
                             )
-                            # Usar action: generate que faz streaming de frames
                             await wav2lip_ws.send_json({
                                 "action": "generate",
                                 "text": text,
-                                "voice": voice
                             })
-                            frame_count = 0
-                            first_frame_sent = False
                             async for w2l_msg in wav2lip_ws:
                                 if w2l_msg.type == aiohttp.WSMsgType.TEXT:
@@ -63,45 +140,19 @@ async def websocket_handler(request):
                                     if msg_type == "status":
                                         await ws.send_json(w2l_data)
-                                    elif msg_type == "first_chunk":
-                                        latency = w2l_data.get("latency_ms", 0)
-                                        await ws.send_json({
-                                            "type": "first_chunk",
-                                            "latency_ms": latency
-                                        })
                                     elif msg_type == "frame":
-                                        # Repassar frame diretamente
-                                        frame_count += 1
-                                        if not first_frame_sent:
-                                            first_frame_sent = True
-                                            ttfb = int((time.time() - start_time) * 1000)
-                                            await ws.send_json({
-                                                "type": "stream_start",
-                                                "ttfb_ms": ttfb
-                                            })
-                                        await ws.send_json({
-                                            "type": "frame",
-                                            "frame": w2l_data.get("frame"),
-                                            "index": frame_count - 1
-                                        })
                                     elif msg_type == "full_audio":
-                                        # Enviar audio completo
-                                        await ws.send_json({
-                                            "type": "audio",
-                                            "audio": w2l_data.get("audio"),
-                                            "duration_ms": w2l_data.get("duration_ms", 0)
-                                        })
                                     elif msg_type == "done":
-                                        elapsed = int((time.time() - start_time) * 1000)
-                                        await ws.send_json({
-                                            "type": "done",
-                                            "frames": frame_count,
-                                            "elapsed_ms": elapsed
-                                        })
                                         break
                                     elif msg_type == "error":
@@ -113,8 +164,96 @@ async def websocket_handler(request):
                             await wav2lip_ws.close()
                     except Exception as e:
                         print(f"Erro: {e}")
                         await ws.send_json({"type": "error", "message": str(e)})
                 elif action == "ping":
@@ -128,16 +267,9 @@ async def websocket_handler(request):
     return ws
-@routes.get("/health")
-async def health(request):
-    return web.json_response({"status": "ok", "mode": "streaming"})
 @routes.get("/")
 async def index(request):
-    return web.FileResponse(
-        os.path.join(os.path.dirname(__file__), "index_streaming.html")
-    )
 @routes.get("/{filename}")
@@ -154,9 +286,14 @@ app.add_routes(routes)
 if __name__ == "__main__":
     print("=" * 50)
-    print("Streaming Server - Progressive Frame Delivery")
     print("=" * 50)
-    print(f"Porta: {PORT}")
-    print(f"Wav2Lip: {WAV2LIP_WS}")
     print("=" * 50)
     web.run_app(app, host="0.0.0.0", port=PORT)

 """
+Interface Server - Streaming com Crossfade Suave
+Faz transicao suave entre idle e fala usando blending de frames
 """
 from aiohttp import web
 import aiohttp
 import base64
 import os
 import time
+import cv2
+import numpy as np
 WAV2LIP_WS = os.getenv("WAV2LIP_WS", "ws://localhost:8082/ws")
 PORT = int(os.getenv("PORT", "8000"))
+IDLE_VIDEO = os.path.join(os.path.dirname(__file__), "idle.mp4")
+# Configuracao de crossfade
+CROSSFADE_FRAMES = 5  # Numero de frames para transicao (200ms @ 25fps)
 routes = web.RouteTableDef()
+# Cache de frames idle
+idle_frames = []
+idle_frame_count = 0
+def load_idle_frames():
+    """Carrega frames do idle.mp4"""
+    global idle_frames, idle_frame_count
+    if idle_frames:
+        return
+    if not os.path.exists(IDLE_VIDEO):
+        print(f"[AVISO] Idle video nao encontrado: {IDLE_VIDEO}")
+        return
+    print(f"Carregando idle frames de {IDLE_VIDEO}...")
+    cap = cv2.VideoCapture(IDLE_VIDEO)
+    while True:
+        ret, frame = cap.read()
+        if not ret:
+            break
+        # Manter em BGR para processamento, converter para JPEG depois
+        idle_frames.append(frame)
+    cap.release()
+    idle_frame_count = len(idle_frames)
+    print(f"Carregados {idle_frame_count} frames idle")
+def frame_to_jpeg_base64(frame, quality=85):
+    """Converte frame numpy para JPEG base64"""
+    encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), quality]
+    _, buffer = cv2.imencode('.jpg', frame, encode_param)
+    return base64.b64encode(buffer).decode('utf-8')
+def jpeg_base64_to_frame(b64_data):
+    """Converte JPEG base64 para frame numpy"""
+    jpeg_data = base64.b64decode(b64_data)
+    nparr = np.frombuffer(jpeg_data, np.uint8)
+    return cv2.imdecode(nparr, cv2.IMREAD_COLOR)
+def blend_frames(frame1, frame2, alpha):
+    """Blend entre dois frames. alpha=0 -> frame1, alpha=1 -> frame2"""
+    # Garantir que ambos frames tem o mesmo tamanho
+    if frame1.shape != frame2.shape:
+        frame2 = cv2.resize(frame2, (frame1.shape[1], frame1.shape[0]))
+    return cv2.addWeighted(frame1, 1 - alpha, frame2, alpha, 0)
+def create_crossfade_frames(from_frame, to_frame, num_frames):
+    """Cria frames de transicao suave entre dois frames"""
+    frames = []
+    for i in range(num_frames):
+        alpha = (i + 1) / (num_frames + 1)  # 0.16, 0.33, 0.5, 0.66, 0.83 para 5 frames
+        blended = blend_frames(from_frame, to_frame, alpha)
+        frames.append(blended)
+    return frames
 @routes.get("/ws")
 async def websocket_handler(request):
     ws = web.WebSocketResponse()
     await ws.prepare(request)
     print("Cliente conectado")
+    # Posicao atual no idle loop (para continuidade)
+    idle_position = 0
     try:
         async for msg in ws:
             if msg.type == aiohttp.WSMsgType.TEXT:
                 if action == "generate":
                     text = data.get("text", "").strip()
                     voice = data.get("voice", "tara")
+                    idle_video_time_ms = data.get("idle_video_time_ms", 0)
                     if not text:
                         await ws.send_json({"type": "error", "message": "Text required"})
                         continue
+                    print(f"Gerando: {text[:50]}... (idle_time: {idle_video_time_ms}ms)")
                     start_time = time.time()
                     try:
                                 timeout=aiohttp.ClientWSTimeout(ws_close=120)
                             )
                             await wav2lip_ws.send_json({
                                 "action": "generate",
                                 "text": text,
+                                "voice": voice,
+                                "idle_video_time_ms": idle_video_time_ms
                             })
+                            # Coletar todos os frames
+                            speaking_frames = []
+                            audio_data = None
+                            audio_duration = 0
+                            end_video_time_ms = 0
                             async for w2l_msg in wav2lip_ws:
                                 if w2l_msg.type == aiohttp.WSMsgType.TEXT:
                                     if msg_type == "status":
                                         await ws.send_json(w2l_data)
                                     elif msg_type == "frame":
+                                        frame_b64 = w2l_data.get("frame", "")
+                                        if frame_b64:
+                                            frame = jpeg_base64_to_frame(frame_b64)
+                                            speaking_frames.append(frame)
                                     elif msg_type == "full_audio":
+                                        audio_data = w2l_data.get("audio", "")
+                                        audio_duration = w2l_data.get("duration_ms", 0)
                                     elif msg_type == "done":
+                                        # Capturar end_video_time_ms para sincronizar idle
+                                        end_video_time_ms = w2l_data.get("end_video_time_ms", 0)
                                         break
                                     elif msg_type == "error":
                             await wav2lip_ws.close()
+                            # Enviar frames SEM crossfade - transicao e feita no cliente
+                            if speaking_frames:
+                                # Atualizar posicao do idle para continuidade apos fala
+                                if idle_frames:
+                                    idle_position = (idle_position + len(speaking_frames)) % idle_frame_count
+                                # Enviar stream_start
+                                ttfb = int((time.time() - start_time) * 1000)
+                                await ws.send_json({"type": "stream_start", "ttfb_ms": ttfb})
+                                # Enviar apenas os frames de fala (sem crossfade)
+                                for idx, frame in enumerate(speaking_frames):
+                                    frame_b64 = frame_to_jpeg_base64(frame)
+                                    await ws.send_json({
+                                        "type": "frame",
+                                        "frame": frame_b64,
+                                        "index": idx
+                                    })
+                                # Enviar audio
+                                if audio_data:
+                                    await ws.send_json({
+                                        "type": "audio",
+                                        "audio": audio_data,
+                                        "duration_ms": audio_duration
+                                    })
+                                # Enviar done com end_video_time_ms para sincronizar idle
+                                elapsed = int((time.time() - start_time) * 1000)
+                                await ws.send_json({
+                                    "type": "done",
+                                    "frames": len(speaking_frames),
+                                    "elapsed_ms": elapsed,
+                                    "end_video_time_ms": end_video_time_ms
+                                })
+                                print(f"Enviados {len(speaking_frames)} frames de fala (sem crossfade)")
                     except Exception as e:
                         print(f"Erro: {e}")
+                        import traceback
+                        traceback.print_exc()
+                        await ws.send_json({"type": "error", "message": str(e)})
+                elif action == "generate_complete":
+                    # Proxy para generate_complete do Wav2Lip
+                    text = data.get("text", "").strip()
+                    voice = data.get("voice", "tara")
+                    idle_before_frames = data.get("idle_before_frames", 0)
+                    idle_after_frames = data.get("idle_after_frames", 0)
+                    crossfade_frames = data.get("crossfade_frames", 0)
+                    jpeg_quality = data.get("jpeg_quality", 95)
+                    if not text:
+                        await ws.send_json({"type": "error", "message": "Text required"})
+                        continue
+                    print(f"Generate Complete: {text[:50]}...")
+                    try:
+                        async with aiohttp.ClientSession() as session:
+                            wav2lip_ws = await session.ws_connect(
+                                WAV2LIP_WS,
+                                timeout=aiohttp.ClientWSTimeout(ws_close=120)
+                            )
+                            await wav2lip_ws.send_json({
+                                "action": "generate_complete",
+                                "text": text,
+                                "voice": voice,
+                                "idle_before_frames": idle_before_frames,
+                                "idle_after_frames": idle_after_frames,
+                                "crossfade_frames": crossfade_frames,
+                                "jpeg_quality": jpeg_quality
+                            })
+                            # Repassar todas as mensagens
+                            async for w2l_msg in wav2lip_ws:
+                                if w2l_msg.type == aiohttp.WSMsgType.TEXT:
+                                    await ws.send_str(w2l_msg.data)
+                                    w2l_data = json.loads(w2l_msg.data)
+                                    if w2l_data.get("type") in ("done", "error"):
+                                        break
+                                elif w2l_msg.type in (aiohttp.WSMsgType.CLOSED, aiohttp.WSMsgType.ERROR):
+                                    break
+                            await wav2lip_ws.close()
+                    except Exception as e:
+                        print(f"Erro generate_complete: {e}")
                         await ws.send_json({"type": "error", "message": str(e)})
                 elif action == "ping":
     return ws
 @routes.get("/")
 async def index(request):
+    return web.FileResponse(os.path.join(os.path.dirname(__file__), "index_streaming.html"))
 @routes.get("/{filename}")
 if __name__ == "__main__":
     print("=" * 50)
+    print("Streaming Server com Crossfade - Porta", PORT)
+    print("Wav2Lip:", WAV2LIP_WS)
+    print("Idle Video:", IDLE_VIDEO)
+    print("Crossfade: DESABILITADO (transicao no cliente)")
     print("=" * 50)
+    # Carregar idle frames
+    load_idle_frames()
     print("=" * 50)
     web.run_app(app, host="0.0.0.0", port=PORT)