Add WebRTC streaming interface with vast.ai deployment

- Update WebRTC interface with race condition fix (isConnecting flag)
- Add automated test scripts (aiortc and Playwright)
- Add deployment scripts for vast.ai and SkyPilot
- Improve idle video sync and transitions
- Fix audio-video buffering for better sync

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

Files changed (10) hide show

interface/deploy_to_server.sh +30 -0
interface/index.html +354 -401
interface/index_optimized.html +534 -136
interface/index_streaming.html +30 -20
interface/server.py +496 -329
interface/server_optimized.py +24 -10
interface/server_streaming.py +473 -9
interface/test_webrtc_client.py +281 -0
interface/test_webrtc_playwright.py +218 -0
interface/webrtc_skypilot.yaml +19 -0

interface/deploy_to_server.sh ADDED Viewed

	@@ -0,0 +1,30 @@

+#!/bin/bash
+echo "==================================================="
+echo "Deploy WebRTC para VPS"
+echo "==================================================="
+SERVER="root@62.107.25.198"
+PORT="47824"
+REMOTE_DIR="/workspace/interface"
+echo ""
+echo "Copiando arquivos para o servidor..."
+echo ""
+# Copiar arquivos principais
+scp -P $PORT \
+    server.py \
+    index.html \
+    $SERVER:$REMOTE_DIR/
+echo ""
+echo "==================================================="
+echo "Deploy concluído!"
+echo "==================================================="
+echo ""
+echo "Para reiniciar o servidor:"
+echo "ssh -p $PORT $SERVER"
+echo "cd $REMOTE_DIR"
+echo "pkill -f server.py"
+echo "python3 server.py"
+echo ""

interface/index.html CHANGED Viewed

@@ -3,458 +3,411 @@
 <head>
     <meta charset="UTF-8">
     <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>Avatar Interface - WebM Streaming</title>
     <style>
         * { margin: 0; padding: 0; box-sizing: border-box; }
         body {
-            font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
             background: linear-gradient(135deg, #0a0a1a 0%, #1a1a3a 100%);
             color: #fff;
             min-height: 100vh;
             padding: 20px;
         }
-        .container { max-width: 1200px; margin: 0 auto; }
-        h1 { text-align: center; margin-bottom: 20px; color: #00d4ff; }
-        .status-bar {
-            display: flex; gap: 20px; justify-content: center; margin-bottom: 20px;
-            flex-wrap: wrap;
-        }
-        .status-item {
-            padding: 10px 20px; border-radius: 20px;
-            background: rgba(255,255,255,0.1);
             font-size: 14px;
         }
-        .status-item.online { background: rgba(0,255,100,0.2); color: #0f0; }
-        .status-item.offline { background: rgba(255,0,0,0.2); color: #f00; }
-        .status-item.connecting { background: rgba(255,200,0,0.2); color: #fc0; }
-        .main-content { display: flex; gap: 20px; flex-wrap: wrap; }
-        .video-section {
-            flex: 1; min-width: 400px;
-            background: rgba(0,0,0,0.3); border-radius: 15px; padding: 20px;
-        }
-        .video-container {
-            width: 100%; aspect-ratio: 16/9;
-            background: #000; border-radius: 10px; overflow: hidden;
             position: relative;
         }
-        #video-player {
-            width: 100%; height: 100%;
             object-fit: contain;
-            background: #000;
         }
-        .video-overlay {
-            position: absolute; top: 0; left: 0; right: 0; bottom: 0;
-            display: flex; align-items: center; justify-content: center;
-            background: rgba(0,0,0,0.7);
-            font-size: 18px; color: #aaa;
-            pointer-events: none;
         }
-        .video-overlay.hidden { display: none; }
-        .control-section {
-            flex: 1; min-width: 300px;
-            background: rgba(0,0,0,0.3); border-radius: 15px; padding: 20px;
         }
-        .input-group { margin-bottom: 15px; }
-        label { display: block; margin-bottom: 5px; color: #aaa; }
-        textarea {
-            width: 100%; height: 100px; padding: 10px;
-            border: 1px solid #333; border-radius: 8px;
-            background: rgba(255,255,255,0.05); color: #fff;
-            resize: vertical; font-size: 14px;
         }
-        select, button {
-            width: 100%; padding: 12px; margin-top: 10px;
-            border: none; border-radius: 8px; cursor: pointer;
-        }
-        select { background: rgba(255,255,255,0.1); color: #fff; }
-        .btn-primary {
-            background: linear-gradient(135deg, #00d4ff, #0066ff);
-            color: #fff; font-weight: bold; font-size: 16px;
         }
-        .btn-danger {
-            background: linear-gradient(135deg, #ff4444, #cc0000);
-            color: #fff; font-weight: bold; font-size: 16px;
         }
         button:hover { opacity: 0.9; }
         button:disabled { opacity: 0.5; cursor: not-allowed; }
         .metrics {
-            margin-top: 20px; padding: 15px;
-            background: rgba(255,255,255,0.05); border-radius: 8px;
-        }
-        .metrics h4 { margin-bottom: 10px; color: #00d4ff; }
-        .metric-row { display: flex; justify-content: space-between; padding: 5px 0; }
-        .metric-value { color: #00d4ff; font-weight: bold; }
-        .log {
-            margin-top: 20px; padding: 10px;
-            background: #000; border-radius: 8px;
-            font-family: monospace; font-size: 12px;
-            max-height: 150px; overflow-y: auto;
         }
-        .log-entry { padding: 2px 0; border-bottom: 1px solid #222; }
-        .log-time { color: #666; }
-        .log-msg { color: #0f0; }
-        .log-error { color: #f00; }
-        .log-status { color: #fc0; }
     </style>
 </head>
 <body>
     <div class="container">
-        <h1>Avatar Interface - WebM Streaming</h1>
-        <div class="status-bar">
-            <div class="status-item" id="ws-status">WebSocket: Desconectado</div>
-            <div class="status-item" id="wav2lip-status">Wav2Lip: --</div>
-            <div class="status-item" id="tts-status">TTS: --</div>
         </div>
-        <div class="main-content">
-            <div class="video-section">
-                <h3>Video Stream</h3>
-                <div class="video-container">
-                    <video id="video-player" autoplay></video>
-                    <div class="video-overlay" id="video-overlay">
-                        Aguardando video...
-                    </div>
-                </div>
-            </div>
-            <div class="control-section">
-                <h3>Controles</h3>
-                <div class="input-group">
-                    <label>Texto para falar:</label>
-                    <textarea id="text-input" placeholder="Digite o texto aqui...">Hello! I am a real-time streaming avatar powered by AI.</textarea>
-                </div>
-                <div class="input-group">
-                    <label>Voz:</label>
-                    <select id="voice-select">
-                        <option value="tara">Tara (Female)</option>
-                        <option value="leah">Leah (Female)</option>
-                        <option value="jess">Jess (Female)</option>
-                        <option value="leo">Leo (Male)</option>
-                        <option value="dan">Dan (Male)</option>
-                    </select>
-                </div>
-                <button id="generate-btn" class="btn-primary" onclick="generate()">
-                    Gerar Avatar
-                </button>
-                <button id="stop-btn" class="btn-danger" onclick="stop()" disabled>
-                    Parar
-                </button>
-                <div class="metrics">
-                    <h4>Metricas</h4>
-                    <div class="metric-row">
-                        <span>Latencia:</span>
-                        <span class="metric-value" id="latency">--</span>
-                    </div>
-                    <div class="metric-row">
-                        <span>Frames:</span>
-                        <span class="metric-value" id="frames">0</span>
-                    </div>
-                    <div class="metric-row">
-                        <span>Chunks WebM:</span>
-                        <span class="metric-value" id="chunks">0</span>
-                    </div>
-                    <div class="metric-row">
-                        <span>Bytes recebidos:</span>
-                        <span class="metric-value" id="bytes">0 KB</span>
-                    </div>
-                    <div class="metric-row">
-                        <span>Duracao:</span>
-                        <span class="metric-value" id="duration">--</span>
-                    </div>
-                </div>
-                <div class="log" id="log"></div>
-            </div>
         </div>
-    </div>
-    <script>
-        // Configuracao
-        const WS_URL = "ws://" + window.location.host + "/ws";
-        // Estado
-        let ws = null;
-        let isGenerating = false;
-        let startTime = null;
-        let totalFrames = 0;
-        let totalChunks = 0;
-        let totalBytes = 0;
-        // Elementos
-        const video = document.getElementById("video-player");
-        const overlay = document.getElementById("video-overlay");
-        // Buffer de WebM para reproducao sequencial
-        let webmQueue = [];
-        let isPlaying = false;
-        function log(msg, type = "msg") {
-            const logDiv = document.getElementById("log");
-            const time = new Date().toLocaleTimeString();
-            logDiv.innerHTML = `<div class="log-entry"><span class="log-time">${time}</span> <span class="log-${type}">${msg}</span></div>` + logDiv.innerHTML;
-            while (logDiv.children.length > 50) {
-                logDiv.removeChild(logDiv.lastChild);
-            }
-        }
-        function updateStatus(element, status, text) {
-            const el = document.getElementById(element);
-            el.textContent = text;
-            el.className = "status-item " + status;
-        }
-        async function checkHealth() {
-            try {
-                const resp = await fetch("/health");
-                const status = await resp.json();
-                updateStatus("wav2lip-status",
-                    status.wav2lip ? "online" : "offline",
-                    "Wav2Lip: " + (status.wav2lip ? "Online" : "Offline")
-                );
-                updateStatus("tts-status",
-                    status.tts ? "online" : "offline",
-                    "TTS: " + (status.tts ? "Online" : "Offline")
-                );
-            } catch (e) {
-                updateStatus("wav2lip-status", "offline", "Wav2Lip: Erro");
-                updateStatus("tts-status", "offline", "TTS: Erro");
-            }
-        }
-        function connectWebSocket() {
-            if (ws && ws.readyState === WebSocket.OPEN) return;
-            updateStatus("ws-status", "connecting", "WebSocket: Conectando...");
-            log("Conectando ao WebSocket...", "status");
-            ws = new WebSocket(WS_URL);
-            ws.onopen = () => {
-                updateStatus("ws-status", "online", "WebSocket: Conectado");
-                log("WebSocket conectado", "msg");
-                checkHealth();
-            };
-            ws.onmessage = (event) => {
-                try {
-                    const data = JSON.parse(event.data);
-                    handleMessage(data);
-                } catch (e) {
-                    log("Erro ao processar mensagem: " + e, "error");
                 }
-            };
-            ws.onclose = () => {
-                updateStatus("ws-status", "offline", "WebSocket: Desconectado");
-                log("WebSocket desconectado", "error");
-                setTimeout(connectWebSocket, 3000);
-            };
-            ws.onerror = (e) => {
-                log("Erro no WebSocket", "error");
-            };
-        }
-        function handleMessage(data) {
-            const type = data.type;
-            switch (type) {
-                case "status":
-                    log(data.message, "status");
-                    break;
-                case "webm_chunk":
-                    handleWebMChunk(data);
-                    break;
-                case "done":
-                    handleDone(data);
-                    break;
-                case "error":
-                    log("Erro: " + data.message, "error");
-                    stopGeneration();
-                    break;
-                case "pong":
-                    break;
-                default:
-                    console.log("Mensagem desconhecida:", data);
-            }
-        }
-        function handleWebMChunk(data) {
-            totalChunks++;
-            document.getElementById("chunks").textContent = totalChunks;
-            // Decodificar WebM
-            const webmData = base64ToArrayBuffer(data.data);
-            totalBytes += webmData.byteLength;
-            document.getElementById("bytes").textContent = (totalBytes / 1024).toFixed(1) + " KB";
-            // Atualizar latencia (primeiro chunk)
-            if (startTime && totalChunks === 1) {
-                const latency = Date.now() - startTime;
-                document.getElementById("latency").textContent = latency + "ms";
-                log(`Primeiro chunk em ${latency}ms`, "status");
             }
-            // Esconder overlay
-            overlay.classList.add("hidden");
-            // Adicionar a fila e reproduzir
-            webmQueue.push(webmData);
-            if (!isPlaying) {
-                playNextWebM();
             }
-        }
-        function playNextWebM() {
-            if (webmQueue.length === 0) {
-                isPlaying = false;
-                return;
             }
-            isPlaying = true;
-            const webmData = webmQueue.shift();
-            // Criar blob URL e reproduzir
-            const blob = new Blob([webmData], { type: "video/webm" });
-            const url = URL.createObjectURL(blob);
-            // Quando o video terminar, reproduzir o proximo
-            video.onended = () => {
-                video.onended = null;
-                URL.revokeObjectURL(url);
-                playNextWebM();
-            };
-            video.src = url;
-            video.play().catch(e => {
-                log("Erro ao reproduzir: " + e, "error");
-                playNextWebM();
-            });
-        }
-        function handleDone(data) {
-            totalFrames = data.total_frames;
-            document.getElementById("frames").textContent = totalFrames;
-            document.getElementById("duration").textContent = data.total_duration_ms + "ms";
-            log(`Geracao concluida: ${data.total_frames} frames, ${data.total_duration_ms}ms`, "msg");
-            isGenerating = false;
-            document.getElementById("generate-btn").disabled = false;
-            document.getElementById("stop-btn").disabled = true;
-        }
-        function base64ToArrayBuffer(base64) {
-            const binaryString = atob(base64);
-            const bytes = new Uint8Array(binaryString.length);
-            for (let i = 0; i < binaryString.length; i++) {
-                bytes[i] = binaryString.charCodeAt(i);
             }
-            return bytes.buffer;
         }
-        function generate() {
-            const text = document.getElementById("text-input").value.trim();
-            const voice = document.getElementById("voice-select").value;
-            if (!text) {
-                log("Digite um texto", "error");
-                return;
-            }
-            if (!ws || ws.readyState !== WebSocket.OPEN) {
-                log("WebSocket nao conectado", "error");
-                return;
-            }
-            // Reset estado
-            webmQueue = [];
-            totalFrames = 0;
-            totalChunks = 0;
-            totalBytes = 0;
-            isPlaying = false;
-            startTime = Date.now();
-            // Reset video
-            video.onended = null;
-            if (video.src) URL.revokeObjectURL(video.src);
-            video.removeAttribute("src");
-            // Reset UI
-            document.getElementById("frames").textContent = "0";
-            document.getElementById("chunks").textContent = "0";
-            document.getElementById("bytes").textContent = "0 KB";
-            document.getElementById("latency").textContent = "--";
-            document.getElementById("duration").textContent = "--";
-            overlay.textContent = "Gerando...";
-            overlay.classList.remove("hidden");
-            // Atualizar botoes
-            isGenerating = true;
-            document.getElementById("generate-btn").disabled = true;
-            document.getElementById("stop-btn").disabled = false;
-            log("Enviando: " + text.substring(0, 50) + "...", "status");
-            // Enviar requisicao
-            ws.send(JSON.stringify({
-                action: "generate",
                 text: text,
-                voice: voice
-            }));
-        }
-        function stop() {
-            if (ws && ws.readyState === WebSocket.OPEN) {
-                ws.send(JSON.stringify({ action: "stop" }));
-            }
-            stopGeneration();
         }
-        function stopGeneration() {
-            isGenerating = false;
-            isPlaying = false;
-            webmQueue = [];
-            video.pause();
-            video.onended = null;
-            if (video.src) URL.revokeObjectURL(video.src);
-            document.getElementById("generate-btn").disabled = false;
-            document.getElementById("stop-btn").disabled = true;
         }
-        // Heartbeat
-        setInterval(() => {
-            if (ws && ws.readyState === WebSocket.OPEN) {
-                ws.send(JSON.stringify({ action: "ping" }));
-            }
-        }, 30000);
-        // Health check
-        setInterval(checkHealth, 10000);
-        // Inicializar
-        connectWebSocket();
-    </script>
 </body>
 </html>

 <head>
     <meta charset="UTF-8">
     <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Avatar - WebRTC</title>
     <style>
         * { margin: 0; padding: 0; box-sizing: border-box; }
         body {
+            font-family: system-ui, sans-serif;
             background: linear-gradient(135deg, #0a0a1a 0%, #1a1a3a 100%);
             color: #fff;
             min-height: 100vh;
             padding: 20px;
         }
+        .container { max-width: 900px; margin: 0 auto; }
+        .status {
+            text-align: center;
+            padding: 10px;
+            margin-bottom: 15px;
+            border-radius: 8px;
             font-size: 14px;
+            background: rgba(255,255,255,0.1);
         }
+        .status.connected { background: rgba(0,255,100,0.2); color: #0f0; }
+        .status.busy { background: rgba(255,200,0,0.2); color: #fc0; }
+        .status.error { background: rgba(255,0,0,0.2); color: #f55; }
+        .video-box {
+            background: #000;
+            border-radius: 10px;
+            overflow: hidden;
+            margin-bottom: 20px;
+            aspect-ratio: 16/9;
+            display: flex;
+            align-items: center;
+            justify-content: center;
             position: relative;
         }
+        video {
+            max-width: 100%;
+            max-height: 100%;
             object-fit: contain;
         }
+        .placeholder {
+            color: #666;
+            font-size: 14px;
+            position: absolute;
         }
+        .controls {
+            display: flex;
+            gap: 10px;
+            margin-bottom: 15px;
+            flex-wrap: wrap;
         }
+        .call-controls {
+            display: flex;
+            gap: 10px;
+            margin-bottom: 20px;
         }
+        textarea {
+            flex: 1;
+            min-width: 200px;
+            padding: 12px;
+            border: 1px solid #333;
+            border-radius: 8px;
+            background: #1a1a2e;
+            color: #fff;
+            font-size: 14px;
+            resize: none;
+            height: 60px;
+        }
+        select {
+            padding: 12px;
+            border: 1px solid #333;
+            border-radius: 8px;
+            background: #1a1a2e;
+            color: #fff;
+            font-size: 14px;
         }
+        button {
+            padding: 12px 24px;
+            border: none;
+            border-radius: 8px;
+            font-size: 14px;
+            font-weight: bold;
+            cursor: pointer;
+            transition: opacity 0.2s;
         }
         button:hover { opacity: 0.9; }
         button:disabled { opacity: 0.5; cursor: not-allowed; }
+        .btn-call { background: #00aaff; color: #fff; }
+        .btn-call.active { background: #ff4444; }
+        .btn-generate { background: #00ff88; color: #000; flex: 0 0 auto; }
         .metrics {
+            display: grid;
+            grid-template-columns: repeat(auto-fit, minmax(120px, 1fr));
+            gap: 8px;
+            padding: 12px;
+            background: #1a1a2e;
+            border-radius: 8px;
+            font-size: 12px;
         }
+        .metric { display: flex; justify-content: space-between; }
+        .val { color: #00ff88; font-family: monospace; }
     </style>
 </head>
 <body>
     <div class="container">
+        <div class="status" id="status">Desconectado</div>
+        <div class="video-box">
+            <video id="video" autoplay playsinline></video>
+            <div class="placeholder" id="placeholder">Clique em "Conectar" para iniciar</div>
         </div>
+        <div class="call-controls">
+            <button class="btn-call" id="btnConnect">Conectar</button>
         </div>
+        <div class="controls">
+            <textarea id="text" placeholder="Digite o texto para o avatar falar...">Hello! I am testing the WebRTC streaming avatar with VP9 codec.</textarea>
+            <select id="voice">
+                <option value="tara">Tara</option>
+                <option value="leah">Leah</option>
+                <option value="jess">Jess</option>
+                <option value="leo">Leo</option>
+                <option value="dan">Dan</option>
+            </select>
+            <button class="btn-generate" id="btnGenerate" disabled>Gerar</button>
+        </div>
+        <div class="metrics">
+            <div class="metric"><span>WebRTC:</span><span class="val" id="mWebrtc">--</span></div>
+            <div class="metric"><span>Video:</span><span class="val" id="mVideo">--</span></div>
+            <div class="metric"><span>Audio:</span><span class="val" id="mAudio">--</span></div>
+            <div class="metric"><span>Latencia:</span><span class="val" id="mLatency">--</span></div>
+        </div>
+    </div>
+<script>
+const video = document.getElementById('video');
+const status = document.getElementById('status');
+const placeholder = document.getElementById('placeholder');
+const btnConnect = document.getElementById('btnConnect');
+const btnGenerate = document.getElementById('btnGenerate');
+// Global variables (para debug)
+window.pc = null;
+let pc = null;
+let sessionId = null;
+let isConnected = false;
+let isConnecting = false;  // Flag para prevenir múltiplas chamadas simultâneas
+function setStatus(txt, cls) {
+    status.textContent = txt;
+    status.className = 'status ' + (cls || '');
+}
+function setMetric(id, val) {
+    document.getElementById(id).textContent = val;
+}
+async function connect() {
+    if (isConnected) {
+        disconnect();
+        return;
+    }
+    // Prevenir múltiplas chamadas simultâneas
+    if (isConnecting) {
+        console.log('Conexão já em andamento, ignorando...');
+        return;
+    }
+    isConnecting = true;
+    setStatus('Conectando...', 'busy');
+    try {
+        // Criar RTCPeerConnection com STUN + TURN múltiplos
+        pc = window.pc = new RTCPeerConnection({
+            iceServers: [
+                { urls: 'stun:stun.l.google.com:19302' },
+                // Servidores TURN públicos - múltiplas opções
+                {
+                    urls: [
+                        'turn:openrelay.metered.ca:80',
+                        'turn:openrelay.metered.ca:443',
+                        'turn:openrelay.metered.ca:443?transport=tcp'
+                    ],
+                    username: 'openrelayproject',
+                    credential: 'openrelayproject'
+                },
+                // Servidor TURN alternativo (Twilio)
+                {
+                    urls: 'turn:global.turn.twilio.com:3478?transport=udp',
+                    username: 'f4b4035eaa76f4a55de5f4351567653ee4ff6fa97b50b6b334fcc1be9c27212d',
+                    credential: 'w1uxM55V9yVoqyVFjt+mxDBV0F87AUCemaYVQGxsPLw='
                 }
+            ],
+            iceCandidatePoolSize: 10,
+            bundlePolicy: 'max-bundle',
+            rtcpMuxPolicy: 'require'
+        });
+        // Handler para tracks recebidos
+        pc.ontrack = (event) => {
+            console.log('Track recebido:', event.track.kind);
+            if (event.track.kind === 'video') {
+                video.srcObject = event.streams[0];
+                placeholder.style.display = 'none';
+                setMetric('mVideo', 'Ativo');
             }
+            if (event.track.kind === 'audio') {
+                setMetric('mAudio', 'Ativo');
             }
+        };
+        // Handler para mudancas de estado
+        pc.onconnectionstatechange = () => {
+            console.log('Estado WebRTC:', pc.connectionState);
+            setMetric('mWebrtc', pc.connectionState);
+            if (pc.connectionState === 'connected') {
+                isConnected = true;
+                isConnecting = false;  // Reset flag quando conectado
+                updateConnectButton();
+                setStatus('Conectado - Streaming ativo', 'connected');
+                btnGenerate.disabled = false;
+                startStatsMonitor();
+            } else if (pc.connectionState === 'failed' || pc.connectionState === 'disconnected') {
+                isConnecting = false;  // Reset flag em caso de falha
+                disconnect();
             }
+        };
+        pc.oniceconnectionstatechange = () => {
+            console.log('ICE State:', pc.iceConnectionState);
+        };
+        // Debug: Log ICE candidates
+        pc.onicecandidate = (event) => {
+            if (event.candidate) {
+                console.log('ICE Candidate:', {
+                    type: event.candidate.type,
+                    protocol: event.candidate.protocol,
+                    address: event.candidate.address,
+                    port: event.candidate.port
+                });
             }
+        };
+        // Criar transceiver para receber video e audio
+        pc.addTransceiver('video', { direction: 'recvonly' });
+        pc.addTransceiver('audio', { direction: 'recvonly' });
+        // Criar offer
+        const offer = await pc.createOffer();
+        await pc.setLocalDescription(offer);
+        // Aguardar alguns candidatos ICE (mas não esperar complete)
+        // Trickle ICE: enviamos o offer logo e candidatos vão depois
+        await new Promise(resolve => setTimeout(resolve, 500));
+        console.log('Enviando offer para servidor...');
+        // Enviar offer para o servidor
+        const response = await fetch('/offer', {
+            method: 'POST',
+            headers: { 'Content-Type': 'application/json' },
+            body: JSON.stringify({
+                sdp: pc.localDescription.sdp,
+                type: pc.localDescription.type
+            })
+        });
+        if (!response.ok) {
+            throw new Error('Erro ao conectar: ' + response.status);
         }
+        const answer = await response.json();
+        sessionId = answer.session_id;
+        // Aplicar answer
+        await pc.setRemoteDescription(new RTCSessionDescription({
+            sdp: answer.sdp,
+            type: answer.type
+        }));
+        console.log('Session ID:', sessionId);
+    } catch (err) {
+        console.error('Erro ao conectar:', err);
+        isConnecting = false;  // Reset flag em caso de erro
+        setStatus('Erro: ' + err.message, 'error');
+        disconnect();
+    }
+}
+function disconnect() {
+    if (pc) {
+        pc.close();
+        pc = null;
+    }
+    isConnected = false;
+    sessionId = null;
+    video.srcObject = null;
+    placeholder.style.display = 'block';
+    updateConnectButton();
+    btnGenerate.disabled = true;
+    setStatus('Desconectado');
+    setMetric('mWebrtc', '--');
+    setMetric('mVideo', '--');
+    setMetric('mAudio', '--');
+    setMetric('mLatency', '--');
+}
+function updateConnectButton() {
+    if (isConnected) {
+        btnConnect.textContent = 'Desconectar';
+        btnConnect.classList.add('active');
+    } else {
+        btnConnect.textContent = 'Conectar';
+        btnConnect.classList.remove('active');
+    }
+}
+async function generate() {
+    const text = document.getElementById('text').value.trim();
+    if (!text || !sessionId) return;
+    setStatus('Gerando fala...', 'busy');
+    btnGenerate.disabled = true;
+    try {
+        const response = await fetch('/generate', {
+            method: 'POST',
+            headers: { 'Content-Type': 'application/json' },
+            body: JSON.stringify({
+                session_id: sessionId,
                 text: text,
+                voice: document.getElementById('voice').value
+            })
+        });
+        if (!response.ok) {
+            const err = await response.json();
+            throw new Error(err.error || 'Erro ao gerar');
         }
+        setStatus('Reproduzindo...', 'connected');
+    } catch (err) {
+        console.error('Erro ao gerar:', err);
+        setStatus('Erro: ' + err.message, 'error');
+    } finally {
+        btnGenerate.disabled = false;
+    }
+}
+function startStatsMonitor() {
+    setInterval(async () => {
+        if (!pc || !isConnected) return;
+        try {
+            const stats = await pc.getStats();
+            stats.forEach(report => {
+                if (report.type === 'inbound-rtp' && report.kind === 'video') {
+                    const fps = report.framesPerSecond || 0;
+                    const width = report.frameWidth || 0;
+                    const height = report.frameHeight || 0;
+                    if (width && height) {
+                        setMetric('mVideo', `${width}x${height} @${fps.toFixed(0)}fps`);
+                    }
+                }
+                if (report.type === 'candidate-pair' && report.state === 'succeeded') {
+                    const rtt = report.currentRoundTripTime;
+                    if (rtt) {
+                        setMetric('mLatency', `${(rtt * 1000).toFixed(0)}ms`);
+                    }
+                }
+            });
+        } catch (e) {
+            // Ignorar erros de stats
         }
+    }, 1000);
+}
+// Event listeners
+btnConnect.onclick = connect;
+btnGenerate.onclick = generate;
+// Atalho Enter no textarea
+document.getElementById('text').onkeydown = (e) => {
+    if (e.key === 'Enter' && !e.shiftKey) {
+        e.preventDefault();
+        if (!btnGenerate.disabled) {
+            generate();
+        }
+    }
+};
+</script>
 </body>
 </html>

interface/index_optimized.html CHANGED Viewed

@@ -138,7 +138,7 @@
             <div class="video-section">
                 <h3 style="margin-bottom:10px">Avatar Stream (Binary Mode)</h3>
                 <div class="canvas-container">
-                    <video id="idle-video" src="/idle.mp4" loop muted playsinline autoplay></video>
                     <canvas id="avatar-canvas"></canvas>
                 </div>
             </div>
@@ -162,13 +162,47 @@
                     </select>
                 </div>
-                <button id="generate-btn" class="btn-primary" onclick="generate()">
                     Gerar
                 </button>
                 <button id="stop-btn" class="btn-danger" onclick="stop()" disabled>
                     Parar
                 </button>
                 <div class="metrics">
                     <h4>Metricas (Binary)</h4>
                     <div class="metric-row">
@@ -243,17 +277,32 @@
         let syncedFrameInterval = FRAME_INTERVAL;
         let playbackStarted = false;
-        // Streaming de audio real
         let nextAudioTime = 0;          // Próximo tempo para agendar audio
         let audioScheduledChunks = 0;   // Quantos chunks já agendamos
         let firstChunkTime = null;      // Tempo do primeiro chunk (para latência)
         let totalAudioSamples = 0;      // Total de samples de audio recebidos
-        let currentAudioSource = null;  // Referência ao audio source atual para detectar quando termina
-        let audioPlaybackStartTime = 0; // Tempo em que o audio começou a tocar
-        let audioExpectedEndTime = 0;   // Tempo esperado para o audio terminar
-        // Sincronização de transição
         let endVideoTimeMs = null;      // Tempo para continuar o vídeo idle após fala
         // Renderização unificada no canvas
         let renderSource = 'idle';      // 'idle' = video, 'speaking' = frames do servidor
@@ -280,63 +329,85 @@
         let speakingFrameIndex = 0;
         let lastSpeakingRenderTime = 0;
-        // Crossfade transition variables
-        let isTransitioning = false;
-        let transitionStartTime = 0;
-        let lastSpeakingFrame = null;  // Guarda o último frame do speaking para crossfade
-        const TRANSITION_DURATION_MS = 300;  // Duração do crossfade em ms
         function startUnifiedRenderLoop() {
             if (unifiedRenderLoop) return; // Já está rodando
             function renderFrame() {
                 if (renderSource === 'idle') {
-                    // Se estamos em transição, fazer crossfade
-                    if (isTransitioning && lastSpeakingFrame) {
-                        const now = performance.now();
-                        const elapsed = now - transitionStartTime;
-                        const progress = Math.min(elapsed / TRANSITION_DURATION_MS, 1);
-                        // Desenha o último frame do speaking
-                        ctx.globalAlpha = 1 - progress;
-                        ctx.drawImage(lastSpeakingFrame, 0, 0, canvas.width, canvas.height);
-                        // Desenha o vídeo idle por cima com alpha crescente
-                        ctx.globalAlpha = progress;
-                        if (idleVideo.readyState >= 2) {
-                            ctx.drawImage(idleVideo, 0, 0, canvas.width, canvas.height);
-                        }
-                        // Restaurar alpha
-                        ctx.globalAlpha = 1;
-                        // Fim da transição
-                        if (progress >= 1) {
-                            isTransitioning = false;
-                            lastSpeakingFrame = null;
-                            console.log("Crossfade concluído");
-                        }
                     } else {
-                        // Desenha o frame atual do vídeo idle no canvas
-                        if (idleVideo.readyState >= 2) { // HAVE_CURRENT_DATA
-                            ctx.drawImage(idleVideo, 0, 0, canvas.width, canvas.height);
-                        }
                     }
                 } else if (renderSource === 'speaking') {
                     // Desenha frames do servidor com timing controlado
                     const now = performance.now();
                     const elapsed = now - lastSpeakingRenderTime;
-                    // SINCRONIZAÇÃO: Verificar se o áudio já terminou
-                    // Se o tempo esperado de fim do áudio já passou, parar de renderizar frames
-                    if (audioExpectedEndTime > 0 && now >= audioExpectedEndTime) {
-                        console.log(`Audio terminou (${now.toFixed(0)} >= ${audioExpectedEndTime.toFixed(0)}), finalizando video`);
-                        // Salvar último frame para crossfade
-                        saveLastFrameForTransition();
                         finishPlayback();
-                        return;
                     }
                     if (elapsed >= syncedFrameInterval && frameQueue.length > 0) {
                         const frameData = frameQueue.shift();
@@ -367,12 +438,6 @@
                         const bw = totalElapsed > 0 ? (totalBytes / 1024 / totalElapsed) : 0;
                         updateStatus("bandwidth-status", "", `BW: ${bw.toFixed(0)} KB/s`);
-                        // Verificar se terminou
-                        if (streamDone && frameQueue.length === 0) {
-                            // Salvar último frame para crossfade
-                            saveLastFrameForTransition();
-                            finishPlayback();
-                        }
                     }
                 }
@@ -391,31 +456,68 @@
             }
         }
-        // Salvar o último frame do speaking para crossfade
-        function saveLastFrameForTransition() {
-            // Criar uma cópia do canvas atual como imagem
-            const tempCanvas = document.createElement('canvas');
-            tempCanvas.width = canvas.width;
-            tempCanvas.height = canvas.height;
-            const tempCtx = tempCanvas.getContext('2d');
-            tempCtx.drawImage(canvas, 0, 0);
-            // Criar imagem a partir do canvas
-            const img = new Image();
-            img.src = tempCanvas.toDataURL('image/jpeg', 0.95);
-            lastSpeakingFrame = img;
-            console.log("Último frame salvo para crossfade");
-        }
         // Iniciar o render loop quando a página carrega
         idleVideo.addEventListener('loadeddata', () => {
             // Ajustar tamanho do canvas para match do vídeo
             canvas.width = idleVideo.videoWidth || 512;
             canvas.height = idleVideo.videoHeight || 512;
-            startUnifiedRenderLoop();
         });
         function log(msg, type = "msg") {
             const logDiv = document.getElementById("log");
             const time = new Date().toLocaleTimeString();
@@ -559,16 +661,26 @@
                     break;
                 case "first_frame":
-                    // Apenas log - latência real é calculada no início do playback
                     log(`Primeiro frame (server): ${data.latency_ms}ms`, "status");
                     break;
                 case "done":
-                    // Salvar o tempo final do vídeo para transição suave
                     if (data.end_video_time_ms !== undefined) {
                         endVideoTimeMs = data.end_video_time_ms;
-                        console.log(`Transição: vídeo idle continuará em ${endVideoTimeMs}ms`);
                     }
                     const framesCount = data.total_frames || data.frames || allFrames.length;
                     const bytesInfo = data.bytes_sent ? `${(data.bytes_sent/1024).toFixed(1)}KB` : '';
@@ -598,8 +710,8 @@
             // Acumular frames
             allFrames.push({ url, index: frameIndex });
-            // Se já está reproduzindo, adicionar ao buffer também
-            if (playbackStarted) {
                 frameQueue.push({ url, index: frameIndex });
                 document.getElementById("buffer").textContent = frameQueue.length;
             }
@@ -612,13 +724,11 @@
             document.getElementById("video-duration").textContent = videoDuration.toFixed(2) + "s";
             updateSyncDiff();
-            if (!playbackStarted) {
                 document.getElementById("buffer").textContent = received + " (buffering)";
             }
             if (received === 1) {
-                // Primeiro frame - apenas log, NÃO mudar para speaking ainda
-                // (vai mudar quando playback sincronizado começar)
                 console.log("Primeiro frame recebido - buffering...");
                 updateStatus("stream-status", "streaming", "Stream: Buffering...");
             }
@@ -656,8 +766,8 @@
                 initAudioContext();
             }
-            // Acumular chunk (NÃO tocar ainda - esperar sync)
             if (chunkData.length > 0) {
                 audioChunks.push(chunkData);
                 // Acumular samples para calcular duração
@@ -671,7 +781,12 @@
                 document.getElementById("audio-duration").textContent = totalAudioDuration.toFixed(2) + "s";
                 updateSyncDiff();
-                console.log(`Audio chunk ${chunkIndex}: ${chunkData.length} bytes, total=${audioChunks.length}, duration=${totalAudioDuration.toFixed(2)}s`);
             }
             if (isLast) {
@@ -680,10 +795,57 @@
                 console.log(`Todos chunks recebidos: ${audioChunks.length}`);
             }
-            // Tentar iniciar frames (vídeo) quando tiver audio suficiente
             tryStartStreamingPlayback();
         }
         function initAudioContext() {
             if (!audioContext) {
                 audioContext = new (window.AudioContext || window.webkitAudioContext)({
@@ -803,60 +965,128 @@
         }
         function tryStartStreamingPlayback() {
-            // Condições para iniciar renderização de frames:
-            // 1. Já iniciou? Sair
             if (playbackStarted) return;
-            // 2. Esperar stream completo (todos frames e áudio)
-            //    Isso permite sincronização perfeita entre áudio e vídeo
-            if (!streamDone) {
-                console.log(`Aguardando stream completo...`);
-                return;
-            }
-            // 3. Temos frames?
-            if (allFrames.length === 0) {
-                console.log(`Nenhum frame recebido`);
-                return;
             }
-            // 4. Temos áudio?
-            if (totalAudioSamples === 0) {
-                console.log(`Nenhum áudio recebido`);
                 return;
             }
-            // Pronto para iniciar renderização de frames!
-            playbackStarted = true;
-            // Calcular latência
             const playbackLatency = Date.now() - startTime;
             document.getElementById("latency").textContent = playbackLatency + "ms";
             log(`Latencia: ${playbackLatency}ms`, "status");
-            // *** SINCRONIZAÇÃO: Calcular FPS baseado na duração real do áudio ***
-            const audioDurationSec = totalAudioSamples / 24000;
-            const totalFrames = allFrames.length;
-            const calculatedFps = totalFrames / audioDurationSec;
-            syncedFrameInterval = 1000 / calculatedFps;
-            console.log(`=== INICIANDO PLAYBACK SINCRONIZADO ===`);
-            console.log(`Latencia: ${playbackLatency}ms`);
-            console.log(`Frames: ${totalFrames}`);
-            console.log(`Audio: ${audioDurationSec.toFixed(2)}s (${totalAudioSamples} samples)`);
-            console.log(`FPS calculado: ${calculatedFps.toFixed(2)} (interval: ${syncedFrameInterval.toFixed(1)}ms)`);
-            // Iniciar renderização de frames via render loop unificado
             frameQueue = [...allFrames];
             document.getElementById("buffer").textContent = frameQueue.length;
             renderSource = 'speaking';
             lastSpeakingRenderTime = performance.now();
-            // Reiniciar o áudio do início para sincronizar com os frames
-            // (os chunks já foram agendados, precisamos recomeçar)
-            restartAudioPlayback();
-            updateStatus("stream-status", "streaming", "Stream: Reproduzindo");
         }
         function restartAudioPlayback() {
@@ -1006,10 +1236,14 @@
             currentAudioSource = null;
             audioPlaybackStartTime = 0;
             audioExpectedEndTime = 0;
-            // Reset crossfade state
-            isTransitioning = false;
-            lastSpeakingFrame = null;
             document.getElementById("frames").textContent = "0";
             document.getElementById("rendered").textContent = "0";
@@ -1026,6 +1260,15 @@
             document.getElementById("stop-btn").disabled = false;
             updateStatus("stream-status", "", "Stream: Iniciando...");
             log("Enviando: " + text.substring(0, 30) + "...", "status");
             // Enviar com o timestamp do vídeo idle para sincronização
@@ -1033,7 +1276,8 @@
                 action: "generate",
                 text: text,
                 voice: voice,
-                idle_video_time_ms: idleVideoTimeMs  // Timestamp exato do vídeo idle
             }));
         }
@@ -1062,13 +1306,27 @@
             log(`Finalizado: ${renderedFrames} frames em ${elapsed.toFixed(1)}s`, "msg");
             console.log(`Playback finalizado: ${renderedFrames} frames renderizados`);
-            // Transição IMEDIATA para idle (não precisa esperar)
-            // Se servidor enviou end_video_time_ms, continuar o vídeo desse ponto
-            if (endVideoTimeMs !== null) {
-                const videoTimeSeconds = endVideoTimeMs / 1000;
                 const videoDuration = idleVideo.duration || 60;
-                const seekTime = videoTimeSeconds % videoDuration;
-                console.log(`Transição suave: vídeo idle em ${seekTime.toFixed(2)}s (end_video_time_ms=${endVideoTimeMs})`);
                 idleVideo.currentTime = seekTime;
             }
@@ -1076,16 +1334,9 @@
             idleVideo.loop = true;
             idleVideo.play().catch(e => console.log("Erro ao retomar idle video:", e));
-            // Iniciar crossfade se temos o último frame salvo
-            if (lastSpeakingFrame) {
-                isTransitioning = true;
-                transitionStartTime = performance.now();
-                console.log("Iniciando crossfade de 300ms");
-            }
-            // Transição: speaking → idle (só muda a fonte, canvas continua renderizando)
             renderSource = 'idle';
-            console.log("Transição para idle - renderSource mudou, idle video playing");
             // Reset estado de audio
             audioExpectedEndTime = 0;
@@ -1093,8 +1344,14 @@
             // Reset estado geral
             isStreaming = false;
-            endVideoTimeMs = null;
             playbackStarted = false;
             document.getElementById("generate-btn").disabled = false;
             document.getElementById("stop-btn").disabled = true;
             updateStatus("stream-status", "online", "Stream: Concluido");
@@ -1123,12 +1380,14 @@
             audioDuration = 0;
             audioBuffer = null;
             playbackStarted = false;
             nextAudioTime = 0;
             audioScheduledChunks = 0;
             firstChunkTime = null;
             totalAudioSamples = 0;
             audioPlaybackStartTime = 0;
             audioExpectedEndTime = 0;
             if (currentAudioSource) {
                 try { currentAudioSource.stop(); } catch (e) {}
@@ -1140,6 +1399,12 @@
                 audioSource = null;
             }
             // Voltar ao video idle (apenas muda a fonte do render loop)
             renderSource = 'idle';
             idleVideo.play().catch(e => {});
@@ -1157,6 +1422,139 @@
             }
         }, 30000);
         connectWebSocket();
     </script>
 </body>

             <div class="video-section">
                 <h3 style="margin-bottom:10px">Avatar Stream (Binary Mode)</h3>
                 <div class="canvas-container">
+                    <video id="idle-video" loop muted playsinline autoplay preload="auto" src="idle.mp4"></video>
                     <canvas id="avatar-canvas"></canvas>
                 </div>
             </div>
                     </select>
                 </div>
+                <div class="input-group" style="margin-top: 15px; padding-top: 15px; border-top: 1px solid rgba(255,255,255,0.1);">
+                    <label>Qualidade Video Fala: <span id="quality-value">95</span>%</label>
+                    <input type="range" id="quality-slider" min="50" max="100" value="95"
+                           style="width: 100%; margin-top: 5px;"
+                           oninput="document.getElementById('quality-value').textContent = this.value">
+                    <small style="color: #666; font-size: 11px;">Aumentar para igualar com vídeo idle</small>
+                </div>
+                <div class="input-group">
+                    <label>Offset Transição: <span id="offset-value">0</span>ms</label>
+                    <input type="range" id="offset-slider" min="-500" max="500" value="0"
+                           style="width: 100%; margin-top: 5px;"
+                           oninput="document.getElementById('offset-value').textContent = this.value">
+                    <small style="color: #666; font-size: 11px;">Ajuste fino do momento de voltar ao idle</small>
+                </div>
+                <button id="generate-btn" class="btn-primary" onclick="generate()" style="margin-top: 15px;">
                     Gerar
                 </button>
                 <button id="stop-btn" class="btn-danger" onclick="stop()" disabled>
                     Parar
                 </button>
+                <div class="input-group" style="margin-top: 15px; padding: 12px; background: rgba(255,200,0,0.1); border-radius: 8px; border: 1px solid rgba(255,200,0,0.3);">
+                    <label style="color: #fc0; font-weight: bold;">Modo Demo</label>
+                    <div style="display: flex; gap: 10px; margin-top: 8px; align-items: center;">
+                        <button id="demo-btn" class="btn-primary" onclick="toggleDemo()" style="flex: 1; background: linear-gradient(135deg, #fc0, #f90); margin: 0;">
+                            Iniciar Demo
+                        </button>
+                    </div>
+                    <div style="display: flex; gap: 10px; margin-top: 10px;">
+                        <div style="flex: 1;">
+                            <label style="font-size: 11px;">Idle: <span id="demo-idle-value">5</span>s</label>
+                            <input type="range" id="demo-idle-slider" min="1" max="30" value="5"
+                                   style="width: 100%;"
+                                   oninput="document.getElementById('demo-idle-value').textContent = this.value">
+                        </div>
+                    </div>
+                    <small style="color: #666; font-size: 11px;">Alterna entre falar e idle automaticamente</small>
+                </div>
                 <div class="metrics">
                     <h4>Metricas (Binary)</h4>
                     <div class="metric-row">
         let syncedFrameInterval = FRAME_INTERVAL;
         let playbackStarted = false;
+        // === REPRODUÇÃO PROGRESSIVA ===
+        // Configurações de buffer mínimo para começar a reproduzir
+        const MIN_FRAMES_TO_START = 5;       // Mínimo de frames antes de iniciar (200ms de vídeo)
+        const MIN_AUDIO_CHUNKS_TO_START = 2; // Mínimo de chunks de áudio antes de iniciar
+        const PROGRESSIVE_MODE = true;       // Ativar reprodução progressiva
+        // Streaming de audio real - agendamento sequencial
         let nextAudioTime = 0;          // Próximo tempo para agendar audio
         let audioScheduledChunks = 0;   // Quantos chunks já agendamos
         let firstChunkTime = null;      // Tempo do primeiro chunk (para latência)
         let totalAudioSamples = 0;      // Total de samples de audio recebidos
+        let currentAudioSource = null;  // Referência ao audio source atual
+        let audioPlaybackStartTime = 0; // Tempo em que o audio começou a tocar (performance.now)
+        let audioExpectedEndTime = 0;   // Tempo esperado para o audio terminar (performance.now)
+        let progressivePlaybackStarted = false;  // Flag para reprodução progressiva
+        let audioContextStartTime = 0;  // audioContext.currentTime quando começou
+        // Sincronização de transição no nível de frame
         let endVideoTimeMs = null;      // Tempo para continuar o vídeo idle após fala
+        let startFrameIdx = null;       // Frame inicial que o Wav2Lip usou
+        let endFrameIdx = null;         // Frame final que o Wav2Lip usou
+        let waitingForFrameSync = false; // Esperando o frame certo para começar
+        let frameSyncStartTime = 0;     // Quando começou a esperar pelo frame
+        const FRAME_SYNC_TIMEOUT = 500; // Timeout máximo em ms para sincronização
+        let idleVideoDurationMs = 0;    // Duração total do vídeo idle em ms
+        let idleVideoTotalFrames = 0;   // Total de frames do vídeo idle
         // Renderização unificada no canvas
         let renderSource = 'idle';      // 'idle' = video, 'speaking' = frames do servidor
         let speakingFrameIndex = 0;
         let lastSpeakingRenderTime = 0;
+        // Transição direta (sem crossfade para parecer mais natural)
         function startUnifiedRenderLoop() {
             if (unifiedRenderLoop) return; // Já está rodando
             function renderFrame() {
+                // === ESTADO: AGUARDANDO SINCRONIZAÇÃO DE FRAME ===
+                if (waitingForFrameSync && startFrameIdx !== null) {
+                    // Continuar mostrando idle video enquanto aguarda
+                    if (idleVideo.readyState >= 2) {
+                        ctx.drawImage(idleVideo, 0, 0, canvas.width, canvas.height);
+                    }
+                    // Verificar timeout - se demorou muito, iniciar mesmo assim
+                    const waitingTime = performance.now() - frameSyncStartTime;
+                    if (waitingTime > FRAME_SYNC_TIMEOUT) {
+                        console.log(`Frame sync: TIMEOUT após ${waitingTime.toFixed(0)}ms - iniciando sem sync`);
+                        waitingForFrameSync = false;
+                        doStartSpeaking();
+                        unifiedRenderLoop = requestAnimationFrame(renderFrame);
+                        return;
+                    }
+                    // Calcular frame atual do vídeo idle
+                    const currentIdleFrame = Math.floor(idleVideo.currentTime * IDLE_VIDEO_FPS) % idleVideoTotalFrames;
+                    // Verificar se chegou no frame alvo (com tolerância de ±2 frames)
+                    const frameDiff = Math.abs(currentIdleFrame - startFrameIdx);
+                    const isCloseEnough = frameDiff <= 2 || frameDiff >= (idleVideoTotalFrames - 2);
+                    if (isCloseEnough) {
+                        console.log(`Frame sync: frame atual=${currentIdleFrame}, alvo=${startFrameIdx} - INICIANDO! (${waitingTime.toFixed(0)}ms)`);
+                        waitingForFrameSync = false;
+                        doStartSpeaking();
+                    }
+                    unifiedRenderLoop = requestAnimationFrame(renderFrame);
+                    return;
+                }
                 if (renderSource === 'idle') {
+                    // Transição direta - sem fade, apenas troca para o vídeo idle
+                    // O vídeo idle já está sincronizado no ponto correto (endVideoTimeMs)
+                    if (idleVideo.readyState >= 2) {
+                        ctx.drawImage(idleVideo, 0, 0, canvas.width, canvas.height);
                     } else {
+                        // Video não está pronto - tentar recarregar
+                        console.log(`[IDLE] Video não pronto: readyState=${idleVideo.readyState}, tentando play...`);
+                        idleVideo.play().catch(e => {});
                     }
                 } else if (renderSource === 'speaking') {
                     // Desenha frames do servidor com timing controlado
                     const now = performance.now();
                     const elapsed = now - lastSpeakingRenderTime;
+                    // === VERIFICAR SE DEVE FINALIZAR ===
+                    // Debug: mostrar estado a cada segundo
+                    if (Math.floor(now / 1000) !== Math.floor(lastSpeakingRenderTime / 1000)) {
+                        console.log(`[DEBUG] streamDone=${streamDone}, queue=${frameQueue.length}, rendered=${renderedFrames}, audioEnd=${audioExpectedEndTime.toFixed(0)}, now=${now.toFixed(0)}`);
+                    }
+                    // 1. Áudio terminou (tempo esperado passou + offset do usuário)
+                    const transitionOffset = parseInt(document.getElementById("offset-slider").value) || 0;
+                    const adjustedEndTime = audioExpectedEndTime + transitionOffset;
+                    if (audioExpectedEndTime > 0 && now >= adjustedEndTime) {
+                        console.log(`[FIM] Audio terminou: now=${now.toFixed(0)} >= end=${adjustedEndTime.toFixed(0)} (offset=${transitionOffset})`);
                         finishPlayback();
+                        // NÃO fazer return aqui - precisa continuar o render loop para o idle
+                    }
+                    // 2. Stream completo e fila de frames vazia
+                    else if (streamDone && frameQueue.length === 0) {
+                        console.log(`[FIM] Stream done + fila vazia: rendered=${renderedFrames}, total=${allFrames.length}`);
+                        finishPlayback();
+                        // NÃO fazer return aqui - precisa continuar o render loop para o idle
                     }
+                    // === RENDERIZAR PRÓXIMO FRAME ===
                     if (elapsed >= syncedFrameInterval && frameQueue.length > 0) {
                         const frameData = frameQueue.shift();
                         const bw = totalElapsed > 0 ? (totalBytes / 1024 / totalElapsed) : 0;
                         updateStatus("bandwidth-status", "", `BW: ${bw.toFixed(0)} KB/s`);
                     }
                 }
             }
         }
         // Iniciar o render loop quando a página carrega
         idleVideo.addEventListener('loadeddata', () => {
             // Ajustar tamanho do canvas para match do vídeo
             canvas.width = idleVideo.videoWidth || 512;
             canvas.height = idleVideo.videoHeight || 512;
+            // Capturar duração e calcular total de frames
+            idleVideoDurationMs = (idleVideo.duration || 60) * 1000;
+            idleVideoTotalFrames = Math.round((idleVideo.duration || 60) * IDLE_VIDEO_FPS);
+            console.log(`Video idle carregado: ${canvas.width}x${canvas.height}, ${idleVideo.duration?.toFixed(1)}s, ~${idleVideoTotalFrames} frames`);
+            // Garantir que o vídeo idle está tocando em loop
+            idleVideo.loop = true;
+            idleVideo.muted = true;
+            idleVideo.play().then(() => {
+                console.log("Video idle iniciado com sucesso");
+                startUnifiedRenderLoop();
+            }).catch(e => {
+                console.log("Autoplay bloqueado, iniciando render loop mesmo assim:", e);
+                startUnifiedRenderLoop();
+            });
+        });
+        // Fallback: se o vídeo não carregar em 3 segundos, iniciar mesmo assim
+        setTimeout(() => {
+            if (!unifiedRenderLoop) {
+                console.log("Fallback: iniciando render loop após timeout");
+                canvas.width = 512;
+                canvas.height = 512;
+                startUnifiedRenderLoop();
+            }
+        }, 3000);
+        // Tratamento de erro do vídeo idle
+        idleVideo.addEventListener('error', (e) => {
+            console.error("Erro carregando vídeo idle:", e);
+            log("Erro carregando vídeo idle", "error");
+            // Tentar carregar novamente após 2 segundos
+            setTimeout(() => {
+                console.log("Tentando recarregar vídeo idle...");
+                idleVideo.load();
+            }, 2000);
         });
+        // Quando o vídeo idle termina (não deveria acontecer com loop=true, mas por segurança)
+        idleVideo.addEventListener('ended', () => {
+            console.log("Video idle ended - reiniciando");
+            idleVideo.currentTime = 0;
+            idleVideo.play().catch(e => console.log("Erro reiniciando idle:", e));
+        });
+        // Quando o vídeo idle para por algum motivo
+        idleVideo.addEventListener('pause', () => {
+            if (renderSource === 'idle') {
+                console.log("Video idle pausou inesperadamente - retomando");
+                idleVideo.play().catch(e => console.log("Erro retomando idle:", e));
+            }
+        });
+        // Forçar carregamento do vídeo idle com cache-busting
+        idleVideo.src = `idle.mp4?t=${Date.now()}`;
         function log(msg, type = "msg") {
             const logDiv = document.getElementById("log");
             const time = new Date().toLocaleTimeString();
                     break;
                 case "first_frame":
+                    // Capturar start_frame_idx para sincronização no nível de frame
+                    if (data.start_frame_idx !== undefined && data.start_frame_idx !== null) {
+                        startFrameIdx = data.start_frame_idx;
+                        console.log(`Sync: start_frame_idx=${startFrameIdx}`);
+                    }
                     log(`Primeiro frame (server): ${data.latency_ms}ms`, "status");
                     break;
                 case "done":
+                    // Salvar índices de frame para sincronização
+                    if (data.start_frame_idx !== undefined && data.start_frame_idx !== null) {
+                        startFrameIdx = data.start_frame_idx;
+                    }
+                    if (data.end_frame_idx !== undefined && data.end_frame_idx !== null) {
+                        endFrameIdx = data.end_frame_idx;
+                    }
                     if (data.end_video_time_ms !== undefined) {
                         endVideoTimeMs = data.end_video_time_ms;
                     }
+                    console.log(`Sync: start_frame=${startFrameIdx}, end_frame=${endFrameIdx}, end_time_ms=${endVideoTimeMs}`);
                     const framesCount = data.total_frames || data.frames || allFrames.length;
                     const bytesInfo = data.bytes_sent ? `${(data.bytes_sent/1024).toFixed(1)}KB` : '';
             // Acumular frames
             allFrames.push({ url, index: frameIndex });
+            // Se já está reproduzindo (progressivo ou normal), adicionar ao buffer
+            if (progressivePlaybackStarted || playbackStarted) {
                 frameQueue.push({ url, index: frameIndex });
                 document.getElementById("buffer").textContent = frameQueue.length;
             }
             document.getElementById("video-duration").textContent = videoDuration.toFixed(2) + "s";
             updateSyncDiff();
+            if (!progressivePlaybackStarted && !playbackStarted) {
                 document.getElementById("buffer").textContent = received + " (buffering)";
             }
             if (received === 1) {
                 console.log("Primeiro frame recebido - buffering...");
                 updateStatus("stream-status", "streaming", "Stream: Buffering...");
             }
                 initAudioContext();
             }
             if (chunkData.length > 0) {
+                // Acumular chunk
                 audioChunks.push(chunkData);
                 // Acumular samples para calcular duração
                 document.getElementById("audio-duration").textContent = totalAudioDuration.toFixed(2) + "s";
                 updateSyncDiff();
+                // === MODO PROGRESSIVO: Agendar chunk imediatamente ===
+                if (PROGRESSIVE_MODE && progressivePlaybackStarted) {
+                    scheduleAudioChunkProgressive(chunkData);
+                }
+                console.log(`Audio chunk ${chunkIndex}: ${chunkData.length} bytes, scheduled=${audioScheduledChunks}`);
             }
             if (isLast) {
                 console.log(`Todos chunks recebidos: ${audioChunks.length}`);
             }
+            // Tentar iniciar playback
             tryStartStreamingPlayback();
         }
+        function scheduleAudioChunkProgressive(chunkData) {
+            // Agendar este chunk de áudio para tocar em sequência
+            if (!audioContext) return;
+            try {
+                // Converter PCM int16 para float32
+                const alignedBuffer = new ArrayBuffer(chunkData.length);
+                new Uint8Array(alignedBuffer).set(chunkData);
+                const samples = new Int16Array(alignedBuffer);
+                const floatSamples = new Float32Array(samples.length);
+                for (let i = 0; i < samples.length; i++) {
+                    floatSamples[i] = samples[i] / 32768;
+                }
+                // Criar buffer de audio
+                const buffer = audioContext.createBuffer(1, floatSamples.length, 24000);
+                buffer.getChannelData(0).set(floatSamples);
+                // Criar source
+                const source = audioContext.createBufferSource();
+                source.buffer = buffer;
+                source.connect(audioContext.destination);
+                // Garantir que não agendamos no passado
+                const now = audioContext.currentTime;
+                if (nextAudioTime < now) {
+                    nextAudioTime = now + 0.01;
+                }
+                // Agendar para tocar
+                source.start(nextAudioTime);
+                // Calcular duração e atualizar próximo tempo
+                const chunkDuration = floatSamples.length / 24000;
+                nextAudioTime += chunkDuration;
+                audioScheduledChunks++;
+                // Atualizar tempo esperado de fim (em performance.now)
+                const audioEndContextTime = nextAudioTime;
+                const elapsedSinceStart = audioEndContextTime - audioContextStartTime;
+                audioExpectedEndTime = audioPlaybackStartTime + (elapsedSinceStart * 1000);
+            } catch (e) {
+                console.error("Erro agendando audio chunk:", e);
+            }
+        }
         function initAudioContext() {
             if (!audioContext) {
                 audioContext = new (window.AudioContext || window.webkitAudioContext)({
         }
         function tryStartStreamingPlayback() {
+            // Já iniciou playback final? Sair
             if (playbackStarted) return;
+            const framesCount = allFrames.length;
+            const audioChunksCount = audioChunks.length;
+            // === MODO PROGRESSIVO ===
+            if (PROGRESSIVE_MODE && !progressivePlaybackStarted) {
+                // Verificar se temos buffer mínimo para começar
+                const hasMinFrames = framesCount >= MIN_FRAMES_TO_START;
+                const hasMinAudio = audioChunksCount >= MIN_AUDIO_CHUNKS_TO_START;
+                if (hasMinFrames && hasMinAudio) {
+                    console.log(`=== INICIANDO REPRODUÇÃO PROGRESSIVA ===`);
+                    console.log(`Buffer: ${framesCount} frames, ${audioChunksCount} chunks de áudio`);
+                    progressivePlaybackStarted = true;
+                    startProgressivePlayback();
+                    return;
+                } else {
+                    // Ainda buffering
+                    const status = `Buffering: ${framesCount}/${MIN_FRAMES_TO_START} frames, ${audioChunksCount}/${MIN_AUDIO_CHUNKS_TO_START} chunks`;
+                    document.getElementById("buffer").textContent = status;
+                    return;
+                }
             }
+            // === STREAM COMPLETO - Ajustar FPS final ===
+            if (streamDone && progressivePlaybackStarted && !playbackStarted) {
+                playbackStarted = true;
+                // Calcular FPS final baseado na duração real
+                const audioDurationSec = totalAudioSamples / 24000;
+                const calculatedFps = framesCount / audioDurationSec;
+                syncedFrameInterval = 1000 / calculatedFps;
+                console.log(`=== AJUSTE FINAL DE SYNC ===`);
+                console.log(`Frames: ${framesCount}, Audio: ${audioDurationSec.toFixed(2)}s`);
+                console.log(`FPS ajustado: ${calculatedFps.toFixed(2)} (interval: ${syncedFrameInterval.toFixed(1)}ms)`);
                 return;
             }
+            // === MODO TRADICIONAL (sem progressivo) ===
+            if (!PROGRESSIVE_MODE && streamDone) {
+                if (framesCount === 0 || totalAudioSamples === 0) return;
+                playbackStarted = true;
+                const playbackLatency = Date.now() - startTime;
+                document.getElementById("latency").textContent = playbackLatency + "ms";
+                log(`Latencia: ${playbackLatency}ms`, "status");
+                const audioDurationSec = totalAudioSamples / 24000;
+                const calculatedFps = framesCount / audioDurationSec;
+                syncedFrameInterval = 1000 / calculatedFps;
+                frameQueue = [...allFrames];
+                renderSource = 'speaking';
+                lastSpeakingRenderTime = performance.now();
+                restartAudioPlayback();
+                updateStatus("stream-status", "streaming", "Stream: Reproduzindo");
+            }
+        }
+        function startProgressivePlayback() {
+            // Registrar latência
             const playbackLatency = Date.now() - startTime;
             document.getElementById("latency").textContent = playbackLatency + "ms";
             log(`Latencia: ${playbackLatency}ms`, "status");
+            // Usar FPS padrão (25fps)
+            syncedFrameInterval = FRAME_INTERVAL;
+            console.log(`Iniciando progressivo: ${allFrames.length} frames, ${audioChunks.length} chunks`);
+            // Copiar frames disponíveis para a fila
             frameQueue = [...allFrames];
             document.getElementById("buffer").textContent = frameQueue.length;
+            // === SINCRONIZAÇÃO NO NÍVEL DE FRAME ===
+            // Se temos startFrameIdx, esperar o vídeo idle chegar no frame certo
+            if (startFrameIdx !== null && idleVideoTotalFrames > 0) {
+                waitingForFrameSync = true;
+                frameSyncStartTime = performance.now();
+                console.log(`Frame sync: aguardando frame ${startFrameIdx} de ${idleVideoTotalFrames}`);
+                updateStatus("stream-status", "streaming", `Sync: aguardando frame ${startFrameIdx}...`);
+                // O render loop vai detectar waitingForFrameSync e fazer a transição quando chegar o frame certo
+                return;
+            }
+            // Sem frame sync - iniciar imediatamente
+            doStartSpeaking();
+        }
+        function doStartSpeaking() {
+            // Mudar para modo speaking
             renderSource = 'speaking';
             lastSpeakingRenderTime = performance.now();
+            // Iniciar áudio - agendar todos os chunks que já temos
+            startProgressiveAudio();
+            updateStatus("stream-status", "streaming", "Stream: Reproduzindo...");
+        }
+        function startProgressiveAudio() {
+            if (!audioContext) {
+                initAudioContext();
+            }
+            // Registrar tempo de início
+            audioPlaybackStartTime = performance.now();
+            audioContextStartTime = audioContext.currentTime;
+            nextAudioTime = audioContext.currentTime + 0.02; // 20ms de buffer inicial
+            // Agendar todos os chunks que já temos
+            for (const chunk of audioChunks) {
+                scheduleAudioChunkProgressive(chunk);
+            }
+            console.log(`Audio progressivo: ${audioScheduledChunks} chunks agendados`);
         }
         function restartAudioPlayback() {
             currentAudioSource = null;
             audioPlaybackStartTime = 0;
             audioExpectedEndTime = 0;
+            audioContextStartTime = 0;
+            progressivePlaybackStarted = false;
+            // Reset frame sync state
+            startFrameIdx = null;
+            endFrameIdx = null;
+            endVideoTimeMs = null;
+            waitingForFrameSync = false;
             document.getElementById("frames").textContent = "0";
             document.getElementById("rendered").textContent = "0";
             document.getElementById("stop-btn").disabled = false;
             updateStatus("stream-status", "", "Stream: Iniciando...");
+            // Pegar valores dos controles
+            const quality = parseInt(document.getElementById("quality-slider").value);
+            // Calcular start_frame_idx localmente (mesmo cálculo que o Wav2Lip faz)
+            // Isso permite sincronização imediata sem esperar resposta do servidor
+            const localStartFrameIdx = Math.floor((idleVideoTimeMs / 1000) * IDLE_VIDEO_FPS) % idleVideoTotalFrames;
+            startFrameIdx = localStartFrameIdx;
+            console.log(`Frame sync local: start_frame_idx=${startFrameIdx} (de ${idleVideoTotalFrames} frames)`);
             log("Enviando: " + text.substring(0, 30) + "...", "status");
             // Enviar com o timestamp do vídeo idle para sincronização
                 action: "generate",
                 text: text,
                 voice: voice,
+                idle_video_time_ms: idleVideoTimeMs,  // Timestamp exato do vídeo idle
+                jpeg_quality: quality  // Qualidade do JPEG (50-100)
             }));
         }
             log(`Finalizado: ${renderedFrames} frames em ${elapsed.toFixed(1)}s`, "msg");
             console.log(`Playback finalizado: ${renderedFrames} frames renderizados`);
+            // Transição para idle - calcular posição exata considerando tempo de rede
+            // O servidor envia end_video_time_ms = posição do último frame usado
+            // Precisamos compensar pelo tempo que passou desde que o servidor terminou até agora
+            if (endVideoTimeMs !== null && endVideoTimeMs > 0) {
                 const videoDuration = idleVideo.duration || 60;
+                const videoDurationMs = videoDuration * 1000;
+                // Calcular quanto tempo passou desde o início (tempo de processamento + rede)
+                const totalElapsedMs = Date.now() - startTime;
+                // O áudio tem a duração real - usar isso como referência
+                const audioDurationMs = totalAudioSamples / 24000 * 1000;
+                // Tempo extra que passou além da duração do áudio (overhead de rede/processamento)
+                const networkOverheadMs = Math.max(0, totalElapsedMs - audioDurationMs);
+                // Posição compensada: end_video_time_ms + overhead de rede
+                const compensatedTimeMs = endVideoTimeMs + networkOverheadMs;
+                const seekTime = (compensatedTimeMs % videoDurationMs) / 1000;
+                console.log(`Transição: end=${endVideoTimeMs}ms, elapsed=${totalElapsedMs}ms, audio=${audioDurationMs.toFixed(0)}ms, overhead=${networkOverheadMs.toFixed(0)}ms -> seek=${seekTime.toFixed(2)}s`);
                 idleVideo.currentTime = seekTime;
             }
             idleVideo.loop = true;
             idleVideo.play().catch(e => console.log("Erro ao retomar idle video:", e));
+            // Transição direta: speaking → idle (sem fade, mais natural)
             renderSource = 'idle';
+            console.log("Transição direta para idle");
             // Reset estado de audio
             audioExpectedEndTime = 0;
             // Reset estado geral
             isStreaming = false;
             playbackStarted = false;
+            progressivePlaybackStarted = false;
+            // Reset frame sync state (manter endVideoTimeMs até aqui pois é usado acima)
+            startFrameIdx = null;
+            endFrameIdx = null;
+            endVideoTimeMs = null;
+            waitingForFrameSync = false;
             document.getElementById("generate-btn").disabled = false;
             document.getElementById("stop-btn").disabled = true;
             updateStatus("stream-status", "online", "Stream: Concluido");
             audioDuration = 0;
             audioBuffer = null;
             playbackStarted = false;
+            progressivePlaybackStarted = false;
             nextAudioTime = 0;
             audioScheduledChunks = 0;
             firstChunkTime = null;
             totalAudioSamples = 0;
             audioPlaybackStartTime = 0;
             audioExpectedEndTime = 0;
+            audioContextStartTime = 0;
             if (currentAudioSource) {
                 try { currentAudioSource.stop(); } catch (e) {}
                 audioSource = null;
             }
+            // Reset frame sync state
+            startFrameIdx = null;
+            endFrameIdx = null;
+            endVideoTimeMs = null;
+            waitingForFrameSync = false;
             // Voltar ao video idle (apenas muda a fonte do render loop)
             renderSource = 'idle';
             idleVideo.play().catch(e => {});
             }
         }, 30000);
+        // === MODO DEMO ===
+        let demoMode = false;
+        let demoTimeout = null;
+        let lastDemoText = "";
+        function toggleDemo() {
+            if (demoMode) {
+                stopDemo();
+            } else {
+                startDemo();
+            }
+        }
+        function startDemo() {
+            demoMode = true;
+            lastDemoText = document.getElementById("text-input").value.trim();
+            if (!lastDemoText) {
+                lastDemoText = "Hello! I am a real-time streaming avatar optimized for low latency.";
+                document.getElementById("text-input").value = lastDemoText;
+            }
+            document.getElementById("demo-btn").textContent = "Parar Demo";
+            document.getElementById("demo-btn").style.background = "linear-gradient(135deg, #f44, #c00)";
+            document.getElementById("generate-btn").disabled = true;
+            document.getElementById("text-input").disabled = true;
+            log("Modo demo iniciado", "status");
+            console.log("Demo: iniciando ciclo");
+            // Iniciar primeiro ciclo
+            demoSpeak();
+        }
+        function stopDemo() {
+            demoMode = false;
+            if (demoTimeout) {
+                clearTimeout(demoTimeout);
+                demoTimeout = null;
+            }
+            document.getElementById("demo-btn").textContent = "Iniciar Demo";
+            document.getElementById("demo-btn").style.background = "linear-gradient(135deg, #fc0, #f90)";
+            document.getElementById("generate-btn").disabled = false;
+            document.getElementById("text-input").disabled = false;
+            stop();
+            log("Modo demo parado", "status");
+            console.log("Demo: parado");
+        }
+        function demoSpeak() {
+            if (!demoMode) return;
+            console.log("Demo: falando...");
+            // Usar o texto configurado
+            const text = document.getElementById("text-input").value.trim() || lastDemoText;
+            const voice = document.getElementById("voice-select").value;
+            const quality = parseInt(document.getElementById("quality-slider").value);
+            const idleVideoTimeMs = getCompensatedIdleVideoTime();
+            // Reset estado
+            stopStream();
+            frameQueue = [];
+            allFrames = [];
+            renderedFrames = 0;
+            totalBytes = 0;
+            startTime = Date.now();
+            audioChunks = [];
+            audioChunksComplete = false;
+            streamDone = false;
+            audioDuration = 0;
+            audioBuffer = null;
+            syncedFrameInterval = FRAME_INTERVAL;
+            playbackStarted = false;
+            nextAudioTime = 0;
+            audioScheduledChunks = 0;
+            firstChunkTime = null;
+            totalAudioSamples = 0;
+            currentAudioSource = null;
+            audioPlaybackStartTime = 0;
+            audioExpectedEndTime = 0;
+            audioContextStartTime = 0;
+            progressivePlaybackStarted = false;
+            // Reset frame sync state
+            endFrameIdx = null;
+            endVideoTimeMs = null;
+            waitingForFrameSync = false;
+            // Calcular start_frame_idx localmente (mesmo cálculo que o Wav2Lip faz)
+            const localStartFrameIdx = Math.floor((idleVideoTimeMs / 1000) * IDLE_VIDEO_FPS) % idleVideoTotalFrames;
+            startFrameIdx = localStartFrameIdx;
+            console.log(`Demo frame sync: start_frame_idx=${startFrameIdx}`);
+            isStreaming = true;
+            updateStatus("stream-status", "streaming", "Demo: Falando...");
+            ws.send(JSON.stringify({
+                action: "generate",
+                text: text,
+                voice: voice,
+                idle_video_time_ms: idleVideoTimeMs,
+                jpeg_quality: quality
+            }));
+        }
+        function demoScheduleNextCycle() {
+            if (!demoMode) return;
+            const idleTime = parseInt(document.getElementById("demo-idle-slider").value) * 1000;
+            console.log(`Demo: aguardando ${idleTime}ms em idle...`);
+            updateStatus("stream-status", "online", `Demo: Idle (${idleTime/1000}s)`);
+            demoTimeout = setTimeout(() => {
+                if (demoMode) {
+                    demoSpeak();
+                }
+            }, idleTime);
+        }
+        // Modificar finishPlayback para suportar demo mode
+        const originalFinishPlayback = finishPlayback;
+        finishPlayback = function() {
+            originalFinishPlayback();
+            // Se está em modo demo, agendar próximo ciclo
+            if (demoMode) {
+                demoScheduleNextCycle();
+            }
+        };
         connectWebSocket();
     </script>
 </body>

interface/index_streaming.html CHANGED Viewed

@@ -275,9 +275,8 @@ async function startSyncedPlayback(base64Audio, durationMs) {
         audioSource.connect(audioContext.destination);
         audioSource.onended = () => {
             audioSource = null;
-            // Transicao imediata quando audio termina
-            stopPlayback();
         };
         // Calcular quantos frames usar baseado na duracao do audio
@@ -304,7 +303,7 @@ async function startSyncedPlayback(base64Audio, durationMs) {
             lastRenderedFrame = 0;
         }
-        // Agora mostrar o canvas (ja com o primeiro frame renderizado)
         talkCanvas.style.display = 'block';
         // INICIAR TUDO SINCRONIZADO: audio + video ao mesmo tempo!
@@ -329,12 +328,18 @@ async function startSyncedPlayback(base64Audio, durationMs) {
 function renderLoop() {
     if (!isPlaying) return;
     const elapsed = performance.now() - playbackStartTime;
     // Usar duracao dinamica para sincronizar com audio
     const targetFrame = Math.floor(elapsed / dynamicFrameDuration);
     const total = totalFrames || frameCount;
-    // So renderizar se for um frame diferente do anterior
     if (targetFrame !== lastRenderedFrame && targetFrame < total) {
         // Acesso O(1) ao frame pelo indice
         let frameToRender = frames[targetFrame];
@@ -362,12 +367,8 @@ function renderLoop() {
         progress.style.width = (displayedFrame / total * 100) + '%';
     }
-    // Continuar enquanto tiver audio ou frames
-    if (audioSource || targetFrame < total) {
-        animationId = requestAnimationFrame(renderLoop);
-    } else {
-        stopPlayback();
-    }
 }
 function stopPlayback() {
@@ -386,21 +387,30 @@ function stopPlayback() {
         audioSource = null;
     }
     // Sincronizar idle video para o tempo correto (onde a fala terminou)
-    if (endVideoTimeMs > 0) {
-        const targetTime = endVideoTimeMs / 1000;
-        // Garantir que o tempo esta dentro da duracao do video
-        if (idleVideo.duration > 0) {
-            idleVideo.currentTime = targetTime % idleVideo.duration;
-            console.log(`Idle video sync: ${targetTime.toFixed(2)}s`);
         }
         endVideoTimeMs = 0;  // Reset para proxima vez
     }
-    // Esconder canvas, mostrar idle
-    talkCanvas.style.display = 'none';
-    ctx.clearRect(0, 0, talkCanvas.width, talkCanvas.height);
     frames = [];
     lastRenderedFrame = -1;
     setStatus('Pronto', 'ok');

         audioSource.connect(audioContext.destination);
         audioSource.onended = () => {
+            // Apenas marcar como null - o renderLoop vai detectar e parar
             audioSource = null;
         };
         // Calcular quantos frames usar baseado na duracao do audio
             lastRenderedFrame = 0;
         }
+        // Mostrar canvas
         talkCanvas.style.display = 'block';
         // INICIAR TUDO SINCRONIZADO: audio + video ao mesmo tempo!
 function renderLoop() {
     if (!isPlaying) return;
+    // Se audio terminou, parar imediatamente (transicao instantanea)
+    if (!audioSource) {
+        stopPlayback();
+        return;
+    }
     const elapsed = performance.now() - playbackStartTime;
     // Usar duracao dinamica para sincronizar com audio
     const targetFrame = Math.floor(elapsed / dynamicFrameDuration);
     const total = totalFrames || frameCount;
+    // So renderizar se for um frame diferente do anterior e dentro do limite
     if (targetFrame !== lastRenderedFrame && targetFrame < total) {
         // Acesso O(1) ao frame pelo indice
         let frameToRender = frames[targetFrame];
         progress.style.width = (displayedFrame / total * 100) + '%';
     }
+    // Continuar apenas enquanto audio estiver tocando
+    animationId = requestAnimationFrame(renderLoop);
 }
 function stopPlayback() {
         audioSource = null;
     }
+    // Esconder canvas IMEDIATAMENTE para evitar "travadinha"
+    // O video idle ja esta tocando por baixo, entao a transicao sera suave
+    talkCanvas.style.display = 'none';
+    ctx.clearRect(0, 0, talkCanvas.width, talkCanvas.height);
     // Sincronizar idle video para o tempo correto (onde a fala terminou)
+    // Isso acontece em background, o usuario ja ve o video idle
+    if (endVideoTimeMs > 0 && idleVideo.duration > 0) {
+        const targetTime = (endVideoTimeMs / 1000) % idleVideo.duration;
+        console.log(`Idle video sync: seeking to ${targetTime.toFixed(2)}s (endVideoTimeMs=${endVideoTimeMs})`);
+        // Fazer o seek em background - video ja esta visivel
+        if (idleVideo.fastSeek) {
+            idleVideo.fastSeek(targetTime);
+        } else {
+            idleVideo.currentTime = targetTime;
         }
+        // Garantir que esta tocando
+        idleVideo.play().catch(() => {});
         endVideoTimeMs = 0;  // Reset para proxima vez
     }
     frames = [];
     lastRenderedFrame = -1;
     setStatus('Pronto', 'ok');

interface/server.py CHANGED Viewed

@@ -1,17 +1,14 @@
 """
-Interface Server - Streaming Paralelo
 Porta: 8080
-Arquitetura (ver CLAUDE.md):
-1. Recebe texto do frontend via WebSocket
-2. Conecta ao Orpheus (8081) e Wav2Lip (8082) EM PARALELO
-3. Recebe chunks de audio do Orpheus e frames do Wav2Lip
-4. Monta chunks (audio Orpheus + frames Wav2Lip) conforme chegam
-5. Envia chunks IMEDIATAMENTE para o frontend
-IMPORTANTE:
-- NAO modificar Wav2Lip - ele gera lip sync com eSpeak interno
-- Audio final = Orpheus (descartar audio do Wav2Lip)
 """
 from aiohttp import web
 import aiohttp
@@ -20,368 +17,523 @@ import json
 import base64
 import os
 import time
-import struct
 # Configuracao
-ORPHEUS_WS = os.getenv("ORPHEUS_WS", "ws://localhost:8081/ws")
 WAV2LIP_WS = os.getenv("WAV2LIP_WS", "ws://localhost:8082/ws")
 PORT = int(os.getenv("PORT", "8080"))
 # Constantes
-AUDIO_SAMPLE_RATE = 24000  # Orpheus
-VIDEO_FPS = 25  # Wav2Lip
-BYTES_PER_SAMPLE = 2  # 16-bit
-MS_PER_FRAME = 1000 / VIDEO_FPS  # 40ms
-AUDIO_BYTES_PER_FRAME = int(MS_PER_FRAME * AUDIO_SAMPLE_RATE * BYTES_PER_SAMPLE / 1000)  # 1920 bytes
-routes = web.RouteTableDef()
-def build_chunk(audio_bytes: bytes, frames: list) -> bytes:
-    """Monta chunk binario: [audio_size][audio][num_frames][frame_sizes][frames]"""
-    data = bytearray()
-    data.extend(struct.pack('>I', len(audio_bytes)))
-    data.extend(audio_bytes)
-    data.extend(struct.pack('>I', len(frames)))
-    for frame in frames:
-        data.extend(struct.pack('>I', len(frame)))
-        data.extend(frame)
-    return bytes(data)
-class ParallelStreamingSession:
-    """Streaming paralelo: Orpheus (audio) + Wav2Lip (frames)"""
-    def __init__(self, client_ws):
-        self.client_ws = client_ws
-        self.is_running = False
-        self.start_time = None
-        # Buffers compartilhados
-        self.audio_buffer = bytearray()
-        self.frame_buffer = []
-        # Locks para acesso thread-safe
-        self.buffer_lock = asyncio.Lock()
-        # Estados
-        self.orpheus_done = False
-        self.wav2lip_done = False
-        self.first_chunk_sent = False
-        self.chunk_index = 0
-        # Estatisticas
-        self.total_audio_bytes = 0
-        self.total_frames = 0
-        self.chunks_sent = 0
-    async def send_status(self, message: str):
-        """Envia status para o cliente."""
-        try:
-            if not self.client_ws.closed:
-                await self.client_ws.send_json({"type": "status", "message": message})
-        except:
-            pass
-    async def send_chunk(self, audio: bytes, frames: list):
-        """Monta e envia chunk para o cliente."""
-        if self.client_ws.closed or not audio or not frames:
-            return
-        try:
-            chunk_data = build_chunk(audio, frames)
-            chunk_b64 = base64.b64encode(chunk_data).decode()
-            audio_ms = len(audio) / BYTES_PER_SAMPLE / AUDIO_SAMPLE_RATE * 1000
-            await self.client_ws.send_json({
-                "type": "chunk",
-                "chunk_index": self.chunk_index,
-                "audio_size": len(audio),
-                "audio_duration_ms": int(audio_ms),
-                "num_frames": len(frames),
-                "data": chunk_b64
-            })
-            self.chunk_index += 1
-            self.chunks_sent += 1
-            if not self.first_chunk_sent:
-                self.first_chunk_sent = True
-                ttfb = int((time.time() - self.start_time) * 1000)
-                print(f"[Stream] Primeiro chunk: TTFB={ttfb}ms")
-                await self.client_ws.send_json({"type": "stream_start", "ttfb_ms": ttfb})
-        except Exception as e:
-            print(f"[Stream] Erro enviando chunk: {e}")
-    async def try_send_chunks(self):
-        """Tenta montar e enviar chunks com dados disponiveis."""
-        async with self.buffer_lock:
-            # Enquanto tiver 1 frame + audio correspondente
-            while len(self.frame_buffer) > 0 and len(self.audio_buffer) >= AUDIO_BYTES_PER_FRAME:
-                # Pega 1 frame
-                frame = self.frame_buffer.pop(0)
-                # Pega audio correspondente (~40ms = 1920 bytes)
-                audio = bytes(self.audio_buffer[:AUDIO_BYTES_PER_FRAME])
-                del self.audio_buffer[:AUDIO_BYTES_PER_FRAME]
-                await self.send_chunk(audio, [frame])
-            # Se ambos terminaram, enviar o que sobrou
-            if self.orpheus_done and self.wav2lip_done:
-                if self.frame_buffer and self.audio_buffer:
-                    # Dividir audio restante pelos frames restantes
-                    audio_per_frame = len(self.audio_buffer) // len(self.frame_buffer) if self.frame_buffer else 0
-                    audio_per_frame = max(audio_per_frame, 2)  # Minimo 2 bytes
-                    audio_per_frame = audio_per_frame - (audio_per_frame % 2)  # Alinhamento 16-bit
-                    while self.frame_buffer and self.audio_buffer:
-                        frame = self.frame_buffer.pop(0)
-                        audio_size = min(audio_per_frame, len(self.audio_buffer))
-                        if not self.frame_buffer:  # Ultimo frame pega todo o resto
-                            audio_size = len(self.audio_buffer)
-                        audio = bytes(self.audio_buffer[:audio_size])
-                        del self.audio_buffer[:audio_size]
-                        await self.send_chunk(audio, [frame])
-                elif self.frame_buffer:
-                    # Frames sem audio - enviar com audio vazio
-                    for frame in self.frame_buffer:
-                        await self.send_chunk(b'', [frame])
-                    self.frame_buffer.clear()
-    async def stream_orpheus(self, text: str, voice: str):
-        """Conecta ao Orpheus e recebe chunks de audio."""
-        try:
-            print(f"[Orpheus] Conectando a {ORPHEUS_WS}...")
-            async with aiohttp.ClientSession() as session:
-                ws = await session.ws_connect(ORPHEUS_WS, timeout=aiohttp.ClientWSTimeout(ws_close=120))
-                # Enviar requisicao
-                await ws.send_json({
-                    "action": "synthesize",
-                    "text": text,
-                    "voice": voice,
-                    "stream": True
-                })
-                print(f"[Orpheus] Requisicao enviada")
-                async for msg in ws:
-                    if not self.is_running:
-                        break
-                    if msg.type == aiohttp.WSMsgType.TEXT:
-                        data = json.loads(msg.data)
-                        msg_type = data.get("type", "")
-                        if msg_type == "audio_chunk":
-                            # Decodificar e adicionar ao buffer
-                            audio_b64 = data.get("audio", "")
-                            audio_bytes = base64.b64decode(audio_b64)
-                            async with self.buffer_lock:
-                                self.audio_buffer.extend(audio_bytes)
-                                self.total_audio_bytes += len(audio_bytes)
-                            # Tentar enviar chunks
-                            await self.try_send_chunks()
-                            chunk_idx = data.get("chunk_index", 0)
-                            if chunk_idx == 1:
-                                print(f"[Orpheus] Primeiro chunk de audio recebido")
-                        elif msg_type == "done":
-                            total = data.get("total_bytes", 0)
-                            print(f"[Orpheus] Concluido: {total} bytes")
-                            break
-                        elif msg_type == "error":
-                            print(f"[Orpheus] Erro: {data.get('message')}")
-                            break
-                    elif msg.type in (aiohttp.WSMsgType.CLOSED, aiohttp.WSMsgType.ERROR):
-                        break
-                await ws.close()
-        except Exception as e:
-            print(f"[Orpheus] Erro: {e}")
-            import traceback
-            traceback.print_exc()
-        finally:
-            self.orpheus_done = True
-            await self.try_send_chunks()
-    async def stream_wav2lip(self, text: str, voice: str):
-        """Conecta ao Wav2Lip e recebe frames."""
-        try:
-            print(f"[Wav2Lip] Conectando a {WAV2LIP_WS}...")
-            async with aiohttp.ClientSession() as session:
-                ws = await session.ws_connect(WAV2LIP_WS, timeout=aiohttp.ClientWSTimeout(ws_close=120))
-                # Enviar requisicao
-                await ws.send_json({
-                    "action": "generate",
-                    "text": text,
-                    "voice": voice
-                })
-                print(f"[Wav2Lip] Requisicao enviada")
-                async for msg in ws:
-                    if not self.is_running:
-                        break
-                    if msg.type == aiohttp.WSMsgType.TEXT:
-                        data = json.loads(msg.data)
-                        msg_type = data.get("type", "")
-                        if msg_type == "frame":
-                            # Decodificar e adicionar ao buffer
-                            frame_b64 = data.get("frame", "")
-                            frame_bytes = base64.b64decode(frame_b64)
-                            async with self.buffer_lock:
-                                self.frame_buffer.append(frame_bytes)
-                                self.total_frames += 1
-                            # Tentar enviar chunks
-                            await self.try_send_chunks()
-                            frame_idx = data.get("index", 0)
-                            if frame_idx == 0:
-                                print(f"[Wav2Lip] Primeiro frame recebido")
-                        elif msg_type == "status":
-                            print(f"[Wav2Lip] {data.get('message')}")
-                        elif msg_type == "first_chunk":
-                            print(f"[Wav2Lip] eSpeak latency: {data.get('latency_ms')}ms")
-                        elif msg_type == "full_audio":
-                            # Ignorar - usamos audio do Orpheus
-                            print(f"[Wav2Lip] Audio ignorado (usando Orpheus)")
-                        elif msg_type == "done":
-                            frames = data.get("frames", 0)
-                            print(f"[Wav2Lip] Concluido: {frames} frames")
-                            break
-                        elif msg_type == "error":
-                            print(f"[Wav2Lip] Erro: {data.get('message')}")
-                            await self.send_status(f"Erro: {data.get('message')}")
-                            break
-                    elif msg.type in (aiohttp.WSMsgType.CLOSED, aiohttp.WSMsgType.ERROR):
-                        break
-                await ws.close()
-        except Exception as e:
-            print(f"[Wav2Lip] Erro: {e}")
-            import traceback
-            traceback.print_exc()
-        finally:
-            self.wav2lip_done = True
-            await self.try_send_chunks()
-    async def run(self, text: str, voice: str):
-        """Executa streaming paralelo."""
-        self.is_running = True
-        self.start_time = time.time()
-        await self.send_status("Conectando aos servicos...")
-        try:
-            # Conectar Orpheus e Wav2Lip EM PARALELO
-            orpheus_task = asyncio.create_task(self.stream_orpheus(text, voice))
-            wav2lip_task = asyncio.create_task(self.stream_wav2lip(text, voice))
-            # Aguardar ambos terminarem
-            await asyncio.gather(orpheus_task, wav2lip_task)
-            # Enviar chunks restantes
-            await self.try_send_chunks()
-        except Exception as e:
-            print(f"[Stream] Erro: {e}")
-            await self.client_ws.send_json({"type": "error", "message": str(e)})
-            return
-        elapsed = time.time() - self.start_time
-        print(f"[Stream] Concluido: {elapsed:.2f}s, {self.chunks_sent} chunks, {self.total_frames} frames, {self.total_audio_bytes} bytes")
         try:
-            await self.client_ws.send_json({
-                "type": "done",
-                "total_chunks": self.chunks_sent,
-                "total_frames": self.total_frames,
-                "total_audio_bytes": self.total_audio_bytes,
-                "elapsed_ms": int(elapsed * 1000)
             })
-        except:
-            pass
-    def stop(self):
-        self.is_running = False
-@routes.get("/ws")
-async def websocket_handler(request):
-    ws = web.WebSocketResponse()
-    await ws.prepare(request)
-    print("Cliente conectado")
-    session = None
-    try:
-        async for msg in ws:
-            if msg.type == aiohttp.WSMsgType.TEXT:
-                try:
                     data = json.loads(msg.data)
-                    action = data.get("action", "")
-                    if action == "generate":
-                        text = data.get("text", "").strip()
-                        voice = data.get("voice", "tara")
-                        if not text:
-                            await ws.send_json({"type": "error", "message": "Text required"})
-                            continue
-                        print(f"Gerando: '{text[:50]}...' voice={voice}")
-                        if session:
-                            session.stop()
-                        session = ParallelStreamingSession(ws)
-                        await session.run(text, voice)
-                    elif action == "stop":
-                        if session:
-                            session.stop()
-                            await ws.send_json({"type": "stopped"})
-                    elif action == "ping":
-                        await ws.send_json({"type": "pong"})
-                except json.JSONDecodeError:
-                    await ws.send_json({"type": "error", "message": "Invalid JSON"})
-            elif msg.type == aiohttp.WSMsgType.ERROR:
-                print(f"WebSocket error: {ws.exception()}")
-                break
-    except Exception as e:
-        print(f"Erro: {e}")
-    finally:
-        if session:
-            session.stop()
-        print("Cliente desconectado")
-    return ws
 @routes.get("/")
@@ -389,11 +541,6 @@ async def index(request):
     return web.FileResponse(os.path.join(os.path.dirname(__file__), "index.html"))
-@routes.get("/idle.mp4")
-async def idle_video(request):
-    return web.FileResponse(os.path.join(os.path.dirname(__file__), "idle.mp4"))
 @routes.get("/{filename}")
 async def static_file(request):
     filename = request.match_info["filename"]
@@ -405,21 +552,41 @@ async def static_file(request):
 @routes.get("/health")
 async def health(request):
-    return web.json_response({"status": "ok", "mode": "parallel_streaming"})
 app = web.Application()
 app.add_routes(routes)
 if __name__ == "__main__":
     print("=" * 50)
-    print("Interface Server - Streaming Paralelo")
     print("=" * 50)
     print(f"Porta: {PORT}")
-    print(f"Orpheus: {ORPHEUS_WS}")
     print(f"Wav2Lip: {WAV2LIP_WS}")
-    print(f"Audio: {AUDIO_SAMPLE_RATE}Hz, {AUDIO_BYTES_PER_FRAME} bytes/frame")
-    print(f"Video: {VIDEO_FPS}fps, {MS_PER_FRAME}ms/frame")
     print("=" * 50)
     web.run_app(app, host="0.0.0.0", port=PORT)

 """
+Interface Server - WebRTC Streaming com VP9
 Porta: 8080
+Arquitetura:
+1. Cliente conecta via WebRTC (signaling por WebSocket)
+2. Servidor envia stream de video VP9 + audio Opus
+3. Fusao idle/lip-sync feita no backend
+4. Frontend apenas renderiza o <video>
+Framework: aiortc (https://github.com/aiortc/aiortc)
 """
 from aiohttp import web
 import aiohttp
 import base64
 import os
 import time
+import uuid
+import fractions
+import numpy as np
+from av import VideoFrame, AudioFrame
+from aiortc import RTCPeerConnection, RTCSessionDescription, MediaStreamTrack, RTCConfiguration, RTCIceServer
+from aiortc.contrib.media import MediaRelay
+import cv2
+import subprocess
+import tempfile
 # Configuracao
 WAV2LIP_WS = os.getenv("WAV2LIP_WS", "ws://localhost:8082/ws")
 PORT = int(os.getenv("PORT", "8080"))
+IDLE_VIDEO = os.path.join(os.path.dirname(__file__), "idle.mp4")
 # Constantes
+VIDEO_FPS = 25
+AUDIO_SAMPLE_RATE = 24000
+VIDEO_TIME_BASE = fractions.Fraction(1, VIDEO_FPS)
+AUDIO_TIME_BASE = fractions.Fraction(1, AUDIO_SAMPLE_RATE)
+# Cache global
+idle_frames_cache = []
+pcs = set()  # Track peer connections
+relay = MediaRelay()
+routes = web.RouteTableDef()
+def calculate_frame_difference(frame1, frame2):
+    """Calcula diferenca entre dois frames (0 = identicos, 1 = muito diferentes)."""
+    if frame1 is None or frame2 is None:
+        return 1.0
+    # Converter para grayscale se necessario
+    if len(frame1.shape) == 3:
+        gray1 = cv2.cvtColor(frame1, cv2.COLOR_RGB2GRAY)
+    else:
+        gray1 = frame1
+    if len(frame2.shape) == 3:
+        gray2 = cv2.cvtColor(frame2, cv2.COLOR_RGB2GRAY)
+    else:
+        gray2 = frame2
+    # Redimensionar para mesma resolucao se necessario
+    if gray1.shape != gray2.shape:
+        gray2 = cv2.resize(gray2, (gray1.shape[1], gray1.shape[0]))
+    # Calcular diferenca
+    diff = cv2.absdiff(gray1, gray2)
+    return np.mean(diff) / 255.0
+def find_best_matching_idle_frame(last_speak_frame, idle_frames, sample_step=10):
+    """
+    Encontra o frame idle mais similar ao ultimo frame de fala.
+    Usa amostragem para ser mais rapido.
+    """
+    if not idle_frames or last_speak_frame is None:
+        return 0, float('inf')
+    best_idx = 0
+    best_diff = float('inf')
+    # Primeira passada: amostragem grosseira
+    for i in range(0, len(idle_frames), sample_step):
+        diff = calculate_frame_difference(last_speak_frame, idle_frames[i])
+        if diff < best_diff:
+            best_diff = diff
+            best_idx = i
+    # Segunda passada: refinamento na regiao
+    start = max(0, best_idx - sample_step)
+    end = min(len(idle_frames), best_idx + sample_step)
+    for i in range(start, end):
+        diff = calculate_frame_difference(last_speak_frame, idle_frames[i])
+        if diff < best_diff:
+            best_diff = diff
+            best_idx = i
+    return best_idx, best_diff
+def trim_high_motion_frames(frames, threshold_multiplier=1.0, max_trim=20):
+    """
+    Remove frames do final que tem movimento muito alto (saltos).
+    """
+    if len(frames) < 20:
+        return frames, None
+    # Calcular diferencas entre frames consecutivos (ultimos 20)
+    last_n = min(20, len(frames) - 1)
+    differences = []
+    for i in range(len(frames) - last_n, len(frames)):
+        if i > 0:
+            diff = calculate_frame_difference(frames[i-1], frames[i])
+            differences.append((i, diff))
+    if not differences:
+        return frames, frames[-1] if frames else None
+    # Calcular media e desvio padrao
+    diffs = [d[1] for d in differences]
+    mean_diff = np.mean(diffs)
+    std_diff = np.std(diffs)
+    # Threshold agressivo
+    threshold = mean_diff + threshold_multiplier * std_diff
+    min_threshold = 0.7
+    if threshold > min_threshold:
+        threshold = min_threshold
+    # Encontrar onde comecam os frames problematicos
+    trim_from = len(frames)
+    frames_removed = 0
+    for i in range(len(differences) - 1, -1, -1):
+        idx, diff = differences[i]
+        if diff > threshold:
+            trim_from = idx
+            frames_removed += 1
+            if frames_removed >= max_trim:
+                break
+        else:
+            break
+    frames_to_trim = len(frames) - trim_from
+    if frames_to_trim > 0 and frames_to_trim <= max_trim:
+        print(f"[Trim] Removendo {frames_to_trim} frames problematicos")
+        trimmed_frames = frames[:trim_from]
+        return trimmed_frames, trimmed_frames[-1] if trimmed_frames else None
+    return frames, frames[-1] if frames else None
+def load_idle_frames():
+    """Carrega frames do idle.mp4 como arrays numpy."""
+    global idle_frames_cache
+    if idle_frames_cache:
+        return idle_frames_cache
+    if not os.path.exists(IDLE_VIDEO):
+        print(f"[Idle] Arquivo nao encontrado: {IDLE_VIDEO}")
+        return []
+    print(f"[Idle] Carregando frames de {IDLE_VIDEO}...")
+    cap = cv2.VideoCapture(IDLE_VIDEO)
+    while True:
+        ret, frame = cap.read()
+        if not ret:
+            break
+        # Converter BGR para RGB
+        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        idle_frames_cache.append(frame_rgb)
+    cap.release()
+    print(f"[Idle] Carregados {len(idle_frames_cache)} frames")
+    return idle_frames_cache
+class AvatarVideoTrack(MediaStreamTrack):
+    """
+    Track de video que envia frames do avatar.
+    Alterna entre idle e lip-sync conforme necessario.
+    """
+    kind = "video"
+    def __init__(self):
+        super().__init__()
+        self.idle_frames = load_idle_frames()
+        self.current_idx = 0
+        self.frame_count = 0
+        self.start_time = None
+        # Estado
+        self.is_speaking = False
+        self.speaking_frames = []
+        self.speaking_idx = 0
+        self.best_idle_idx = None  # Frame idle para transicao suave
+        # Dimensoes do video
+        if self.idle_frames:
+            self.height, self.width = self.idle_frames[0].shape[:2]
+        else:
+            self.width, self.height = 640, 480
+        print(f"[VideoTrack] Inicializado: {self.width}x{self.height} @ {VIDEO_FPS}fps")
+    async def recv(self):
+        """Retorna o proximo frame de video."""
+        if self.start_time is None:
+            self.start_time = time.time()
+        # Calcular pts baseado no tempo
+        pts = int(self.frame_count * VIDEO_TIME_BASE.denominator / VIDEO_FPS)
+        self.frame_count += 1
+        # Escolher frame: speaking ou idle
+        if self.is_speaking and self.speaking_frames:
+            if self.speaking_idx < len(self.speaking_frames):
+                frame_data = self.speaking_frames[self.speaking_idx]
+                self.speaking_idx += 1
+            else:
+                # Acabou a fala, voltar ao idle no best_idle_idx
+                self.is_speaking = False
+                self.speaking_frames = []
+                self.speaking_idx = 0
+                # Usar o frame idle pre-calculado para transicao suave
+                if self.best_idle_idx is not None and self.idle_frames:
+                    self.current_idx = self.best_idle_idx
+                    self.best_idle_idx = None
+                    print(f"[VideoTrack] Transicao suave -> idle frame {self.current_idx}")
+                frame_data = self.idle_frames[self.current_idx % len(self.idle_frames)]
+                self.current_idx += 1
+        elif self.idle_frames:
+            frame_data = self.idle_frames[self.current_idx % len(self.idle_frames)]
+            self.current_idx += 1
+        else:
+            # Fallback: frame preto
+            frame_data = np.zeros((self.height, self.width, 3), dtype=np.uint8)
+        # Criar VideoFrame
+        frame = VideoFrame.from_ndarray(frame_data, format="rgb24")
+        frame.pts = pts
+        frame.time_base = VIDEO_TIME_BASE
+        # Manter timing de 25fps
+        elapsed = time.time() - self.start_time
+        expected = self.frame_count / VIDEO_FPS
+        if expected > elapsed:
+            await asyncio.sleep(expected - elapsed)
+        return frame
+    def set_speaking_frames(self, frames):
+        """Define frames de lip-sync para reproduzir com transicao suave."""
+        # Aplicar trim de frames problematicos
+        trimmed_frames, last_frame = trim_high_motion_frames(frames)
+        # Encontrar o melhor frame idle para transicao suave
+        if last_frame is not None and self.idle_frames:
+            best_idx, best_diff = find_best_matching_idle_frame(
+                last_frame, self.idle_frames, sample_step=10
+            )
+            self.best_idle_idx = best_idx
+            print(f"[VideoTrack] Best match: idle frame {best_idx} (diff: {best_diff:.2f})")
+        else:
+            self.best_idle_idx = None
+        self.speaking_frames = trimmed_frames
+        self.speaking_idx = 0
+        self.is_speaking = True
+        print(f"[VideoTrack] Speaking: {len(trimmed_frames)} frames (original: {len(frames)})")
+class AvatarAudioTrack(MediaStreamTrack):
+    """
+    Track de audio que envia silencio ou audio do Orpheus.
+    """
+    kind = "audio"
+    def __init__(self):
+        super().__init__()
+        self.sample_rate = AUDIO_SAMPLE_RATE
+        self.samples_per_frame = 960  # 40ms @ 24kHz
+        self.frame_count = 0
+        self.start_time = None
+        # Buffer de audio
+        self.audio_buffer = []
+        self.buffer_idx = 0
+        print(f"[AudioTrack] Inicializado: {self.sample_rate}Hz")
+    async def recv(self):
+        """Retorna o proximo frame de audio."""
+        if self.start_time is None:
+            self.start_time = time.time()
+        pts = self.frame_count * self.samples_per_frame
+        self.frame_count += 1
+        # Pegar audio do buffer ou silencio
+        if self.audio_buffer and self.buffer_idx < len(self.audio_buffer):
+            samples = self.audio_buffer[self.buffer_idx]
+            self.buffer_idx += 1
+        else:
+            # Silencio
+            samples = np.zeros(self.samples_per_frame, dtype=np.int16)
+        # Criar AudioFrame
+        frame = AudioFrame(format="s16", layout="mono", samples=len(samples))
+        frame.sample_rate = self.sample_rate
+        frame.pts = pts
+        frame.time_base = AUDIO_TIME_BASE
+        # Copiar samples
+        frame.planes[0].update(samples.tobytes())
+        # Manter timing
+        elapsed = time.time() - self.start_time
+        expected = self.frame_count * self.samples_per_frame / self.sample_rate
+        if expected > elapsed:
+            await asyncio.sleep(expected - elapsed)
+        return frame
+    def set_audio(self, pcm_data):
+        """Define audio PCM para reproduzir."""
+        # Converter bytes para numpy array
+        samples = np.frombuffer(pcm_data, dtype=np.int16)
+        # Dividir em frames de 40ms
+        self.audio_buffer = []
+        for i in range(0, len(samples), self.samples_per_frame):
+            chunk = samples[i:i + self.samples_per_frame]
+            if len(chunk) < self.samples_per_frame:
+                # Padding com zeros
+                chunk = np.pad(chunk, (0, self.samples_per_frame - len(chunk)))
+            self.audio_buffer.append(chunk)
+        self.buffer_idx = 0
+        print(f"[AudioTrack] Audio: {len(self.audio_buffer)} frames ({len(pcm_data)} bytes)")
+class AvatarSession:
+    """Gerencia uma sessao WebRTC com o cliente."""
+    def __init__(self, pc, video_track, audio_track):
+        self.pc = pc
+        self.video_track = video_track
+        self.audio_track = audio_track
+        self.wav2lip_ws = None
+        self.wav2lip_session = None
+    async def generate(self, text: str, voice: str):
+        """Gera fala com lip-sync via Wav2Lip."""
+        print(f"[Session] Gerando: '{text[:50]}...'")
         try:
+            # Conectar ao Wav2Lip
+            self.wav2lip_session = aiohttp.ClientSession()
+            self.wav2lip_ws = await self.wav2lip_session.ws_connect(
+                WAV2LIP_WS,
+                timeout=aiohttp.ClientWSTimeout(ws_close=120)
+            )
+            # Enviar requisicao
+            await self.wav2lip_ws.send_json({
+                "action": "generate",
+                "text": text,
+                "voice": voice
             })
+            speaking_frames = []
+            audio_data = b''
+            # Receber frames e audio
+            async for msg in self.wav2lip_ws:
+                if msg.type == aiohttp.WSMsgType.TEXT:
                     data = json.loads(msg.data)
+                    msg_type = data.get("type", "")
+                    if msg_type == "frame":
+                        frame_b64 = data.get("frame", "")
+                        if frame_b64:
+                            # Decodificar JPEG para numpy
+                            jpeg_data = base64.b64decode(frame_b64)
+                            nparr = np.frombuffer(jpeg_data, np.uint8)
+                            frame = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
+                            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                            speaking_frames.append(frame_rgb)
+                    elif msg_type == "full_audio":
+                        audio_b64 = data.get("audio", "")
+                        if audio_b64:
+                            audio_data = base64.b64decode(audio_b64)
+                    elif msg_type == "done":
+                        break
+                    elif msg_type == "error":
+                        print(f"[Session] Erro Wav2Lip: {data.get('message')}")
+                        break
+                elif msg.type in (aiohttp.WSMsgType.CLOSED, aiohttp.WSMsgType.ERROR):
+                    break
+            await self.wav2lip_ws.close()
+            await self.wav2lip_session.close()
+            # Aplicar frames e audio aos tracks
+            if speaking_frames:
+                self.video_track.set_speaking_frames(speaking_frames)
+            if audio_data:
+                self.audio_track.set_audio(audio_data)
+            print(f"[Session] Gerado: {len(speaking_frames)} frames, {len(audio_data)} bytes audio")
+        except Exception as e:
+            print(f"[Session] Erro: {e}")
+            import traceback
+            traceback.print_exc()
+    async def close(self):
+        """Fecha a sessao."""
+        if self.wav2lip_ws and not self.wav2lip_ws.closed:
+            await self.wav2lip_ws.close()
+        if self.wav2lip_session:
+            await self.wav2lip_session.close()
+# Armazenar sessoes ativas
+sessions = {}
+@routes.post("/offer")
+async def offer(request):
+    """Recebe offer SDP do cliente e retorna answer."""
+    params = await request.json()
+    offer = RTCSessionDescription(sdp=params["sdp"], type=params["type"])
+    # Configurar ICE servers (STUN + TURN publicos)
+    ice_servers = [
+        RTCIceServer(urls=["stun:stun.l.google.com:19302"]),
+        # Servidores TURN com múltiplas URLs
+        RTCIceServer(
+            urls=[
+                "turn:openrelay.metered.ca:80",
+                "turn:openrelay.metered.ca:443",
+                "turn:openrelay.metered.ca:443?transport=tcp"
+            ],
+            username="openrelayproject",
+            credential="openrelayproject"
+        ),
+        # TURN alternativo (Twilio)
+        RTCIceServer(
+            urls=["turn:global.turn.twilio.com:3478?transport=udp"],
+            username="f4b4035eaa76f4a55de5f4351567653ee4ff6fa97b50b6b334fcc1be9c27212d",
+            credential="w1uxM55V9yVoqyVFjt+mxDBV0F87AUCemaYVQGxsPLw="
+        ),
+    ]
+    config = RTCConfiguration(iceServers=ice_servers)
+    pc = RTCPeerConnection(configuration=config)
+    pc_id = str(uuid.uuid4())
+    pcs.add(pc)
+    print(f"[WebRTC] Nova conexao: {pc_id}")
+    # Criar tracks
+    video_track = AvatarVideoTrack()
+    audio_track = AvatarAudioTrack()
+    # Adicionar tracks ao peer connection
+    pc.addTrack(video_track)
+    pc.addTrack(audio_track)
+    # Criar sessao
+    session = AvatarSession(pc, video_track, audio_track)
+    sessions[pc_id] = session
+    @pc.on("iceconnectionstatechange")
+    async def on_ice_state():
+        print(f"[ICE] Estado: {pc.iceConnectionState}")
+    @pc.on("icegatheringstatechange")
+    async def on_ice_gathering():
+        print(f"[ICE] Gathering: {pc.iceGatheringState}")
+    @pc.on("connectionstatechange")
+    async def on_connectionstatechange():
+        print(f"[WebRTC] Estado: {pc.connectionState}")
+        if pc.connectionState == "failed" or pc.connectionState == "closed":
+            await pc.close()
+            pcs.discard(pc)
+            if pc_id in sessions:
+                await sessions[pc_id].close()
+                del sessions[pc_id]
+    # Processar offer e criar answer
+    await pc.setRemoteDescription(offer)
+    answer = await pc.createAnswer()
+    await pc.setLocalDescription(answer)
+    return web.json_response({
+        "sdp": pc.localDescription.sdp,
+        "type": pc.localDescription.type,
+        "session_id": pc_id
+    })
+@routes.post("/generate")
+async def generate(request):
+    """Gera fala com lip-sync."""
+    params = await request.json()
+    session_id = params.get("session_id")
+    text = params.get("text", "").strip()
+    voice = params.get("voice", "tara")
+    if not session_id or session_id not in sessions:
+        return web.json_response({"error": "Sessao invalida"}, status=400)
+    if not text:
+        return web.json_response({"error": "Texto obrigatorio"}, status=400)
+    session = sessions[session_id]
+    asyncio.create_task(session.generate(text, voice))
+    return web.json_response({"status": "generating"})
 @routes.get("/")
     return web.FileResponse(os.path.join(os.path.dirname(__file__), "index.html"))
 @routes.get("/{filename}")
 async def static_file(request):
     filename = request.match_info["filename"]
 @routes.get("/health")
 async def health(request):
+    return web.json_response({
+        "status": "ok",
+        "mode": "webrtc",
+        "connections": len(pcs)
+    })
+async def on_shutdown(app):
+    """Fecha todas as conexoes ao desligar."""
+    coros = [pc.close() for pc in pcs]
+    await asyncio.gather(*coros)
+    pcs.clear()
 app = web.Application()
 app.add_routes(routes)
+app.on_shutdown.append(on_shutdown)
 if __name__ == "__main__":
     print("=" * 50)
+    print("Interface Server - WebRTC VP9 Streaming")
     print("=" * 50)
     print(f"Porta: {PORT}")
+    print(f"Idle Video: {IDLE_VIDEO}")
     print(f"Wav2Lip: {WAV2LIP_WS}")
     print("=" * 50)
+    print("Endpoints:")
+    print("  POST /offer - WebRTC signaling")
+    print("  POST /generate - Gerar fala")
+    print("=" * 50)
+    # Pre-carregar idle frames
+    print("Carregando idle frames...")
+    load_idle_frames()
+    print("=" * 50)
     web.run_app(app, host="0.0.0.0", port=PORT)

interface/server_optimized.py CHANGED Viewed

@@ -48,6 +48,8 @@ class OptimizedSession:
         self.total_frames = 0
         self.total_bytes = 0
         self.end_video_time_ms = None  # Tempo do vídeo onde parou (para sync)
     async def send_json(self, msg_type: str, **kwargs):
         """Envia mensagem JSON (para status/controle)."""
@@ -162,7 +164,7 @@ class OptimizedSession:
         return total_chunks > 0
-    async def run(self, text: str, voice: str):
         """Streaming otimizado - áudio e frames em paralelo, enviados conforme chegam."""
         self.is_running = True
         self.start_time = time.time()
@@ -186,17 +188,19 @@ class OptimizedSession:
                 await self.send_json("status", message="Gerando...")
-                # Enviar texto para Wav2Lip
                 await wav2lip_ws.send_json({
-                    "action": "speak",
                     "text": text,
                     "voice": voice,
-                    "parallel": True
                 })
                 print(f"[Optimized] Texto: '{text[:50]}...'")
                 first_frame_time = None
                 # Receber frames do Wav2Lip
                 async for msg in wav2lip_ws:
@@ -217,8 +221,10 @@ class OptimizedSession:
                             if first_frame_time is None:
                                 first_frame_time = time.time() - self.start_time
                                 latency_ms = int(first_frame_time * 1000)
-                                print(f"[Optimized] Primeiro frame: {latency_ms}ms")
-                                await self.send_json("first_frame", latency_ms=latency_ms)
                         elif msg_type == "full_audio":
                             print(f"[Optimized] Ignorando audio Wav2Lip (usando Orpheus streaming)")
@@ -234,10 +240,14 @@ class OptimizedSession:
                             break
                         elif msg_type == "done":
-                            # Capturar end_video_time_ms para sincronização
                             self.end_video_time_ms = data.get("end_video_time_ms")
                             end_frame_idx = data.get("end_frame_idx")
-                            print(f"[Optimized] Wav2Lip done: {self.total_frames} frames, end_video_time_ms={self.end_video_time_ms}, end_frame_idx={end_frame_idx}")
                             break
                     elif msg.type in (aiohttp.WSMsgType.CLOSED, aiohttp.WSMsgType.ERROR):
@@ -272,6 +282,8 @@ class OptimizedSession:
             total_frames=self.total_frames,
             elapsed_ms=int(elapsed * 1000),
             bytes_sent=self.total_bytes,
             end_video_time_ms=self.end_video_time_ms
         )
@@ -297,18 +309,20 @@ async def websocket_handler(request):
                     if action == "generate":
                         text = data.get("text", "").strip()
                         voice = data.get("voice", "tara")
                         if not text:
                             await ws.send_json({"type": "error", "message": "Text required"})
                             continue
-                        print(f"Gerando: '{text[:50]}...'")
                         if session:
                             session.stop()
                         session = OptimizedSession(ws)
-                        await session.run(text, voice)
                     elif action == "stop":
                         if session:

         self.total_frames = 0
         self.total_bytes = 0
         self.end_video_time_ms = None  # Tempo do vídeo onde parou (para sync)
+        self.start_frame_idx = None    # Frame inicial usado pelo Wav2Lip
+        self.end_frame_idx = None      # Frame final usado pelo Wav2Lip
     async def send_json(self, msg_type: str, **kwargs):
         """Envia mensagem JSON (para status/controle)."""
         return total_chunks > 0
+    async def run(self, text: str, voice: str, jpeg_quality: int = 95, idle_video_time_ms: int = 0):
         """Streaming otimizado - áudio e frames em paralelo, enviados conforme chegam."""
         self.is_running = True
         self.start_time = time.time()
                 await self.send_json("status", message="Gerando...")
+                # Enviar texto para Wav2Lip com action "generate" para receber end_video_time_ms
                 await wav2lip_ws.send_json({
+                    "action": "generate",
                     "text": text,
                     "voice": voice,
+                    "idle_video_time_ms": idle_video_time_ms,  # Tempo inicial do vídeo idle
+                    "jpeg_quality": jpeg_quality  # Qualidade do JPEG (50-100)
                 })
                 print(f"[Optimized] Texto: '{text[:50]}...'")
                 first_frame_time = None
+                start_frame_idx = None  # Frame inicial usado pelo Wav2Lip
                 # Receber frames do Wav2Lip
                 async for msg in wav2lip_ws:
                             if first_frame_time is None:
                                 first_frame_time = time.time() - self.start_time
                                 latency_ms = int(first_frame_time * 1000)
+                                # Capturar start_frame_idx do primeiro frame
+                                start_frame_idx = data.get("source_frame_idx", None)
+                                print(f"[Optimized] Primeiro frame: {latency_ms}ms, source_frame={start_frame_idx}")
+                                await self.send_json("first_frame", latency_ms=latency_ms, start_frame_idx=start_frame_idx)
                         elif msg_type == "full_audio":
                             print(f"[Optimized] Ignorando audio Wav2Lip (usando Orpheus streaming)")
                             break
                         elif msg_type == "done":
+                            # Capturar índices de frame para sincronização
                             self.end_video_time_ms = data.get("end_video_time_ms")
+                            if start_frame_idx is None:
+                                start_frame_idx = data.get("start_frame_idx")
                             end_frame_idx = data.get("end_frame_idx")
+                            self.start_frame_idx = start_frame_idx
+                            self.end_frame_idx = end_frame_idx
+                            print(f"[Optimized] Wav2Lip done: {self.total_frames} frames, start_frame={start_frame_idx}, end_frame={end_frame_idx}, end_time_ms={self.end_video_time_ms}")
                             break
                     elif msg.type in (aiohttp.WSMsgType.CLOSED, aiohttp.WSMsgType.ERROR):
             total_frames=self.total_frames,
             elapsed_ms=int(elapsed * 1000),
             bytes_sent=self.total_bytes,
+            start_frame_idx=self.start_frame_idx,
+            end_frame_idx=self.end_frame_idx,
             end_video_time_ms=self.end_video_time_ms
         )
                     if action == "generate":
                         text = data.get("text", "").strip()
                         voice = data.get("voice", "tara")
+                        jpeg_quality = data.get("jpeg_quality", 95)  # Qualidade JPEG (50-100)
+                        idle_video_time_ms = data.get("idle_video_time_ms", 0)  # Tempo do vídeo idle
                         if not text:
                             await ws.send_json({"type": "error", "message": "Text required"})
                             continue
+                        print(f"Gerando: '{text[:50]}...' (quality={jpeg_quality}, idle_time={idle_video_time_ms}ms)")
                         if session:
                             session.stop()
                         session = OptimizedSession(ws)
+                        await session.run(text, voice, jpeg_quality, idle_video_time_ms)
                     elif action == "stop":
                         if session:

interface/server_streaming.py CHANGED Viewed

@@ -24,11 +24,22 @@ routes = web.RouteTableDef()
 # Cache de frames idle
 idle_frames = []
 idle_frame_count = 0
 def load_idle_frames():
-    """Carrega frames do idle.mp4"""
-    global idle_frames, idle_frame_count
     if idle_frames:
         return
@@ -40,16 +51,21 @@ def load_idle_frames():
     print(f"Carregando idle frames de {IDLE_VIDEO}...")
     cap = cv2.VideoCapture(IDLE_VIDEO)
     while True:
         ret, frame = cap.read()
         if not ret:
             break
-        # Manter em BGR para processamento, converter para JPEG depois
         idle_frames.append(frame)
     cap.release()
     idle_frame_count = len(idle_frames)
-    print(f"Carregados {idle_frame_count} frames idle")
 def frame_to_jpeg_base64(frame, quality=85):
@@ -66,6 +82,403 @@ def jpeg_base64_to_frame(b64_data):
     return cv2.imdecode(nparr, cv2.IMREAD_COLOR)
 def blend_frames(frame1, frame2, alpha):
     """Blend entre dois frames. alpha=0 -> frame1, alpha=1 -> frame2"""
     # Garantir que ambos frames tem o mesmo tamanho
@@ -132,6 +545,13 @@ async def websocket_handler(request):
                             audio_duration = 0
                             end_video_time_ms = 0
                             async for w2l_msg in wav2lip_ws:
                                 if w2l_msg.type == aiohttp.WSMsgType.TEXT:
                                     w2l_data = json.loads(w2l_msg.data)
@@ -144,6 +564,20 @@ async def websocket_handler(request):
                                         frame_b64 = w2l_data.get("frame", "")
                                         if frame_b64:
                                             frame = jpeg_base64_to_frame(frame_b64)
                                             speaking_frames.append(frame)
                                     elif msg_type == "full_audio":
@@ -166,17 +600,45 @@ async def websocket_handler(request):
                             # Enviar frames SEM crossfade - transicao e feita no cliente
                             if speaking_frames:
                                 # Atualizar posicao do idle para continuidade apos fala
                                 if idle_frames:
-                                    idle_position = (idle_position + len(speaking_frames)) % idle_frame_count
                                 # Enviar stream_start
                                 ttfb = int((time.time() - start_time) * 1000)
                                 await ws.send_json({"type": "stream_start", "ttfb_ms": ttfb})
                                 # Enviar apenas os frames de fala (sem crossfade)
                                 for idx, frame in enumerate(speaking_frames):
-                                    frame_b64 = frame_to_jpeg_base64(frame)
                                     await ws.send_json({
                                         "type": "frame",
                                         "frame": frame_b64,
@@ -200,7 +662,7 @@ async def websocket_handler(request):
                                     "end_video_time_ms": end_video_time_ms
                                 })
-                                print(f"Enviados {len(speaking_frames)} frames de fala (sem crossfade)")
                     except Exception as e:
                         print(f"Erro: {e}")
@@ -286,14 +748,16 @@ app.add_routes(routes)
 if __name__ == "__main__":
     print("=" * 50)
-    print("Streaming Server com Crossfade - Porta", PORT)
     print("Wav2Lip:", WAV2LIP_WS)
     print("Idle Video:", IDLE_VIDEO)
-    print("Crossfade: DESABILITADO (transicao no cliente)")
     print("=" * 50)
     # Carregar idle frames
     load_idle_frames()
     print("=" * 50)
     web.run_app(app, host="0.0.0.0", port=PORT)

 # Cache de frames idle
 idle_frames = []
 idle_frame_count = 0
+idle_resolution = (1920, 1080)  # Resolucao do idle video (width, height)
+# Regiao da boca/queixo (em ratio do frame)
+# Regiao mais focada para evitar "pulos" na transicao
+# Apenas boca e queixo, sem incluir muito do rosto
+MOUTH_REGION = {
+    'top': 0.50,      # 50% do topo (comeca abaixo do nariz)
+    'bottom': 0.80,   # ate 80% (apenas queixo)
+    'left': 0.32,     # 32% da esquerda
+    'right': 0.68     # ate 68% (mais estreito)
+}
 def load_idle_frames():
+    """Carrega frames do idle.mp4 e obtem resolucao"""
+    global idle_frames, idle_frame_count, idle_resolution
     if idle_frames:
         return
     print(f"Carregando idle frames de {IDLE_VIDEO}...")
     cap = cv2.VideoCapture(IDLE_VIDEO)
+    # Obter resolucao do video
+    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    idle_resolution = (width, height)
+    print(f"Resolucao idle: {width}x{height}")
     while True:
         ret, frame = cap.read()
         if not ret:
             break
         idle_frames.append(frame)
     cap.release()
     idle_frame_count = len(idle_frames)
+    print(f"Carregados {idle_frame_count} frames idle em full resolution")
 def frame_to_jpeg_base64(frame, quality=85):
     return cv2.imdecode(nparr, cv2.IMREAD_COLOR)
+def upscale_frame(frame, target_size):
+    """
+    Upscale frame para a resolucao alvo usando LANCZOS4 (alta qualidade).
+    target_size: (width, height)
+    """
+    if frame is None:
+        return frame
+    current_h, current_w = frame.shape[:2]
+    target_w, target_h = target_size
+    # Se ja esta na resolucao correta, retornar
+    if current_w == target_w and current_h == target_h:
+        return frame
+    # Upscale usando LANCZOS4 (melhor qualidade para upscaling)
+    upscaled = cv2.resize(frame, (target_w, target_h), interpolation=cv2.INTER_LANCZOS4)
+    return upscaled
+def match_histogram(source, reference):
+    """
+    Ajusta o histograma da source para corresponder ao da reference.
+    Isso corrige diferencas de brilho/cor entre Wav2Lip e idle frames.
+    Usa o espaco de cor LAB para melhor correspondencia perceptual.
+    """
+    # Converter para LAB (melhor para correspondencia de cor)
+    source_lab = cv2.cvtColor(source, cv2.COLOR_BGR2LAB).astype(np.float32)
+    reference_lab = cv2.cvtColor(reference, cv2.COLOR_BGR2LAB).astype(np.float32)
+    # Para cada canal, ajustar media e desvio padrao
+    for i in range(3):
+        src_mean, src_std = source_lab[:, :, i].mean(), source_lab[:, :, i].std()
+        ref_mean, ref_std = reference_lab[:, :, i].mean(), reference_lab[:, :, i].std()
+        # Evitar divisao por zero
+        if src_std < 1e-6:
+            src_std = 1e-6
+        # Normalizar e reescalar
+        source_lab[:, :, i] = (source_lab[:, :, i] - src_mean) * (ref_std / src_std) + ref_mean
+    # Clipar valores validos e converter de volta
+    source_lab = np.clip(source_lab, 0, 255).astype(np.uint8)
+    result = cv2.cvtColor(source_lab, cv2.COLOR_LAB2BGR)
+    return result
+def extract_mouth_region(frame, region=MOUTH_REGION):
+    """
+    Extrai apenas a regiao da boca/queixo do frame.
+    Retorna (regiao_cortada, coordenadas) para posterior blending.
+    """
+    h, w = frame.shape[:2]
+    y1 = int(h * region['top'])
+    y2 = int(h * region['bottom'])
+    x1 = int(w * region['left'])
+    x2 = int(w * region['right'])
+    mouth_crop = frame[y1:y2, x1:x2].copy()
+    return mouth_crop, (x1, y1, x2, y2)
+def create_feathered_mask(shape, feather_pixels=15):
+    """
+    Cria mascara com bordas suavizadas (feathered) para blending seamless.
+    Usa gradiente suave (ease-in-out) para transicao mais natural.
+    """
+    h, w = shape[:2]
+    mask = np.ones((h, w), dtype=np.float32)
+    # Criar gradiente nas bordas usando curva suave (ease-in-out)
+    for i in range(feather_pixels):
+        # Curva suave: smoothstep para transicao mais natural
+        t = i / feather_pixels
+        alpha = t * t * (3 - 2 * t)  # smoothstep
+        # Top
+        mask[i, :] = np.minimum(mask[i, :], alpha)
+        # Bottom
+        mask[h - 1 - i, :] = np.minimum(mask[h - 1 - i, :], alpha)
+        # Left
+        mask[:, i] = np.minimum(mask[:, i], alpha)
+        # Right
+        mask[:, w - 1 - i] = np.minimum(mask[:, w - 1 - i], alpha)
+    return mask
+def blend_mouth_region_only(wav2lip_frame, idle_frame):
+    """
+    Nova estrategia: Manter idle em full resolution, substituir APENAS a boca.
+    1. Extrai regiao da boca do frame Wav2Lip (853x480)
+    2. Upscala APENAS essa regiao para a escala do idle (1920x1080)
+    3. Aplica Poisson Blending apenas na regiao da boca
+    4. Retorna o frame idle com apenas a boca substituida
+    Isso preserva toda a qualidade do idle (cabelo, fundo, roupa) e
+    so substitui a pequena regiao da boca.
+    """
+    if wav2lip_frame is None or idle_frame is None:
+        return wav2lip_frame if wav2lip_frame is not None else idle_frame
+    # Dimensoes
+    idle_h, idle_w = idle_frame.shape[:2]
+    w2l_h, w2l_w = wav2lip_frame.shape[:2]
+    # Calcular escala entre frames
+    scale_x = idle_w / w2l_w
+    scale_y = idle_h / w2l_h
+    # 1. Extrair regiao da boca do Wav2Lip
+    mouth_crop, (x1_w2l, y1_w2l, x2_w2l, y2_w2l) = extract_mouth_region(wav2lip_frame)
+    # 2. Calcular coordenadas equivalentes no idle (full res)
+    x1_idle = int(x1_w2l * scale_x)
+    y1_idle = int(y1_w2l * scale_y)
+    x2_idle = int(x2_w2l * scale_x)
+    y2_idle = int(y2_w2l * scale_y)
+    # Dimensao da regiao no idle
+    region_w = x2_idle - x1_idle
+    region_h = y2_idle - y1_idle
+    # 3. Upscale apenas a regiao da boca para a resolucao do idle
+    mouth_upscaled = cv2.resize(mouth_crop, (region_w, region_h), interpolation=cv2.INTER_LANCZOS4)
+    # 3.5 Histogram matching: ajustar cor/brilho do mouth para corresponder ao idle
+    idle_region = idle_frame[y1_idle:y2_idle, x1_idle:x2_idle]
+    mouth_upscaled = match_histogram(mouth_upscaled, idle_region)
+    # 4. Criar mascara com bordas suavizadas
+    # Usar 25% da menor dimensao para feathering bem suave
+    feather = max(30, min(region_w, region_h) // 4)  # ~25% da menor dimensao
+    mask = create_feathered_mask((region_h, region_w), feather_pixels=feather)
+    mask_3ch = np.dstack([mask, mask, mask])
+    # 5. Fazer copia do idle e aplicar blending na regiao
+    result = idle_frame.copy()
+    # Regiao do idle onde vai o mouth
+    idle_region = result[y1_idle:y2_idle, x1_idle:x2_idle]
+    # Blending com mascara feathered
+    blended_region = (mouth_upscaled * mask_3ch + idle_region * (1 - mask_3ch)).astype(np.uint8)
+    # Substituir regiao
+    result[y1_idle:y2_idle, x1_idle:x2_idle] = blended_region
+    return result
+def blend_with_poisson(wav2lip_frame, idle_frame):
+    """
+    Estrategia alternativa: Poisson Blending apenas na regiao da boca.
+    Mais lento mas com transicao mais suave nos bordos.
+    """
+    if wav2lip_frame is None or idle_frame is None:
+        return wav2lip_frame if wav2lip_frame is not None else idle_frame
+    idle_h, idle_w = idle_frame.shape[:2]
+    w2l_h, w2l_w = wav2lip_frame.shape[:2]
+    scale_x = idle_w / w2l_w
+    scale_y = idle_h / w2l_h
+    # Extrair e upscalar boca
+    mouth_crop, (x1_w2l, y1_w2l, x2_w2l, y2_w2l) = extract_mouth_region(wav2lip_frame)
+    x1_idle = int(x1_w2l * scale_x)
+    y1_idle = int(y1_w2l * scale_y)
+    x2_idle = int(x2_w2l * scale_x)
+    y2_idle = int(y2_w2l * scale_y)
+    region_w = x2_idle - x1_idle
+    region_h = y2_idle - y1_idle
+    mouth_upscaled = cv2.resize(mouth_crop, (region_w, region_h), interpolation=cv2.INTER_LANCZOS4)
+    # Criar imagem source do tamanho do idle (preta com boca no lugar certo)
+    source = np.zeros_like(idle_frame)
+    source[y1_idle:y2_idle, x1_idle:x2_idle] = mouth_upscaled
+    # Criar mascara eliptica para a regiao
+    mask = np.zeros((idle_h, idle_w), dtype=np.uint8)
+    center_x = (x1_idle + x2_idle) // 2
+    center_y = (y1_idle + y2_idle) // 2
+    axes_x = region_w // 2 - 10  # Um pouco menor para evitar bordas
+    axes_y = region_h // 2 - 10
+    cv2.ellipse(mask, (center_x, center_y), (axes_x, axes_y), 0, 0, 360, 255, -1)
+    try:
+        result = cv2.seamlessClone(
+            source,
+            idle_frame,
+            mask,
+            (center_x, center_y),
+            cv2.NORMAL_CLONE
+        )
+        return result
+    except Exception as e:
+        print(f"[Poisson] Erro: {e}, usando feathered blend")
+        return blend_mouth_region_only(wav2lip_frame, idle_frame)
+def calculate_frame_difference(frame1, frame2):
+    """
+    Calcula a diferenca entre dois frames.
+    Retorna um valor de 0-100 indicando quanta diferenca ha.
+    """
+    if frame1 is None or frame2 is None:
+        return 0
+    # Converter para grayscale
+    gray1 = cv2.cvtColor(frame1, cv2.COLOR_BGR2GRAY)
+    gray2 = cv2.cvtColor(frame2, cv2.COLOR_BGR2GRAY)
+    # Calcular diferenca absoluta
+    diff = cv2.absdiff(gray1, gray2)
+    # Valor medio da diferenca (0-255)
+    mean_diff = np.mean(diff)
+    # Normalizar para 0-100
+    return (mean_diff / 255.0) * 100
+def calculate_sharpness(frame):
+    """
+    Calcula a nitidez de um frame usando variância do Laplaciano.
+    Quanto maior o valor, mais nítido o frame.
+    """
+    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) if len(frame.shape) == 3 else frame
+    laplacian = cv2.Laplacian(gray, cv2.CV_64F)
+    return laplacian.var()
+def find_best_matching_idle_frame(target_frame, idle_frames, sample_step=5, sharpness_weight=0.3):
+    """
+    Encontra o frame do idle mais similar ao target_frame.
+    Considera tanto similaridade quanto nitidez para evitar frames desfocados.
+    Args:
+        target_frame: Frame para comparar (último frame da fala)
+        idle_frames: Lista de frames idle
+        sample_step: Passo de amostragem (5 = compara 1 a cada 5 frames)
+        sharpness_weight: Peso da nitidez no score (0-1)
+    Returns:
+        Índice do frame idle mais similar e nítido
+    """
+    if not idle_frames or target_frame is None:
+        return 0, 0
+    # Converter target para grayscale uma vez
+    target_gray = cv2.cvtColor(target_frame, cv2.COLOR_BGR2GRAY)
+    # Primeira fase: encontrar os N melhores candidatos por similaridade
+    candidates = []
+    for i in range(0, len(idle_frames), sample_step):
+        idle_gray = cv2.cvtColor(idle_frames[i], cv2.COLOR_BGR2GRAY)
+        diff = np.mean(cv2.absdiff(target_gray, idle_gray))
+        candidates.append((i, diff))
+    # Ordenar por diferença (menor = mais similar)
+    candidates.sort(key=lambda x: x[1])
+    # Pegar os top 20 candidatos mais similares
+    top_candidates = candidates[:20]
+    # Segunda fase: refinar busca na vizinhança dos top candidatos
+    refined_candidates = []
+    for idx, _ in top_candidates:
+        start = max(0, idx - sample_step)
+        end = min(len(idle_frames), idx + sample_step + 1)
+        for i in range(start, end):
+            idle_frame = idle_frames[i]
+            idle_gray = cv2.cvtColor(idle_frame, cv2.COLOR_BGR2GRAY)
+            # Calcular diferença
+            diff = np.mean(cv2.absdiff(target_gray, idle_gray))
+            # Calcular nitidez
+            sharpness = calculate_sharpness(idle_frame)
+            refined_candidates.append((i, diff, sharpness))
+    if not refined_candidates:
+        return 0, 0
+    # Normalizar valores para scoring
+    diffs = [c[1] for c in refined_candidates]
+    sharpnesses = [c[2] for c in refined_candidates]
+    min_diff, max_diff = min(diffs), max(diffs)
+    min_sharp, max_sharp = min(sharpnesses), max(sharpnesses)
+    # Evitar divisão por zero
+    diff_range = max_diff - min_diff if max_diff > min_diff else 1
+    sharp_range = max_sharp - min_sharp if max_sharp > min_sharp else 1
+    # Calcular score combinado (menor = melhor)
+    # diff_score: 0 = mais similar, 1 = menos similar
+    # sharp_score: 0 = mais nítido, 1 = menos nítido (invertido)
+    best_idx = 0
+    best_score = float('inf')
+    best_diff = 0
+    for i, diff, sharpness in refined_candidates:
+        diff_score = (diff - min_diff) / diff_range
+        sharp_score = 1 - (sharpness - min_sharp) / sharp_range  # Invertido: maior nitidez = menor score
+        # Score combinado
+        combined_score = (1 - sharpness_weight) * diff_score + sharpness_weight * sharp_score
+        if combined_score < best_score:
+            best_score = combined_score
+            best_idx = i
+            best_diff = diff
+    return best_idx, best_diff
+def trim_high_motion_frames(frames, threshold_multiplier=1.0, max_trim=20):
+    """
+    Remove frames do final que tem movimento muito alto (saltos).
+    Isso elimina os frames problemáticos que causam "travamento".
+    Versão mais agressiva: usa threshold menor e remove mais frames.
+    Args:
+        frames: Lista de frames
+        threshold_multiplier: Multiplicador do threshold (media + multiplier * std)
+        max_trim: Maximo de frames a remover
+    Returns:
+        Lista de frames com os problematicos removidos
+    """
+    if len(frames) < 20:
+        return frames
+    # Calcular diferenças entre frames consecutivos (últimos 20)
+    last_n = min(20, len(frames) - 1)
+    differences = []
+    for i in range(len(frames) - last_n, len(frames)):
+        if i > 0:
+            diff = calculate_frame_difference(frames[i-1], frames[i])
+            differences.append((i, diff))
+    if not differences:
+        return frames
+    # Calcular média e desvio padrão
+    diffs = [d[1] for d in differences]
+    mean_diff = np.mean(diffs)
+    std_diff = np.std(diffs)
+    # Threshold mais agressivo: média + 1.0*std (antes era 1.5)
+    threshold = mean_diff + threshold_multiplier * std_diff
+    # Threshold mínimo absoluto para evitar frames com muito movimento
+    min_threshold = 0.7  # Frames com diff > 0.7 são sempre problemáticos
+    if threshold > min_threshold:
+        threshold = min_threshold
+    # Encontrar onde começam os frames problemáticos (do fim para o início)
+    trim_from = len(frames)
+    frames_removed = 0
+    # Abordagem mais agressiva: remove todos os frames problemáticos do final
+    for i in range(len(differences) - 1, -1, -1):
+        idx, diff = differences[i]
+        if diff > threshold:
+            trim_from = idx
+            frames_removed += 1
+            if frames_removed >= max_trim:
+                break
+        else:
+            # Para no primeiro frame bom encontrado
+            break
+    # Calcular quantos frames remover
+    frames_to_trim = len(frames) - trim_from
+    if frames_to_trim > 0 and frames_to_trim <= max_trim:
+        print(f"[Trim] Removendo {frames_to_trim} frames problemáticos (threshold: {threshold:.2f}, mean: {mean_diff:.2f})")
+        return frames[:trim_from]
+    return frames
 def blend_frames(frame1, frame2, alpha):
     """Blend entre dois frames. alpha=0 -> frame1, alpha=1 -> frame2"""
     # Garantir que ambos frames tem o mesmo tamanho
                             audio_duration = 0
                             end_video_time_ms = 0
+                            # Calcular posicao inicial no idle baseado no tempo
+                            # idle_video_time_ms em ms, video @ 25fps = 40ms/frame
+                            fps = 25
+                            frame_duration_ms = 1000 / fps
+                            start_idle_idx = int(idle_video_time_ms / frame_duration_ms) % idle_frame_count if idle_frame_count > 0 else 0
+                            current_idle_idx = start_idle_idx
                             async for w2l_msg in wav2lip_ws:
                                 if w2l_msg.type == aiohttp.WSMsgType.TEXT:
                                     w2l_data = json.loads(w2l_msg.data)
                                         frame_b64 = w2l_data.get("frame", "")
                                         if frame_b64:
                                             frame = jpeg_base64_to_frame(frame_b64)
+                                            # Pegar frame idle full-res correspondente para histogram matching
+                                            idle_ref = None
+                                            if idle_frames and idle_frame_count > 0:
+                                                idle_ref = idle_frames[current_idle_idx]
+                                                current_idle_idx = (current_idle_idx + 1) % idle_frame_count
+                                            # Upscale frame inteiro do Wav2Lip
+                                            frame = upscale_frame(frame, idle_resolution)
+                                            # Histogram matching para consistencia de cor
+                                            if idle_ref is not None:
+                                                frame = match_histogram(frame, idle_ref)
                                             speaking_frames.append(frame)
                                     elif msg_type == "full_audio":
                             # Enviar frames SEM crossfade - transicao e feita no cliente
                             if speaking_frames:
+                                # 1. Primeiro, remover frames problemáticos do final (alto movimento)
+                                original_count = len(speaking_frames)
+                                speaking_frames = trim_high_motion_frames(speaking_frames)
+                                if len(speaking_frames) < original_count:
+                                    print(f"[Motion Trim] {original_count} -> {len(speaking_frames)} frames")
+                                # 2. Depois, trim para match audio duration (se ainda houver excesso)
+                                fps = 25
+                                if audio_duration > 0:
+                                    expected_frames = int(audio_duration / 1000 * fps)
+                                    if len(speaking_frames) > expected_frames:
+                                        trimmed = len(speaking_frames) - expected_frames
+                                        print(f"[Duration Trim] {trimmed} extra frames ({len(speaking_frames)} -> {expected_frames})")
+                                        speaking_frames = speaking_frames[:expected_frames]
+                                # 3. Encontrar o frame idle mais similar ao último frame de fala
+                                # Isso minimiza o "salto" visual na transição speak->idle
+                                best_idle_idx = 0
+                                if idle_frames and speaking_frames:
+                                    last_speak_frame = speaking_frames[-1]
+                                    best_idle_idx, best_diff = find_best_matching_idle_frame(
+                                        last_speak_frame, idle_frames, sample_step=10
+                                    )
+                                    # Converter índice para tempo em ms (25fps = 40ms/frame)
+                                    end_video_time_ms = int(best_idle_idx * 40)
+                                    print(f"[Best Match] Idle frame {best_idle_idx} (diff: {best_diff:.2f}) -> {end_video_time_ms}ms")
                                 # Atualizar posicao do idle para continuidade apos fala
                                 if idle_frames:
+                                    idle_position = best_idle_idx
                                 # Enviar stream_start
                                 ttfb = int((time.time() - start_time) * 1000)
                                 await ws.send_json({"type": "stream_start", "ttfb_ms": ttfb})
                                 # Enviar apenas os frames de fala (sem crossfade)
+                                # Usar qualidade JPEG alta (95) para minimizar artefatos
                                 for idx, frame in enumerate(speaking_frames):
+                                    frame_b64 = frame_to_jpeg_base64(frame, quality=95)
                                     await ws.send_json({
                                         "type": "frame",
                                         "frame": frame_b64,
                                     "end_video_time_ms": end_video_time_ms
                                 })
+                                print(f"Enviados {len(speaking_frames)} frames (Poisson Blending)")
                     except Exception as e:
                         print(f"Erro: {e}")
 if __name__ == "__main__":
     print("=" * 50)
+    print("Streaming Server - Porta", PORT)
     print("Wav2Lip:", WAV2LIP_WS)
     print("Idle Video:", IDLE_VIDEO)
     print("=" * 50)
     # Carregar idle frames
     load_idle_frames()
+    print(f"Upscaling: ENABLED (target {idle_resolution[0]}x{idle_resolution[1]})")
+    print("Interpolacao: LANCZOS4 (alta qualidade)")
+    print("Color: HISTOGRAM MATCHING (LAB color space)")
     print("=" * 50)
     web.run_app(app, host="0.0.0.0", port=PORT)

interface/test_webrtc_client.py ADDED Viewed

	@@ -0,0 +1,281 @@

+#!/usr/bin/env python3
+"""
+Cliente WebRTC automatizado para testar conexão com servidor
+"""
+import asyncio
+import aiohttp
+import sys
+from aiortc import RTCPeerConnection, RTCConfiguration, RTCIceServer, RTCSessionDescription
+from aiortc.contrib.media import MediaRecorder
+import time
+# Configuração
+SERVER_URL = "http://62.107.25.198:47898"
+# Cores para terminal
+GREEN = '\033[92m'
+RED = '\033[91m'
+YELLOW = '\033[93m'
+BLUE = '\033[94m'
+RESET = '\033[0m'
+class WebRTCTester:
+    def __init__(self):
+        self.pc = None
+        self.session_id = None
+        self.video_frames = 0
+        self.audio_frames = 0
+        self.ice_candidates = []
+        self.connected = False
+    async def test_connection(self):
+        """Testa conexão WebRTC completa"""
+        print(f"\n{'='*60}")
+        print(f"{BLUE}WebRTC Connection Test{RESET}")
+        print(f"{'='*60}\n")
+        try:
+            # 1. Configurar ICE servers (STUN + TURN)
+            print(f"{YELLOW}[1/5]{RESET} Configurando ICE servers...")
+            ice_servers = [
+                RTCIceServer(urls=["stun:stun.l.google.com:19302"]),
+                RTCIceServer(urls=["stun:stun1.l.google.com:19302"]),
+                RTCIceServer(
+                    urls=["turn:openrelay.metered.ca:80"],
+                    username="openrelayproject",
+                    credential="openrelayproject"
+                ),
+                RTCIceServer(
+                    urls=["turn:openrelay.metered.ca:443"],
+                    username="openrelayproject",
+                    credential="openrelayproject"
+                ),
+            ]
+            config = RTCConfiguration(iceServers=ice_servers)
+            self.pc = RTCPeerConnection(configuration=config)
+            print(f"{GREEN}✓{RESET} ICE servers configurados\n")
+            # 2. Configurar handlers
+            print(f"{YELLOW}[2/5]{RESET} Configurando event handlers...")
+            @self.pc.on("icecandidate")
+            async def on_ice_candidate(candidate):
+                if candidate:
+                    self.ice_candidates.append({
+                        'type': candidate.candidate.type if hasattr(candidate.candidate, 'type') else 'unknown',
+                        'protocol': candidate.candidate.protocol if hasattr(candidate.candidate, 'protocol') else 'unknown'
+                    })
+            @self.pc.on("connectionstatechange")
+            async def on_connection_state():
+                state = self.pc.connectionState
+                if state == "connected":
+                    print(f"{GREEN}✓{RESET} WebRTC Estado: {GREEN}CONNECTED{RESET}")
+                    self.connected = True
+                elif state == "failed":
+                    print(f"{RED}✗{RESET} WebRTC Estado: {RED}FAILED{RESET}")
+                elif state == "connecting":
+                    print(f"{YELLOW}⟳{RESET} WebRTC Estado: CONNECTING...")
+                else:
+                    print(f"  WebRTC Estado: {state}")
+            @self.pc.on("iceconnectionstatechange")
+            async def on_ice_state():
+                state = self.pc.iceConnectionState
+                if state == "connected":
+                    print(f"{GREEN}✓{RESET} ICE Estado: {GREEN}CONNECTED{RESET}")
+                elif state == "failed":
+                    print(f"{RED}✗{RESET} ICE Estado: {RED}FAILED{RESET}")
+                elif state == "checking":
+                    print(f"{YELLOW}⟳{RESET} ICE Estado: CHECKING...")
+                else:
+                    print(f"  ICE Estado: {state}")
+            @self.pc.on("track")
+            async def on_track(track):
+                print(f"{GREEN}✓{RESET} Track recebido: {BLUE}{track.kind}{RESET}")
+                if track.kind == "video":
+                    while True:
+                        try:
+                            frame = await track.recv()
+                            self.video_frames += 1
+                            if self.video_frames % 25 == 0:  # A cada segundo (25fps)
+                                print(f"  {BLUE}Video:{RESET} {self.video_frames} frames recebidos")
+                        except Exception as e:
+                            break
+                elif track.kind == "audio":
+                    while True:
+                        try:
+                            frame = await track.recv()
+                            self.audio_frames += 1
+                            if self.audio_frames % 50 == 0:  # A cada ~1 segundo
+                                print(f"  {BLUE}Audio:{RESET} {self.audio_frames} frames recebidos")
+                        except Exception as e:
+                            break
+            print(f"{GREEN}✓{RESET} Handlers configurados\n")
+            # 3. Criar transceivers
+            print(f"{YELLOW}[3/5]{RESET} Criando transceivers...")
+            self.pc.addTransceiver("video", direction="recvonly")
+            self.pc.addTransceiver("audio", direction="recvonly")
+            print(f"{GREEN}✓{RESET} Transceivers criados\n")
+            # 4. Criar offer e enviar para servidor
+            print(f"{YELLOW}[4/5]{RESET} Criando offer SDP...")
+            offer = await self.pc.createOffer()
+            await self.pc.setLocalDescription(offer)
+            # Aguardar ICE gathering
+            print(f"{YELLOW}⟳{RESET} Aguardando ICE gathering...")
+            await self.wait_for_ice_gathering()
+            print(f"{GREEN}✓{RESET} Offer criado\n")
+            # 5. Enviar offer para servidor
+            print(f"{YELLOW}[5/5]{RESET} Enviando offer para servidor...")
+            async with aiohttp.ClientSession() as session:
+                async with session.post(
+                    f"{SERVER_URL}/offer",
+                    json={
+                        "sdp": self.pc.localDescription.sdp,
+                        "type": self.pc.localDescription.type
+                    }
+                ) as resp:
+                    if resp.status != 200:
+                        raise Exception(f"Erro HTTP {resp.status}")
+                    answer = await resp.json()
+                    self.session_id = answer.get('session_id')
+                    # Aplicar answer
+                    await self.pc.setRemoteDescription(
+                        RTCSessionDescription(
+                            sdp=answer["sdp"],
+                            type=answer["type"]
+                        )
+                    )
+            print(f"{GREEN}✓{RESET} Answer recebido")
+            print(f"{GREEN}✓{RESET} Session ID: {BLUE}{self.session_id}{RESET}\n")
+            # 6. Aguardar conexão
+            print(f"{YELLOW}⟳{RESET} Aguardando conexão WebRTC...\n")
+            # Aguardar até 15 segundos
+            for i in range(15):
+                await asyncio.sleep(1)
+                if self.connected:
+                    break
+            # 7. Mostrar resultados
+            await self.show_results()
+            # 8. Manter vivo por 10 segundos para receber frames
+            if self.connected:
+                print(f"\n{YELLOW}⟳{RESET} Monitorando recepção de frames por 10 segundos...\n")
+                await asyncio.sleep(10)
+            # 9. Resultados finais
+            await self.show_final_results()
+        except Exception as e:
+            print(f"\n{RED}✗ ERRO:{RESET} {e}\n")
+            import traceback
+            traceback.print_exc()
+            return False
+        finally:
+            if self.pc:
+                await self.pc.close()
+        return self.connected
+    async def wait_for_ice_gathering(self):
+        """Aguarda ICE gathering completar"""
+        max_wait = 5
+        for _ in range(max_wait * 10):
+            if self.pc.iceGatheringState == "complete":
+                return
+            await asyncio.sleep(0.1)
+    async def show_results(self):
+        """Mostra resultados da conexão"""
+        print(f"\n{'='*60}")
+        print(f"{BLUE}Resultados da Conexão{RESET}")
+        print(f"{'='*60}\n")
+        # Estado da conexão
+        conn_state = self.pc.connectionState
+        if conn_state == "connected":
+            print(f"WebRTC State:     {GREEN}✓ CONNECTED{RESET}")
+        elif conn_state == "failed":
+            print(f"WebRTC State:     {RED}✗ FAILED{RESET}")
+        else:
+            print(f"WebRTC State:     {YELLOW}{conn_state.upper()}{RESET}")
+        # Estado ICE
+        ice_state = self.pc.iceConnectionState
+        if ice_state == "connected":
+            print(f"ICE State:        {GREEN}✓ CONNECTED{RESET}")
+        elif ice_state == "failed":
+            print(f"ICE State:        {RED}✗ FAILED{RESET}")
+        else:
+            print(f"ICE State:        {YELLOW}{ice_state.upper()}{RESET}")
+        # Candidatos ICE
+        print(f"\nICE Candidates:   {len(self.ice_candidates)} gerados")
+        candidate_types = {}
+        for c in self.ice_candidates:
+            ctype = c['type']
+            candidate_types[ctype] = candidate_types.get(ctype, 0) + 1
+        for ctype, count in candidate_types.items():
+            icon = "✓" if ctype == "relay" else "•"
+            color = GREEN if ctype == "relay" else BLUE
+            print(f"  {color}{icon}{RESET} {ctype}: {count}")
+        # Session ID
+        if self.session_id:
+            print(f"\nSession ID:       {BLUE}{self.session_id}{RESET}")
+        print()
+    async def show_final_results(self):
+        """Mostra resultados finais"""
+        print(f"\n{'='*60}")
+        print(f"{BLUE}Resultados Finais{RESET}")
+        print(f"{'='*60}\n")
+        if self.connected:
+            print(f"Status:           {GREEN}✓ SUCESSO{RESET}")
+        else:
+            print(f"Status:           {RED}✗ FALHA{RESET}")
+        print(f"Video Frames:     {BLUE}{self.video_frames}{RESET}")
+        print(f"Audio Frames:     {BLUE}{self.audio_frames}{RESET}")
+        if self.video_frames > 0 and self.audio_frames > 0:
+            print(f"\n{GREEN}✓ WebRTC funcionando perfeitamente!{RESET}")
+        elif self.connected:
+            print(f"\n{YELLOW}⚠ Conectado mas sem receber frames{RESET}")
+        else:
+            print(f"\n{RED}✗ Falha na conexão WebRTC{RESET}")
+        print()
+async def main():
+    tester = WebRTCTester()
+    success = await tester.test_connection()
+    sys.exit(0 if success else 1)
+if __name__ == "__main__":
+    asyncio.run(main())

interface/test_webrtc_playwright.py ADDED Viewed

	@@ -0,0 +1,218 @@

+#!/usr/bin/env python3
+"""
+Teste WebRTC automatizado com Playwright headless
+"""
+import asyncio
+from playwright.async_api import async_playwright
+import sys
+SERVER_URL = "http://62.107.25.198:47898"
+async def test_webrtc():
+    print("="*60)
+    print("Teste WebRTC com Playwright (Headless)")
+    print("="*60)
+    async with async_playwright() as p:
+        # Iniciar browser headless
+        print("\n[1/6] Iniciando browser headless...")
+        browser = await p.chromium.launch(
+            headless=True,
+            args=[
+                '--use-fake-ui-for-media-stream',
+                '--use-fake-device-for-media-stream',
+                '--no-sandbox'
+            ]
+        )
+        context = await browser.new_context(
+            permissions=['camera', 'microphone']
+        )
+        page = await context.new_page()
+        # Coletar logs do console e erros
+        console_logs = []
+        page_errors = []
+        page.on("console", lambda msg: console_logs.append(f"[{msg.type}] {msg.text}"))
+        page.on("pageerror", lambda exc: page_errors.append(f"ERROR: {exc}"))
+        # Capturar requests/responses
+        network_logs = []
+        def handle_response(response):
+            if '/offer' in response.url:
+                network_logs.append(f"POST /offer → Status: {response.status}")
+                if response.status != 200:
+                    network_logs.append(f"  Error: {response.status_text}")
+        page.on("response", handle_response)
+        # Navegar para a página
+        print(f"[2/6] Navegando para {SERVER_URL}...")
+        try:
+            await page.goto(SERVER_URL, timeout=10000)
+            print("✓ Página carregada")
+        except Exception as e:
+            print(f"✗ Erro ao carregar página: {e}")
+            await browser.close()
+            return False
+        # Verificar se a página carregou
+        print("[3/6] Verificando elementos...")
+        try:
+            await page.wait_for_selector('#btnConnect', timeout=5000)
+            print("✓ Botão 'Conectar' encontrado")
+        except Exception as e:
+            print(f"✗ Erro: {e}")
+            await browser.close()
+            return False
+        # Injetar código para capturar eventos ICE
+        print("[4/6] Configurando captura de eventos ICE...")
+        await page.evaluate("""
+            window.iceInfo = {
+                candidates: [],
+                states: [],
+                connectionStates: []
+            };
+            // Interceptar console.log de ICE candidates
+            const originalLog = console.log;
+            console.log = function(...args) {
+                if (args[0] === 'ICE Candidate:') {
+                    window.iceInfo.candidates.push(args[1]);
+                } else if (args[0] === 'ICE State:') {
+                    window.iceInfo.states.push(args[1]);
+                } else if (args[0] === 'Estado WebRTC:') {
+                    window.iceInfo.connectionStates.push(args[1]);
+                }
+                originalLog.apply(console, args);
+            };
+        """)
+        # Clicar em conectar
+        print("[5/6] Clicando em 'Conectar'...")
+        await page.click('#btnConnect')
+        # Aguardar tentativa de conexão
+        print("[6/6] Aguardando conexão (15s)...")
+        await asyncio.sleep(15)
+        # Capturar informações ICE
+        ice_info = await page.evaluate("window.iceInfo")
+        # Capturar estado da conexão
+        webrtc_state = await page.evaluate("""
+            (() => {
+                const pc = window.pc;
+                if (!pc) return null;
+                return {
+                    connectionState: pc.connectionState,
+                    iceConnectionState: pc.iceConnectionState,
+                    iceGatheringState: pc.iceGatheringState,
+                    signalingState: pc.signalingState
+                };
+            })()
+        """)
+        # Resultados
+        print("\n" + "="*60)
+        print("RESULTADOS")
+        print("="*60)
+        if webrtc_state:
+            print(f"\nEstados WebRTC:")
+            print(f"  Connection:     {webrtc_state['connectionState']}")
+            print(f"  ICE Connection: {webrtc_state['iceConnectionState']}")
+            print(f"  ICE Gathering:  {webrtc_state['iceGatheringState']}")
+            print(f"  Signaling:      {webrtc_state['signalingState']}")
+        else:
+            print("\n✗ Peer connection não foi criada!")
+        # Análise de candidatos ICE
+        candidates = ice_info.get('candidates', [])
+        print(f"\nCandidatos ICE: {len(candidates)}")
+        candidate_types = {}
+        for c in candidates:
+            ctype = c.get('type', 'unknown')
+            candidate_types[ctype] = candidate_types.get(ctype, 0) + 1
+        for ctype, count in candidate_types.items():
+            icon = "✓" if ctype == "relay" else "•"
+            print(f"  {icon} {ctype}: {count}")
+        # Diagnóstico
+        print("\n" + "="*60)
+        print("DIAGNÓSTICO")
+        print("="*60)
+        success = False
+        if not webrtc_state:
+            print("\n✗ FALHA: Peer connection não foi criada")
+            print("  → Verificar JavaScript do frontend")
+        elif webrtc_state['connectionState'] == 'connected':
+            print("\n✓ SUCESSO: WebRTC conectado!")
+            success = True
+        elif webrtc_state['iceConnectionState'] == 'failed':
+            print("\n✗ FALHA: ICE connection failed")
+            if len(candidates) == 0:
+                print("  → Problema: Nenhum candidato ICE gerado")
+                print("  → Solução: Verificar configuração STUN/TURN")
+            elif 'relay' not in candidate_types:
+                print("  → Problema: Nenhum candidato TURN (relay)")
+                print("  → Solução: Servidores TURN não estão funcionando")
+                print("  → Tentar outros servidores TURN")
+            else:
+                print("  → Problema: Candidatos gerados mas não conectam")
+                print("  → Solução: Problema de firewall/NAT no servidor")
+                print("  → Verificar port mapping UDP no vast.ai")
+        elif webrtc_state['connectionState'] == 'connecting':
+            print("\n⚠ TIMEOUT: Conexão travada em 'connecting'")
+            print("  → Problema: ICE negotiation timeout")
+            print("  → Causa provável: Port mapping UDP não configurado")
+            print("  → Solução: Expor portas UDP no vast.ai ou usar ngrok")
+        # Erros de página
+        if page_errors:
+            print("\n" + "="*60)
+            print("ERROS JAVASCRIPT")
+            print("="*60)
+            for err in page_errors:
+                print(err)
+        # Logs de rede
+        if network_logs:
+            print("\n" + "="*60)
+            print("LOGS DE REDE")
+            print("="*60)
+            for log in network_logs:
+                print(log)
+        # Logs do console
+        if console_logs:
+            print("\n" + "="*60)
+            print("LOGS DO CONSOLE (últimos 20)")
+            print("="*60)
+            for log in console_logs[-20:]:
+                print(log)
+        await browser.close()
+        return success
+async def main():
+    success = await test_webrtc()
+    sys.exit(0 if success else 1)
+if __name__ == "__main__":
+    asyncio.run(main())

interface/webrtc_skypilot.yaml ADDED Viewed

	@@ -0,0 +1,19 @@

+name: webrtc-avatar
+resources:
+  cloud: vast
+  accelerators: RTX_5090
+  disk_size: 200
+  ports:
+    - 8080  # HTTP + WebRTC
+workdir: .
+setup: |
+  set -e
+  sudo apt-get update -qq
+  sudo apt-get install -y ffmpeg libavdevice-dev libavfilter-dev libopus-dev libvpx-dev libsrtp2-dev
+  pip install aiohttp aiortc opencv-python numpy av websockets -q
+run: |
+  python3 server.py