speech2speech-interface / interface /index_streaming.html
marcosremar2's picture
Add WebRTC streaming interface with vast.ai deployment
e62aafd
Raw
History Blame Contribute Delete
16.7 kB
<!DOCTYPE html>
<html lang="pt-BR">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Avatar - Streaming Progressivo</title>
<style>
* { margin: 0; padding: 0; box-sizing: border-box; }
body { font-family: system-ui, sans-serif; background: #0a0a1a; color: #fff; min-height: 100vh; padding: 20px; }
.container { max-width: 900px; margin: 0 auto; }
.status { text-align: center; padding: 8px; margin-bottom: 15px; border-radius: 8px; font-size: 13px; background: rgba(255,255,255,0.1); }
.status.ok { background: rgba(0,255,100,0.2); color: #0f0; }
.status.busy { background: rgba(255,200,0,0.2); color: #fc0; }
.video-box { background: #000; border-radius: 10px; overflow: hidden; margin-bottom: 20px; aspect-ratio: 16/9; position: relative; }
video, canvas { width: 100%; height: 100%; object-fit: contain; position: absolute; top: 0; left: 0; }
#idleVideo { z-index: 1; }
#talkCanvas { z-index: 2; display: none; }
.controls { display: flex; gap: 10px; margin-bottom: 20px; flex-wrap: wrap; }
textarea { flex: 1; min-width: 200px; padding: 12px; border: 1px solid #333; border-radius: 8px; background: #1a1a2e; color: #fff; font-size: 14px; resize: none; height: 50px; }
select { padding: 12px; border: 1px solid #333; border-radius: 8px; background: #1a1a2e; color: #fff; font-size: 14px; }
button { padding: 12px 24px; border: none; border-radius: 8px; font-size: 14px; font-weight: bold; cursor: pointer; }
button:disabled { opacity: 0.5; cursor: not-allowed; }
.btn-go { background: #00ff88; color: #000; }
.btn-stop { background: #ff4444; color: #fff; }
.metrics { display: grid; grid-template-columns: repeat(auto-fit, minmax(120px, 1fr)); gap: 8px; padding: 12px; background: #1a1a2e; border-radius: 8px; font-size: 12px; }
.metric { display: flex; justify-content: space-between; }
.val { color: #00ff88; font-family: monospace; }
.progress { height: 4px; background: #333; border-radius: 2px; margin-top: 10px; overflow: hidden; }
.progress-bar { height: 100%; background: #00ff88; width: 0%; transition: width 0.1s; }
</style>
</head>
<body>
<div class="container">
<div class="status" id="status">Carregando...</div>
<div class="video-box">
<video id="idleVideo" playsinline muted loop></video>
<canvas id="talkCanvas"></canvas>
</div>
<div class="controls">
<textarea id="text">Hello! I am testing the avatar streaming.</textarea>
<select id="voice">
<option value="tara">Tara</option>
<option value="leah">Leah</option>
<option value="jess">Jess</option>
<option value="leo">Leo</option>
<option value="dan">Dan</option>
</select>
<button class="btn-go" id="btnGo">Gerar</button>
<button class="btn-stop" id="btnStop" disabled>Parar</button>
</div>
<div class="progress"><div class="progress-bar" id="progress"></div></div>
<div class="metrics">
<div class="metric"><span>TTFB:</span><span class="val" id="mTtfb">--</span></div>
<div class="metric"><span>Frames:</span><span class="val" id="mFrames">--</span></div>
<div class="metric"><span>FPS:</span><span class="val" id="mFps">--</span></div>
<div class="metric"><span>Audio:</span><span class="val" id="mAudio">--</span></div>
</div>
</div>
<script>
const idleVideo = document.getElementById('idleVideo');
const talkCanvas = document.getElementById('talkCanvas');
const ctx = talkCanvas.getContext('2d');
const status = document.getElementById('status');
const progress = document.getElementById('progress');
const btnGo = document.getElementById('btnGo');
const btnStop = document.getElementById('btnStop');
let ws = null;
let frames = []; // Array indexado para acesso O(1)
let isPlaying = false;
let isBuffering = false;
let audioContext = null;
let audioBuffer = null;
let audioSource = null;
let startTime = 0;
let frameCount = 0;
let totalFrames = 0;
let playbackStartTime = 0;
let animationId = null;
let lastRenderedFrame = -1; // Para evitar re-renderizar mesmo frame
let endVideoTimeMs = 0; // Tempo do idle video onde a fala termina
// Configuracao
const TARGET_FPS = 25;
const FRAME_DURATION = 1000 / TARGET_FPS; // 40ms
// Carregar video IDLE em loop
idleVideo.src = 'idle.mp4';
idleVideo.oncanplay = () => {
idleVideo.play().catch(() => {
setStatus('Clique na tela para iniciar', 'busy');
document.body.onclick = () => {
idleVideo.play();
document.body.onclick = null;
};
});
};
idleVideo.onplay = () => {
if (!ws || ws.readyState !== WebSocket.OPEN) {
setStatus('Conectando...', 'busy');
}
};
function setStatus(txt, cls) {
status.textContent = txt;
status.className = 'status ' + (cls || '');
}
function setMetric(id, val) {
document.getElementById(id).textContent = val;
}
function resetMetrics() {
setMetric('mTtfb', '--');
setMetric('mFrames', '--');
setMetric('mFps', '--');
setMetric('mAudio', '--');
progress.style.width = '0%';
}
function connect() {
if (ws && ws.readyState === WebSocket.OPEN) return;
ws = new WebSocket('ws://' + location.host + '/ws');
ws.onopen = () => setStatus('Pronto', 'ok');
ws.onclose = () => { setStatus('Desconectado'); setTimeout(connect, 3000); };
ws.onerror = () => setStatus('Erro de conexao');
ws.onmessage = (e) => {
const msg = JSON.parse(e.data);
console.log('MSG:', msg.type);
switch (msg.type) {
case 'status':
setStatus(msg.message, 'busy');
break;
case 'stream_start':
setMetric('mTtfb', msg.ttfb_ms + 'ms');
setStatus('Recebendo frames...', 'busy');
// MUDANCA: Apenas iniciar buffering, NAO iniciar playback ainda
startBuffering();
break;
case 'frame':
// Adicionar frame na fila (ainda nao reproduz)
addFrame(msg.frame, msg.index);
break;
case 'audio':
// MUDANCA: Audio chegou - AGORA iniciar playback sincronizado!
setMetric('mAudio', (msg.duration_ms / 1000).toFixed(2) + 's');
// Calcular FPS real baseado na duracao do audio
const realFps = (totalFrames || frameCount) / (msg.duration_ms / 1000);
setMetric('mFps', realFps.toFixed(1));
startSyncedPlayback(msg.audio, msg.duration_ms);
break;
case 'done':
totalFrames = msg.frames;
setMetric('mFrames', msg.frames);
// Salvar end_video_time_ms para sincronizar idle quando fala terminar
endVideoTimeMs = msg.end_video_time_ms || 0;
console.log(`Done: ${msg.frames} frames, end_video_time: ${endVideoTimeMs}ms`);
break;
case 'error':
setStatus('Erro: ' + msg.message);
stopPlayback();
setButtons(false);
break;
}
};
}
function addFrame(base64Frame, index) {
// Decodificar frame e adicionar no array indexado
const img = new Image();
img.onload = () => {
// Armazenar no indice correto para acesso O(1)
frames[index] = img;
frameCount++;
// Ajustar tamanho do canvas no primeiro frame
if (index === 0) {
talkCanvas.width = img.width;
talkCanvas.height = img.height;
}
// Atualizar status de buffering
if (isBuffering && !isPlaying) {
setStatus(`Buffering: ${frameCount} frames...`, 'busy');
}
};
img.src = 'data:image/jpeg;base64,' + base64Frame;
}
function startBuffering() {
// Preparar para receber frames, mas NAO iniciar playback
isBuffering = true;
isPlaying = false;
frames = []; // Reset array indexado
frameCount = 0;
totalFrames = 0;
lastRenderedFrame = -1;
// NAO mostrar canvas ainda - so quando primeiro frame estiver pronto
// talkCanvas.style.display = 'block';
}
// Duracao real do audio (usado para sincronizar frames)
let audioDurationMs = 0;
let dynamicFrameDuration = FRAME_DURATION;
async function startSyncedPlayback(base64Audio, durationMs) {
// Audio chegou - iniciar playback sincronizado de video + audio
setStatus('Reproduzindo...', 'ok');
try {
// Inicializar AudioContext se necessario
if (!audioContext) {
audioContext = new (window.AudioContext || window.webkitAudioContext)();
}
// Garantir que AudioContext esta rodando (pode estar suspenso)
if (audioContext.state === 'suspended') {
await audioContext.resume();
}
// Decodificar base64 para ArrayBuffer
const binaryString = atob(base64Audio);
const bytes = new Uint8Array(binaryString.length);
for (let i = 0; i < binaryString.length; i++) {
bytes[i] = binaryString.charCodeAt(i);
}
// Verificar se tem header WAV (RIFF) e pular se existir
let pcmOffset = 0;
if (bytes.length > 44 &&
bytes[0] === 0x52 && bytes[1] === 0x49 &&
bytes[2] === 0x46 && bytes[3] === 0x46) { // "RIFF"
console.log('WAV header detected, skipping 44 bytes');
pcmOffset = 44;
}
// PCM 16-bit mono 24kHz -> AudioBuffer
const pcmData = new Int16Array(bytes.buffer, pcmOffset);
const floatData = new Float32Array(pcmData.length);
for (let i = 0; i < pcmData.length; i++) {
floatData[i] = pcmData[i] / 32768.0;
}
// Aplicar fade-in suave para evitar estalo no inicio (50ms @ 24kHz = 1200 samples)
const fadeInSamples = 1200;
for (let i = 0; i < Math.min(fadeInSamples, floatData.length); i++) {
// Usar curva exponencial para fade mais suave
const t = i / fadeInSamples;
floatData[i] *= t * t; // Curva quadratica (mais suave que linear)
}
// Aplicar fade-out suave para evitar estalo no fim (30ms @ 24kHz = 720 samples)
const fadeOutSamples = 720;
const fadeOutStart = floatData.length - fadeOutSamples;
for (let i = 0; i < fadeOutSamples && fadeOutStart + i < floatData.length; i++) {
floatData[fadeOutStart + i] *= (fadeOutSamples - i) / fadeOutSamples;
}
audioBuffer = audioContext.createBuffer(1, floatData.length, 24000);
audioBuffer.getChannelData(0).set(floatData);
// Criar source
audioSource = audioContext.createBufferSource();
audioSource.buffer = audioBuffer;
audioSource.connect(audioContext.destination);
audioSource.onended = () => {
// Apenas marcar como null - o renderLoop vai detectar e parar
audioSource = null;
};
// Calcular quantos frames usar baseado na duracao do audio
// Manter 25fps fixo e usar apenas os frames necessarios
audioDurationMs = durationMs;
dynamicFrameDuration = FRAME_DURATION; // Sempre 40ms (25fps)
// Calcular quantos frames cabem na duracao do audio
const framesNeeded = Math.floor(durationMs / FRAME_DURATION);
const numFrames = totalFrames || frameCount;
// Limitar ao numero de frames disponiveis ou necessarios (o menor)
const framesToUse = Math.min(framesNeeded, numFrames);
console.log(`Audio: ${durationMs}ms, Frames disponiveis: ${numFrames}, Frames a usar: ${framesToUse} (${(1000/dynamicFrameDuration).toFixed(1)}fps)`);
// Atualizar totalFrames para usar apenas os necessarios
totalFrames = framesToUse;
// PRE-RENDERIZAR primeiro frame ANTES de mostrar canvas
// Isso evita o "tec" de um frame em branco
if (frames[0]) {
ctx.drawImage(frames[0], 0, 0);
lastRenderedFrame = 0;
}
// Mostrar canvas
talkCanvas.style.display = 'block';
// INICIAR TUDO SINCRONIZADO: audio + video ao mesmo tempo!
isPlaying = true;
isBuffering = false;
playbackStartTime = performance.now();
// Iniciar audio
audioSource.start(0);
console.log('Playback sincronizado iniciado:', frameCount, 'frames,', durationMs, 'ms audio');
// Iniciar loop de renderizacao de video
renderLoop();
} catch (err) {
console.error('Erro ao iniciar playback:', err);
setStatus('Erro: ' + err.message);
stopPlayback();
}
}
function renderLoop() {
if (!isPlaying) return;
// Se audio terminou, parar imediatamente (transicao instantanea)
if (!audioSource) {
stopPlayback();
return;
}
const elapsed = performance.now() - playbackStartTime;
// Usar duracao dinamica para sincronizar com audio
const targetFrame = Math.floor(elapsed / dynamicFrameDuration);
const total = totalFrames || frameCount;
// So renderizar se for um frame diferente do anterior e dentro do limite
if (targetFrame !== lastRenderedFrame && targetFrame < total) {
// Acesso O(1) ao frame pelo indice
let frameToRender = frames[targetFrame];
// Se frame ainda nao chegou, usar o ultimo frame disponivel
if (!frameToRender) {
// Procurar frame mais proximo anterior
for (let i = targetFrame - 1; i >= 0; i--) {
if (frames[i]) {
frameToRender = frames[i];
break;
}
}
}
if (frameToRender) {
ctx.drawImage(frameToRender, 0, 0);
lastRenderedFrame = targetFrame;
}
}
// Atualizar progresso visual
if (total > 0) {
const displayedFrame = Math.min(targetFrame, total);
progress.style.width = (displayedFrame / total * 100) + '%';
}
// Continuar apenas enquanto audio estiver tocando
animationId = requestAnimationFrame(renderLoop);
}
function stopPlayback() {
isPlaying = false;
isBuffering = false;
if (animationId) {
cancelAnimationFrame(animationId);
animationId = null;
}
if (audioSource) {
try {
audioSource.stop();
} catch (e) {}
audioSource = null;
}
// Esconder canvas IMEDIATAMENTE para evitar "travadinha"
// O video idle ja esta tocando por baixo, entao a transicao sera suave
talkCanvas.style.display = 'none';
ctx.clearRect(0, 0, talkCanvas.width, talkCanvas.height);
// Sincronizar idle video para o tempo correto (onde a fala terminou)
// Isso acontece em background, o usuario ja ve o video idle
if (endVideoTimeMs > 0 && idleVideo.duration > 0) {
const targetTime = (endVideoTimeMs / 1000) % idleVideo.duration;
console.log(`Idle video sync: seeking to ${targetTime.toFixed(2)}s (endVideoTimeMs=${endVideoTimeMs})`);
// Fazer o seek em background - video ja esta visivel
if (idleVideo.fastSeek) {
idleVideo.fastSeek(targetTime);
} else {
idleVideo.currentTime = targetTime;
}
// Garantir que esta tocando
idleVideo.play().catch(() => {});
endVideoTimeMs = 0; // Reset para proxima vez
}
frames = [];
lastRenderedFrame = -1;
setStatus('Pronto', 'ok');
setButtons(false);
}
function setButtons(generating) {
btnGo.disabled = generating;
btnStop.disabled = !generating;
}
btnGo.onclick = () => {
const text = document.getElementById('text').value.trim();
if (!text) return;
if (!ws || ws.readyState !== WebSocket.OPEN) {
setStatus('Nao conectado');
return;
}
resetMetrics();
setButtons(true);
setStatus('Gerando...', 'busy');
startTime = Date.now();
// Capturar o tempo atual do video idle para sincronizacao
const idleVideoTimeMs = Math.floor(idleVideo.currentTime * 1000);
console.log(`Idle video time: ${idleVideoTimeMs}ms`);
ws.send(JSON.stringify({
action: 'generate',
text: text,
voice: document.getElementById('voice').value,
idle_video_time_ms: idleVideoTimeMs // Enviar para servidor sincronizar frames
}));
};
btnStop.onclick = () => {
stopPlayback();
};
connect();
</script>
</body>
</html>