/** * Avatar Stream SDK com Sincronização Inteligente * * Extende o SDK base com: * 1. Detecção de início/fim de fala no áudio (VAD) * 2. Detecção de início/fim de movimento no vídeo * 3. Alinhamento automático áudio/vídeo * 4. Ajuste dinâmico de FPS */ import { SyncEngine, SyncConfig, SyncResult, SyncDebugInfo } from './SyncEngine'; // ============================================================================ // Types // ============================================================================ export interface SyncedStreamConfig { /** WebSocket URL do servidor */ wsUrl: string; /** Canvas onde os frames serão renderizados */ canvas: HTMLCanvasElement; /** FPS original do vídeo (default: 25) */ targetFps?: number; /** Sample rate do áudio (default: 24000) */ audioSampleRate?: number; /** Habilitar sincronização inteligente (default: true) */ enableSmartSync?: boolean; /** Aguardar todo o conteúdo antes de sincronizar (default: false para streaming) */ waitForComplete?: boolean; /** Threshold para detecção de fala (0-1, default: 0.02) */ audioThreshold?: number; /** Threshold para detecção de movimento (0-1, default: 0.05) */ videoThreshold?: number; /** Callbacks */ onStatusChange?: (status: SyncedStreamStatus) => void; onMetricsUpdate?: (metrics: SyncedStreamMetrics) => void; onSyncUpdate?: (sync: SyncDebugInfo) => void; onError?: (error: Error) => void; onComplete?: (summary: SyncedStreamSummary) => void; } export type SyncedStreamStatus = | 'disconnected' | 'connecting' | 'connected' | 'receiving' | 'analyzing' | 'syncing' | 'playing' | 'completed' | 'error'; export interface SyncedStreamMetrics { framesReceived: number; framesRendered: number; audioChunksReceived: number; totalAudioSamples: number; videoDurationMs: number; audioDurationMs: number; originalFps: number; adjustedFps: number; syncDiffMs: number; audioSpeechStartMs: number | null; audioSpeechEndMs: number | null; videoSpeechStartFrame: number | null; videoSpeechEndFrame: number | null; } export interface SyncedStreamSummary { totalFrames: number; totalDurationMs: number; originalFps: number; adjustedFps: number; syncStrategy: string; audioSpeechDurationMs: number; videoSpeechDurationMs: number; finalSyncDiffMs: number; } // ============================================================================ // Constants // ============================================================================ const MSG_FRAME = 0x01; const MSG_AUDIO = 0x02; const MSG_AUDIO_CHUNK = 0x03; // ============================================================================ // Main Class // ============================================================================ export class AvatarStreamSDKSync { private config: Required; private ctx: CanvasRenderingContext2D; private syncEngine: SyncEngine; // WebSocket private ws: WebSocket | null = null; // State private status: SyncedStreamStatus = 'disconnected'; private startTime: number = 0; // Collected data private frameBlobs: Blob[] = []; private frameImages: HTMLImageElement[] = []; private audioChunks: Uint8Array[] = []; private allAudioSamples: Float32Array | null = null; private totalAudioSamples: number = 0; // Playback state private isPlaying: boolean = false; private currentFrameIndex: number = 0; private renderInterval: number | null = null; private audioContext: AudioContext | null = null; private audioSource: AudioBufferSourceNode | null = null; // Sync results private syncResult: SyncResult | null = null; // Streaming mode (before all data arrives) private streamingStarted: boolean = false; private nextAudioTime: number = 0; private scheduledAudioChunks: number = 0; // Heartbeat private heartbeatInterval: number | null = null; constructor(config: SyncedStreamConfig) { if (!config.wsUrl) throw new Error('wsUrl is required'); if (!config.canvas) throw new Error('canvas is required'); const ctx = config.canvas.getContext('2d'); if (!ctx) throw new Error('Could not get 2D context from canvas'); this.ctx = ctx; this.config = { wsUrl: config.wsUrl, canvas: config.canvas, targetFps: config.targetFps ?? 25, audioSampleRate: config.audioSampleRate ?? 24000, enableSmartSync: config.enableSmartSync ?? true, waitForComplete: config.waitForComplete ?? false, audioThreshold: config.audioThreshold ?? 0.02, videoThreshold: config.videoThreshold ?? 0.05, onStatusChange: config.onStatusChange ?? (() => {}), onMetricsUpdate: config.onMetricsUpdate ?? (() => {}), onSyncUpdate: config.onSyncUpdate ?? (() => {}), onError: config.onError ?? (() => {}), onComplete: config.onComplete ?? (() => {}), }; this.syncEngine = new SyncEngine({ audioSampleRate: this.config.audioSampleRate, videoFps: this.config.targetFps, audioThreshold: this.config.audioThreshold, videoThreshold: this.config.videoThreshold, onDebug: (info) => this.config.onSyncUpdate(info), }); } // ========================================================================== // Public API // ========================================================================== async connect(): Promise { if (this.ws?.readyState === WebSocket.OPEN) return; return new Promise((resolve, reject) => { this.setStatus('connecting'); this.ws = new WebSocket(this.config.wsUrl); this.ws.binaryType = 'arraybuffer'; this.ws.onopen = () => { this.setStatus('connected'); this.startHeartbeat(); resolve(); }; this.ws.onmessage = (event) => { if (event.data instanceof ArrayBuffer) { this.handleBinaryMessage(event.data); } else { try { this.handleJsonMessage(JSON.parse(event.data)); } catch (e) { this.config.onError(new Error(`Invalid JSON: ${e}`)); } } }; this.ws.onclose = () => { this.setStatus('disconnected'); this.stopHeartbeat(); }; this.ws.onerror = () => { const error = new Error('WebSocket connection failed'); this.config.onError(error); reject(error); }; }); } disconnect(): void { this.stop(); this.stopHeartbeat(); if (this.ws) { this.ws.close(); this.ws = null; } this.setStatus('disconnected'); } async generate(text: string, voice: string = 'tara'): Promise { if (!this.ws || this.ws.readyState !== WebSocket.OPEN) { throw new Error('WebSocket not connected'); } this.resetState(); this.startTime = Date.now(); this.setStatus('receiving'); this.ws.send(JSON.stringify({ action: 'generate', text: text.trim(), voice, })); } stop(): void { if (this.ws?.readyState === WebSocket.OPEN) { this.ws.send(JSON.stringify({ action: 'stop' })); } this.cleanup(); this.setStatus('connected'); } getMetrics(): SyncedStreamMetrics { const videoDurationMs = (this.frameBlobs.length / this.config.targetFps) * 1000; const audioDurationMs = (this.totalAudioSamples / this.config.audioSampleRate) * 1000; return { framesReceived: this.frameBlobs.length, framesRendered: this.currentFrameIndex, audioChunksReceived: this.audioChunks.length, totalAudioSamples: this.totalAudioSamples, videoDurationMs, audioDurationMs, originalFps: this.config.targetFps, adjustedFps: this.syncResult?.adjustedFps ?? this.config.targetFps, syncDiffMs: audioDurationMs - videoDurationMs, audioSpeechStartMs: this.syncResult?.debug.audioStartMs ?? null, audioSpeechEndMs: this.syncResult?.debug.audioEndMs ?? null, videoSpeechStartFrame: this.syncResult?.debug.videoStartFrame ?? null, videoSpeechEndFrame: this.syncResult?.debug.videoEndFrame ?? null, }; } // ========================================================================== // Message Handling // ========================================================================== private handleBinaryMessage(buffer: ArrayBuffer): void { const view = new DataView(buffer); const msgType = view.getUint8(0); if (msgType === MSG_FRAME) { const frameIndex = view.getUint32(1, true); const dataSize = view.getUint32(5, true); const frameData = new Uint8Array(buffer, 9, dataSize); this.handleFrame(frameData, frameIndex); } else if (msgType === MSG_AUDIO_CHUNK) { const chunkIndex = view.getUint32(1, true); const isLast = view.getUint8(5) === 1; const dataSize = view.getUint32(6, true); const chunkData = new Uint8Array(buffer, 10, dataSize); this.handleAudioChunk(chunkData, chunkIndex, isLast); } else if (msgType === MSG_AUDIO) { const sampleRate = view.getUint32(1, true); const dataSize = view.getUint32(5, true); const audioData = new Uint8Array(buffer, 9, dataSize); this.handleFullAudio(audioData, sampleRate); } this.emitMetrics(); } private handleJsonMessage(data: any): void { if (data.type === 'done') { this.onStreamComplete(); } else if (data.type === 'error') { this.config.onError(new Error(data.message)); this.setStatus('error'); } } private handleFrame(frameData: Uint8Array, _frameIndex: number): void { const blob = new Blob([frameData], { type: 'image/jpeg' }); this.frameBlobs.push(blob); // Se não estamos aguardando tudo, começar streaming if (!this.config.waitForComplete && !this.streamingStarted) { this.tryStartStreaming(); } } private handleAudioChunk(chunkData: Uint8Array, _chunkIndex: number, _isLast: boolean): void { if (chunkData.length > 0) { this.audioChunks.push(new Uint8Array(chunkData)); // Converter para samples e acumular const alignedBuffer = new ArrayBuffer(chunkData.length); new Uint8Array(alignedBuffer).set(chunkData); const samples = new Int16Array(alignedBuffer); this.totalAudioSamples += samples.length; // Streaming mode: agendar áudio imediatamente if (!this.config.waitForComplete) { this.scheduleAudioChunk(chunkData); } } } private handleFullAudio(audioData: Uint8Array, _sampleRate: number): void { this.audioChunks = [audioData]; const alignedBuffer = new ArrayBuffer(audioData.length); new Uint8Array(alignedBuffer).set(audioData); const samples = new Int16Array(alignedBuffer); this.totalAudioSamples = samples.length; } // ========================================================================== // Streaming Mode (antes de tudo chegar) // ========================================================================== private tryStartStreaming(): void { // Começar quando tivermos frames e áudio suficientes if (this.frameBlobs.length >= 10 && this.scheduledAudioChunks >= 2 && !this.streamingStarted) { this.streamingStarted = true; this.setStatus('playing'); this.startStreamingPlayback(); } } private scheduleAudioChunk(chunkData: Uint8Array): void { if (!this.audioContext) { this.audioContext = new (window.AudioContext || (window as any).webkitAudioContext)({ sampleRate: this.config.audioSampleRate, }); this.nextAudioTime = this.audioContext.currentTime + 0.05; } try { const alignedBuffer = new ArrayBuffer(chunkData.length); new Uint8Array(alignedBuffer).set(chunkData); const samples = new Int16Array(alignedBuffer); const floatSamples = new Float32Array(samples.length); for (let i = 0; i < samples.length; i++) { floatSamples[i] = samples[i] / 32768; } const chunkBuffer = this.audioContext.createBuffer(1, floatSamples.length, this.config.audioSampleRate); chunkBuffer.getChannelData(0).set(floatSamples); const source = this.audioContext.createBufferSource(); source.buffer = chunkBuffer; source.connect(this.audioContext.destination); if (this.nextAudioTime < this.audioContext.currentTime) { this.nextAudioTime = this.audioContext.currentTime + 0.01; } source.start(this.nextAudioTime); this.nextAudioTime += floatSamples.length / this.config.audioSampleRate; this.scheduledAudioChunks++; this.tryStartStreaming(); } catch (e) { console.error('Error scheduling audio:', e); } } private startStreamingPlayback(): void { // Usar FPS simples inicialmente const frameInterval = 1000 / this.config.targetFps; this.renderInterval = window.setInterval(() => { this.renderNextStreamingFrame(); }, frameInterval); } private renderNextStreamingFrame(): void { if (this.currentFrameIndex >= this.frameBlobs.length) { // Aguardar mais frames return; } const blob = this.frameBlobs[this.currentFrameIndex]; const url = URL.createObjectURL(blob); const img = new Image(); img.onload = () => { if (this.config.canvas.width !== img.width) { this.config.canvas.width = img.width; this.config.canvas.height = img.height; } this.ctx.drawImage(img, 0, 0); URL.revokeObjectURL(url); }; img.src = url; this.currentFrameIndex++; this.emitMetrics(); } // ========================================================================== // Complete Mode (após tudo chegar - sincronização precisa) // ========================================================================== private async onStreamComplete(): Promise { // Se já estamos em streaming mode, finalizar if (this.streamingStarted) { this.finishPlayback(); return; } this.setStatus('analyzing'); try { // Preparar áudio completo await this.prepareFullAudio(); // Preparar imagens para análise await this.prepareFrameImages(); if (this.config.enableSmartSync && this.allAudioSamples && this.frameImages.length > 0) { this.setStatus('syncing'); // Calcular sincronização inteligente this.syncResult = await this.syncEngine.calculateSync( this.allAudioSamples, this.frameImages, (this.totalAudioSamples / this.config.audioSampleRate) * 1000 ); console.log('Sync result:', this.syncResult.debug); } // Iniciar playback sincronizado this.setStatus('playing'); await this.startSyncedPlayback(); } catch (e) { console.error('Sync error:', e); this.config.onError(e as Error); this.setStatus('error'); } } private async prepareFullAudio(): Promise { if (this.audioChunks.length === 0) return; // Combinar todos os chunks const totalLength = this.audioChunks.reduce((sum, c) => sum + c.length, 0); const combined = new Uint8Array(totalLength); let offset = 0; for (const chunk of this.audioChunks) { combined.set(chunk, offset); offset += chunk.length; } // Converter para Float32Array const alignedBuffer = new ArrayBuffer(combined.length); new Uint8Array(alignedBuffer).set(combined); const samples = new Int16Array(alignedBuffer); this.allAudioSamples = new Float32Array(samples.length); for (let i = 0; i < samples.length; i++) { this.allAudioSamples[i] = samples[i] / 32768; } } private async prepareFrameImages(): Promise { this.frameImages = []; for (const blob of this.frameBlobs) { const img = new Image(); const url = URL.createObjectURL(blob); await new Promise((resolve) => { img.onload = () => { this.frameImages.push(img); resolve(); }; img.src = url; }); // Não revogar URL ainda - precisamos das imagens para análise } } private async startSyncedPlayback(): Promise { if (!this.allAudioSamples) return; // Criar AudioContext e buffer if (!this.audioContext) { this.audioContext = new (window.AudioContext || (window as any).webkitAudioContext)({ sampleRate: this.config.audioSampleRate, }); } const audioBuffer = this.audioContext.createBuffer( 1, this.allAudioSamples.length, this.config.audioSampleRate ); audioBuffer.getChannelData(0).set(this.allAudioSamples); // Aplicar delays se necessário const audioDelay = this.syncResult?.audioDelayMs ?? 0; const videoDelay = this.syncResult?.videoDelayMs ?? 0; // Iniciar áudio this.audioSource = this.audioContext.createBufferSource(); this.audioSource.buffer = audioBuffer; this.audioSource.connect(this.audioContext.destination); const audioStartTime = this.audioContext.currentTime + (audioDelay / 1000); this.audioSource.start(audioStartTime); // Iniciar vídeo após delay setTimeout(() => { this.startSyncedVideoPlayback(); }, videoDelay); } private startSyncedVideoPlayback(): void { const fps = this.syncResult?.adjustedFps ?? this.config.targetFps; const frameInterval = 1000 / fps; const frameMap = this.syncResult?.frameMap; this.currentFrameIndex = 0; this.isPlaying = true; this.renderInterval = window.setInterval(() => { if (!this.isPlaying) { this.finishPlayback(); return; } // Usar frameMap se disponível let frameToRender: number; if (frameMap && this.currentFrameIndex < frameMap.length) { frameToRender = frameMap[this.currentFrameIndex]; } else if (this.currentFrameIndex < this.frameImages.length) { frameToRender = this.currentFrameIndex; } else { this.finishPlayback(); return; } // Renderizar frame if (frameToRender < this.frameImages.length) { const img = this.frameImages[frameToRender]; if (this.config.canvas.width !== img.width) { this.config.canvas.width = img.width; this.config.canvas.height = img.height; } this.ctx.drawImage(img, 0, 0); } this.currentFrameIndex++; this.emitMetrics(); }, frameInterval); } private finishPlayback(): void { this.isPlaying = false; if (this.renderInterval) { clearInterval(this.renderInterval); this.renderInterval = null; } const metrics = this.getMetrics(); const summary: SyncedStreamSummary = { totalFrames: this.currentFrameIndex, totalDurationMs: Date.now() - this.startTime, originalFps: this.config.targetFps, adjustedFps: this.syncResult?.adjustedFps ?? this.config.targetFps, syncStrategy: this.syncResult?.debug.syncStrategy ?? 'normal', audioSpeechDurationMs: this.syncResult?.debug.audioSpeechDuration ?? metrics.audioDurationMs, videoSpeechDurationMs: ((this.syncResult?.debug.videoSpeechFrames ?? this.frameBlobs.length) / this.config.targetFps) * 1000, finalSyncDiffMs: metrics.syncDiffMs, }; this.config.onComplete(summary); this.setStatus('completed'); } // ========================================================================== // Utilities // ========================================================================== private setStatus(status: SyncedStreamStatus): void { this.status = status; this.config.onStatusChange(status); } private emitMetrics(): void { this.config.onMetricsUpdate(this.getMetrics()); } private resetState(): void { this.cleanup(); this.frameBlobs = []; this.frameImages = []; this.audioChunks = []; this.allAudioSamples = null; this.totalAudioSamples = 0; this.currentFrameIndex = 0; this.syncResult = null; this.streamingStarted = false; this.nextAudioTime = 0; this.scheduledAudioChunks = 0; } private cleanup(): void { this.isPlaying = false; if (this.renderInterval) { clearInterval(this.renderInterval); this.renderInterval = null; } if (this.audioSource) { try { this.audioSource.stop(); } catch {} this.audioSource = null; } } private startHeartbeat(): void { this.heartbeatInterval = window.setInterval(() => { if (this.ws?.readyState === WebSocket.OPEN) { this.ws.send(JSON.stringify({ action: 'ping' })); } }, 30000); } private stopHeartbeat(): void { if (this.heartbeatInterval) { clearInterval(this.heartbeatInterval); this.heartbeatInterval = null; } } } export default AvatarStreamSDKSync;