speech2speech-interface / interface /sdk /src /AvatarStreamSDKSync.ts
marcosremar2's picture
Add SDK and improve video transition synchronization
3acaae2
Raw
History Blame Contribute Delete
20.8 kB
/**
* Avatar Stream SDK com Sincronização Inteligente
*
* Extende o SDK base com:
* 1. Detecção de início/fim de fala no áudio (VAD)
* 2. Detecção de início/fim de movimento no vídeo
* 3. Alinhamento automático áudio/vídeo
* 4. Ajuste dinâmico de FPS
*/
import { SyncEngine, SyncConfig, SyncResult, SyncDebugInfo } from './SyncEngine';
// ============================================================================
// Types
// ============================================================================
export interface SyncedStreamConfig {
/** WebSocket URL do servidor */
wsUrl: string;
/** Canvas onde os frames serão renderizados */
canvas: HTMLCanvasElement;
/** FPS original do vídeo (default: 25) */
targetFps?: number;
/** Sample rate do áudio (default: 24000) */
audioSampleRate?: number;
/** Habilitar sincronização inteligente (default: true) */
enableSmartSync?: boolean;
/** Aguardar todo o conteúdo antes de sincronizar (default: false para streaming) */
waitForComplete?: boolean;
/** Threshold para detecção de fala (0-1, default: 0.02) */
audioThreshold?: number;
/** Threshold para detecção de movimento (0-1, default: 0.05) */
videoThreshold?: number;
/** Callbacks */
onStatusChange?: (status: SyncedStreamStatus) => void;
onMetricsUpdate?: (metrics: SyncedStreamMetrics) => void;
onSyncUpdate?: (sync: SyncDebugInfo) => void;
onError?: (error: Error) => void;
onComplete?: (summary: SyncedStreamSummary) => void;
}
export type SyncedStreamStatus =
| 'disconnected'
| 'connecting'
| 'connected'
| 'receiving'
| 'analyzing'
| 'syncing'
| 'playing'
| 'completed'
| 'error';
export interface SyncedStreamMetrics {
framesReceived: number;
framesRendered: number;
audioChunksReceived: number;
totalAudioSamples: number;
videoDurationMs: number;
audioDurationMs: number;
originalFps: number;
adjustedFps: number;
syncDiffMs: number;
audioSpeechStartMs: number | null;
audioSpeechEndMs: number | null;
videoSpeechStartFrame: number | null;
videoSpeechEndFrame: number | null;
}
export interface SyncedStreamSummary {
totalFrames: number;
totalDurationMs: number;
originalFps: number;
adjustedFps: number;
syncStrategy: string;
audioSpeechDurationMs: number;
videoSpeechDurationMs: number;
finalSyncDiffMs: number;
}
// ============================================================================
// Constants
// ============================================================================
const MSG_FRAME = 0x01;
const MSG_AUDIO = 0x02;
const MSG_AUDIO_CHUNK = 0x03;
// ============================================================================
// Main Class
// ============================================================================
export class AvatarStreamSDKSync {
private config: Required<SyncedStreamConfig>;
private ctx: CanvasRenderingContext2D;
private syncEngine: SyncEngine;
// WebSocket
private ws: WebSocket | null = null;
// State
private status: SyncedStreamStatus = 'disconnected';
private startTime: number = 0;
// Collected data
private frameBlobs: Blob[] = [];
private frameImages: HTMLImageElement[] = [];
private audioChunks: Uint8Array[] = [];
private allAudioSamples: Float32Array | null = null;
private totalAudioSamples: number = 0;
// Playback state
private isPlaying: boolean = false;
private currentFrameIndex: number = 0;
private renderInterval: number | null = null;
private audioContext: AudioContext | null = null;
private audioSource: AudioBufferSourceNode | null = null;
// Sync results
private syncResult: SyncResult | null = null;
// Streaming mode (before all data arrives)
private streamingStarted: boolean = false;
private nextAudioTime: number = 0;
private scheduledAudioChunks: number = 0;
// Heartbeat
private heartbeatInterval: number | null = null;
constructor(config: SyncedStreamConfig) {
if (!config.wsUrl) throw new Error('wsUrl is required');
if (!config.canvas) throw new Error('canvas is required');
const ctx = config.canvas.getContext('2d');
if (!ctx) throw new Error('Could not get 2D context from canvas');
this.ctx = ctx;
this.config = {
wsUrl: config.wsUrl,
canvas: config.canvas,
targetFps: config.targetFps ?? 25,
audioSampleRate: config.audioSampleRate ?? 24000,
enableSmartSync: config.enableSmartSync ?? true,
waitForComplete: config.waitForComplete ?? false,
audioThreshold: config.audioThreshold ?? 0.02,
videoThreshold: config.videoThreshold ?? 0.05,
onStatusChange: config.onStatusChange ?? (() => {}),
onMetricsUpdate: config.onMetricsUpdate ?? (() => {}),
onSyncUpdate: config.onSyncUpdate ?? (() => {}),
onError: config.onError ?? (() => {}),
onComplete: config.onComplete ?? (() => {}),
};
this.syncEngine = new SyncEngine({
audioSampleRate: this.config.audioSampleRate,
videoFps: this.config.targetFps,
audioThreshold: this.config.audioThreshold,
videoThreshold: this.config.videoThreshold,
onDebug: (info) => this.config.onSyncUpdate(info),
});
}
// ==========================================================================
// Public API
// ==========================================================================
async connect(): Promise<void> {
if (this.ws?.readyState === WebSocket.OPEN) return;
return new Promise((resolve, reject) => {
this.setStatus('connecting');
this.ws = new WebSocket(this.config.wsUrl);
this.ws.binaryType = 'arraybuffer';
this.ws.onopen = () => {
this.setStatus('connected');
this.startHeartbeat();
resolve();
};
this.ws.onmessage = (event) => {
if (event.data instanceof ArrayBuffer) {
this.handleBinaryMessage(event.data);
} else {
try {
this.handleJsonMessage(JSON.parse(event.data));
} catch (e) {
this.config.onError(new Error(`Invalid JSON: ${e}`));
}
}
};
this.ws.onclose = () => {
this.setStatus('disconnected');
this.stopHeartbeat();
};
this.ws.onerror = () => {
const error = new Error('WebSocket connection failed');
this.config.onError(error);
reject(error);
};
});
}
disconnect(): void {
this.stop();
this.stopHeartbeat();
if (this.ws) {
this.ws.close();
this.ws = null;
}
this.setStatus('disconnected');
}
async generate(text: string, voice: string = 'tara'): Promise<void> {
if (!this.ws || this.ws.readyState !== WebSocket.OPEN) {
throw new Error('WebSocket not connected');
}
this.resetState();
this.startTime = Date.now();
this.setStatus('receiving');
this.ws.send(JSON.stringify({
action: 'generate',
text: text.trim(),
voice,
}));
}
stop(): void {
if (this.ws?.readyState === WebSocket.OPEN) {
this.ws.send(JSON.stringify({ action: 'stop' }));
}
this.cleanup();
this.setStatus('connected');
}
getMetrics(): SyncedStreamMetrics {
const videoDurationMs = (this.frameBlobs.length / this.config.targetFps) * 1000;
const audioDurationMs = (this.totalAudioSamples / this.config.audioSampleRate) * 1000;
return {
framesReceived: this.frameBlobs.length,
framesRendered: this.currentFrameIndex,
audioChunksReceived: this.audioChunks.length,
totalAudioSamples: this.totalAudioSamples,
videoDurationMs,
audioDurationMs,
originalFps: this.config.targetFps,
adjustedFps: this.syncResult?.adjustedFps ?? this.config.targetFps,
syncDiffMs: audioDurationMs - videoDurationMs,
audioSpeechStartMs: this.syncResult?.debug.audioStartMs ?? null,
audioSpeechEndMs: this.syncResult?.debug.audioEndMs ?? null,
videoSpeechStartFrame: this.syncResult?.debug.videoStartFrame ?? null,
videoSpeechEndFrame: this.syncResult?.debug.videoEndFrame ?? null,
};
}
// ==========================================================================
// Message Handling
// ==========================================================================
private handleBinaryMessage(buffer: ArrayBuffer): void {
const view = new DataView(buffer);
const msgType = view.getUint8(0);
if (msgType === MSG_FRAME) {
const frameIndex = view.getUint32(1, true);
const dataSize = view.getUint32(5, true);
const frameData = new Uint8Array(buffer, 9, dataSize);
this.handleFrame(frameData, frameIndex);
} else if (msgType === MSG_AUDIO_CHUNK) {
const chunkIndex = view.getUint32(1, true);
const isLast = view.getUint8(5) === 1;
const dataSize = view.getUint32(6, true);
const chunkData = new Uint8Array(buffer, 10, dataSize);
this.handleAudioChunk(chunkData, chunkIndex, isLast);
} else if (msgType === MSG_AUDIO) {
const sampleRate = view.getUint32(1, true);
const dataSize = view.getUint32(5, true);
const audioData = new Uint8Array(buffer, 9, dataSize);
this.handleFullAudio(audioData, sampleRate);
}
this.emitMetrics();
}
private handleJsonMessage(data: any): void {
if (data.type === 'done') {
this.onStreamComplete();
} else if (data.type === 'error') {
this.config.onError(new Error(data.message));
this.setStatus('error');
}
}
private handleFrame(frameData: Uint8Array, _frameIndex: number): void {
const blob = new Blob([frameData], { type: 'image/jpeg' });
this.frameBlobs.push(blob);
// Se não estamos aguardando tudo, começar streaming
if (!this.config.waitForComplete && !this.streamingStarted) {
this.tryStartStreaming();
}
}
private handleAudioChunk(chunkData: Uint8Array, _chunkIndex: number, _isLast: boolean): void {
if (chunkData.length > 0) {
this.audioChunks.push(new Uint8Array(chunkData));
// Converter para samples e acumular
const alignedBuffer = new ArrayBuffer(chunkData.length);
new Uint8Array(alignedBuffer).set(chunkData);
const samples = new Int16Array(alignedBuffer);
this.totalAudioSamples += samples.length;
// Streaming mode: agendar áudio imediatamente
if (!this.config.waitForComplete) {
this.scheduleAudioChunk(chunkData);
}
}
}
private handleFullAudio(audioData: Uint8Array, _sampleRate: number): void {
this.audioChunks = [audioData];
const alignedBuffer = new ArrayBuffer(audioData.length);
new Uint8Array(alignedBuffer).set(audioData);
const samples = new Int16Array(alignedBuffer);
this.totalAudioSamples = samples.length;
}
// ==========================================================================
// Streaming Mode (antes de tudo chegar)
// ==========================================================================
private tryStartStreaming(): void {
// Começar quando tivermos frames e áudio suficientes
if (this.frameBlobs.length >= 10 && this.scheduledAudioChunks >= 2 && !this.streamingStarted) {
this.streamingStarted = true;
this.setStatus('playing');
this.startStreamingPlayback();
}
}
private scheduleAudioChunk(chunkData: Uint8Array): void {
if (!this.audioContext) {
this.audioContext = new (window.AudioContext || (window as any).webkitAudioContext)({
sampleRate: this.config.audioSampleRate,
});
this.nextAudioTime = this.audioContext.currentTime + 0.05;
}
try {
const alignedBuffer = new ArrayBuffer(chunkData.length);
new Uint8Array(alignedBuffer).set(chunkData);
const samples = new Int16Array(alignedBuffer);
const floatSamples = new Float32Array(samples.length);
for (let i = 0; i < samples.length; i++) {
floatSamples[i] = samples[i] / 32768;
}
const chunkBuffer = this.audioContext.createBuffer(1, floatSamples.length, this.config.audioSampleRate);
chunkBuffer.getChannelData(0).set(floatSamples);
const source = this.audioContext.createBufferSource();
source.buffer = chunkBuffer;
source.connect(this.audioContext.destination);
if (this.nextAudioTime < this.audioContext.currentTime) {
this.nextAudioTime = this.audioContext.currentTime + 0.01;
}
source.start(this.nextAudioTime);
this.nextAudioTime += floatSamples.length / this.config.audioSampleRate;
this.scheduledAudioChunks++;
this.tryStartStreaming();
} catch (e) {
console.error('Error scheduling audio:', e);
}
}
private startStreamingPlayback(): void {
// Usar FPS simples inicialmente
const frameInterval = 1000 / this.config.targetFps;
this.renderInterval = window.setInterval(() => {
this.renderNextStreamingFrame();
}, frameInterval);
}
private renderNextStreamingFrame(): void {
if (this.currentFrameIndex >= this.frameBlobs.length) {
// Aguardar mais frames
return;
}
const blob = this.frameBlobs[this.currentFrameIndex];
const url = URL.createObjectURL(blob);
const img = new Image();
img.onload = () => {
if (this.config.canvas.width !== img.width) {
this.config.canvas.width = img.width;
this.config.canvas.height = img.height;
}
this.ctx.drawImage(img, 0, 0);
URL.revokeObjectURL(url);
};
img.src = url;
this.currentFrameIndex++;
this.emitMetrics();
}
// ==========================================================================
// Complete Mode (após tudo chegar - sincronização precisa)
// ==========================================================================
private async onStreamComplete(): Promise<void> {
// Se já estamos em streaming mode, finalizar
if (this.streamingStarted) {
this.finishPlayback();
return;
}
this.setStatus('analyzing');
try {
// Preparar áudio completo
await this.prepareFullAudio();
// Preparar imagens para análise
await this.prepareFrameImages();
if (this.config.enableSmartSync && this.allAudioSamples && this.frameImages.length > 0) {
this.setStatus('syncing');
// Calcular sincronização inteligente
this.syncResult = await this.syncEngine.calculateSync(
this.allAudioSamples,
this.frameImages,
(this.totalAudioSamples / this.config.audioSampleRate) * 1000
);
console.log('Sync result:', this.syncResult.debug);
}
// Iniciar playback sincronizado
this.setStatus('playing');
await this.startSyncedPlayback();
} catch (e) {
console.error('Sync error:', e);
this.config.onError(e as Error);
this.setStatus('error');
}
}
private async prepareFullAudio(): Promise<void> {
if (this.audioChunks.length === 0) return;
// Combinar todos os chunks
const totalLength = this.audioChunks.reduce((sum, c) => sum + c.length, 0);
const combined = new Uint8Array(totalLength);
let offset = 0;
for (const chunk of this.audioChunks) {
combined.set(chunk, offset);
offset += chunk.length;
}
// Converter para Float32Array
const alignedBuffer = new ArrayBuffer(combined.length);
new Uint8Array(alignedBuffer).set(combined);
const samples = new Int16Array(alignedBuffer);
this.allAudioSamples = new Float32Array(samples.length);
for (let i = 0; i < samples.length; i++) {
this.allAudioSamples[i] = samples[i] / 32768;
}
}
private async prepareFrameImages(): Promise<void> {
this.frameImages = [];
for (const blob of this.frameBlobs) {
const img = new Image();
const url = URL.createObjectURL(blob);
await new Promise<void>((resolve) => {
img.onload = () => {
this.frameImages.push(img);
resolve();
};
img.src = url;
});
// Não revogar URL ainda - precisamos das imagens para análise
}
}
private async startSyncedPlayback(): Promise<void> {
if (!this.allAudioSamples) return;
// Criar AudioContext e buffer
if (!this.audioContext) {
this.audioContext = new (window.AudioContext || (window as any).webkitAudioContext)({
sampleRate: this.config.audioSampleRate,
});
}
const audioBuffer = this.audioContext.createBuffer(
1,
this.allAudioSamples.length,
this.config.audioSampleRate
);
audioBuffer.getChannelData(0).set(this.allAudioSamples);
// Aplicar delays se necessário
const audioDelay = this.syncResult?.audioDelayMs ?? 0;
const videoDelay = this.syncResult?.videoDelayMs ?? 0;
// Iniciar áudio
this.audioSource = this.audioContext.createBufferSource();
this.audioSource.buffer = audioBuffer;
this.audioSource.connect(this.audioContext.destination);
const audioStartTime = this.audioContext.currentTime + (audioDelay / 1000);
this.audioSource.start(audioStartTime);
// Iniciar vídeo após delay
setTimeout(() => {
this.startSyncedVideoPlayback();
}, videoDelay);
}
private startSyncedVideoPlayback(): void {
const fps = this.syncResult?.adjustedFps ?? this.config.targetFps;
const frameInterval = 1000 / fps;
const frameMap = this.syncResult?.frameMap;
this.currentFrameIndex = 0;
this.isPlaying = true;
this.renderInterval = window.setInterval(() => {
if (!this.isPlaying) {
this.finishPlayback();
return;
}
// Usar frameMap se disponível
let frameToRender: number;
if (frameMap && this.currentFrameIndex < frameMap.length) {
frameToRender = frameMap[this.currentFrameIndex];
} else if (this.currentFrameIndex < this.frameImages.length) {
frameToRender = this.currentFrameIndex;
} else {
this.finishPlayback();
return;
}
// Renderizar frame
if (frameToRender < this.frameImages.length) {
const img = this.frameImages[frameToRender];
if (this.config.canvas.width !== img.width) {
this.config.canvas.width = img.width;
this.config.canvas.height = img.height;
}
this.ctx.drawImage(img, 0, 0);
}
this.currentFrameIndex++;
this.emitMetrics();
}, frameInterval);
}
private finishPlayback(): void {
this.isPlaying = false;
if (this.renderInterval) {
clearInterval(this.renderInterval);
this.renderInterval = null;
}
const metrics = this.getMetrics();
const summary: SyncedStreamSummary = {
totalFrames: this.currentFrameIndex,
totalDurationMs: Date.now() - this.startTime,
originalFps: this.config.targetFps,
adjustedFps: this.syncResult?.adjustedFps ?? this.config.targetFps,
syncStrategy: this.syncResult?.debug.syncStrategy ?? 'normal',
audioSpeechDurationMs: this.syncResult?.debug.audioSpeechDuration ?? metrics.audioDurationMs,
videoSpeechDurationMs: ((this.syncResult?.debug.videoSpeechFrames ?? this.frameBlobs.length) / this.config.targetFps) * 1000,
finalSyncDiffMs: metrics.syncDiffMs,
};
this.config.onComplete(summary);
this.setStatus('completed');
}
// ==========================================================================
// Utilities
// ==========================================================================
private setStatus(status: SyncedStreamStatus): void {
this.status = status;
this.config.onStatusChange(status);
}
private emitMetrics(): void {
this.config.onMetricsUpdate(this.getMetrics());
}
private resetState(): void {
this.cleanup();
this.frameBlobs = [];
this.frameImages = [];
this.audioChunks = [];
this.allAudioSamples = null;
this.totalAudioSamples = 0;
this.currentFrameIndex = 0;
this.syncResult = null;
this.streamingStarted = false;
this.nextAudioTime = 0;
this.scheduledAudioChunks = 0;
}
private cleanup(): void {
this.isPlaying = false;
if (this.renderInterval) {
clearInterval(this.renderInterval);
this.renderInterval = null;
}
if (this.audioSource) {
try { this.audioSource.stop(); } catch {}
this.audioSource = null;
}
}
private startHeartbeat(): void {
this.heartbeatInterval = window.setInterval(() => {
if (this.ws?.readyState === WebSocket.OPEN) {
this.ws.send(JSON.stringify({ action: 'ping' }));
}
}, 30000);
}
private stopHeartbeat(): void {
if (this.heartbeatInterval) {
clearInterval(this.heartbeatInterval);
this.heartbeatInterval = null;
}
}
}
export default AvatarStreamSDKSync;