speech2speech-interface / interface /sdk /src /AvatarStreamSDK.ts
marcosremar2's picture
Add SDK and improve video transition synchronization
3acaae2
Raw
History Blame Contribute Delete
19.6 kB
/**
* Avatar Stream SDK
*
* SDK para streaming sincronizado de áudio e vídeo em tempo real.
* Recebe frames JPEG e chunks de áudio PCM via WebSocket binário.
*
* @example
* ```typescript
* const avatar = new AvatarStreamSDK({
* wsUrl: 'ws://localhost:8080/ws',
* canvas: document.getElementById('canvas') as HTMLCanvasElement,
* onStatusChange: (status) => console.log('Status:', status),
* onMetricsUpdate: (metrics) => console.log('Metrics:', metrics),
* });
*
* await avatar.connect();
* await avatar.generate('Hello world!', 'tara');
* ```
*/
// ============================================================================
// Types
// ============================================================================
export interface AvatarStreamConfig {
/** WebSocket URL do servidor */
wsUrl: string;
/** Canvas onde os frames serão renderizados */
canvas: HTMLCanvasElement;
/** FPS alvo para renderização (default: 25) */
targetFps?: number;
/** Sample rate do áudio (default: 24000) */
audioSampleRate?: number;
/** Mínimo de frames antes de iniciar playback (default: 10) */
minFramesBuffer?: number;
/** Mínimo de chunks de áudio antes de iniciar playback (default: 2) */
minAudioChunks?: number;
/** Callback quando o status muda */
onStatusChange?: (status: StreamStatus) => void;
/** Callback quando as métricas são atualizadas */
onMetricsUpdate?: (metrics: StreamMetrics) => void;
/** Callback quando ocorre um erro */
onError?: (error: Error) => void;
/** Callback quando o stream termina */
onComplete?: (summary: StreamSummary) => void;
/** Callback quando o primeiro frame é recebido */
onFirstFrame?: (latencyMs: number) => void;
/** Callback quando o playback inicia */
onPlaybackStart?: (latencyMs: number) => void;
}
export type StreamStatus =
| 'disconnected'
| 'connecting'
| 'connected'
| 'generating'
| 'buffering'
| 'playing'
| 'completed'
| 'error';
export interface StreamMetrics {
/** Frames recebidos do servidor */
framesReceived: number;
/** Frames renderizados no canvas */
framesRendered: number;
/** Frames no buffer aguardando renderização */
bufferSize: number;
/** Bytes totais recebidos */
bytesReceived: number;
/** Duração do vídeo em segundos (frames / fps) */
videoDuration: number;
/** Duração do áudio em segundos (samples / sampleRate) */
audioDuration: number;
/** Diferença entre áudio e vídeo (audioDuration - videoDuration) */
syncDiff: number;
/** FPS atual da renderização */
currentFps: number;
/** Bandwidth em KB/s */
bandwidthKBps: number;
/** Chunks de áudio recebidos */
audioChunksReceived: number;
/** Latência até o primeiro frame (ms) */
firstFrameLatencyMs: number | null;
/** Latência até o início do playback (ms) */
playbackLatencyMs: number | null;
}
export interface StreamSummary {
/** Total de frames renderizados */
totalFrames: number;
/** Duração total em segundos */
durationSeconds: number;
/** Bytes totais transferidos */
totalBytes: number;
/** Duração do vídeo */
videoDuration: number;
/** Duração do áudio */
audioDuration: number;
/** Diferença de sincronização final */
finalSyncDiff: number;
}
interface FrameData {
url: string;
index: number;
}
// ============================================================================
// Constants
// ============================================================================
/** Tipos de mensagem binária (devem corresponder ao servidor) */
const MSG_FRAME = 0x01;
const MSG_AUDIO = 0x02;
const MSG_AUDIO_CHUNK = 0x03;
// ============================================================================
// SDK Class
// ============================================================================
export class AvatarStreamSDK {
// Configuration
private config: Required<AvatarStreamConfig>;
private ctx: CanvasRenderingContext2D;
// WebSocket
private ws: WebSocket | null = null;
// State
private status: StreamStatus = 'disconnected';
private startTime: number = 0;
private streamDone: boolean = false;
private playbackStarted: boolean = false;
// Frame management
private allFrames: FrameData[] = [];
private frameQueue: FrameData[] = [];
private renderedFrames: number = 0;
private renderInterval: number | null = null;
private lastRenderTime: number = 0;
private totalBytes: number = 0;
// Audio management
private audioContext: AudioContext | null = null;
private audioChunks: Uint8Array[] = [];
private audioChunksComplete: boolean = false;
private nextAudioTime: number = 0;
private audioScheduledChunks: number = 0;
private totalAudioSamples: number = 0;
// Metrics
private firstFrameLatencyMs: number | null = null;
private playbackLatencyMs: number | null = null;
// Heartbeat
private heartbeatInterval: number | null = null;
constructor(config: AvatarStreamConfig) {
// Validate required config
if (!config.wsUrl) throw new Error('wsUrl is required');
if (!config.canvas) throw new Error('canvas is required');
const ctx = config.canvas.getContext('2d');
if (!ctx) throw new Error('Could not get 2D context from canvas');
this.ctx = ctx;
// Apply defaults
this.config = {
wsUrl: config.wsUrl,
canvas: config.canvas,
targetFps: config.targetFps ?? 25,
audioSampleRate: config.audioSampleRate ?? 24000,
minFramesBuffer: config.minFramesBuffer ?? 10,
minAudioChunks: config.minAudioChunks ?? 2,
onStatusChange: config.onStatusChange ?? (() => {}),
onMetricsUpdate: config.onMetricsUpdate ?? (() => {}),
onError: config.onError ?? (() => {}),
onComplete: config.onComplete ?? (() => {}),
onFirstFrame: config.onFirstFrame ?? (() => {}),
onPlaybackStart: config.onPlaybackStart ?? (() => {}),
};
}
// ==========================================================================
// Public API
// ==========================================================================
/**
* Conecta ao servidor WebSocket
*/
async connect(): Promise<void> {
if (this.ws?.readyState === WebSocket.OPEN) {
return;
}
return new Promise((resolve, reject) => {
this.setStatus('connecting');
this.ws = new WebSocket(this.config.wsUrl);
this.ws.binaryType = 'arraybuffer';
this.ws.onopen = () => {
this.setStatus('connected');
this.startHeartbeat();
resolve();
};
this.ws.onmessage = (event) => {
if (event.data instanceof ArrayBuffer) {
this.handleBinaryMessage(event.data);
} else {
try {
const data = JSON.parse(event.data);
this.handleJsonMessage(data);
} catch (e) {
this.config.onError(new Error(`Invalid JSON: ${e}`));
}
}
};
this.ws.onclose = () => {
this.setStatus('disconnected');
this.stopHeartbeat();
};
this.ws.onerror = () => {
const error = new Error('WebSocket connection failed');
this.config.onError(error);
reject(error);
};
});
}
/**
* Desconecta do servidor
*/
disconnect(): void {
this.stop();
this.stopHeartbeat();
if (this.ws) {
this.ws.close();
this.ws = null;
}
this.setStatus('disconnected');
}
/**
* Gera um avatar falando o texto especificado
*/
async generate(text: string, voice: string = 'tara'): Promise<void> {
if (!this.ws || this.ws.readyState !== WebSocket.OPEN) {
throw new Error('WebSocket not connected. Call connect() first.');
}
if (!text.trim()) {
throw new Error('Text cannot be empty');
}
// Reset state
this.resetState();
this.startTime = Date.now();
this.setStatus('generating');
// Send request
this.ws.send(JSON.stringify({
action: 'generate',
text: text.trim(),
voice,
}));
}
/**
* Para a geração atual
*/
stop(): void {
if (this.ws?.readyState === WebSocket.OPEN) {
this.ws.send(JSON.stringify({ action: 'stop' }));
}
this.cleanup();
this.setStatus('connected');
}
/**
* Retorna o status atual
*/
getStatus(): StreamStatus {
return this.status;
}
/**
* Retorna as métricas atuais
*/
getMetrics(): StreamMetrics {
const elapsed = (Date.now() - this.startTime) / 1000;
const videoDuration = this.allFrames.length / this.config.targetFps;
const audioDuration = this.totalAudioSamples / this.config.audioSampleRate;
return {
framesReceived: this.allFrames.length,
framesRendered: this.renderedFrames,
bufferSize: this.frameQueue.length,
bytesReceived: this.totalBytes,
videoDuration,
audioDuration,
syncDiff: audioDuration - videoDuration,
currentFps: this.lastRenderTime > 0 ? 1000 / (performance.now() - this.lastRenderTime) : 0,
bandwidthKBps: elapsed > 0 ? (this.totalBytes / 1024 / elapsed) : 0,
audioChunksReceived: this.audioChunks.length,
firstFrameLatencyMs: this.firstFrameLatencyMs,
playbackLatencyMs: this.playbackLatencyMs,
};
}
/**
* Verifica se está conectado
*/
isConnected(): boolean {
return this.ws?.readyState === WebSocket.OPEN;
}
/**
* Verifica se está reproduzindo
*/
isPlaying(): boolean {
return this.playbackStarted && !this.streamDone;
}
// ==========================================================================
// Private Methods - Message Handling
// ==========================================================================
private handleBinaryMessage(buffer: ArrayBuffer): void {
const view = new DataView(buffer);
const msgType = view.getUint8(0);
this.totalBytes += buffer.byteLength;
if (msgType === MSG_FRAME) {
const frameIndex = view.getUint32(1, true);
const dataSize = view.getUint32(5, true);
const frameData = new Uint8Array(buffer, 9, dataSize);
this.handleFrame(frameData, frameIndex);
} else if (msgType === MSG_AUDIO) {
const sampleRate = view.getUint32(1, true);
const dataSize = view.getUint32(5, true);
const audioData = new Uint8Array(buffer, 9, dataSize);
this.handleAudioComplete(audioData, sampleRate);
} else if (msgType === MSG_AUDIO_CHUNK) {
const chunkIndex = view.getUint32(1, true);
const isLast = view.getUint8(5) === 1;
const dataSize = view.getUint32(6, true);
const chunkData = new Uint8Array(buffer, 10, dataSize);
this.handleAudioChunk(chunkData, chunkIndex, isLast);
}
this.emitMetrics();
}
private handleJsonMessage(data: any): void {
switch (data.type) {
case 'first_frame':
// Server-side first frame latency
break;
case 'done':
this.streamDone = true;
this.tryStartStreamingPlayback();
break;
case 'status':
// Status message from server
break;
case 'error':
this.config.onError(new Error(data.message));
this.cleanup();
this.setStatus('error');
break;
}
}
// ==========================================================================
// Private Methods - Frame Handling
// ==========================================================================
private handleFrame(frameData: Uint8Array, frameIndex: number): void {
const blob = new Blob([frameData], { type: 'image/jpeg' });
const url = URL.createObjectURL(blob);
this.allFrames.push({ url, index: frameIndex });
if (this.playbackStarted) {
this.frameQueue.push({ url, index: frameIndex });
}
if (this.allFrames.length === 1) {
this.firstFrameLatencyMs = Date.now() - this.startTime;
this.config.onFirstFrame(this.firstFrameLatencyMs);
this.setStatus('buffering');
}
this.tryStartStreamingPlayback();
}
private startRendering(): void {
if (this.renderInterval !== null) return;
this.lastRenderTime = performance.now();
const frameInterval = 1000 / this.config.targetFps;
this.renderInterval = window.setInterval(() => {
this.renderNextFrame();
}, frameInterval);
}
private renderNextFrame(): void {
if (this.frameQueue.length === 0) {
if (this.streamDone) {
this.finishPlayback();
}
return;
}
const frame = this.frameQueue.shift()!;
const img = new Image();
img.onload = () => {
if (this.config.canvas.width !== img.width ||
this.config.canvas.height !== img.height) {
this.config.canvas.width = img.width;
this.config.canvas.height = img.height;
}
this.ctx.drawImage(img, 0, 0);
URL.revokeObjectURL(frame.url);
this.renderedFrames++;
this.emitMetrics();
};
img.src = frame.url;
this.lastRenderTime = performance.now();
}
// ==========================================================================
// Private Methods - Audio Handling
// ==========================================================================
private initAudioContext(): void {
if (!this.audioContext) {
this.audioContext = new (window.AudioContext || (window as any).webkitAudioContext)({
sampleRate: this.config.audioSampleRate,
});
}
this.nextAudioTime = this.audioContext.currentTime + 0.05;
this.audioScheduledChunks = 0;
}
private handleAudioChunk(chunkData: Uint8Array, chunkIndex: number, isLast: boolean): void {
if (this.audioChunks.length === 0 && chunkData.length > 0) {
this.initAudioContext();
}
if (chunkData.length > 0) {
this.audioChunks.push(chunkData);
this.scheduleAudioChunk(chunkData);
}
if (isLast) {
this.audioChunksComplete = true;
}
this.tryStartStreamingPlayback();
}
private scheduleAudioChunk(chunkData: Uint8Array): void {
if (!this.audioContext) return;
try {
// Convert PCM int16 to float32
const alignedBuffer = new ArrayBuffer(chunkData.length);
new Uint8Array(alignedBuffer).set(chunkData);
const samples = new Int16Array(alignedBuffer);
const floatSamples = new Float32Array(samples.length);
for (let i = 0; i < samples.length; i++) {
floatSamples[i] = samples[i] / 32768;
}
this.totalAudioSamples += samples.length;
// Create audio buffer
const chunkBuffer = this.audioContext.createBuffer(
1,
floatSamples.length,
this.config.audioSampleRate
);
chunkBuffer.getChannelData(0).set(floatSamples);
// Create and schedule source
const source = this.audioContext.createBufferSource();
source.buffer = chunkBuffer;
source.connect(this.audioContext.destination);
if (this.nextAudioTime < this.audioContext.currentTime) {
this.nextAudioTime = this.audioContext.currentTime + 0.01;
}
source.start(this.nextAudioTime);
const chunkDuration = floatSamples.length / this.config.audioSampleRate;
this.nextAudioTime += chunkDuration;
this.audioScheduledChunks++;
} catch (e) {
console.error('Error scheduling audio chunk:', e);
}
}
private handleAudioComplete(audioData: Uint8Array, sampleRate: number): void {
// Fallback for complete audio mode
try {
if (!this.audioContext) {
this.audioContext = new (window.AudioContext || (window as any).webkitAudioContext)({
sampleRate,
});
}
const alignedBuffer = new ArrayBuffer(audioData.length);
new Uint8Array(alignedBuffer).set(audioData);
const samples = new Int16Array(alignedBuffer);
const floatSamples = new Float32Array(samples.length);
for (let i = 0; i < samples.length; i++) {
floatSamples[i] = samples[i] / 32768;
}
this.totalAudioSamples = samples.length;
this.audioChunksComplete = true;
const audioBuffer = this.audioContext.createBuffer(1, floatSamples.length, sampleRate);
audioBuffer.getChannelData(0).set(floatSamples);
const source = this.audioContext.createBufferSource();
source.buffer = audioBuffer;
source.connect(this.audioContext.destination);
source.start();
this.tryStartStreamingPlayback();
} catch (e) {
this.config.onError(new Error(`Audio processing error: ${e}`));
}
}
// ==========================================================================
// Private Methods - Playback Control
// ==========================================================================
private tryStartStreamingPlayback(): void {
if (this.playbackStarted) return;
if (this.allFrames.length < this.config.minFramesBuffer) {
return;
}
if (this.audioScheduledChunks < this.config.minAudioChunks) {
return;
}
// Ready to start
this.playbackStarted = true;
this.playbackLatencyMs = Date.now() - this.startTime;
this.config.onPlaybackStart(this.playbackLatencyMs);
this.frameQueue = [...this.allFrames];
this.startRendering();
this.setStatus('playing');
}
private finishPlayback(): void {
if (this.renderInterval !== null) {
clearInterval(this.renderInterval);
this.renderInterval = null;
}
const elapsed = (Date.now() - this.startTime) / 1000;
const videoDuration = this.allFrames.length / this.config.targetFps;
const audioDuration = this.totalAudioSamples / this.config.audioSampleRate;
const summary: StreamSummary = {
totalFrames: this.renderedFrames,
durationSeconds: elapsed,
totalBytes: this.totalBytes,
videoDuration,
audioDuration,
finalSyncDiff: audioDuration - videoDuration,
};
this.config.onComplete(summary);
this.setStatus('completed');
}
// ==========================================================================
// Private Methods - Utilities
// ==========================================================================
private setStatus(status: StreamStatus): void {
this.status = status;
this.config.onStatusChange(status);
}
private emitMetrics(): void {
this.config.onMetricsUpdate(this.getMetrics());
}
private resetState(): void {
this.cleanup();
this.allFrames = [];
this.frameQueue = [];
this.renderedFrames = 0;
this.totalBytes = 0;
this.startTime = 0;
this.streamDone = false;
this.playbackStarted = false;
this.audioChunks = [];
this.audioChunksComplete = false;
this.nextAudioTime = 0;
this.audioScheduledChunks = 0;
this.totalAudioSamples = 0;
this.firstFrameLatencyMs = null;
this.playbackLatencyMs = null;
}
private cleanup(): void {
if (this.renderInterval !== null) {
clearInterval(this.renderInterval);
this.renderInterval = null;
}
// Release blob URLs
this.frameQueue.forEach(f => URL.revokeObjectURL(f.url));
this.allFrames.forEach(f => URL.revokeObjectURL(f.url));
}
private startHeartbeat(): void {
this.heartbeatInterval = window.setInterval(() => {
if (this.ws?.readyState === WebSocket.OPEN) {
this.ws.send(JSON.stringify({ action: 'ping' }));
}
}, 30000);
}
private stopHeartbeat(): void {
if (this.heartbeatInterval !== null) {
clearInterval(this.heartbeatInterval);
this.heartbeatInterval = null;
}
}
}
// Export default
export default AvatarStreamSDK;