/** * VoskEngine - Motor de reconhecimento de fala com alinhamento fonético * * Usa Vosk (port WASM do Kaldi) para: * 1. Transcrição de áudio * 2. Alinhamento de palavras com timestamps * 3. Detecção precisa de início/fim de fala */ export interface VoskConfig { /** URL do modelo Vosk (default: modelo pequeno português) */ modelUrl?: string; /** Sample rate do áudio (default: 16000 - Vosk padrão) */ sampleRate?: number; /** Callback para progresso do carregamento do modelo */ onModelProgress?: (progress: number) => void; /** Callback para resultado parcial */ onPartialResult?: (text: string) => void; /** Callback para debug */ onDebug?: (info: VoskDebugInfo) => void; } export interface WordAlignment { /** Palavra reconhecida */ word: string; /** Início em segundos */ start: number; /** Fim em segundos */ end: number; /** Confiança (0-1) */ confidence: number; } export interface VoskResult { /** Texto completo reconhecido */ text: string; /** Alinhamento por palavra */ words: WordAlignment[]; /** Início da fala em ms */ speechStartMs: number; /** Fim da fala em ms */ speechEndMs: number; /** Duração total da fala em ms */ speechDurationMs: number; } export interface VoskDebugInfo { modelLoaded: boolean; processingTimeMs: number; wordsDetected: number; averageConfidence: number; } // Tipos do vosk-browser (declarados aqui para evitar dependência de tipos) interface VoskModel { // Métodos internos do Vosk } interface VoskRecognizer { acceptWaveform(data: Float32Array | Int16Array): boolean; result(): { text: string; result?: Array<{ word: string; start: number; end: number; conf: number }> }; partialResult(): { partial: string }; finalResult(): { text: string; result?: Array<{ word: string; start: number; end: number; conf: number }> }; free(): void; } // Declaração global para vosk-browser declare global { interface Window { Vosk?: { createModel(url: string, progress?: (loaded: number, total: number) => void): Promise; createRecognizer(model: VoskModel, sampleRate: number): VoskRecognizer; }; } } export class VoskEngine { private config: Required; private model: VoskModel | null = null; private recognizer: VoskRecognizer | null = null; private isModelLoaded: boolean = false; // URLs de modelos Vosk pré-configurados static readonly MODELS = { // Português brasileiro - pequeno (~50MB) 'pt-br-small': 'https://alphacephei.com/vosk/models/vosk-model-small-pt-0.3.zip', // Português - grande (~1GB, melhor qualidade) 'pt-br-large': 'https://alphacephei.com/vosk/models/vosk-model-pt-fb-v0.1.1-20220516_2113.zip', // Inglês - pequeno 'en-small': 'https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip', // Espanhol - pequeno 'es-small': 'https://alphacephei.com/vosk/models/vosk-model-small-es-0.42.zip', } as const; constructor(config: Partial = {}) { this.config = { modelUrl: config.modelUrl ?? VoskEngine.MODELS['pt-br-small'], sampleRate: config.sampleRate ?? 16000, onModelProgress: config.onModelProgress ?? (() => {}), onPartialResult: config.onPartialResult ?? (() => {}), onDebug: config.onDebug ?? (() => {}), }; } /** * Carrega o script do Vosk browser */ private async loadVoskScript(): Promise { if (window.Vosk) return; return new Promise((resolve, reject) => { const script = document.createElement('script'); script.src = 'https://cdn.jsdelivr.net/npm/vosk-browser@0.0.8/dist/vosk.js'; script.async = true; script.onload = () => { if (window.Vosk) { resolve(); } else { reject(new Error('Vosk not available after script load')); } }; script.onerror = () => reject(new Error('Failed to load Vosk script')); document.head.appendChild(script); }); } /** * Carrega o modelo Vosk */ async loadModel(): Promise { if (this.isModelLoaded && this.model) return; // Carregar script primeiro await this.loadVoskScript(); if (!window.Vosk) { throw new Error('Vosk not available'); } try { this.model = await window.Vosk.createModel( this.config.modelUrl, (loaded, total) => { const progress = total > 0 ? loaded / total : 0; this.config.onModelProgress(progress); } ); this.isModelLoaded = true; console.log('Vosk model loaded successfully'); } catch (e) { console.error('Failed to load Vosk model:', e); throw new Error(`Failed to load Vosk model: ${e}`); } } /** * Processa áudio e retorna alinhamento de palavras */ async processAudio(audioSamples: Float32Array, originalSampleRate: number): Promise { const startTime = performance.now(); if (!this.isModelLoaded || !this.model) { await this.loadModel(); } if (!window.Vosk || !this.model) { throw new Error('Vosk model not loaded'); } // Resample se necessário (Vosk usa 16000 Hz) let samples = audioSamples; if (originalSampleRate !== this.config.sampleRate) { samples = this.resample(audioSamples, originalSampleRate, this.config.sampleRate); } // Criar recognizer this.recognizer = window.Vosk.createRecognizer(this.model, this.config.sampleRate); // Processar em chunks const chunkSize = 4000; for (let i = 0; i < samples.length; i += chunkSize) { const chunk = samples.subarray(i, Math.min(i + chunkSize, samples.length)); this.recognizer.acceptWaveform(chunk); // Emitir resultado parcial const partial = this.recognizer.partialResult(); if (partial.partial) { this.config.onPartialResult(partial.partial); } } // Obter resultado final const finalResult = this.recognizer.finalResult(); this.recognizer.free(); this.recognizer = null; // Processar resultado const words: WordAlignment[] = (finalResult.result || []).map(w => ({ word: w.word, start: w.start, end: w.end, confidence: w.conf, })); // Calcular limites de fala let speechStartMs = 0; let speechEndMs = (samples.length / this.config.sampleRate) * 1000; if (words.length > 0) { speechStartMs = words[0].start * 1000; speechEndMs = words[words.length - 1].end * 1000; } const processingTimeMs = performance.now() - startTime; const avgConfidence = words.length > 0 ? words.reduce((sum, w) => sum + w.confidence, 0) / words.length : 0; this.config.onDebug({ modelLoaded: this.isModelLoaded, processingTimeMs, wordsDetected: words.length, averageConfidence: avgConfidence, }); return { text: finalResult.text || '', words, speechStartMs, speechEndMs, speechDurationMs: speechEndMs - speechStartMs, }; } /** * Realiza alinhamento forçado entre áudio e texto conhecido * (Útil quando você já sabe o texto que foi falado) */ async forceAlign( audioSamples: Float32Array, originalSampleRate: number, expectedText: string ): Promise { // Primeiro, fazer reconhecimento normal const result = await this.processAudio(audioSamples, originalSampleRate); // Se o texto reconhecido for muito diferente, ajustar const expectedWords = expectedText.toLowerCase().split(/\s+/); const recognizedWords = result.text.toLowerCase().split(/\s+/); // Calcular similaridade simples const similarity = this.calculateSimilarity(expectedWords, recognizedWords); if (similarity < 0.7) { console.warn(`Low alignment confidence: ${similarity.toFixed(2)}`); // Podemos tentar interpolação baseada no texto esperado // Por agora, retornar o resultado original } return result; } /** * Resample de áudio para o sample rate do Vosk */ private resample(samples: Float32Array, fromRate: number, toRate: number): Float32Array { if (fromRate === toRate) return samples; const ratio = fromRate / toRate; const newLength = Math.floor(samples.length / ratio); const result = new Float32Array(newLength); for (let i = 0; i < newLength; i++) { const srcIndex = i * ratio; const srcIndexFloor = Math.floor(srcIndex); const srcIndexCeil = Math.min(srcIndexFloor + 1, samples.length - 1); const t = srcIndex - srcIndexFloor; // Interpolação linear result[i] = samples[srcIndexFloor] * (1 - t) + samples[srcIndexCeil] * t; } return result; } /** * Calcula similaridade entre duas listas de palavras (Jaccard) */ private calculateSimilarity(words1: string[], words2: string[]): number { const set1 = new Set(words1); const set2 = new Set(words2); const intersection = new Set([...set1].filter(x => set2.has(x))); const union = new Set([...set1, ...set2]); return union.size > 0 ? intersection.size / union.size : 0; } /** * Verifica se o modelo está carregado */ isReady(): boolean { return this.isModelLoaded; } /** * Libera recursos */ dispose(): void { if (this.recognizer) { this.recognizer.free(); this.recognizer = null; } this.model = null; this.isModelLoaded = false; } } export default VoskEngine;