| |
| |
| |
| |
| |
| |
| |
| |
|
|
| export interface VoskConfig { |
| |
| modelUrl?: string; |
|
|
| |
| sampleRate?: number; |
|
|
| |
| onModelProgress?: (progress: number) => void; |
|
|
| |
| onPartialResult?: (text: string) => void; |
|
|
| |
| onDebug?: (info: VoskDebugInfo) => void; |
| } |
|
|
| export interface WordAlignment { |
| |
| word: string; |
|
|
| |
| start: number; |
|
|
| |
| end: number; |
|
|
| |
| confidence: number; |
| } |
|
|
| export interface VoskResult { |
| |
| text: string; |
|
|
| |
| words: WordAlignment[]; |
|
|
| |
| speechStartMs: number; |
|
|
| |
| speechEndMs: number; |
|
|
| |
| speechDurationMs: number; |
| } |
|
|
| export interface VoskDebugInfo { |
| modelLoaded: boolean; |
| processingTimeMs: number; |
| wordsDetected: number; |
| averageConfidence: number; |
| } |
|
|
| |
| interface VoskModel { |
| |
| } |
|
|
| interface VoskRecognizer { |
| acceptWaveform(data: Float32Array | Int16Array): boolean; |
| result(): { text: string; result?: Array<{ word: string; start: number; end: number; conf: number }> }; |
| partialResult(): { partial: string }; |
| finalResult(): { text: string; result?: Array<{ word: string; start: number; end: number; conf: number }> }; |
| free(): void; |
| } |
|
|
| |
| declare global { |
| interface Window { |
| Vosk?: { |
| createModel(url: string, progress?: (loaded: number, total: number) => void): Promise<VoskModel>; |
| createRecognizer(model: VoskModel, sampleRate: number): VoskRecognizer; |
| }; |
| } |
| } |
|
|
| export class VoskEngine { |
| private config: Required<VoskConfig>; |
| private model: VoskModel | null = null; |
| private recognizer: VoskRecognizer | null = null; |
| private isModelLoaded: boolean = false; |
|
|
| |
| static readonly MODELS = { |
| |
| 'pt-br-small': 'https://alphacephei.com/vosk/models/vosk-model-small-pt-0.3.zip', |
| |
| 'pt-br-large': 'https://alphacephei.com/vosk/models/vosk-model-pt-fb-v0.1.1-20220516_2113.zip', |
| |
| 'en-small': 'https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip', |
| |
| 'es-small': 'https://alphacephei.com/vosk/models/vosk-model-small-es-0.42.zip', |
| } as const; |
|
|
| constructor(config: Partial<VoskConfig> = {}) { |
| this.config = { |
| modelUrl: config.modelUrl ?? VoskEngine.MODELS['pt-br-small'], |
| sampleRate: config.sampleRate ?? 16000, |
| onModelProgress: config.onModelProgress ?? (() => {}), |
| onPartialResult: config.onPartialResult ?? (() => {}), |
| onDebug: config.onDebug ?? (() => {}), |
| }; |
| } |
|
|
| |
| |
| |
| private async loadVoskScript(): Promise<void> { |
| if (window.Vosk) return; |
|
|
| return new Promise((resolve, reject) => { |
| const script = document.createElement('script'); |
| script.src = 'https://cdn.jsdelivr.net/npm/vosk-browser@0.0.8/dist/vosk.js'; |
| script.async = true; |
|
|
| script.onload = () => { |
| if (window.Vosk) { |
| resolve(); |
| } else { |
| reject(new Error('Vosk not available after script load')); |
| } |
| }; |
|
|
| script.onerror = () => reject(new Error('Failed to load Vosk script')); |
| document.head.appendChild(script); |
| }); |
| } |
|
|
| |
| |
| |
| async loadModel(): Promise<void> { |
| if (this.isModelLoaded && this.model) return; |
|
|
| |
| await this.loadVoskScript(); |
|
|
| if (!window.Vosk) { |
| throw new Error('Vosk not available'); |
| } |
|
|
| try { |
| this.model = await window.Vosk.createModel( |
| this.config.modelUrl, |
| (loaded, total) => { |
| const progress = total > 0 ? loaded / total : 0; |
| this.config.onModelProgress(progress); |
| } |
| ); |
|
|
| this.isModelLoaded = true; |
| console.log('Vosk model loaded successfully'); |
|
|
| } catch (e) { |
| console.error('Failed to load Vosk model:', e); |
| throw new Error(`Failed to load Vosk model: ${e}`); |
| } |
| } |
|
|
| |
| |
| |
| async processAudio(audioSamples: Float32Array, originalSampleRate: number): Promise<VoskResult> { |
| const startTime = performance.now(); |
|
|
| if (!this.isModelLoaded || !this.model) { |
| await this.loadModel(); |
| } |
|
|
| if (!window.Vosk || !this.model) { |
| throw new Error('Vosk model not loaded'); |
| } |
|
|
| |
| let samples = audioSamples; |
| if (originalSampleRate !== this.config.sampleRate) { |
| samples = this.resample(audioSamples, originalSampleRate, this.config.sampleRate); |
| } |
|
|
| |
| this.recognizer = window.Vosk.createRecognizer(this.model, this.config.sampleRate); |
|
|
| |
| const chunkSize = 4000; |
| for (let i = 0; i < samples.length; i += chunkSize) { |
| const chunk = samples.subarray(i, Math.min(i + chunkSize, samples.length)); |
| this.recognizer.acceptWaveform(chunk); |
|
|
| |
| const partial = this.recognizer.partialResult(); |
| if (partial.partial) { |
| this.config.onPartialResult(partial.partial); |
| } |
| } |
|
|
| |
| const finalResult = this.recognizer.finalResult(); |
| this.recognizer.free(); |
| this.recognizer = null; |
|
|
| |
| const words: WordAlignment[] = (finalResult.result || []).map(w => ({ |
| word: w.word, |
| start: w.start, |
| end: w.end, |
| confidence: w.conf, |
| })); |
|
|
| |
| let speechStartMs = 0; |
| let speechEndMs = (samples.length / this.config.sampleRate) * 1000; |
|
|
| if (words.length > 0) { |
| speechStartMs = words[0].start * 1000; |
| speechEndMs = words[words.length - 1].end * 1000; |
| } |
|
|
| const processingTimeMs = performance.now() - startTime; |
| const avgConfidence = words.length > 0 |
| ? words.reduce((sum, w) => sum + w.confidence, 0) / words.length |
| : 0; |
|
|
| this.config.onDebug({ |
| modelLoaded: this.isModelLoaded, |
| processingTimeMs, |
| wordsDetected: words.length, |
| averageConfidence: avgConfidence, |
| }); |
|
|
| return { |
| text: finalResult.text || '', |
| words, |
| speechStartMs, |
| speechEndMs, |
| speechDurationMs: speechEndMs - speechStartMs, |
| }; |
| } |
|
|
| |
| |
| |
| |
| async forceAlign( |
| audioSamples: Float32Array, |
| originalSampleRate: number, |
| expectedText: string |
| ): Promise<VoskResult> { |
| |
| const result = await this.processAudio(audioSamples, originalSampleRate); |
|
|
| |
| const expectedWords = expectedText.toLowerCase().split(/\s+/); |
| const recognizedWords = result.text.toLowerCase().split(/\s+/); |
|
|
| |
| const similarity = this.calculateSimilarity(expectedWords, recognizedWords); |
|
|
| if (similarity < 0.7) { |
| console.warn(`Low alignment confidence: ${similarity.toFixed(2)}`); |
| |
| |
| } |
|
|
| return result; |
| } |
|
|
| |
| |
| |
| private resample(samples: Float32Array, fromRate: number, toRate: number): Float32Array { |
| if (fromRate === toRate) return samples; |
|
|
| const ratio = fromRate / toRate; |
| const newLength = Math.floor(samples.length / ratio); |
| const result = new Float32Array(newLength); |
|
|
| for (let i = 0; i < newLength; i++) { |
| const srcIndex = i * ratio; |
| const srcIndexFloor = Math.floor(srcIndex); |
| const srcIndexCeil = Math.min(srcIndexFloor + 1, samples.length - 1); |
| const t = srcIndex - srcIndexFloor; |
|
|
| |
| result[i] = samples[srcIndexFloor] * (1 - t) + samples[srcIndexCeil] * t; |
| } |
|
|
| return result; |
| } |
|
|
| |
| |
| |
| private calculateSimilarity(words1: string[], words2: string[]): number { |
| const set1 = new Set(words1); |
| const set2 = new Set(words2); |
|
|
| const intersection = new Set([...set1].filter(x => set2.has(x))); |
| const union = new Set([...set1, ...set2]); |
|
|
| return union.size > 0 ? intersection.size / union.size : 0; |
| } |
|
|
| |
| |
| |
| isReady(): boolean { |
| return this.isModelLoaded; |
| } |
|
|
| |
| |
| |
| dispose(): void { |
| if (this.recognizer) { |
| this.recognizer.free(); |
| this.recognizer = null; |
| } |
| this.model = null; |
| this.isModelLoaded = false; |
| } |
| } |
|
|
| export default VoskEngine; |
|
|