marcosremar2's picture
Add SDK and improve video transition synchronization
3acaae2
Raw
History Blame Contribute Delete
9.49 kB
/**
* VoskEngine - Motor de reconhecimento de fala com alinhamento fonético
*
* Usa Vosk (port WASM do Kaldi) para:
* 1. Transcrição de áudio
* 2. Alinhamento de palavras com timestamps
* 3. Detecção precisa de início/fim de fala
*/
export interface VoskConfig {
/** URL do modelo Vosk (default: modelo pequeno português) */
modelUrl?: string;
/** Sample rate do áudio (default: 16000 - Vosk padrão) */
sampleRate?: number;
/** Callback para progresso do carregamento do modelo */
onModelProgress?: (progress: number) => void;
/** Callback para resultado parcial */
onPartialResult?: (text: string) => void;
/** Callback para debug */
onDebug?: (info: VoskDebugInfo) => void;
}
export interface WordAlignment {
/** Palavra reconhecida */
word: string;
/** Início em segundos */
start: number;
/** Fim em segundos */
end: number;
/** Confiança (0-1) */
confidence: number;
}
export interface VoskResult {
/** Texto completo reconhecido */
text: string;
/** Alinhamento por palavra */
words: WordAlignment[];
/** Início da fala em ms */
speechStartMs: number;
/** Fim da fala em ms */
speechEndMs: number;
/** Duração total da fala em ms */
speechDurationMs: number;
}
export interface VoskDebugInfo {
modelLoaded: boolean;
processingTimeMs: number;
wordsDetected: number;
averageConfidence: number;
}
// Tipos do vosk-browser (declarados aqui para evitar dependência de tipos)
interface VoskModel {
// Métodos internos do Vosk
}
interface VoskRecognizer {
acceptWaveform(data: Float32Array | Int16Array): boolean;
result(): { text: string; result?: Array<{ word: string; start: number; end: number; conf: number }> };
partialResult(): { partial: string };
finalResult(): { text: string; result?: Array<{ word: string; start: number; end: number; conf: number }> };
free(): void;
}
// Declaração global para vosk-browser
declare global {
interface Window {
Vosk?: {
createModel(url: string, progress?: (loaded: number, total: number) => void): Promise<VoskModel>;
createRecognizer(model: VoskModel, sampleRate: number): VoskRecognizer;
};
}
}
export class VoskEngine {
private config: Required<VoskConfig>;
private model: VoskModel | null = null;
private recognizer: VoskRecognizer | null = null;
private isModelLoaded: boolean = false;
// URLs de modelos Vosk pré-configurados
static readonly MODELS = {
// Português brasileiro - pequeno (~50MB)
'pt-br-small': 'https://alphacephei.com/vosk/models/vosk-model-small-pt-0.3.zip',
// Português - grande (~1GB, melhor qualidade)
'pt-br-large': 'https://alphacephei.com/vosk/models/vosk-model-pt-fb-v0.1.1-20220516_2113.zip',
// Inglês - pequeno
'en-small': 'https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip',
// Espanhol - pequeno
'es-small': 'https://alphacephei.com/vosk/models/vosk-model-small-es-0.42.zip',
} as const;
constructor(config: Partial<VoskConfig> = {}) {
this.config = {
modelUrl: config.modelUrl ?? VoskEngine.MODELS['pt-br-small'],
sampleRate: config.sampleRate ?? 16000,
onModelProgress: config.onModelProgress ?? (() => {}),
onPartialResult: config.onPartialResult ?? (() => {}),
onDebug: config.onDebug ?? (() => {}),
};
}
/**
* Carrega o script do Vosk browser
*/
private async loadVoskScript(): Promise<void> {
if (window.Vosk) return;
return new Promise((resolve, reject) => {
const script = document.createElement('script');
script.src = 'https://cdn.jsdelivr.net/npm/vosk-browser@0.0.8/dist/vosk.js';
script.async = true;
script.onload = () => {
if (window.Vosk) {
resolve();
} else {
reject(new Error('Vosk not available after script load'));
}
};
script.onerror = () => reject(new Error('Failed to load Vosk script'));
document.head.appendChild(script);
});
}
/**
* Carrega o modelo Vosk
*/
async loadModel(): Promise<void> {
if (this.isModelLoaded && this.model) return;
// Carregar script primeiro
await this.loadVoskScript();
if (!window.Vosk) {
throw new Error('Vosk not available');
}
try {
this.model = await window.Vosk.createModel(
this.config.modelUrl,
(loaded, total) => {
const progress = total > 0 ? loaded / total : 0;
this.config.onModelProgress(progress);
}
);
this.isModelLoaded = true;
console.log('Vosk model loaded successfully');
} catch (e) {
console.error('Failed to load Vosk model:', e);
throw new Error(`Failed to load Vosk model: ${e}`);
}
}
/**
* Processa áudio e retorna alinhamento de palavras
*/
async processAudio(audioSamples: Float32Array, originalSampleRate: number): Promise<VoskResult> {
const startTime = performance.now();
if (!this.isModelLoaded || !this.model) {
await this.loadModel();
}
if (!window.Vosk || !this.model) {
throw new Error('Vosk model not loaded');
}
// Resample se necessário (Vosk usa 16000 Hz)
let samples = audioSamples;
if (originalSampleRate !== this.config.sampleRate) {
samples = this.resample(audioSamples, originalSampleRate, this.config.sampleRate);
}
// Criar recognizer
this.recognizer = window.Vosk.createRecognizer(this.model, this.config.sampleRate);
// Processar em chunks
const chunkSize = 4000;
for (let i = 0; i < samples.length; i += chunkSize) {
const chunk = samples.subarray(i, Math.min(i + chunkSize, samples.length));
this.recognizer.acceptWaveform(chunk);
// Emitir resultado parcial
const partial = this.recognizer.partialResult();
if (partial.partial) {
this.config.onPartialResult(partial.partial);
}
}
// Obter resultado final
const finalResult = this.recognizer.finalResult();
this.recognizer.free();
this.recognizer = null;
// Processar resultado
const words: WordAlignment[] = (finalResult.result || []).map(w => ({
word: w.word,
start: w.start,
end: w.end,
confidence: w.conf,
}));
// Calcular limites de fala
let speechStartMs = 0;
let speechEndMs = (samples.length / this.config.sampleRate) * 1000;
if (words.length > 0) {
speechStartMs = words[0].start * 1000;
speechEndMs = words[words.length - 1].end * 1000;
}
const processingTimeMs = performance.now() - startTime;
const avgConfidence = words.length > 0
? words.reduce((sum, w) => sum + w.confidence, 0) / words.length
: 0;
this.config.onDebug({
modelLoaded: this.isModelLoaded,
processingTimeMs,
wordsDetected: words.length,
averageConfidence: avgConfidence,
});
return {
text: finalResult.text || '',
words,
speechStartMs,
speechEndMs,
speechDurationMs: speechEndMs - speechStartMs,
};
}
/**
* Realiza alinhamento forçado entre áudio e texto conhecido
* (Útil quando você já sabe o texto que foi falado)
*/
async forceAlign(
audioSamples: Float32Array,
originalSampleRate: number,
expectedText: string
): Promise<VoskResult> {
// Primeiro, fazer reconhecimento normal
const result = await this.processAudio(audioSamples, originalSampleRate);
// Se o texto reconhecido for muito diferente, ajustar
const expectedWords = expectedText.toLowerCase().split(/\s+/);
const recognizedWords = result.text.toLowerCase().split(/\s+/);
// Calcular similaridade simples
const similarity = this.calculateSimilarity(expectedWords, recognizedWords);
if (similarity < 0.7) {
console.warn(`Low alignment confidence: ${similarity.toFixed(2)}`);
// Podemos tentar interpolação baseada no texto esperado
// Por agora, retornar o resultado original
}
return result;
}
/**
* Resample de áudio para o sample rate do Vosk
*/
private resample(samples: Float32Array, fromRate: number, toRate: number): Float32Array {
if (fromRate === toRate) return samples;
const ratio = fromRate / toRate;
const newLength = Math.floor(samples.length / ratio);
const result = new Float32Array(newLength);
for (let i = 0; i < newLength; i++) {
const srcIndex = i * ratio;
const srcIndexFloor = Math.floor(srcIndex);
const srcIndexCeil = Math.min(srcIndexFloor + 1, samples.length - 1);
const t = srcIndex - srcIndexFloor;
// Interpolação linear
result[i] = samples[srcIndexFloor] * (1 - t) + samples[srcIndexCeil] * t;
}
return result;
}
/**
* Calcula similaridade entre duas listas de palavras (Jaccard)
*/
private calculateSimilarity(words1: string[], words2: string[]): number {
const set1 = new Set(words1);
const set2 = new Set(words2);
const intersection = new Set([...set1].filter(x => set2.has(x)));
const union = new Set([...set1, ...set2]);
return union.size > 0 ? intersection.size / union.size : 0;
}
/**
* Verifica se o modelo está carregado
*/
isReady(): boolean {
return this.isModelLoaded;
}
/**
* Libera recursos
*/
dispose(): void {
if (this.recognizer) {
this.recognizer.free();
this.recognizer = null;
}
this.model = null;
this.isModelLoaded = false;
}
}
export default VoskEngine;