| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| import { VoskEngine, VoskResult, WordAlignment } from './VoskEngine'; |
|
|
| export interface SyncConfig { |
| |
| audioSampleRate: number; |
|
|
| |
| videoFps: number; |
|
|
| |
| audioThreshold: number; |
|
|
| |
| videoThreshold: number; |
|
|
| |
| analysisWindowMs: number; |
|
|
| |
| useVosk: boolean; |
|
|
| |
| voskModelUrl?: string; |
|
|
| |
| onDebug?: (info: SyncDebugInfo) => void; |
| } |
|
|
| export interface SyncDebugInfo { |
| audioStartMs: number | null; |
| audioEndMs: number | null; |
| audioSpeechDuration: number | null; |
| videoStartFrame: number | null; |
| videoEndFrame: number | null; |
| videoSpeechFrames: number | null; |
| adjustedFps: number; |
| syncStrategy: 'stretch' | 'skip' | 'repeat' | 'normal'; |
| |
| voskResult?: VoskResult; |
| |
| wordAlignments?: WordAlignment[]; |
| } |
|
|
| export interface SyncResult { |
| |
| adjustedFps: number; |
|
|
| |
| videoDelayMs: number; |
|
|
| |
| audioDelayMs: number; |
|
|
| |
| skipFramesStart: number; |
|
|
| |
| skipFramesEnd: number; |
|
|
| |
| frameMap: number[]; |
|
|
| |
| debug: SyncDebugInfo; |
| } |
|
|
| export class SyncEngine { |
| private config: SyncConfig; |
| private voskEngine: VoskEngine | null = null; |
| private voskReady: boolean = false; |
|
|
| constructor(config: Partial<SyncConfig> = {}) { |
| this.config = { |
| audioSampleRate: config.audioSampleRate ?? 24000, |
| videoFps: config.videoFps ?? 25, |
| audioThreshold: config.audioThreshold ?? 0.02, |
| videoThreshold: config.videoThreshold ?? 0.05, |
| analysisWindowMs: config.analysisWindowMs ?? 50, |
| useVosk: config.useVosk ?? false, |
| voskModelUrl: config.voskModelUrl, |
| onDebug: config.onDebug, |
| }; |
|
|
| |
| if (this.config.useVosk) { |
| this.initVosk(); |
| } |
| } |
|
|
| |
| |
| |
| private async initVosk(): Promise<void> { |
| try { |
| this.voskEngine = new VoskEngine({ |
| modelUrl: this.config.voskModelUrl, |
| sampleRate: 16000, |
| onModelProgress: (progress) => { |
| console.log(`Vosk model loading: ${(progress * 100).toFixed(1)}%`); |
| }, |
| }); |
|
|
| await this.voskEngine.loadModel(); |
| this.voskReady = true; |
| console.log('Vosk engine ready'); |
| } catch (e) { |
| console.warn('Failed to initialize Vosk, falling back to VAD:', e); |
| this.voskEngine = null; |
| this.voskReady = false; |
| } |
| } |
|
|
| |
| |
| |
| async waitForVosk(timeoutMs: number = 30000): Promise<boolean> { |
| if (!this.config.useVosk) return false; |
| if (this.voskReady) return true; |
|
|
| const start = Date.now(); |
| while (Date.now() - start < timeoutMs) { |
| if (this.voskReady) return true; |
| await new Promise(r => setTimeout(r, 100)); |
| } |
|
|
| return this.voskReady; |
| } |
|
|
| |
| |
| |
| async analyzeAudioWithVosk(audioSamples: Float32Array): Promise<VoskResult | null> { |
| if (!this.voskEngine || !this.voskReady) { |
| return null; |
| } |
|
|
| try { |
| return await this.voskEngine.processAudio(audioSamples, this.config.audioSampleRate); |
| } catch (e) { |
| console.error('Vosk analysis failed:', e); |
| return null; |
| } |
| } |
|
|
| |
| |
| |
| analyzeAudio(audioSamples: Float32Array): { startMs: number; endMs: number; durationMs: number } { |
| const sampleRate = this.config.audioSampleRate; |
| const windowSize = Math.floor(sampleRate * this.config.analysisWindowMs / 1000); |
| const threshold = this.config.audioThreshold; |
|
|
| let startSample = 0; |
| let endSample = audioSamples.length; |
|
|
| |
| for (let i = 0; i < audioSamples.length - windowSize; i += windowSize) { |
| const rms = this.calculateRMS(audioSamples, i, windowSize); |
| if (rms > threshold) { |
| |
| startSample = Math.max(0, i - windowSize); |
| break; |
| } |
| } |
|
|
| |
| for (let i = audioSamples.length - windowSize; i >= startSample; i -= windowSize) { |
| const rms = this.calculateRMS(audioSamples, i, windowSize); |
| if (rms > threshold) { |
| |
| endSample = Math.min(audioSamples.length, i + windowSize * 2); |
| break; |
| } |
| } |
|
|
| const startMs = (startSample / sampleRate) * 1000; |
| const endMs = (endSample / sampleRate) * 1000; |
|
|
| return { |
| startMs, |
| endMs, |
| durationMs: endMs - startMs, |
| }; |
| } |
|
|
| |
| |
| |
| |
| async analyzeVideo(frames: HTMLImageElement[] | ImageBitmap[]): Promise<{ startFrame: number; endFrame: number; speechFrames: number }> { |
| if (frames.length < 2) { |
| return { startFrame: 0, endFrame: frames.length - 1, speechFrames: frames.length }; |
| } |
|
|
| const differences: number[] = []; |
| const canvas = new OffscreenCanvas(64, 64); |
| const ctx = canvas.getContext('2d')!; |
|
|
| |
| let prevData: Uint8ClampedArray | null = null; |
|
|
| for (const frame of frames) { |
| ctx.drawImage(frame, 0, 0, 64, 64); |
| const imageData = ctx.getImageData(0, 0, 64, 64); |
| const data = imageData.data; |
|
|
| if (prevData) { |
| let diff = 0; |
| |
| for (let i = 0; i < data.length; i += 4) { |
| const lum1 = (prevData[i] + prevData[i + 1] + prevData[i + 2]) / 3; |
| const lum2 = (data[i] + data[i + 1] + data[i + 2]) / 3; |
| diff += Math.abs(lum1 - lum2); |
| } |
| differences.push(diff / (data.length / 4) / 255); |
| } |
|
|
| prevData = new Uint8ClampedArray(data); |
| } |
|
|
| |
| const threshold = this.config.videoThreshold; |
| let startFrame = 0; |
| let endFrame = frames.length - 1; |
|
|
| for (let i = 0; i < differences.length; i++) { |
| if (differences[i] > threshold) { |
| startFrame = i; |
| break; |
| } |
| } |
|
|
| for (let i = differences.length - 1; i >= startFrame; i--) { |
| if (differences[i] > threshold) { |
| endFrame = i + 1; |
| break; |
| } |
| } |
|
|
| return { |
| startFrame, |
| endFrame, |
| speechFrames: endFrame - startFrame + 1, |
| }; |
| } |
|
|
| |
| |
| |
| async calculateSync( |
| audioSamples: Float32Array, |
| frames: HTMLImageElement[] | ImageBitmap[], |
| totalAudioDurationMs: number |
| ): Promise<SyncResult> { |
| |
| let voskResult: VoskResult | null = null; |
| let audioAnalysis: { startMs: number; endMs: number; durationMs: number }; |
|
|
| if (this.config.useVosk && this.voskReady) { |
| voskResult = await this.analyzeAudioWithVosk(audioSamples); |
|
|
| if (voskResult && voskResult.words.length > 0) { |
| |
| audioAnalysis = { |
| startMs: voskResult.speechStartMs, |
| endMs: voskResult.speechEndMs, |
| durationMs: voskResult.speechDurationMs, |
| }; |
| console.log('Using Vosk for audio analysis:', voskResult.text); |
| } else { |
| |
| audioAnalysis = this.analyzeAudio(audioSamples); |
| } |
| } else { |
| |
| audioAnalysis = this.analyzeAudio(audioSamples); |
| } |
|
|
| |
| const videoAnalysis = await this.analyzeVideo(frames); |
|
|
| const totalFrames = frames.length; |
| const originalFps = this.config.videoFps; |
|
|
| |
| const audioSpeechDurationMs = audioAnalysis.durationMs; |
| const videoSpeechDurationMs = (videoAnalysis.speechFrames / originalFps) * 1000; |
| const totalVideoDurationMs = (totalFrames / originalFps) * 1000; |
|
|
| |
| const speedRatio = audioSpeechDurationMs / videoSpeechDurationMs; |
|
|
| |
| let syncStrategy: 'stretch' | 'skip' | 'repeat' | 'normal' = 'normal'; |
| let adjustedFps = originalFps; |
|
|
| if (Math.abs(speedRatio - 1) > 0.05) { |
| if (speedRatio > 1) { |
| |
| syncStrategy = 'stretch'; |
| adjustedFps = originalFps / speedRatio; |
| } else { |
| |
| syncStrategy = 'skip'; |
| adjustedFps = originalFps / speedRatio; |
| } |
| } |
|
|
| |
| adjustedFps = Math.max(10, Math.min(60, adjustedFps)); |
|
|
| |
| const videoStartMs = (videoAnalysis.startFrame / originalFps) * 1000; |
| const audioStartMs = audioAnalysis.startMs; |
|
|
| let videoDelayMs = 0; |
| let audioDelayMs = 0; |
|
|
| if (audioStartMs > videoStartMs) { |
| |
| videoDelayMs = audioStartMs - videoStartMs; |
| } else { |
| |
| audioDelayMs = videoStartMs - audioStartMs; |
| } |
|
|
| |
| const frameMap = this.createFrameMap( |
| totalFrames, |
| videoAnalysis.startFrame, |
| videoAnalysis.endFrame, |
| speedRatio |
| ); |
|
|
| const debug: SyncDebugInfo = { |
| audioStartMs: audioAnalysis.startMs, |
| audioEndMs: audioAnalysis.endMs, |
| audioSpeechDuration: audioAnalysis.durationMs, |
| videoStartFrame: videoAnalysis.startFrame, |
| videoEndFrame: videoAnalysis.endFrame, |
| videoSpeechFrames: videoAnalysis.speechFrames, |
| adjustedFps, |
| syncStrategy, |
| voskResult: voskResult ?? undefined, |
| wordAlignments: voskResult?.words, |
| }; |
|
|
| this.config.onDebug?.(debug); |
|
|
| return { |
| adjustedFps, |
| videoDelayMs, |
| audioDelayMs, |
| skipFramesStart: videoAnalysis.startFrame, |
| skipFramesEnd: totalFrames - videoAnalysis.endFrame - 1, |
| frameMap, |
| debug, |
| }; |
| } |
|
|
| |
| |
| |
| |
| private createFrameMap( |
| totalFrames: number, |
| startFrame: number, |
| endFrame: number, |
| speedRatio: number |
| ): number[] { |
| const speechFrames = endFrame - startFrame + 1; |
| const targetFrames = Math.round(speechFrames * speedRatio); |
|
|
| const frameMap: number[] = []; |
|
|
| |
| for (let i = 0; i < startFrame; i++) { |
| frameMap.push(i); |
| } |
|
|
| |
| if (Math.abs(speedRatio - 1) > 0.05) { |
| for (let i = 0; i < targetFrames; i++) { |
| const sourceFrame = startFrame + Math.floor((i / targetFrames) * speechFrames); |
| frameMap.push(Math.min(sourceFrame, endFrame)); |
| } |
| } else { |
| |
| for (let i = startFrame; i <= endFrame; i++) { |
| frameMap.push(i); |
| } |
| } |
|
|
| |
| for (let i = endFrame + 1; i < totalFrames; i++) { |
| frameMap.push(i); |
| } |
|
|
| return frameMap; |
| } |
|
|
| |
| |
| |
| private calculateRMS(samples: Float32Array, start: number, length: number): number { |
| let sum = 0; |
| const end = Math.min(start + length, samples.length); |
|
|
| for (let i = start; i < end; i++) { |
| sum += samples[i] * samples[i]; |
| } |
|
|
| return Math.sqrt(sum / (end - start)); |
| } |
|
|
| |
| |
| |
| |
| calculateSimpleSync( |
| totalAudioDurationMs: number, |
| totalVideoFrames: number |
| ): { adjustedFps: number; frameInterval: number } { |
| const originalFps = this.config.videoFps; |
| const videoDurationMs = (totalVideoFrames / originalFps) * 1000; |
|
|
| |
| const speedRatio = totalAudioDurationMs / videoDurationMs; |
| let adjustedFps = originalFps / speedRatio; |
|
|
| |
| adjustedFps = Math.max(10, Math.min(60, adjustedFps)); |
|
|
| return { |
| adjustedFps, |
| frameInterval: 1000 / adjustedFps, |
| }; |
| } |
| } |
|
|
| export default SyncEngine; |
|
|