| | import { pipeline, AutoProcessor, ClapAudioModelWithProjection } from '@xenova/transformers'; |
| |
|
| | class CLAPProcessor { |
| | constructor() { |
| | this.model = null; |
| | this.processor = null; |
| | this.defaultLabels = [ |
| | 'speech', 'music', 'singing', 'guitar', 'piano', 'drums', 'violin', |
| | 'trumpet', 'saxophone', 'flute', 'classical music', 'rock music', |
| | 'pop music', 'jazz', 'electronic music', 'ambient', 'nature sounds', |
| | 'rain', 'wind', 'ocean waves', 'birds chirping', 'dog barking', |
| | 'cat meowing', 'car engine', 'traffic', 'footsteps', 'door closing', |
| | 'applause', 'laughter', 'crying', 'coughing', 'sneezing', |
| | 'telephone ringing', 'alarm clock', 'typing', 'water running', |
| | 'fire crackling', 'thunder', 'helicopter', 'airplane', 'train', |
| | 'motorcycle', 'bell ringing', 'whistle', 'horn', 'siren', |
| | 'explosion', 'gunshot', 'silence', 'noise', 'distortion' |
| | ]; |
| | } |
| |
|
| | async initialize() { |
| | if (this.model && this.processor) return; |
| |
|
| | try { |
| | |
| | this.processor = await AutoProcessor.from_pretrained('Xenova/clap-htsat-unfused'); |
| | this.model = await ClapAudioModelWithProjection.from_pretrained('Xenova/clap-htsat-unfused'); |
| | |
| | console.log('CLAP model loaded successfully'); |
| | } catch (error) { |
| | console.error('Failed to load CLAP model:', error); |
| | throw error; |
| | } |
| | } |
| |
|
| | async processAudio(audioBuffer) { |
| | if (!this.model || !this.processor) { |
| | await this.initialize(); |
| | } |
| |
|
| | try { |
| | |
| | const audio = await this.preprocessAudio(audioBuffer); |
| | |
| | |
| | const audioInputs = await this.processor(audio); |
| | const audioFeatures = await this.model.get_audio_features(audioInputs); |
| | |
| | |
| | const textInputs = await this.processor.text(this.defaultLabels); |
| | const textFeatures = await this.model.get_text_features(textInputs); |
| | |
| | |
| | const similarities = await this.calculateSimilarities(audioFeatures, textFeatures); |
| | |
| | |
| | return this.getTopTags(similarities, 5); |
| | } catch (error) { |
| | console.error('Error processing audio:', error); |
| | throw error; |
| | } |
| | } |
| |
|
| | async preprocessAudio(audioBuffer) { |
| | |
| | let audioData; |
| | if (audioBuffer.numberOfChannels > 1) { |
| | audioData = new Float32Array(audioBuffer.length); |
| | for (let i = 0; i < audioBuffer.length; i++) { |
| | let sum = 0; |
| | for (let channel = 0; channel < audioBuffer.numberOfChannels; channel++) { |
| | sum += audioBuffer.getChannelData(channel)[i]; |
| | } |
| | audioData[i] = sum / audioBuffer.numberOfChannels; |
| | } |
| | } else { |
| | audioData = audioBuffer.getChannelData(0); |
| | } |
| |
|
| | |
| | const targetSampleRate = 48000; |
| | if (audioBuffer.sampleRate !== targetSampleRate) { |
| | audioData = await this.resampleAudio(audioData, audioBuffer.sampleRate, targetSampleRate); |
| | } |
| |
|
| | return audioData; |
| | } |
| |
|
| | async resampleAudio(audioData, originalRate, targetRate) { |
| | |
| | const ratio = originalRate / targetRate; |
| | const newLength = Math.round(audioData.length / ratio); |
| | const resampled = new Float32Array(newLength); |
| | |
| | for (let i = 0; i < newLength; i++) { |
| | const originalIndex = i * ratio; |
| | const indexFloor = Math.floor(originalIndex); |
| | const indexCeil = Math.min(indexFloor + 1, audioData.length - 1); |
| | const fraction = originalIndex - indexFloor; |
| | |
| | resampled[i] = audioData[indexFloor] * (1 - fraction) + audioData[indexCeil] * fraction; |
| | } |
| | |
| | return resampled; |
| | } |
| |
|
| | async calculateSimilarities(audioFeatures, textFeatures) { |
| | |
| | const audioVector = audioFeatures.data; |
| | const similarities = []; |
| |
|
| | for (let i = 0; i < this.defaultLabels.length; i++) { |
| | const textVector = textFeatures.data.slice( |
| | i * audioVector.length, |
| | (i + 1) * audioVector.length |
| | ); |
| | |
| | const similarity = this.cosineSimilarity(audioVector, textVector); |
| | similarities.push(similarity); |
| | } |
| |
|
| | return similarities; |
| | } |
| |
|
| | cosineSimilarity(vecA, vecB) { |
| | let dotProduct = 0; |
| | let normA = 0; |
| | let normB = 0; |
| |
|
| | for (let i = 0; i < vecA.length; i++) { |
| | dotProduct += vecA[i] * vecB[i]; |
| | normA += vecA[i] * vecA[i]; |
| | normB += vecB[i] * vecB[i]; |
| | } |
| |
|
| | return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB)); |
| | } |
| |
|
| | getTopTags(similarities, topK = 5) { |
| | const tagged = this.defaultLabels.map((label, index) => ({ |
| | label, |
| | confidence: Math.max(0, similarities[index]) |
| | })); |
| |
|
| | return tagged |
| | .sort((a, b) => b.confidence - a.confidence) |
| | .slice(0, topK); |
| | } |
| |
|
| | |
| | async fileToAudioBuffer(file) { |
| | const arrayBuffer = await file.arrayBuffer(); |
| | const audioContext = new (window.AudioContext || window.webkitAudioContext)(); |
| | return await audioContext.decodeAudioData(arrayBuffer); |
| | } |
| | } |
| |
|
| | export default CLAPProcessor; |