Feature Extraction
Transformers
Safetensors
English
voiceclap-small
audio
speech
emotion
clap
contrastive
voice
custom_code
Instructions to use VoiceNet/voiceclap-small with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use VoiceNet/voiceclap-small with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("feature-extraction", model="VoiceNet/voiceclap-small", trust_remote_code=True)# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("VoiceNet/voiceclap-small", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
| """VoiceCLAP-Small: dual-tower CLAP using BUD-E-Whisper-Small + MiniLM. | |
| Standalone single-file implementation. Only depends on PyTorch and | |
| HuggingFace `transformers` (for `BertModel`, `PreTrainedModel`, and | |
| `PretrainedConfig`). | |
| """ | |
| import math | |
| from typing import Optional | |
| import torch | |
| import torch.nn as nn | |
| import torch.nn.functional as F | |
| from transformers import BertConfig, BertModel, PreTrainedModel | |
| try: | |
| from .configuration_voiceclap import VoiceCLAPSmallConfig | |
| except ImportError: | |
| from configuration_voiceclap import VoiceCLAPSmallConfig | |
| class _LayerNorm(nn.LayerNorm): | |
| def forward(self, x): | |
| return super().forward(x.float()).type(x.dtype) | |
| def _sinusoids(length: int, channels: int, max_timescale: float = 10000.0) -> torch.Tensor: | |
| assert channels % 2 == 0 | |
| log_timescale_increment = math.log(max_timescale) / (channels // 2 - 1) | |
| inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2)) | |
| scaled_time = torch.arange(length)[:, None] * inv_timescales[None, :] | |
| return torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1) | |
| class _MultiHeadAttention(nn.Module): | |
| def __init__(self, n_state: int, n_head: int): | |
| super().__init__() | |
| self.n_head = n_head | |
| self.query = nn.Linear(n_state, n_state) | |
| self.key = nn.Linear(n_state, n_state, bias=False) | |
| self.value = nn.Linear(n_state, n_state) | |
| self.out = nn.Linear(n_state, n_state) | |
| def forward(self, x: torch.Tensor) -> torch.Tensor: | |
| q = self.query(x) | |
| k = self.key(x) | |
| v = self.value(x) | |
| n_batch, n_ctx, n_state = q.shape | |
| head_dim = n_state // self.n_head | |
| q = q.view(n_batch, n_ctx, self.n_head, head_dim).transpose(1, 2) | |
| k = k.view(n_batch, n_ctx, self.n_head, head_dim).transpose(1, 2) | |
| v = v.view(n_batch, n_ctx, self.n_head, head_dim).transpose(1, 2) | |
| out = F.scaled_dot_product_attention(q, k, v) | |
| out = out.transpose(1, 2).reshape(n_batch, n_ctx, n_state) | |
| return self.out(out) | |
| class _ResidualAttentionBlock(nn.Module): | |
| def __init__(self, n_state: int, n_head: int): | |
| super().__init__() | |
| self.attn = _MultiHeadAttention(n_state, n_head) | |
| self.attn_ln = _LayerNorm(n_state) | |
| n_mlp = n_state * 4 | |
| self.mlp = nn.Sequential(nn.Linear(n_state, n_mlp), nn.GELU(), nn.Linear(n_mlp, n_state)) | |
| self.mlp_ln = _LayerNorm(n_state) | |
| def forward(self, x: torch.Tensor) -> torch.Tensor: | |
| x = x + self.attn(self.attn_ln(x)) | |
| x = x + self.mlp(self.mlp_ln(x)) | |
| return x | |
| class _WhisperAudioEncoder(nn.Module): | |
| """Whisper-style audio encoder. Takes a precomputed log-mel spectrogram.""" | |
| def __init__( | |
| self, | |
| n_mels: int = 80, | |
| n_ctx: int = 1500, | |
| n_state: int = 768, | |
| n_head: int = 12, | |
| n_layer: int = 12, | |
| output_dim: int = 768, | |
| ): | |
| super().__init__() | |
| self.conv1 = nn.Conv1d(n_mels, n_state, kernel_size=3, padding=1) | |
| self.conv2 = nn.Conv1d(n_state, n_state, kernel_size=3, stride=2, padding=1) | |
| self.register_buffer("positional_embedding", _sinusoids(n_ctx, n_state)) | |
| self.blocks = nn.ModuleList( | |
| [_ResidualAttentionBlock(n_state, n_head) for _ in range(n_layer)] | |
| ) | |
| self.ln_post = _LayerNorm(n_state) | |
| self.avg_pooler = nn.AvgPool1d(kernel_size=2, stride=2) | |
| self.proj = nn.Linear(n_state, output_dim) | |
| def forward(self, mel: torch.Tensor) -> torch.Tensor: | |
| # mel: (B, n_mels, T_mel) | |
| x = F.gelu(self.conv1(mel)) | |
| x = F.gelu(self.conv2(x)) | |
| x = x.permute(0, 2, 1) # (B, T', D) | |
| T = x.size(1) | |
| x = x + self.positional_embedding[:T].to(dtype=x.dtype, device=x.device) | |
| for block in self.blocks: | |
| x = block(x) | |
| x = x.permute(0, 2, 1) | |
| x = self.avg_pooler(x) | |
| x = x.permute(0, 2, 1) | |
| x = self.ln_post(x) | |
| x = self.proj(x) | |
| return x | |
| class VoiceCLAPSmall(PreTrainedModel): | |
| config_class = VoiceCLAPSmallConfig | |
| def __init__(self, config: VoiceCLAPSmallConfig): | |
| super().__init__(config) | |
| self.audio_encoder = _WhisperAudioEncoder( | |
| n_mels=config.n_mels, | |
| n_ctx=config.n_ctx, | |
| n_state=config.n_state, | |
| n_head=config.n_head, | |
| n_layer=config.n_layer, | |
| output_dim=config.embed_dim, | |
| ) | |
| self.audio_proj = nn.Sequential( | |
| nn.Linear(config.embed_dim, config.embed_dim), | |
| nn.GELU(), | |
| nn.Linear(config.embed_dim, config.embed_dim), | |
| ) | |
| bert_config = BertConfig( | |
| vocab_size=config.text_vocab_size, | |
| hidden_size=config.text_hidden_dim, | |
| num_hidden_layers=config.text_num_layers, | |
| num_attention_heads=config.text_num_heads, | |
| intermediate_size=config.text_intermediate_size, | |
| max_position_embeddings=config.text_max_position_embeddings, | |
| layer_norm_eps=config.text_layer_norm_eps, | |
| pad_token_id=config.text_pad_token_id, | |
| ) | |
| self.text_encoder = BertModel(bert_config, add_pooling_layer=False) | |
| self.text_proj = nn.Sequential( | |
| nn.Linear(config.text_hidden_dim, config.text_proj_hidden, bias=False), | |
| nn.GELU(), | |
| nn.Linear(config.text_proj_hidden, config.embed_dim, bias=False), | |
| ) | |
| self.logit_scale = nn.Parameter(torch.zeros(())) | |
| self.logit_bias = nn.Parameter(torch.zeros(())) | |
| # Mel filterbank used by encode_waveform / compute_log_mel. | |
| # 80 mel bins x 201 freq bins for n_fft=400, sr=16000 (Whisper-style). | |
| self.register_buffer( | |
| "mel_filters", | |
| torch.zeros(config.n_mels, 201), | |
| persistent=True, | |
| ) | |
| self.post_init() | |
| def compute_log_mel( | |
| self, waveform: torch.Tensor, sample_rate: int = 16000 | |
| ) -> torch.Tensor: | |
| """Whisper-style log-mel spectrogram. waveform: (B, T) or (T,) at 16 kHz. | |
| Returns (B, n_mels, T_mel). Matches the training-time preprocessing | |
| bit-exactly so embeddings reproduce the published results. | |
| """ | |
| if sample_rate != 16000: | |
| raise ValueError(f"sample_rate must be 16000, got {sample_rate}") | |
| if waveform.dim() == 1: | |
| waveform = waveform.unsqueeze(0) | |
| device = self.mel_filters.device | |
| waveform = waveform.to(device=device, dtype=torch.float32) | |
| window = torch.hann_window(400, device=device) | |
| stft = torch.stft(waveform, n_fft=400, hop_length=160, window=window, return_complex=True) | |
| magnitudes = stft[..., :-1].abs() ** 2 | |
| mel = self.mel_filters.to(magnitudes.dtype) @ magnitudes | |
| log_spec = torch.clamp(mel, min=1e-10).log10() | |
| log_spec = torch.maximum(log_spec, log_spec.amax(dim=(-2, -1), keepdim=True) - 8.0) | |
| log_spec = (log_spec + 4.0) / 4.0 | |
| return log_spec | |
| def encode_waveform(self, waveform: torch.Tensor, sample_rate: int = 16000) -> torch.Tensor: | |
| """Encode raw 16 kHz waveform; calls ``compute_log_mel`` then ``encode_audio``.""" | |
| mel = self.compute_log_mel(waveform, sample_rate=sample_rate) | |
| return self.encode_audio(mel) | |
| def encode_audio(self, mel: torch.Tensor) -> torch.Tensor: | |
| feats = self.audio_encoder(mel) # (B, T', D) | |
| feats = feats.mean(dim=1) # clip-level mean | |
| feats = self.audio_proj(feats) | |
| return F.normalize(feats, dim=-1) | |
| def encode_text( | |
| self, | |
| input_ids: torch.Tensor, | |
| attention_mask: Optional[torch.Tensor] = None, | |
| ) -> torch.Tensor: | |
| if attention_mask is None: | |
| attention_mask = (input_ids != self.config.text_pad_token_id).long() | |
| out = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask) | |
| hidden = out.last_hidden_state # (B, T, H) | |
| mask = attention_mask.unsqueeze(-1).to(hidden.dtype) | |
| pooled = (hidden * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1e-9) | |
| feats = self.text_proj(pooled) | |
| return F.normalize(feats, dim=-1) | |