nixiieee/dusha_balanced
Viewer • Updated • 44.1k • 42 • 2
How to use nixiieee/whisper-small-emotion-classifier-dusha with Transformers:
# Use a pipeline as a high-level helper
from transformers import pipeline
pipe = pipeline("audio-classification", model="nixiieee/whisper-small-emotion-classifier-dusha") # Load model directly
from transformers import AutoProcessor, WhisperForEmotionClassification
processor = AutoProcessor.from_pretrained("nixiieee/whisper-small-emotion-classifier-dusha")
model = WhisperForEmotionClassification.from_pretrained("nixiieee/whisper-small-emotion-classifier-dusha")This model is a fine-tuned version of openai/whisper-small on an unknown dataset. It achieves the following results on the evaluation set:
The following hyperparameters were used during training:
| Training Loss | Epoch | Step | Validation Loss | Accuracy | Balanced Accuracy | Precision | Recall | F1 |
|---|---|---|---|---|---|---|---|---|
| 0.8545 | 1.0 | 4609 | 0.7419 | 0.7097 | 0.7426 | 0.7483 | 0.7426 | 0.7388 |
| 0.8001 | 2.0 | 9218 | 0.6393 | 0.7597 | 0.7931 | 0.7982 | 0.7931 | 0.7934 |
| 0.6171 | 3.0 | 13827 | 0.6245 | 0.7739 | 0.8024 | 0.8100 | 0.8024 | 0.8055 |
| 0.7518 | 4.0 | 18436 | 0.6152 | 0.7722 | 0.8055 | 0.8064 | 0.8055 | 0.8038 |
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers import AutoProcessor, WhisperForAudioClassification, AutoConfig, PreTrainedModel, WhisperModel
import torch.nn as nn
class WhisperClassifier(nn.Module):
def __init__(self, hidden_size, num_labels=5, dropout=0.2):
super().__init__()
self.pool_norm = nn.LayerNorm(hidden_size)
self.pre_dropout = nn.Dropout(dropout)
mid1 = max(hidden_size // 2, num_labels * 4)
mid2 = max(hidden_size // 4, num_labels * 2)
self.classifier = nn.Sequential(
nn.Linear(hidden_size, mid1),
nn.GELU(),
nn.Dropout(dropout),
nn.LayerNorm(mid1),
nn.Linear(mid1, mid2),
nn.GELU(),
nn.Dropout(dropout),
nn.LayerNorm(mid2),
nn.Linear(mid2, num_labels),
)
def forward(self, hidden_states, attention_mask=None):
if attention_mask is not None:
lengths = attention_mask.sum(dim=1, keepdim=True)
masked = hidden_states * attention_mask.unsqueeze(-1)
pooled = masked.sum(dim=1) / lengths
else:
pooled = hidden_states.mean(dim=1)
x = self.pool_norm(pooled)
x = self.pre_dropout(x)
logits = self.classifier(x)
return logits
class WhisperForEmotionClassification(PreTrainedModel):
config_class = AutoConfig
def __init__(
self, config, model_name="openai/whisper-small", num_labels=5, dropout=0.2
):
super().__init__(config)
self.encoder = WhisperModel.from_pretrained(model_name).encoder
hidden_size = config.hidden_size
self.classifier = WhisperClassifier(
hidden_size, num_labels=num_labels, dropout=dropout
)
self.post_init()
def forward(self, input_features, attention_mask=None, labels=None):
encoder_output = self.encoder(
input_features=input_features,
attention_mask=attention_mask,
return_dict=True,
)
hidden_states = encoder_output.last_hidden_state
logits = self.classifier(hidden_states, attention_mask=attention_mask)
loss = None
if labels is not None:
loss = nn.CrossEntropyLoss()(
logits.view(-1, logits.size(-1)), labels.view(-1)
)
return SequenceClassifierOutput(
loss=loss,
logits=logits,
)
EMOTION_LABELS = ['neutral', 'angry', 'positive', 'sad', 'other']
model_name = "nixiieee/whisper-small-emotion-classifier-dusha"
processor = WhisperProcessor.from_pretrained("openai/whisper-small", return_attention_mask=True)
config = AutoConfig.from_pretrained(model_name)
model = WhisperForEmotionClassification.from_pretrained(model_name, num_labels=5, dropout=0.2)
model.eval()
# load audio
wav, sr = torchaudio.load("audio.wav")
# resample if necessary
wav = torchaudio.functional.resample(wav, sr, 16000)
input_features = processor(wav[0], sampling_rate=16000, return_tensors="pt")
with torch.no_grad():
pred_ids = model.generate(**input_features)
pred = pred_ids.logits.argmax(dim=-1).item()
print("Predicted emotion:", EMOTION_LABELS[pred])
Base model
openai/whisper-small