gaunernst
/

vit_base_patch16_1024_128.audiomae_as2m

Model card Files Files and versions

gaunernst commited on Apr 25, 2025

Commit

a1e9f8f

·

verified ·

1 Parent(s): b0f2ea3

Update README.md

Files changed (1) hide show

README.md +7 -1

README.md CHANGED Viewed

@@ -30,7 +30,7 @@ import torch.nn.functional as F
 from torchaudio.compliance import kaldi
 # for fine-tuning, you can pass `num_classes={your number of classes}`
-model = timm.create_model("hf_hub:gaunernst/vit_base_patch16_1024_128.audiomae_as2m_ft", pretrained=True)
 model = model.eval()
 MEAN = -4.2677393
@@ -48,6 +48,12 @@ melspec = (melspec - MEAN) / (STD * 2)
 melspec = melspec.view(1, 1, 1024, 128)  # add batch dim and channel dim
 output = model(melspec)  # embeddings with shape (1, 768)
 ```
 ## Citation

 from torchaudio.compliance import kaldi
 # for fine-tuning, you can pass `num_classes={your number of classes}`
+model = timm.create_model("hf_hub:gaunernst/vit_base_patch16_1024_128.audiomae_as2m", pretrained=True)
 model = model.eval()
 MEAN = -4.2677393
 melspec = melspec.view(1, 1, 1024, 128)  # add batch dim and channel dim
 output = model(melspec)  # embeddings with shape (1, 768)
+# to get frame level embeddings
+output = model.forward_features(melspec)  # shape (1, 513, 768)
+output = output[:, 1:]  # remove [CLS] token
+output = output.unflatten(1, (1024 // 16, 128 // 16))  # (1, 64, 8, 768) -> 2D patches
+output = output.mean(2)  # (1, 64, 768) -> mean pooling across mel dimension
 ```
 ## Citation