Upload 7 files

Browse files

added the modified files

Files changed (7) hide show

README.md +85 -0
config.json +38 -0
configuration_eat.py +58 -0
eat_model.py +83 -0
model.safetensors +3 -0
model_core.py +294 -0
modeling_eat.py +18 -0

README.md ADDED Viewed

	@@ -0,0 +1,85 @@

+---
+license: mit
+tags:
+- Audio
+- SSL
+- EAT
+library_name: transformers
+---
+# EAT-base (Epoch 30, Pre-trained Checkpoint)
+This is the **pre-trained EAT-base model** at epoch 30, trained on the AS-2M dataset using the EAT framework for audio self-supervised learning.
+It offers efficient feature extraction and can also serve as a strong initialization for fine-tuning on a wide range of downstream audio understanding tasks such as classification and captioning.
+For more details on the EAT framework, please refer to the [GitHub repository](https://github.com/cwx-worst-one/EAT) and our paper [EAT: Self-Supervised Pre-Training with Efficient Audio Transformer](https://arxiv.org/abs/2401.03497).
+## 🔧 Usage
+You can load and use the model for feature extraction directly via Hugging Face Transformers:
+```python
+import torchaudio
+import torch
+import soundfile as sf
+import numpy as np
+from transformers import AutoModel
+model_id = "HTill/flexEAT-base_epoch30_pretrain"
+model = AutoModel.from_pretrained(model_id, trust_remote_code=True).eval().cuda()
+source_file = "/path/to/input.wav"
+target_file = "/path/to/output.npy"
+norm_mean = -4.268
+norm_std = 4.569
+# Load and resample audio
+wav, sr = sf.read(source_file)
+waveform = torch.tensor(wav).float().cuda()
+if sr != 16000:
+    waveform = torchaudio.functional.resample(waveform, sr, 16000)
+# Normalize and convert to mel-spectrogram
+waveform = waveform - waveform.mean()
+mel = torchaudio.compliance.kaldi.fbank(
+    waveform.unsqueeze(0),
+    htk_compat=True,
+    sample_frequency=16000,
+    use_energy=False,
+    window_type='hanning',
+    num_mel_bins=128,
+    dither=0.0,
+    frame_shift=10
+).unsqueeze(0)
+# Normalize
+mel = (mel - norm_mean) / (norm_std * 2)
+mel = mel.unsqueeze(0).cuda()  # shape: [1, 1, T, F]
+# Extract features
+with torch.no_grad():
+    feat = model.extract_features(mel)
+feat = feat.squeeze(0).cpu().numpy()
+np.save(target_file, feat)
+print(f"Feature shape: {feat.shape}")
+print(f"Saved to: {target_file}")
+```
+## 📌 Notes
+The model supports both **frame-level** (\~50Hz) and **utterance-level** (CLS token) representations.
+See the [feature extraction guide](https://github.com/cwx-worst-one/EAT/tree/main/feature_extract) for more instructions.
+## 📚 Citation
+If you find this model useful, please consider citing our [paper](https://arxiv.org/abs/2401.03497):
+```bibtex
+@article{chen2024eat,
+  title={EAT: Self-supervised pre-training with efficient audio transformer},
+  author={Chen, Wenxi and Liang, Yuzhe and Ma, Ziyang and Zheng, Zhisheng and Chen, Xie},
+  journal={arXiv preprint arXiv:2401.03497},
+  year={2024}
+}

config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "activation_dropout": 0.0,
+  "architectures": [
+    "EATModel"
+  ],
+  "auto_map": {
+    "AutoModel": "modeling_eat.EATModel",
+    "AutoConfig": "configuration_eat.EATConfig"
+  },
+  "attn_drop_rate": 0.0,
+  "depth": 12,
+  "drop_rate": 0.0,
+  "embed_dim": 768,
+  "end_drop_path_rate": 0.0,
+  "fixed_positions": true,
+  "img_size": [
+    1024,
+    128
+  ],
+  "in_chans": 1,
+  "layer_norm_first": false,
+  "max_length": 768,
+  "mel_bins": 128,
+  "mlp_ratio": 4.0,
+  "model_type": "eat",
+  "model_variant": "pretrain",
+  "norm_affine": true,
+  "norm_eps": 1e-06,
+  "num_classes": 527,
+  "num_heads": 12,
+  "patch_size": 16,
+  "post_mlp_drop": 0.0,
+  "qkv_bias": true,
+  "start_drop_path_rate": 0.0,
+  "stride": 16,
+  "torch_dtype": "float32",
+  "transformers_version": "4.51.3"
+}

configuration_eat.py ADDED Viewed

	@@ -0,0 +1,58 @@

+# configuration_eat.py
+from transformers import PretrainedConfig
+class EATConfig(PretrainedConfig):
+    model_type = "eat"
+    def __init__(
+        self,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        patch_size=16,
+        stride=16,
+        in_chans=1,
+        num_classes=527,
+        model_variant="pretrain",  # or "finetune"
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        activation_dropout=0.0,
+        post_mlp_drop=0.0,
+        start_drop_path_rate=0.0,
+        end_drop_path_rate=0.0,
+        layer_norm_first=False,
+        norm_eps=1e-6,
+        norm_affine=True,
+        fixed_positions=True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.embed_dim = embed_dim
+        self.depth = depth
+        self.num_heads = num_heads
+        self.patch_size = patch_size
+        self.stride = stride
+        self.in_chans = in_chans
+        self.num_classes = num_classes
+        self.model_variant = model_variant
+        self.mlp_ratio = mlp_ratio
+        self.qkv_bias = qkv_bias
+        self.drop_rate = drop_rate
+        self.attn_drop_rate = attn_drop_rate
+        self.activation_dropout = activation_dropout
+        self.post_mlp_drop = post_mlp_drop
+        self.start_drop_path_rate = start_drop_path_rate
+        self.end_drop_path_rate = end_drop_path_rate
+        self.layer_norm_first = layer_norm_first
+        self.norm_eps = norm_eps
+        self.norm_affine = norm_affine
+        self.fixed_positions = fixed_positions

eat_model.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import torch
+import torch.nn as nn
+from functools import partial
+import numpy as np
+from .model_core import (
+    PatchEmbed,
+    AltBlock,
+    trunc_normal_
+)
+class EAT(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.mode = config.model_variant  # "pretrain" or "finetune"
+        # === Embedding / Encoder ===
+        self.local_encoder = PatchEmbed(
+            img_size=config.img_size,
+            patch_size=config.patch_size,
+            in_chans=config.in_chans,
+            embed_dim=config.embed_dim,
+            stride=config.stride
+            use_sincos_pos=config.fixed_positions
+        )
+        self.extra_tokens = nn.Parameter(torch.zeros(1, 1, config.embed_dim))
+        self.pos_drop = nn.Dropout(p=config.drop_rate, inplace=True)
+        trunc_normal_(self.extra_tokens, std=.02)
+        norm_layer = partial(nn.LayerNorm, eps=config.norm_eps, elementwise_affine=config.norm_affine)
+        dpr = np.linspace(config.start_drop_path_rate, config.end_drop_path_rate, config.depth)
+        self.blocks = nn.ModuleList([
+            AltBlock(config.embed_dim, config.num_heads, config.mlp_ratio,
+                     qkv_bias=config.qkv_bias, drop=config.drop_rate,
+                     attn_drop=config.attn_drop_rate, mlp_drop=config.activation_dropout,
+                     post_mlp_drop=config.post_mlp_drop, drop_path=dpr[i],
+                     norm_layer=norm_layer, layer_norm_first=config.layer_norm_first,
+                     ffn_targets=True)
+            for i in range(config.depth)
+        ])
+        self.pre_norm = norm_layer(config.embed_dim)
+        # === Head (for finetune) ===
+        if self.mode == "finetune":
+            self.fc_norm = nn.LayerNorm(config.embed_dim)
+            self.head = nn.Linear(config.embed_dim, config.num_classes, bias=True)
+        else:
+            self.head = nn.Identity()
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def encode(self, x):
+        B = x.shape[0]
+        x = self.local_encoder(x)
+        x = torch.cat((self.extra_tokens.expand(B, -1, -1), x), dim=1)
+        x = self.pre_norm(x)
+        x = self.pos_drop(x)
+        for blk in self.blocks:
+            x, _ = blk(x)
+        return x
+    def forward(self, x):
+        x = self.encode(x)
+        if self.mode == "finetune":
+            x = x[:, 0]  # use cls token
+            x = self.fc_norm(x)
+            x = self.head(x)
+        return x
+    def extract_features(self, x):
+        x = self.encode(x)
+        return x

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8623072d09aac4f3ad1168b4fed3a24e4f68fe1da25b9fe733375efb237e5f48
+size 359905840

model_core.py ADDED Viewed

	@@ -0,0 +1,294 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import collections
+# --- Helpers (Replacements for timm functions) ---
+def to_2tuple(x):
+    if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+        return x
+    return tuple(x for _ in range(2))
+def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0):
+    """Replacement for timm.models.layers.trunc_normal_"""
+    return torch.nn.init.trunc_normal_(tensor, mean, std, a, b)
+# --- Custom Modules (No TIMM) ---
+def drop_path(
+    x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True
+):
+    """Drop paths (Stochastic Depth) per sample."""
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0 and scale_by_keep:
+        random_tensor.div_(keep_prob)
+    return x * random_tensor
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample."""
+    def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
+    def extra_repr(self):
+        return f"drop_prob={round(self.drop_prob,3):0.3f}"
+class Mlp(nn.Module):
+    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        drop=0.0,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer() if isinstance(act_layer, type) else act_layer
+        self.drop1 = nn.Dropout(drop)
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop2 = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop1(x)
+        x = self.fc2(x)
+        x = self.drop2(x)
+        return x
+class SinCos2DEmbed(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x):
+        # x has the shape [batch_size, embed_dim, grid_length, grid_height]
+        # Note: grid_length corresponds to H (Time/Frequency), grid_height to W
+        _, embed_dim, grid_length, grid_height = x.shape
+        # Create grid positions
+        grid_length_a = torch.arange(grid_length, dtype=torch.float32, device=x.device)
+        grid_height_a = torch.arange(grid_height, dtype=torch.float32, device=x.device)
+        grid = torch.meshgrid(grid_length_a, grid_height_a, indexing="ij")
+        sub_embed_dim = embed_dim // 4
+        omega = torch.arange(sub_embed_dim, dtype=torch.float32, device=x.device)
+        omega /= sub_embed_dim
+        omega = 1.0 / 10000**omega
+        # embed_length (dimension 0 of grid)
+        out_length = torch.einsum("mn,d->dmn", grid[0], omega)
+        embed_length_sin = torch.sin(out_length)
+        embed_length_cos = torch.cos(out_length)
+        embed_length = torch.cat([embed_length_sin, embed_length_cos], dim=0)
+        # embed_height (dimension 1 of grid)
+        out_height = torch.einsum("mn,d->dmn", grid[1], omega)
+        embed_height_sin = torch.sin(out_height)
+        embed_height_cos = torch.cos(out_height)
+        embed_height = torch.cat([embed_height_sin, embed_height_cos], dim=0)
+        # concat length and height embeddings
+        embed = torch.cat([embed_length, embed_height], dim=0).unsqueeze(dim=0)
+        x = x + embed
+        return x
+class PatchEmbed(nn.Module):
+    """Flexible Image to Patch Embedding"""
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        stride=16,
+        use_sincos_pos=False,
+    ):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        stride = to_2tuple(stride)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.use_sincos_pos = use_sincos_pos
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=patch_size, stride=stride
+        )  # with overlapped patches
+        if self.use_sincos_pos:
+            self.pos_embed = SinCos2DEmbed()
+        else:
+            self.pos_embed = None
+    def forward(self, x):
+        x = self.proj(x)
+        # Apply dynamic positional embedding before flattening
+        if self.pos_embed is not None:
+            x = self.pos_embed(x)
+        x = x.flatten(2).transpose(1, 2)
+        return x
+class AltBlock(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        mlp_drop=0.0,
+        post_mlp_drop=0.0,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+        layer_norm_first=True,
+        ffn_targets=False,
+        cosine_attention=False,
+    ):
+        super().__init__()
+        self.layer_norm_first = layer_norm_first
+        self.ffn_targets = ffn_targets
+        self.norm1 = norm_layer(dim)
+        self.attn = AltAttention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            cosine_attention=cosine_attention,
+        )
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=mlp_drop,
+        )
+        self.post_mlp_dropout = nn.Dropout(post_mlp_drop, inplace=False)
+    def forward(self, x, padding_mask=None, alibi_bias=None):
+        if self.layer_norm_first:
+            x = x + self.drop_path(self.attn(self.norm1(x), padding_mask, alibi_bias))
+            r = x = self.mlp(self.norm2(x))
+            t = x
+            x = r + self.drop_path(self.post_mlp_dropout(x))
+            if not self.ffn_targets:
+                t = x
+        else:
+            x = x + self.drop_path(self.attn(x, padding_mask, alibi_bias))
+            r = x = self.norm1(x)
+            x = self.mlp(x)
+            t = x
+            x = self.norm2(r + self.drop_path(self.post_mlp_dropout(x)))
+            if not self.ffn_targets:
+                t = x
+        return x, t
+class AltAttention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads=8,
+        qkv_bias=False,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+        cosine_attention=False,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.cosine_attention = cosine_attention
+        if cosine_attention:
+            self.logit_scale = nn.Parameter(
+                torch.log(10 * torch.ones((num_heads, 1, 1))), requires_grad=True
+            )
+    def forward(self, x, padding_mask=None, alibi_bias=None):
+        B, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B, N, 3, self.num_heads, C // self.num_heads)
+            .permute(2, 0, 3, 1, 4)  # qkv x B x H x L x D
+        )
+        q, k, v = (
+            qkv[0],
+            qkv[1],
+            qkv[2],
+        )  # make torchscript happy (cannot use tensor as tuple)
+        dtype = q.dtype
+        if self.cosine_attention:
+            # cosine attention
+            attn = F.normalize(q, dim=-1) @ F.normalize(k, dim=-1).transpose(-2, -1)
+            logit_scale = torch.clamp(
+                self.logit_scale, max=torch.log(torch.tensor(1.0 / 0.01))
+            ).exp()
+            attn = attn * logit_scale
+        else:
+            q = q * self.scale
+            attn = q @ k.transpose(-2, -1)
+        if alibi_bias is not None:
+            attn = attn.type_as(alibi_bias)
+            attn[:, : alibi_bias.size(1)] += alibi_bias
+        if padding_mask is not None and padding_mask.any():
+            attn = attn.masked_fill(
+                padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool),
+                float("-inf"),
+            )
+        attn = attn.softmax(dim=-1, dtype=torch.float32).to(dtype=dtype)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2)  #
+        x = x.reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x

modeling_eat.py ADDED Viewed

	@@ -0,0 +1,18 @@

+# modeling_eat.py
+from transformers import PreTrainedModel
+from .configuration_eat import EATConfig
+from .eat_model import EAT
+class EATModel(PreTrainedModel):
+    config_class = EATConfig
+    def __init__(self, config: EATConfig):
+        super().__init__(config)
+        self.model = EAT(config)
+    def forward(self, *args, **kwargs):
+        return self.model(*args, **kwargs)
+    def extract_features(self, x):
+        return self.model.extract_features(x)