Upload folder using huggingface_hub

Browse files

Files changed (8) hide show

config.json +50 -0
configuration_eat.py +66 -0
configuration_finelap.py +39 -0
eat_model.py +99 -0
eat_model_core.py +224 -0
model.safetensors +3 -0
modeling_eat.py +18 -0
modeling_finelap.py +175 -0

config.json ADDED Viewed

	@@ -0,0 +1,50 @@

+{
+  "auto_map": {
+    "AutoConfig": "configuration_finelap.FineLAPConfig",
+    "AutoModel": "modeling_finelap.FineLAPModel"
+  },
+  "architectures": [
+    "FineLAPModel"
+  ],
+  "audio_config": {
+    "_attn_implementation_autoset": true,
+    "activation_dropout": 0.0,
+    "attn_drop_rate": 0.0,
+    "depth": 12,
+    "drop_rate": 0.0,
+    "embed_dim": 768,
+    "end_drop_path_rate": 0.0,
+    "fixed_positions": true,
+    "img_size": [
+      1024,
+      128
+    ],
+    "in_chans": 1,
+    "layer_norm_first": false,
+    "mel_bins": 128,
+    "mlp_ratio": 4.0,
+    "model_type": "eat",
+    "model_variant": "pretrain",
+    "norm_affine": true,
+    "norm_eps": 1e-06,
+    "num_classes": 527,
+    "num_heads": 12,
+    "patch_size": 16,
+    "post_mlp_drop": 0.0,
+    "qkv_bias": true,
+    "start_drop_path_rate": 0.0,
+    "stride": 16
+  },
+  "b_global": -10.0,
+  "b_local": -10.0,
+  "embed_size": 1024,
+  "local_audio_proj_type": "transformer",
+  "model_type": "finelap",
+  "normalize_dense_audio_embeds": true,
+  "temp_global": 0.1,
+  "temp_local": 0.1,
+  "text_encoder_name": "roberta-base",
+  "torch_dtype": "float32",
+  "transformers_version": "4.51.3",
+  "unify_audio_proj": false
+}

configuration_eat.py ADDED Viewed

	@@ -0,0 +1,66 @@

+# configuration_eat.py
+from transformers import PretrainedConfig
+class EATConfig(PretrainedConfig):
+    model_type = "eat"
+    def __init__(
+        self,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        patch_size=16,
+        stride=16,
+        in_chans=1,
+        mel_bins=128,
+        max_length=768,
+        num_classes=527,
+        model_variant="pretrain",  # or "finetune"
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        activation_dropout=0.0,
+        post_mlp_drop=0.0,
+        start_drop_path_rate=0.0,
+        end_drop_path_rate=0.0,
+        layer_norm_first=False,
+        norm_eps=1e-6,
+        norm_affine=True,
+        fixed_positions=True,
+        img_size=(1024, 128),  # (target_length, mel_bins)
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.embed_dim = embed_dim
+        self.depth = depth
+        self.num_heads = num_heads
+        self.patch_size = patch_size
+        self.stride = stride
+        self.in_chans = in_chans
+        self.mel_bins = mel_bins
+        self.max_length = max_length
+        self.num_classes = num_classes
+        self.model_variant = model_variant
+        self.mlp_ratio = mlp_ratio
+        self.qkv_bias = qkv_bias
+        self.drop_rate = drop_rate
+        self.attn_drop_rate = attn_drop_rate
+        self.activation_dropout = activation_dropout
+        self.post_mlp_drop = post_mlp_drop
+        self.start_drop_path_rate = start_drop_path_rate
+        self.end_drop_path_rate = end_drop_path_rate
+        self.layer_norm_first = layer_norm_first
+        self.norm_eps = norm_eps
+        self.norm_affine = norm_affine
+        self.fixed_positions = fixed_positions
+        self.img_size = img_size

configuration_finelap.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from transformers import PretrainedConfig
+from .configuration_eat import EATConfig
+class FineLAPConfig(PretrainedConfig):
+    model_type = "finelap"
+    def __init__(
+        self,
+        embed_size=1024,
+        temp_global=0.1,
+        b_global=-10.0,
+        temp_local=0.1,
+        b_local=-10.0,
+        local_audio_proj_type="transformer",
+        normalize_dense_audio_embeds=True,
+        unify_audio_proj=False,
+        text_encoder_name="roberta-base",
+        audio_config=None,
+        **kwargs
+    ):
+        self.embed_size = embed_size
+        self.temp_global = temp_global
+        self.b_global = b_global
+        self.temp_local = temp_local
+        self.b_local = b_local
+        self.local_audio_proj_type = local_audio_proj_type
+        self.normalize_dense_audio_embeds = normalize_dense_audio_embeds
+        self.unify_audio_proj = unify_audio_proj
+        self.text_encoder_name = text_encoder_name
+        # 👈 关键修改 2：如果读进来的是字典，把它重新包装成 EATConfig 对象
+        if isinstance(audio_config, dict):
+            self.audio_config = EATConfig(**audio_config)
+        elif isinstance(audio_config, EATConfig):
+            self.audio_config = audio_config
+        else:
+            self.audio_config = EATConfig()
+        super().__init__(**kwargs)

eat_model.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import torch
+import torch.nn as nn
+from timm.models.layers import trunc_normal_
+from functools import partial
+import numpy as np
+from .eat_model_core import (
+    PatchEmbed_new,
+    get_2d_sincos_pos_embed_flexible,
+    FixedPositionalEncoder,
+    AltBlock
+)
+class EAT(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.mode = config.model_variant  # "pretrain" or "finetune"
+        # === Embedding / Encoder ===
+        self.local_encoder = PatchEmbed_new(
+            img_size=config.img_size,
+            patch_size=config.patch_size,
+            in_chans=config.in_chans,
+            embed_dim=config.embed_dim,
+            stride=config.stride
+        )
+        self.extra_tokens = nn.Parameter(torch.zeros(1, 1, config.embed_dim))
+        self.pos_drop = nn.Dropout(p=config.drop_rate, inplace=True)
+        trunc_normal_(self.extra_tokens, std=.02)
+        self.fixed_positional_encoder = (
+            FixedPositionalEncoder(self.build_sincos_pos_embed()) if config.fixed_positions else None
+        )
+        norm_layer = partial(nn.LayerNorm, eps=config.norm_eps, elementwise_affine=config.norm_affine)
+        dpr = np.linspace(config.start_drop_path_rate, config.end_drop_path_rate, config.depth)
+        self.blocks = nn.ModuleList([
+            AltBlock(config.embed_dim, config.num_heads, config.mlp_ratio,
+                     qkv_bias=config.qkv_bias, drop=config.drop_rate,
+                     attn_drop=config.attn_drop_rate, mlp_drop=config.activation_dropout,
+                     post_mlp_drop=config.post_mlp_drop, drop_path=dpr[i],
+                     norm_layer=norm_layer, layer_norm_first=config.layer_norm_first,
+                     ffn_targets=True)
+            for i in range(config.depth)
+        ])
+        self.pre_norm = norm_layer(config.embed_dim)
+        # === Head (for finetune) ===
+        if self.mode == "finetune":
+            self.fc_norm = nn.LayerNorm(config.embed_dim)
+            self.head = nn.Linear(config.embed_dim, config.num_classes, bias=True)
+        else:
+            self.head = nn.Identity()
+        self.apply(self._init_weights)
+    def build_sincos_pos_embed(self):
+        W = self.config.mel_bins // self.config.patch_size
+        max_length = self.config.max_length
+        embed_dim = self.config.embed_dim
+        pos_embed = nn.Parameter(torch.zeros(1, max_length * W, embed_dim), requires_grad=False)
+        emb = get_2d_sincos_pos_embed_flexible(embed_dim, (max_length, W), cls_token=False)
+        pos_embed.data.copy_(torch.from_numpy(emb).float().unsqueeze(0))
+        return pos_embed
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def encode(self, x):
+        B = x.shape[0]
+        x = self.local_encoder(x)
+        if self.fixed_positional_encoder is not None:
+            x = x + self.fixed_positional_encoder(x, None)[:, :x.size(1), :]
+        x = torch.cat((self.extra_tokens.expand(B, -1, -1), x), dim=1)
+        x = self.pre_norm(x)
+        x = self.pos_drop(x)
+        for blk in self.blocks:
+            x, _ = blk(x)
+        return x
+    def forward(self, x):
+        x = self.encode(x)
+        if self.mode == "finetune":
+            x = x[:, 0]  # use cls token
+            x = self.fc_norm(x)
+            x = self.head(x)
+        return x
+    def extract_features(self, x):
+        x = self.encode(x)
+        return x

eat_model_core.py ADDED Viewed

	@@ -0,0 +1,224 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from timm.models.layers import to_2tuple
+class PatchEmbed_new(nn.Module):
+    """ Flexible Image to Patch Embedding
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, stride=16):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        stride = to_2tuple(stride)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=stride) # with overlapped patches
+    def forward(self, x):
+        x = self.proj(x)
+        x = x.flatten(2).transpose(1, 2)
+        return x
+def get_2d_sincos_pos_embed_flexible(embed_dim, grid_size, cls_token=False):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size[0], dtype=np.float32)
+    grid_w = np.arange(grid_size[1], dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size[0], grid_size[1]])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token:
+        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float32)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000 ** omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+class FixedPositionalEncoder(nn.Module):
+    def __init__(self, pos_embed):
+        super().__init__()
+        self.positions = pos_embed
+    def forward(self, x, padding_mask):
+        return self.positions
+class AltBlock(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        mlp_drop=0.0,
+        post_mlp_drop=0.0,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+        layer_norm_first=True,
+        ffn_targets=False,
+        cosine_attention=False,
+    ):
+        super().__init__()
+        self.layer_norm_first = layer_norm_first
+        self.ffn_targets = ffn_targets
+        from timm.models.vision_transformer import DropPath, Mlp
+        self.norm1 = norm_layer(dim)
+        self.attn = AltAttention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            cosine_attention=cosine_attention,
+        )
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=mlp_drop,
+        )
+        self.post_mlp_dropout = nn.Dropout(post_mlp_drop, inplace=False)
+    def forward(self, x, padding_mask=None, alibi_bias=None):
+        if self.layer_norm_first:
+            x = x + self.drop_path(self.attn(self.norm1(x), padding_mask, alibi_bias))
+            r = x = self.mlp(self.norm2(x))
+            t = x
+            x = r + self.drop_path(self.post_mlp_dropout(x))
+            if not self.ffn_targets:
+                t = x
+        else:
+            x = x + self.drop_path(self.attn(x, padding_mask, alibi_bias))
+            r = x = self.norm1(x)
+            x = self.mlp(x)
+            t = x
+            x = self.norm2(r + self.drop_path(self.post_mlp_dropout(x)))
+            if not self.ffn_targets:
+                t = x
+        return x, t
+class AltAttention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads=8,
+        qkv_bias=False,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+        cosine_attention=False,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.cosine_attention = cosine_attention
+        if cosine_attention:
+            self.logit_scale = nn.Parameter(
+                torch.log(10 * torch.ones((num_heads, 1, 1))), requires_grad=True
+            )
+    def forward(self, x, padding_mask=None, alibi_bias=None):
+        B, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B, N, 3, self.num_heads, C // self.num_heads)
+            .permute(2, 0, 3, 1, 4)  # qkv x B x H x L x D
+        )
+        q, k, v = (
+            qkv[0],
+            qkv[1],
+            qkv[2],
+        )  # make torchscript happy (cannot use tensor as tuple)
+        dtype = q.dtype
+        if self.cosine_attention:
+            # cosine attention
+            attn = F.normalize(q, dim=-1) @ F.normalize(k, dim=-1).transpose(-2, -1)
+            logit_scale = torch.clamp(
+                self.logit_scale, max=torch.log(torch.tensor(1.0 / 0.01))
+            ).exp()
+            attn = attn * logit_scale
+        else:
+            q = q * self.scale
+            attn = q @ k.transpose(-2, -1)
+        if alibi_bias is not None:
+            attn = attn.type_as(alibi_bias)
+            attn[:, : alibi_bias.size(1)] += alibi_bias
+        if padding_mask is not None and padding_mask.any():
+            attn = attn.masked_fill(
+                padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool),
+                float("-inf"),
+            )
+        attn = attn.softmax(dim=-1, dtype=torch.float32).to(dtype=dtype)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2)  #
+        x = x.reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:13b9646c9f9d48513c0145bed75e654179e83f0fd8d49ed4ffc5d6b8f3353fb4
+size 974773008

modeling_eat.py ADDED Viewed

	@@ -0,0 +1,18 @@

+# modeling_eat.py
+from transformers import PreTrainedModel
+from .configuration_eat import EATConfig
+from .eat_model import EAT
+class EATModel(PreTrainedModel):
+    config_class = EATConfig
+    def __init__(self, config: EATConfig):
+        super().__init__(config)
+        self.model = EAT(config)
+    def forward(self, *args, **kwargs):
+        return self.model(*args, **kwargs)
+    def extract_features(self, x):
+        return self.model.extract_features(x)

modeling_finelap.py ADDED Viewed

	@@ -0,0 +1,175 @@

+# modeling_finelap.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PreTrainedModel, RobertaModel, RobertaTokenizer
+from .configuration_finelap import FineLAPConfig
+from .modeling_eat import EATModel
+class FineLAPModel(PreTrainedModel):
+    config_class = FineLAPConfig
+    def __init__(self, config: FineLAPConfig):
+        super().__init__(config)
+        self.config = config
+        self.audio_encoder = EATModel(config.audio_config)
+        self.audio_width = getattr(config.audio_config, 'hidden_size', 768)
+        self.text_encoder = RobertaModel.from_pretrained(
+            config.text_encoder_name,
+            add_pooling_layer=False,
+        )
+        self.text_width = self.text_encoder.config.hidden_size
+        self.embed_size = config.embed_size
+        if config.temp_global != 0:
+            self.temp_global = nn.Parameter(torch.ones([]) * config.temp_global)
+        if config.b_global != 0:
+            self.b_global = nn.Parameter(torch.ones([]) * config.b_global)
+        if config.temp_local != 0:
+            self.temp_local = nn.Parameter(torch.ones([]) * config.temp_local)
+        if config.b_local != 0:
+            self.b_local = nn.Parameter(torch.ones([]) * config.b_local)
+        self.global_audio_proj = nn.Sequential(
+            nn.Linear(self.audio_width, self.embed_size),
+            nn.ReLU(),
+            nn.Linear(self.embed_size, self.embed_size),
+        )
+        self.global_text_proj = nn.Sequential(
+            nn.Linear(self.text_width, self.embed_size),
+            nn.ReLU(),
+            nn.Linear(self.embed_size, self.embed_size),
+        )
+        # 5. Local Audio Projection Layer
+        self.local_audio_proj_type = config.local_audio_proj_type
+        if self.local_audio_proj_type == "rnn":
+            self.local_audio_proj = nn.GRU(
+                input_size=self.audio_width,
+                hidden_size=int(self.embed_size / 2),
+                num_layers=2,
+                batch_first=True,
+                bidirectional=True
+            )
+        elif self.local_audio_proj_type == "linear":
+            self.local_audio_proj = nn.Sequential(
+                nn.Linear(self.audio_width, self.embed_size),
+                nn.ReLU(),
+                nn.Linear(self.embed_size, self.embed_size)
+            )
+        elif self.local_audio_proj_type == "transformer":
+            encoder_layer = nn.TransformerEncoderLayer(
+                d_model=self.embed_size,
+                nhead=8,
+                dim_feedforward=self.embed_size * 4,
+                dropout=0.1,
+                activation='relu',
+                batch_first=True
+            )
+            transformer_encoder = nn.TransformerEncoder(encoder_layer=encoder_layer, num_layers=2)
+            self.local_audio_proj = nn.Sequential(
+                nn.Linear(self.audio_width, self.embed_size),
+                transformer_encoder
+            )
+        elif self.local_audio_proj_type == "transformer_linearlast":
+            encoder_layer = nn.TransformerEncoderLayer(
+                d_model=self.audio_width,
+                nhead=8,
+                dim_feedforward=self.audio_width * 4,
+                dropout=0.1,
+                activation='relu',
+                batch_first=True
+            )
+            transformer_encoder = nn.TransformerEncoder(encoder_layer=encoder_layer, num_layers=2)
+            self.local_audio_proj = nn.Sequential(
+                transformer_encoder,
+                nn.Linear(self.audio_width, self.embed_size),
+            )
+        else:
+            raise ValueError(f"Invalid local audio proj type: {self.local_audio_proj_type}")
+        self.post_init()
+    def encode_audio(self, audio_mel):
+        outputs = self.audio_encoder.extract_features(audio_mel)
+        audio_encoded_raw = outputs['x'] if isinstance(outputs, dict) else outputs
+        audio_cls = audio_encoded_raw[:, 0:1, :]
+        audio_patches = audio_encoded_raw[:, 1:, :]
+        B, T, D = audio_patches.shape
+        ds_factor = 8
+        audio_patches_downsampled = audio_patches.reshape(
+            B, T // ds_factor, ds_factor, D
+        ).mean(dim=2)
+        # [B, 1+T//8, D]
+        audio_encoded = torch.cat([audio_cls, audio_patches_downsampled], dim=1)
+        return audio_encoded
+    def encode_text(self, input_ids, attention_mask):
+        outputs = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask)
+        return outputs.last_hidden_state
+    def get_global_text_embeds(self, input_ids, attention_mask):
+        text_feats = self.encode_text(input_ids, attention_mask)
+        text_embeds = F.normalize(self.global_text_proj(text_feats[:, 0, :]), dim=-1)
+        return text_embeds
+    def get_global_audio_embeds(self, audio_mel):
+        audio_feats = self.encode_audio(audio_mel)
+        if self.config.unify_audio_proj:
+            audio_embeds = self.local_audio_proj(audio_feats)
+            if self.config.local_audio_proj_type == "rnn":
+                audio_embeds = audio_embeds[0]
+            global_audio_embeds = F.normalize(audio_embeds[:, 0, :], dim=-1)
+            return global_audio_embeds
+        else:
+            audio_cls_feat = audio_feats[:, 0, :]
+            audio_embeds = F.normalize(self.global_audio_proj(audio_cls_feat), dim=-1)
+            return audio_embeds
+    def get_dense_audio_embeds(self, audio_mel):
+        audio_feats = self.encode_audio(audio_mel)
+        audio_patches = audio_feats[:, 1:, :]
+        audio_embeds = self.local_audio_proj(audio_patches)
+        if self.config.local_audio_proj_type == "rnn":
+            audio_embeds = audio_embeds[0]
+        if self.config.normalize_dense_audio_embeds:
+            audio_embeds = F.normalize(audio_embeds, dim=-1)
+        return audio_embeds
+    def forward(self, audio_mel=None, input_ids=None, attention_mask=None, return_dict=True):
+        global_audio_embeds = None
+        dense_audio_embeds = None
+        global_text_embeds = None
+        if audio_mel is not None:
+            global_audio_embeds = self.get_global_audio_embeds(audio_mel)
+            dense_audio_embeds = self.get_dense_audio_embeds(audio_mel)
+        if input_ids is not None:
+            global_text_embeds = self.get_global_text_embeds(input_ids, attention_mask)
+        if not return_dict:
+            return (global_audio_embeds, dense_audio_embeds, global_text_embeds)
+        return {
+            "global_audio_embeds": global_audio_embeds,
+            "dense_audio_embeds": dense_audio_embeds,
+            "global_text_embeds": global_text_embeds
+        }