HTill
/

flexEAT-base_epoch30_pretrain

Audio Classification

feature-extraction

audio embeddings

Model card Files Files and versions

HTill commited on Dec 2, 2025

Commit

dc410bf

·

verified ·

1 Parent(s): 86c2a7c

Update model_core.py

Files changed (1) hide show

model_core.py +26 -18

model_core.py CHANGED Viewed

@@ -76,41 +76,49 @@ class Mlp(nn.Module):
         return x
-class SinCos2DEmbed(nn.Module):
-    def __init__(self):
         super().__init__()
     def forward(self, x):
         # x has the shape [batch_size, embed_dim, grid_length, grid_height]
-        # Note: grid_length corresponds to H (Time/Frequency), grid_height to W
-        _, embed_dim, grid_length, grid_height = x.shape
         # Create grid positions
         grid_length_a = torch.arange(grid_length, dtype=torch.float32, device=x.device)
         grid_height_a = torch.arange(grid_height, dtype=torch.float32, device=x.device)
-        grid = torch.meshgrid(grid_length_a, grid_height_a, indexing="xy")
-        sub_embed_dim = embed_dim // 4
         omega = torch.arange(sub_embed_dim, dtype=torch.float32, device=x.device)
         omega /= sub_embed_dim
-        omega = 1.0 / 10000**omega
-        # embed_length (dimension 0 of grid)
-        out_length = torch.einsum("mn,d->dmn", grid[0], omega)
         embed_length_sin = torch.sin(out_length)
         embed_length_cos = torch.cos(out_length)
-        embed_length = torch.cat([embed_length_sin, embed_length_cos], dim=0)
-        # embed_height (dimension 1 of grid)
-        out_height = torch.einsum("mn,d->dmn", grid[1], omega)
-        embed_height_sin = torch.sin(out_height)
-        embed_height_cos = torch.cos(out_height)
-        embed_height = torch.cat([embed_height_sin, embed_height_cos], dim=0)
-        # concat length and height embeddings
-        embed = torch.cat([embed_length, embed_height], dim=0).unsqueeze(dim=0)
-        x = x + embed
         return x

         return x
+class SinCos2DEmbed(torch.nn.Module):
+    def __init__(
+        self,
+    ):
         super().__init__()
     def forward(self, x):
         # x has the shape [batch_size, embed_dim, grid_length, grid_height]
+        batch_size, embed_dim, grid_length, grid_height = x.shape
         # Create grid positions
         grid_length_a = torch.arange(grid_length, dtype=torch.float32, device=x.device)
         grid_height_a = torch.arange(grid_height, dtype=torch.float32, device=x.device)
+        grid = torch.meshgrid(grid_height_a, grid_length_a, indexing="xy")
+        sub_embed_dim = embed_dim//4
         omega = torch.arange(sub_embed_dim, dtype=torch.float32, device=x.device)
         omega /= sub_embed_dim
+        omega = 1.0 / 10000**omega
+        # embed_length
+        out_length = torch.einsum("mn,d->dmn", grid[0],omega)
         embed_length_sin = torch.sin(out_length)
         embed_length_cos = torch.cos(out_length)
+        embed_length = torch.concatenate([embed_length_sin,embed_length_cos],dim=0)
+        # embed_heigth
+        out_heigth = torch.einsum("mn,d->dmn", grid[1], omega)
+        embed_heigth_sin = torch.sin(out_heigth)
+        embed_heigth_cos = torch.cos(out_heigth)
+        embed_heigth = torch.concatenate([embed_heigth_sin,embed_heigth_cos],dim=0)
+        # concat length and heigth
+        embed = torch.concatenate([embed_length, embed_heigth],dim=0).unsqueeze(dim=0)
+        x = x + embed
         return x