1NEYRON1
/

mosnet

PyTorch

mosnet

custom_code

Model card Files Files and versions

xet

Community

1NEYRON1 commited on Jun 15, 2025

Commit

c8282f5

verified ·

1 Parent(s): c5de25b

Update modeling_mosnet.py

Browse files

Files changed (1) hide show

modeling_mosnet.py +75 -83

modeling_mosnet.py CHANGED Viewed

@@ -28,84 +28,6 @@ class TimeDistributed(nn.Module):
         return output
-class CnnBlstmMbnet2(nn.Module):
-    def __init__(self, dropout: float = 0.3) -> None:
-        super().__init__()
-        self.conv1 = nn.Sequential(
-            nn.Conv2d(1, 16, (3, 3), (1, 1), padding=1),
-            nn.ReLU(),
-            nn.Conv2d(16, 16, (3, 3), (1, 1), 1),
-            nn.ReLU(),
-            nn.Conv2d(16, 16, (3, 3), (1, 3), 1),
-            nn.ReLU(),
-            nn.BatchNorm2d(16),
-            nn.Dropout(dropout),
-        )
-        self.conv2 = nn.Sequential(
-            nn.Conv2d(16, 32, (3, 3), (1, 1), 1),
-            nn.ReLU(),
-            nn.Conv2d(32, 32, (3, 3), (1, 1), 1),
-            nn.ReLU(),
-            nn.Conv2d(32, 32, (3, 3), (1, 3), 1),
-            nn.ReLU(),
-            nn.BatchNorm2d(32),
-            nn.Dropout(dropout),
-        )
-        self.conv3 = nn.Sequential(
-            nn.Conv2d(32, 64, (3, 3), (1, 1), 1),
-            nn.ReLU(),
-            nn.Conv2d(64, 64, (3, 3), (1, 1), 1),
-            nn.ReLU(),
-            nn.Conv2d(64, 64, (3, 3), (1, 3), 1),
-            nn.ReLU(),
-            nn.BatchNorm2d(64),
-            nn.Dropout(dropout),
-        )
-        self.conv4 = nn.Sequential(
-            nn.Conv2d(64, 128, (3, 3), (1, 1), 1),
-            nn.ReLU(),
-            nn.Conv2d(128, 128, (3, 3), (1, 1), 1),
-            nn.ReLU(),
-            nn.Conv2d(128, 128, (3, 3), (1, 3), 1),
-            nn.ReLU(),
-            nn.BatchNorm2d(128),
-            nn.Dropout(dropout),
-        )
-        self.blstm1 = nn.LSTM(512, 128, bidirectional=True, batch_first=True)
-        self.droupout = nn.Dropout(dropout)
-        self.flatten = TimeDistributed(nn.Flatten(), batch_first=True)
-        self.dense1 = nn.Sequential(
-            TimeDistributed(
-                nn.Sequential(
-                    nn.Linear(256, 128),
-                    nn.ReLU(),
-                ),
-                batch_first=True,
-            ),
-            nn.Dropout(dropout),
-        )
-        self.frame_layer = TimeDistributed(nn.Linear(128, 1), batch_first=True)
-        self.average_layer = nn.AdaptiveAvgPool1d(1)
-    def forward(self, forward_input: torch.Tensor, mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        conv1_output = self.conv1(forward_input)
-        conv2_output = self.conv2(conv1_output)
-        conv3_output = self.conv3(conv2_output)
-        conv4_output = self.conv4(conv3_output)
-        conv4_output = conv4_output.permute(0, 2, 1, 3)
-        conv4_output = torch.reshape(conv4_output, (conv4_output.shape[0], conv4_output.shape[1], 4 * 128))
-        blstm_output, _ = self.blstm1(conv4_output)
-        blstm_output = self.droupout(blstm_output)
-        flatten_output = self.flatten(blstm_output)
-        fc_output = self.dense1(flatten_output)
-        frame_score = self.frame_layer(fc_output)
-        frame_score = frame_score.squeeze(-1) * mask
-        valid_sum = torch.sum(frame_score, dim=1)
-        valid_count = torch.sum(mask, dim=1)
-        avg_score = valid_sum / (valid_count + 1e-8)
-        return avg_score.unsqueeze(-1), frame_score
 class SwiGLU(nn.Module):
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         x_, gate = x.chunk(2, dim=-1)
@@ -263,7 +185,6 @@ class CrossAttentionModel(nn.Module):
         avg_score = self.average_layer(frame_score.permute(0, 2, 1))  # (B, 1, 1)
         return avg_score.reshape(avg_score.size(0), -1), frame_score.squeeze()
 class MosNet(PreTrainedModel):
     config_class = MosNetConfig
@@ -281,6 +202,81 @@ class MosNet(PreTrainedModel):
         self.fft_size = self.config.fft_size
         self.hop_length = self.config.hop_length
         self.win_length = self.config.win_length
     def preprocess_audios(self, audios: List[Any]) -> Tuple[torch.Tensor, torch.Tensor]:
         spectrograms = []
@@ -304,10 +300,6 @@ class MosNet(PreTrainedModel):
             masks[i, :valid_len] = 1.0
         return padded, masks
-    def forward(self, audios: torch.Tensor, masks: torch.Tensor) -> torch.Tensor:
-        outputs, _ = self.model(audios.to(self.device), masks.to(self.device))
-        return outputs
     def predict(self, audios: List[Any]) -> List[float]:
         with torch.no_grad():
             padded, masks = self.preprocess_audios(audios)

         return output
 class SwiGLU(nn.Module):
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         x_, gate = x.chunk(2, dim=-1)
         avg_score = self.average_layer(frame_score.permute(0, 2, 1))  # (B, 1, 1)
         return avg_score.reshape(avg_score.size(0), -1), frame_score.squeeze()
 class MosNet(PreTrainedModel):
     config_class = MosNetConfig
         self.fft_size = self.config.fft_size
         self.hop_length = self.config.hop_length
         self.win_length = self.config.win_length
+        self.dropout = self.config.dropout
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(1, 16, (3, 3), (1, 1), padding=1),
+            nn.ReLU(),
+            nn.Conv2d(16, 16, (3, 3), (1, 1), 1),
+            nn.ReLU(),
+            nn.Conv2d(16, 16, (3, 3), (1, 3), 1),
+            nn.ReLU(),
+            nn.BatchNorm2d(16),
+            nn.Dropout(self.dropout),
+        )
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(16, 32, (3, 3), (1, 1), 1),
+            nn.ReLU(),
+            nn.Conv2d(32, 32, (3, 3), (1, 1), 1),
+            nn.ReLU(),
+            nn.Conv2d(32, 32, (3, 3), (1, 3), 1),
+            nn.ReLU(),
+            nn.BatchNorm2d(32),
+            nn.Dropout(self.dropout),
+        )
+        self.conv3 = nn.Sequential(
+            nn.Conv2d(32, 64, (3, 3), (1, 1), 1),
+            nn.ReLU(),
+            nn.Conv2d(64, 64, (3, 3), (1, 1), 1),
+            nn.ReLU(),
+            nn.Conv2d(64, 64, (3, 3), (1, 3), 1),
+            nn.ReLU(),
+            nn.BatchNorm2d(64),
+            nn.Dropout(self.dropout),
+        )
+        self.conv4 = nn.Sequential(
+            nn.Conv2d(64, 128, (3, 3), (1, 1), 1),
+            nn.ReLU(),
+            nn.Conv2d(128, 128, (3, 3), (1, 1), 1),
+            nn.ReLU(),
+            nn.Conv2d(128, 128, (3, 3), (1, 3), 1),
+            nn.ReLU(),
+            nn.BatchNorm2d(128),
+            nn.Dropout(self.dropout),
+        )
+        self.blstm1 = nn.LSTM(512, 128, bidirectional=True, batch_first=True)
+        self.droupout = nn.Dropout(self.dropout)
+        self.flatten = TimeDistributed(nn.Flatten(), batch_first=True)
+        self.dense1 = nn.Sequential(
+            TimeDistributed(
+                nn.Sequential(
+                    nn.Linear(256, 128),
+                    nn.ReLU(),
+                ),
+                batch_first=True,
+            ),
+            nn.Dropout(self.dropout),
+        )
+        self.frame_layer = TimeDistributed(nn.Linear(128, 1), batch_first=True)
+        self.average_layer = nn.AdaptiveAvgPool1d(1)
+    def forward(self, forward_input: torch.Tensor, mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        conv1_output = self.conv1(forward_input)
+        conv2_output = self.conv2(conv1_output)
+        conv3_output = self.conv3(conv2_output)
+        conv4_output = self.conv4(conv3_output)
+        conv4_output = conv4_output.permute(0, 2, 1, 3)
+        conv4_output = torch.reshape(conv4_output, (conv4_output.shape[0], conv4_output.shape[1], 4 * 128))
+        blstm_output, _ = self.blstm1(conv4_output)
+        blstm_output = self.droupout(blstm_output)
+        flatten_output = self.flatten(blstm_output)
+        fc_output = self.dense1(flatten_output)
+        frame_score = self.frame_layer(fc_output)
+        frame_score = frame_score.squeeze(-1) * mask
+        valid_sum = torch.sum(frame_score, dim=1)
+        valid_count = torch.sum(mask, dim=1)
+        avg_score = valid_sum / (valid_count + 1e-8)
+        return avg_score.unsqueeze(-1)
     def preprocess_audios(self, audios: List[Any]) -> Tuple[torch.Tensor, torch.Tensor]:
         spectrograms = []
             masks[i, :valid_len] = 1.0
         return padded, masks
     def predict(self, audios: List[Any]) -> List[float]:
         with torch.no_grad():
             padded, masks = self.preprocess_audios(audios)