Backup-bdg commited on
Commit
53966ff
·
verified ·
1 Parent(s): 2892ee5

Update model weights after training (epoch 4, loss 3.0820)

Browse files
audio_decoder.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:56f890d89fb28ae5df4b1ab79c42b0c29edc81203f2a99ea077d47909b6d128a
3
- size 1458415836
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:840aaf132e4830dfcfa0634d27acab02841f8eb9fffbfe4f78377c1d50aa050a
3
+ size 1458410612
audio_encoder.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:30c6b7e43c61588099a04e970a49bd38fd73465ef129d8e39ed9a1e8c45aeecf
3
- size 466150140
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68b7ed71f13950d93d17d4152e38cdcdd5e1a157729f4615ee38072473e8c12a
3
+ size 466119380
cross_attention.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6d6a01050517c1c2762257ca7f0c03259704ec620070948f9cccd9c26476fcae
3
  size 174191400
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5343b1fe1af46ca860a51de6f3bd51d1843f70998850084f805c875aec2de030
3
  size 174191400
generator.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:02673bab0a9e2949c2f3bfe0725ff77cd631fd89a559d86e9c886c99455a5e72
3
  size 629440508
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:499cee360f74c21e9e08624abd739f8cd982b339117fbcae3ae09433cdbebc71
3
  size 629440508
llm.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:816725a4e4eaaf6f5a2bb5c3cb678c13f298ccc280937e88c3c947d9fc052fb3
3
  size 1506831304
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c2d81eae0ff676724bf38cf020b2e6317e609eb90d43150ffe91610e67864e7
3
  size 1506831304
model.safetensors.index.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "metadata": {
3
- "total_size": 7309396886,
4
  "format": "components"
5
  },
6
  "weight_map": {
@@ -1951,23 +1951,14 @@
1951
  "audio_encoder.speaker_encoder.frame_encoder.0.bias": "audio_encoder.safetensors",
1952
  "audio_encoder.speaker_encoder.frame_encoder.2.weight": "audio_encoder.safetensors",
1953
  "audio_encoder.speaker_encoder.frame_encoder.2.bias": "audio_encoder.safetensors",
1954
- "audio_encoder.speaker_encoder.frame_encoder.2.running_mean": "audio_encoder.safetensors",
1955
- "audio_encoder.speaker_encoder.frame_encoder.2.running_var": "audio_encoder.safetensors",
1956
- "audio_encoder.speaker_encoder.frame_encoder.2.num_batches_tracked": "audio_encoder.safetensors",
1957
  "audio_encoder.speaker_encoder.frame_encoder.3.weight": "audio_encoder.safetensors",
1958
  "audio_encoder.speaker_encoder.frame_encoder.3.bias": "audio_encoder.safetensors",
1959
  "audio_encoder.speaker_encoder.frame_encoder.5.weight": "audio_encoder.safetensors",
1960
  "audio_encoder.speaker_encoder.frame_encoder.5.bias": "audio_encoder.safetensors",
1961
- "audio_encoder.speaker_encoder.frame_encoder.5.running_mean": "audio_encoder.safetensors",
1962
- "audio_encoder.speaker_encoder.frame_encoder.5.running_var": "audio_encoder.safetensors",
1963
- "audio_encoder.speaker_encoder.frame_encoder.5.num_batches_tracked": "audio_encoder.safetensors",
1964
  "audio_encoder.speaker_encoder.frame_encoder.6.weight": "audio_encoder.safetensors",
1965
  "audio_encoder.speaker_encoder.frame_encoder.6.bias": "audio_encoder.safetensors",
1966
  "audio_encoder.speaker_encoder.frame_encoder.8.weight": "audio_encoder.safetensors",
1967
  "audio_encoder.speaker_encoder.frame_encoder.8.bias": "audio_encoder.safetensors",
1968
- "audio_encoder.speaker_encoder.frame_encoder.8.running_mean": "audio_encoder.safetensors",
1969
- "audio_encoder.speaker_encoder.frame_encoder.8.running_var": "audio_encoder.safetensors",
1970
- "audio_encoder.speaker_encoder.frame_encoder.8.num_batches_tracked": "audio_encoder.safetensors",
1971
  "audio_encoder.speaker_encoder.lstm.weight_ih_l0": "audio_encoder.safetensors",
1972
  "audio_encoder.speaker_encoder.lstm.weight_hh_l0": "audio_encoder.safetensors",
1973
  "audio_encoder.speaker_encoder.lstm.bias_ih_l0": "audio_encoder.safetensors",
@@ -2033,9 +2024,6 @@
2033
  "audio_encoder.conformer_blocks.0.conv.depthwise_conv.bias": "audio_encoder.safetensors",
2034
  "audio_encoder.conformer_blocks.0.conv.batch_norm.weight": "audio_encoder.safetensors",
2035
  "audio_encoder.conformer_blocks.0.conv.batch_norm.bias": "audio_encoder.safetensors",
2036
- "audio_encoder.conformer_blocks.0.conv.batch_norm.running_mean": "audio_encoder.safetensors",
2037
- "audio_encoder.conformer_blocks.0.conv.batch_norm.running_var": "audio_encoder.safetensors",
2038
- "audio_encoder.conformer_blocks.0.conv.batch_norm.num_batches_tracked": "audio_encoder.safetensors",
2039
  "audio_encoder.conformer_blocks.0.conv.pointwise_conv2.weight": "audio_encoder.safetensors",
2040
  "audio_encoder.conformer_blocks.0.conv.pointwise_conv2.bias": "audio_encoder.safetensors",
2041
  "audio_encoder.conformer_blocks.0.ff2_norm.weight": "audio_encoder.safetensors",
@@ -2069,9 +2057,6 @@
2069
  "audio_encoder.conformer_blocks.1.conv.depthwise_conv.bias": "audio_encoder.safetensors",
2070
  "audio_encoder.conformer_blocks.1.conv.batch_norm.weight": "audio_encoder.safetensors",
2071
  "audio_encoder.conformer_blocks.1.conv.batch_norm.bias": "audio_encoder.safetensors",
2072
- "audio_encoder.conformer_blocks.1.conv.batch_norm.running_mean": "audio_encoder.safetensors",
2073
- "audio_encoder.conformer_blocks.1.conv.batch_norm.running_var": "audio_encoder.safetensors",
2074
- "audio_encoder.conformer_blocks.1.conv.batch_norm.num_batches_tracked": "audio_encoder.safetensors",
2075
  "audio_encoder.conformer_blocks.1.conv.pointwise_conv2.weight": "audio_encoder.safetensors",
2076
  "audio_encoder.conformer_blocks.1.conv.pointwise_conv2.bias": "audio_encoder.safetensors",
2077
  "audio_encoder.conformer_blocks.1.ff2_norm.weight": "audio_encoder.safetensors",
@@ -2105,9 +2090,6 @@
2105
  "audio_encoder.conformer_blocks.2.conv.depthwise_conv.bias": "audio_encoder.safetensors",
2106
  "audio_encoder.conformer_blocks.2.conv.batch_norm.weight": "audio_encoder.safetensors",
2107
  "audio_encoder.conformer_blocks.2.conv.batch_norm.bias": "audio_encoder.safetensors",
2108
- "audio_encoder.conformer_blocks.2.conv.batch_norm.running_mean": "audio_encoder.safetensors",
2109
- "audio_encoder.conformer_blocks.2.conv.batch_norm.running_var": "audio_encoder.safetensors",
2110
- "audio_encoder.conformer_blocks.2.conv.batch_norm.num_batches_tracked": "audio_encoder.safetensors",
2111
  "audio_encoder.conformer_blocks.2.conv.pointwise_conv2.weight": "audio_encoder.safetensors",
2112
  "audio_encoder.conformer_blocks.2.conv.pointwise_conv2.bias": "audio_encoder.safetensors",
2113
  "audio_encoder.conformer_blocks.2.ff2_norm.weight": "audio_encoder.safetensors",
@@ -2141,9 +2123,6 @@
2141
  "audio_encoder.conformer_blocks.3.conv.depthwise_conv.bias": "audio_encoder.safetensors",
2142
  "audio_encoder.conformer_blocks.3.conv.batch_norm.weight": "audio_encoder.safetensors",
2143
  "audio_encoder.conformer_blocks.3.conv.batch_norm.bias": "audio_encoder.safetensors",
2144
- "audio_encoder.conformer_blocks.3.conv.batch_norm.running_mean": "audio_encoder.safetensors",
2145
- "audio_encoder.conformer_blocks.3.conv.batch_norm.running_var": "audio_encoder.safetensors",
2146
- "audio_encoder.conformer_blocks.3.conv.batch_norm.num_batches_tracked": "audio_encoder.safetensors",
2147
  "audio_encoder.conformer_blocks.3.conv.pointwise_conv2.weight": "audio_encoder.safetensors",
2148
  "audio_encoder.conformer_blocks.3.conv.pointwise_conv2.bias": "audio_encoder.safetensors",
2149
  "audio_encoder.conformer_blocks.3.ff2_norm.weight": "audio_encoder.safetensors",
@@ -2177,9 +2156,6 @@
2177
  "audio_encoder.conformer_blocks.4.conv.depthwise_conv.bias": "audio_encoder.safetensors",
2178
  "audio_encoder.conformer_blocks.4.conv.batch_norm.weight": "audio_encoder.safetensors",
2179
  "audio_encoder.conformer_blocks.4.conv.batch_norm.bias": "audio_encoder.safetensors",
2180
- "audio_encoder.conformer_blocks.4.conv.batch_norm.running_mean": "audio_encoder.safetensors",
2181
- "audio_encoder.conformer_blocks.4.conv.batch_norm.running_var": "audio_encoder.safetensors",
2182
- "audio_encoder.conformer_blocks.4.conv.batch_norm.num_batches_tracked": "audio_encoder.safetensors",
2183
  "audio_encoder.conformer_blocks.4.conv.pointwise_conv2.weight": "audio_encoder.safetensors",
2184
  "audio_encoder.conformer_blocks.4.conv.pointwise_conv2.bias": "audio_encoder.safetensors",
2185
  "audio_encoder.conformer_blocks.4.ff2_norm.weight": "audio_encoder.safetensors",
@@ -2213,9 +2189,6 @@
2213
  "audio_encoder.conformer_blocks.5.conv.depthwise_conv.bias": "audio_encoder.safetensors",
2214
  "audio_encoder.conformer_blocks.5.conv.batch_norm.weight": "audio_encoder.safetensors",
2215
  "audio_encoder.conformer_blocks.5.conv.batch_norm.bias": "audio_encoder.safetensors",
2216
- "audio_encoder.conformer_blocks.5.conv.batch_norm.running_mean": "audio_encoder.safetensors",
2217
- "audio_encoder.conformer_blocks.5.conv.batch_norm.running_var": "audio_encoder.safetensors",
2218
- "audio_encoder.conformer_blocks.5.conv.batch_norm.num_batches_tracked": "audio_encoder.safetensors",
2219
  "audio_encoder.conformer_blocks.5.conv.pointwise_conv2.weight": "audio_encoder.safetensors",
2220
  "audio_encoder.conformer_blocks.5.conv.pointwise_conv2.bias": "audio_encoder.safetensors",
2221
  "audio_encoder.conformer_blocks.5.ff2_norm.weight": "audio_encoder.safetensors",
@@ -2588,30 +2561,18 @@
2588
  "audio_decoder.postnet.0.0.bias": "audio_decoder.safetensors",
2589
  "audio_decoder.postnet.0.1.weight": "audio_decoder.safetensors",
2590
  "audio_decoder.postnet.0.1.bias": "audio_decoder.safetensors",
2591
- "audio_decoder.postnet.0.1.running_mean": "audio_decoder.safetensors",
2592
- "audio_decoder.postnet.0.1.running_var": "audio_decoder.safetensors",
2593
- "audio_decoder.postnet.0.1.num_batches_tracked": "audio_decoder.safetensors",
2594
  "audio_decoder.postnet.1.0.weight": "audio_decoder.safetensors",
2595
  "audio_decoder.postnet.1.0.bias": "audio_decoder.safetensors",
2596
  "audio_decoder.postnet.1.1.weight": "audio_decoder.safetensors",
2597
  "audio_decoder.postnet.1.1.bias": "audio_decoder.safetensors",
2598
- "audio_decoder.postnet.1.1.running_mean": "audio_decoder.safetensors",
2599
- "audio_decoder.postnet.1.1.running_var": "audio_decoder.safetensors",
2600
- "audio_decoder.postnet.1.1.num_batches_tracked": "audio_decoder.safetensors",
2601
  "audio_decoder.postnet.2.0.weight": "audio_decoder.safetensors",
2602
  "audio_decoder.postnet.2.0.bias": "audio_decoder.safetensors",
2603
  "audio_decoder.postnet.2.1.weight": "audio_decoder.safetensors",
2604
  "audio_decoder.postnet.2.1.bias": "audio_decoder.safetensors",
2605
- "audio_decoder.postnet.2.1.running_mean": "audio_decoder.safetensors",
2606
- "audio_decoder.postnet.2.1.running_var": "audio_decoder.safetensors",
2607
- "audio_decoder.postnet.2.1.num_batches_tracked": "audio_decoder.safetensors",
2608
  "audio_decoder.postnet.3.0.weight": "audio_decoder.safetensors",
2609
  "audio_decoder.postnet.3.0.bias": "audio_decoder.safetensors",
2610
  "audio_decoder.postnet.3.1.weight": "audio_decoder.safetensors",
2611
  "audio_decoder.postnet.3.1.bias": "audio_decoder.safetensors",
2612
- "audio_decoder.postnet.3.1.running_mean": "audio_decoder.safetensors",
2613
- "audio_decoder.postnet.3.1.running_var": "audio_decoder.safetensors",
2614
- "audio_decoder.postnet.3.1.num_batches_tracked": "audio_decoder.safetensors",
2615
  "audio_decoder.postnet.4.weight": "audio_decoder.safetensors",
2616
  "audio_decoder.postnet.4.bias": "audio_decoder.safetensors",
2617
  "audio_decoder.waveform_decoder.input_proj.bias": "audio_decoder.safetensors",
 
1
  {
2
  "metadata": {
3
+ "total_size": 7309365038,
4
  "format": "components"
5
  },
6
  "weight_map": {
 
1951
  "audio_encoder.speaker_encoder.frame_encoder.0.bias": "audio_encoder.safetensors",
1952
  "audio_encoder.speaker_encoder.frame_encoder.2.weight": "audio_encoder.safetensors",
1953
  "audio_encoder.speaker_encoder.frame_encoder.2.bias": "audio_encoder.safetensors",
 
 
 
1954
  "audio_encoder.speaker_encoder.frame_encoder.3.weight": "audio_encoder.safetensors",
1955
  "audio_encoder.speaker_encoder.frame_encoder.3.bias": "audio_encoder.safetensors",
1956
  "audio_encoder.speaker_encoder.frame_encoder.5.weight": "audio_encoder.safetensors",
1957
  "audio_encoder.speaker_encoder.frame_encoder.5.bias": "audio_encoder.safetensors",
 
 
 
1958
  "audio_encoder.speaker_encoder.frame_encoder.6.weight": "audio_encoder.safetensors",
1959
  "audio_encoder.speaker_encoder.frame_encoder.6.bias": "audio_encoder.safetensors",
1960
  "audio_encoder.speaker_encoder.frame_encoder.8.weight": "audio_encoder.safetensors",
1961
  "audio_encoder.speaker_encoder.frame_encoder.8.bias": "audio_encoder.safetensors",
 
 
 
1962
  "audio_encoder.speaker_encoder.lstm.weight_ih_l0": "audio_encoder.safetensors",
1963
  "audio_encoder.speaker_encoder.lstm.weight_hh_l0": "audio_encoder.safetensors",
1964
  "audio_encoder.speaker_encoder.lstm.bias_ih_l0": "audio_encoder.safetensors",
 
2024
  "audio_encoder.conformer_blocks.0.conv.depthwise_conv.bias": "audio_encoder.safetensors",
2025
  "audio_encoder.conformer_blocks.0.conv.batch_norm.weight": "audio_encoder.safetensors",
2026
  "audio_encoder.conformer_blocks.0.conv.batch_norm.bias": "audio_encoder.safetensors",
 
 
 
2027
  "audio_encoder.conformer_blocks.0.conv.pointwise_conv2.weight": "audio_encoder.safetensors",
2028
  "audio_encoder.conformer_blocks.0.conv.pointwise_conv2.bias": "audio_encoder.safetensors",
2029
  "audio_encoder.conformer_blocks.0.ff2_norm.weight": "audio_encoder.safetensors",
 
2057
  "audio_encoder.conformer_blocks.1.conv.depthwise_conv.bias": "audio_encoder.safetensors",
2058
  "audio_encoder.conformer_blocks.1.conv.batch_norm.weight": "audio_encoder.safetensors",
2059
  "audio_encoder.conformer_blocks.1.conv.batch_norm.bias": "audio_encoder.safetensors",
 
 
 
2060
  "audio_encoder.conformer_blocks.1.conv.pointwise_conv2.weight": "audio_encoder.safetensors",
2061
  "audio_encoder.conformer_blocks.1.conv.pointwise_conv2.bias": "audio_encoder.safetensors",
2062
  "audio_encoder.conformer_blocks.1.ff2_norm.weight": "audio_encoder.safetensors",
 
2090
  "audio_encoder.conformer_blocks.2.conv.depthwise_conv.bias": "audio_encoder.safetensors",
2091
  "audio_encoder.conformer_blocks.2.conv.batch_norm.weight": "audio_encoder.safetensors",
2092
  "audio_encoder.conformer_blocks.2.conv.batch_norm.bias": "audio_encoder.safetensors",
 
 
 
2093
  "audio_encoder.conformer_blocks.2.conv.pointwise_conv2.weight": "audio_encoder.safetensors",
2094
  "audio_encoder.conformer_blocks.2.conv.pointwise_conv2.bias": "audio_encoder.safetensors",
2095
  "audio_encoder.conformer_blocks.2.ff2_norm.weight": "audio_encoder.safetensors",
 
2123
  "audio_encoder.conformer_blocks.3.conv.depthwise_conv.bias": "audio_encoder.safetensors",
2124
  "audio_encoder.conformer_blocks.3.conv.batch_norm.weight": "audio_encoder.safetensors",
2125
  "audio_encoder.conformer_blocks.3.conv.batch_norm.bias": "audio_encoder.safetensors",
 
 
 
2126
  "audio_encoder.conformer_blocks.3.conv.pointwise_conv2.weight": "audio_encoder.safetensors",
2127
  "audio_encoder.conformer_blocks.3.conv.pointwise_conv2.bias": "audio_encoder.safetensors",
2128
  "audio_encoder.conformer_blocks.3.ff2_norm.weight": "audio_encoder.safetensors",
 
2156
  "audio_encoder.conformer_blocks.4.conv.depthwise_conv.bias": "audio_encoder.safetensors",
2157
  "audio_encoder.conformer_blocks.4.conv.batch_norm.weight": "audio_encoder.safetensors",
2158
  "audio_encoder.conformer_blocks.4.conv.batch_norm.bias": "audio_encoder.safetensors",
 
 
 
2159
  "audio_encoder.conformer_blocks.4.conv.pointwise_conv2.weight": "audio_encoder.safetensors",
2160
  "audio_encoder.conformer_blocks.4.conv.pointwise_conv2.bias": "audio_encoder.safetensors",
2161
  "audio_encoder.conformer_blocks.4.ff2_norm.weight": "audio_encoder.safetensors",
 
2189
  "audio_encoder.conformer_blocks.5.conv.depthwise_conv.bias": "audio_encoder.safetensors",
2190
  "audio_encoder.conformer_blocks.5.conv.batch_norm.weight": "audio_encoder.safetensors",
2191
  "audio_encoder.conformer_blocks.5.conv.batch_norm.bias": "audio_encoder.safetensors",
 
 
 
2192
  "audio_encoder.conformer_blocks.5.conv.pointwise_conv2.weight": "audio_encoder.safetensors",
2193
  "audio_encoder.conformer_blocks.5.conv.pointwise_conv2.bias": "audio_encoder.safetensors",
2194
  "audio_encoder.conformer_blocks.5.ff2_norm.weight": "audio_encoder.safetensors",
 
2561
  "audio_decoder.postnet.0.0.bias": "audio_decoder.safetensors",
2562
  "audio_decoder.postnet.0.1.weight": "audio_decoder.safetensors",
2563
  "audio_decoder.postnet.0.1.bias": "audio_decoder.safetensors",
 
 
 
2564
  "audio_decoder.postnet.1.0.weight": "audio_decoder.safetensors",
2565
  "audio_decoder.postnet.1.0.bias": "audio_decoder.safetensors",
2566
  "audio_decoder.postnet.1.1.weight": "audio_decoder.safetensors",
2567
  "audio_decoder.postnet.1.1.bias": "audio_decoder.safetensors",
 
 
 
2568
  "audio_decoder.postnet.2.0.weight": "audio_decoder.safetensors",
2569
  "audio_decoder.postnet.2.0.bias": "audio_decoder.safetensors",
2570
  "audio_decoder.postnet.2.1.weight": "audio_decoder.safetensors",
2571
  "audio_decoder.postnet.2.1.bias": "audio_decoder.safetensors",
 
 
 
2572
  "audio_decoder.postnet.3.0.weight": "audio_decoder.safetensors",
2573
  "audio_decoder.postnet.3.0.bias": "audio_decoder.safetensors",
2574
  "audio_decoder.postnet.3.1.weight": "audio_decoder.safetensors",
2575
  "audio_decoder.postnet.3.1.bias": "audio_decoder.safetensors",
 
 
 
2576
  "audio_decoder.postnet.4.weight": "audio_decoder.safetensors",
2577
  "audio_decoder.postnet.4.bias": "audio_decoder.safetensors",
2578
  "audio_decoder.waveform_decoder.input_proj.bias": "audio_decoder.safetensors",
modeling_xoron.py CHANGED
@@ -4371,16 +4371,18 @@ class SpeakerEncoder(nn.Module):
4371
  self.output_size = output_size
4372
 
4373
  # Frame-level encoder
 
 
4374
  self.frame_encoder = nn.Sequential(
4375
  nn.Conv1d(80, hidden_size, 5, 1, 2),
4376
  nn.ReLU(),
4377
- nn.BatchNorm1d(hidden_size),
4378
  nn.Conv1d(hidden_size, hidden_size, 5, 1, 2),
4379
  nn.ReLU(),
4380
- nn.BatchNorm1d(hidden_size),
4381
  nn.Conv1d(hidden_size, hidden_size, 5, 1, 2),
4382
  nn.ReLU(),
4383
- nn.BatchNorm1d(hidden_size),
4384
  )
4385
 
4386
  # LSTM for temporal modeling
@@ -4853,7 +4855,8 @@ class ConvolutionModule(nn.Module):
4853
  channels, channels, kernel_size=kernel_size,
4854
  padding=(kernel_size - 1) // 2, groups=channels
4855
  )
4856
- self.batch_norm = nn.BatchNorm1d(channels)
 
4857
  self.pointwise_conv2 = nn.Conv1d(channels, channels, kernel_size=1)
4858
  self.dropout = nn.Dropout(dropout)
4859
 
@@ -5544,25 +5547,27 @@ class AudioDecoder(nn.Module):
5544
  self.mel_linear = nn.Linear(hidden_size, n_mels)
5545
 
5546
  # Postnet
 
 
5547
  self.postnet = nn.ModuleList([
5548
  nn.Sequential(
5549
  nn.Conv1d(n_mels, 256, kernel_size=5, padding=2),
5550
- nn.BatchNorm1d(256),
5551
  nn.Tanh(),
5552
  ),
5553
  nn.Sequential(
5554
  nn.Conv1d(256, 256, kernel_size=5, padding=2),
5555
- nn.BatchNorm1d(256),
5556
  nn.Tanh(),
5557
  ),
5558
  nn.Sequential(
5559
  nn.Conv1d(256, 256, kernel_size=5, padding=2),
5560
- nn.BatchNorm1d(256),
5561
  nn.Tanh(),
5562
  ),
5563
  nn.Sequential(
5564
  nn.Conv1d(256, 256, kernel_size=5, padding=2),
5565
- nn.BatchNorm1d(256),
5566
  nn.Tanh(),
5567
  ),
5568
  nn.Conv1d(256, n_mels, kernel_size=5, padding=2),
@@ -5770,9 +5775,8 @@ class AudioDecoder(nn.Module):
5770
  energy_pred = F.softplus(self.energy_predictor(x))
5771
 
5772
  # Determine output length
5773
- # IMPORTANT: BatchNorm1d requires sequence length > 1 during training
5774
- # Enforce minimum length of 2 to avoid "Expected more than 1 value per channel" error
5775
- MIN_MEL_LENGTH = 2
5776
  if target_length is not None:
5777
  mel_length = max(MIN_MEL_LENGTH, target_length)
5778
  else:
@@ -8608,6 +8612,7 @@ class AuxLosslessMoELayer(nn.Module):
8608
 
8609
  def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
8610
  batch_size, seq_len, hidden_size = hidden_states.shape
 
8611
  hidden_flat = hidden_states.view(-1, hidden_size)
8612
 
8613
  top_k_probs, top_k_indices, _ = self.router(hidden_states)
@@ -8622,10 +8627,11 @@ class AuxLosslessMoELayer(nn.Module):
8622
  expert_input = hidden_flat[mask]
8623
  expert_output = expert(expert_input)
8624
  weight = top_k_probs[mask, k:k+1]
8625
- final_output[mask] = final_output[mask] + weight * expert_output
 
8626
 
8627
  shared_output = self.shared_expert(hidden_flat)
8628
- final_output = final_output + shared_output
8629
 
8630
  final_output = final_output.view(batch_size, seq_len, hidden_size)
8631
 
@@ -9125,7 +9131,14 @@ class XoronMultimodalModel(nn.Module):
9125
  super().__init__()
9126
  self.config = config
9127
  self.device_map = device_map
9128
- self._model_parallel = device_map is not None and len(set(device_map.values())) > 1
 
 
 
 
 
 
 
9129
 
9130
  print("\n" + "=" * 60)
9131
  print("🚀 BUILDING XORON-DEV MULTIMODAL MODEL")
@@ -9273,7 +9286,11 @@ class XoronMultimodalModel(nn.Module):
9273
  def apply_model_parallel(self, device_map: Dict[str, str]):
9274
  """Apply Model Parallelism by placing components on different devices."""
9275
  self.device_map = device_map
9276
- self._model_parallel = len(set(device_map.values())) > 1
 
 
 
 
9277
 
9278
  if not self._model_parallel:
9279
  print(" ℹ️ Single device - no model parallelism needed")
 
4371
  self.output_size = output_size
4372
 
4373
  # Frame-level encoder
4374
+ # Use GroupNorm instead of BatchNorm1d to handle sequence length of 1
4375
+ # GroupNorm(1, C) is equivalent to LayerNorm and works with any batch/seq size
4376
  self.frame_encoder = nn.Sequential(
4377
  nn.Conv1d(80, hidden_size, 5, 1, 2),
4378
  nn.ReLU(),
4379
+ nn.GroupNorm(1, hidden_size),
4380
  nn.Conv1d(hidden_size, hidden_size, 5, 1, 2),
4381
  nn.ReLU(),
4382
+ nn.GroupNorm(1, hidden_size),
4383
  nn.Conv1d(hidden_size, hidden_size, 5, 1, 2),
4384
  nn.ReLU(),
4385
+ nn.GroupNorm(1, hidden_size),
4386
  )
4387
 
4388
  # LSTM for temporal modeling
 
4855
  channels, channels, kernel_size=kernel_size,
4856
  padding=(kernel_size - 1) // 2, groups=channels
4857
  )
4858
+ # Use GroupNorm instead of BatchNorm1d to handle sequence length of 1
4859
+ self.batch_norm = nn.GroupNorm(1, channels)
4860
  self.pointwise_conv2 = nn.Conv1d(channels, channels, kernel_size=1)
4861
  self.dropout = nn.Dropout(dropout)
4862
 
 
5547
  self.mel_linear = nn.Linear(hidden_size, n_mels)
5548
 
5549
  # Postnet
5550
+ # Use GroupNorm instead of BatchNorm1d to handle sequence length of 1
5551
+ # GroupNorm(1, C) is equivalent to LayerNorm and works with any batch/seq size
5552
  self.postnet = nn.ModuleList([
5553
  nn.Sequential(
5554
  nn.Conv1d(n_mels, 256, kernel_size=5, padding=2),
5555
+ nn.GroupNorm(1, 256),
5556
  nn.Tanh(),
5557
  ),
5558
  nn.Sequential(
5559
  nn.Conv1d(256, 256, kernel_size=5, padding=2),
5560
+ nn.GroupNorm(1, 256),
5561
  nn.Tanh(),
5562
  ),
5563
  nn.Sequential(
5564
  nn.Conv1d(256, 256, kernel_size=5, padding=2),
5565
+ nn.GroupNorm(1, 256),
5566
  nn.Tanh(),
5567
  ),
5568
  nn.Sequential(
5569
  nn.Conv1d(256, 256, kernel_size=5, padding=2),
5570
+ nn.GroupNorm(1, 256),
5571
  nn.Tanh(),
5572
  ),
5573
  nn.Conv1d(256, n_mels, kernel_size=5, padding=2),
 
5775
  energy_pred = F.softplus(self.energy_predictor(x))
5776
 
5777
  # Determine output length
5778
+ # Note: We use GroupNorm instead of BatchNorm1d so any sequence length works
5779
+ MIN_MEL_LENGTH = 1
 
5780
  if target_length is not None:
5781
  mel_length = max(MIN_MEL_LENGTH, target_length)
5782
  else:
 
8612
 
8613
  def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
8614
  batch_size, seq_len, hidden_size = hidden_states.shape
8615
+ original_dtype = hidden_states.dtype
8616
  hidden_flat = hidden_states.view(-1, hidden_size)
8617
 
8618
  top_k_probs, top_k_indices, _ = self.router(hidden_states)
 
8627
  expert_input = hidden_flat[mask]
8628
  expert_output = expert(expert_input)
8629
  weight = top_k_probs[mask, k:k+1]
8630
+ weighted_output = (weight * expert_output).to(original_dtype)
8631
+ final_output[mask] = final_output[mask] + weighted_output
8632
 
8633
  shared_output = self.shared_expert(hidden_flat)
8634
+ final_output = final_output + shared_output.to(original_dtype)
8635
 
8636
  final_output = final_output.view(batch_size, seq_len, hidden_size)
8637
 
 
9131
  super().__init__()
9132
  self.config = config
9133
  self.device_map = device_map
9134
+
9135
+ # Check for model parallelism - only consider string device values
9136
+ # (device_map may contain metadata like 'training_gpus' list, 'dual_gpu_mode' bool)
9137
+ if device_map is not None:
9138
+ device_values = [v for v in device_map.values() if isinstance(v, str)]
9139
+ self._model_parallel = len(set(device_values)) > 1
9140
+ else:
9141
+ self._model_parallel = False
9142
 
9143
  print("\n" + "=" * 60)
9144
  print("🚀 BUILDING XORON-DEV MULTIMODAL MODEL")
 
9286
  def apply_model_parallel(self, device_map: Dict[str, str]):
9287
  """Apply Model Parallelism by placing components on different devices."""
9288
  self.device_map = device_map
9289
+
9290
+ # Check for model parallelism - only consider string device values
9291
+ # (device_map may contain metadata like 'training_gpus' list, 'dual_gpu_mode' bool)
9292
+ device_values = [v for v in device_map.values() if isinstance(v, str)]
9293
+ self._model_parallel = len(set(device_values)) > 1
9294
 
9295
  if not self._model_parallel:
9296
  print(" ℹ️ Single device - no model parallelism needed")
streaming_state.json CHANGED
@@ -1,15 +1,15 @@
1
  {
2
- "epoch": 12,
3
- "unique_samples": 3029,
4
- "total_yields": 6058,
5
  "dataset_positions": {
6
- "WebSight": 186,
7
- "ScienceQA": 164,
8
- "InstructPix2Pix": 186,
9
- "Flickr8k": 186,
10
- "NewYorker": 186,
11
  "Football": 6,
12
- "MagicBrush": 186,
13
  "WildChat": 200,
14
  "Synth-ShellExecution": 200,
15
  "Midjourney-Prompts": 200,
@@ -113,20 +113,20 @@
113
  "OpenAssistant": 200
114
  },
115
  "image": {
116
- "WebSight": 186,
117
- "ScienceQA": 164,
118
- "InstructPix2Pix": 186,
119
- "Flickr8k": 186,
120
- "NewYorker": 186,
121
  "Football": 6,
122
- "MagicBrush": 186
123
  },
124
  "video": {},
125
  "audio": {}
126
  },
127
  "modality_counts": {
128
- "text": 3029,
129
- "image": 0,
130
  "video": 0,
131
  "audio": 0
132
  },
 
1
  {
2
+ "epoch": 19,
3
+ "unique_samples": 300,
4
+ "total_yields": 600,
5
  "dataset_positions": {
6
+ "WebSight": 386,
7
+ "ScienceQA": 364,
8
+ "InstructPix2Pix": 386,
9
+ "Flickr8k": 386,
10
+ "NewYorker": 386,
11
  "Football": 6,
12
+ "MagicBrush": 386,
13
  "WildChat": 200,
14
  "Synth-ShellExecution": 200,
15
  "Midjourney-Prompts": 200,
 
113
  "OpenAssistant": 200
114
  },
115
  "image": {
116
+ "WebSight": 386,
117
+ "ScienceQA": 364,
118
+ "InstructPix2Pix": 386,
119
+ "Flickr8k": 386,
120
+ "NewYorker": 386,
121
  "Football": 6,
122
+ "MagicBrush": 386
123
  },
124
  "video": {},
125
  "audio": {}
126
  },
127
  "modality_counts": {
128
+ "text": 0,
129
+ "image": 300,
130
  "video": 0,
131
  "audio": 0
132
  },
trainer_state.json CHANGED
@@ -1,14 +1,14 @@
1
  {
2
  "best_model_checkpoint": "/kaggle/working/xoron-final",
3
- "best_metric": 3.646694382440487,
4
  "epoch": 4,
5
  "epochs_completed": 4,
6
- "global_step": 1597,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [],
10
  "logging_steps": 50,
11
- "max_steps": 1597,
12
  "num_train_epochs": 4,
13
  "total_flos": 0,
14
  "train_batch_size": 1,
@@ -16,16 +16,16 @@
16
  "learning_rate": 0.0001,
17
  "max_grad_norm": 1.0,
18
  "trainable_components": [
 
19
  "llm",
20
  "cross_attention",
 
21
  "modality_markers"
22
  ],
23
  "frozen_components": [
24
- "vision",
25
  "video",
26
  "audio",
27
  "speech",
28
- "image_generation",
29
  "video_generation"
30
  ],
31
  "trial_name": null,
 
1
  {
2
  "best_model_checkpoint": "/kaggle/working/xoron-final",
3
+ "best_metric": 3.0820325045382684,
4
  "epoch": 4,
5
  "epochs_completed": 4,
6
+ "global_step": 148,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [],
10
  "logging_steps": 50,
11
+ "max_steps": 148,
12
  "num_train_epochs": 4,
13
  "total_flos": 0,
14
  "train_batch_size": 1,
 
16
  "learning_rate": 0.0001,
17
  "max_grad_norm": 1.0,
18
  "trainable_components": [
19
+ "vision",
20
  "llm",
21
  "cross_attention",
22
+ "image_generation",
23
  "modality_markers"
24
  ],
25
  "frozen_components": [
 
26
  "video",
27
  "audio",
28
  "speech",
 
29
  "video_generation"
30
  ],
31
  "trial_name": null,
training_state.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9be0dff94c6d235091cae224c3e034a33fa84932af351d74ac37a512956c5486
3
- size 781495681
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d17b71b1b8d8d73a29371b107d2020d349cf453a9089b49b44d1b5cb446fba74
3
+ size 1419723549