Any-to-Any
Transformers
Safetensors
English
xoron
multimodal
Mixture of Experts
text-to-image
image editing
image to video
text-to-video
video editing
text-to-speech
speech-to-text
speech-to-speech
image-to-text
video-to-text
agentic
tool-use
flow-matching
3d-rope
titok
vidtok
dual-stream-attention
zero-shot-voice-cloning
bigvgan
snake-activation
multi-receptive-field-fusion
custom_code
Update model weights after training (epoch 4, loss 3.0820)
Browse files- audio_decoder.safetensors +2 -2
- audio_encoder.safetensors +2 -2
- cross_attention.safetensors +1 -1
- generator.safetensors +1 -1
- llm.safetensors +1 -1
- model.safetensors.index.json +1 -40
- modeling_xoron.py +32 -15
- streaming_state.json +17 -17
- trainer_state.json +5 -5
- training_state.pt +2 -2
audio_decoder.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:840aaf132e4830dfcfa0634d27acab02841f8eb9fffbfe4f78377c1d50aa050a
|
| 3 |
+
size 1458410612
|
audio_encoder.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:68b7ed71f13950d93d17d4152e38cdcdd5e1a157729f4615ee38072473e8c12a
|
| 3 |
+
size 466119380
|
cross_attention.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 174191400
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5343b1fe1af46ca860a51de6f3bd51d1843f70998850084f805c875aec2de030
|
| 3 |
size 174191400
|
generator.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 629440508
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:499cee360f74c21e9e08624abd739f8cd982b339117fbcae3ae09433cdbebc71
|
| 3 |
size 629440508
|
llm.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1506831304
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4c2d81eae0ff676724bf38cf020b2e6317e609eb90d43150ffe91610e67864e7
|
| 3 |
size 1506831304
|
model.safetensors.index.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"metadata": {
|
| 3 |
-
"total_size":
|
| 4 |
"format": "components"
|
| 5 |
},
|
| 6 |
"weight_map": {
|
|
@@ -1951,23 +1951,14 @@
|
|
| 1951 |
"audio_encoder.speaker_encoder.frame_encoder.0.bias": "audio_encoder.safetensors",
|
| 1952 |
"audio_encoder.speaker_encoder.frame_encoder.2.weight": "audio_encoder.safetensors",
|
| 1953 |
"audio_encoder.speaker_encoder.frame_encoder.2.bias": "audio_encoder.safetensors",
|
| 1954 |
-
"audio_encoder.speaker_encoder.frame_encoder.2.running_mean": "audio_encoder.safetensors",
|
| 1955 |
-
"audio_encoder.speaker_encoder.frame_encoder.2.running_var": "audio_encoder.safetensors",
|
| 1956 |
-
"audio_encoder.speaker_encoder.frame_encoder.2.num_batches_tracked": "audio_encoder.safetensors",
|
| 1957 |
"audio_encoder.speaker_encoder.frame_encoder.3.weight": "audio_encoder.safetensors",
|
| 1958 |
"audio_encoder.speaker_encoder.frame_encoder.3.bias": "audio_encoder.safetensors",
|
| 1959 |
"audio_encoder.speaker_encoder.frame_encoder.5.weight": "audio_encoder.safetensors",
|
| 1960 |
"audio_encoder.speaker_encoder.frame_encoder.5.bias": "audio_encoder.safetensors",
|
| 1961 |
-
"audio_encoder.speaker_encoder.frame_encoder.5.running_mean": "audio_encoder.safetensors",
|
| 1962 |
-
"audio_encoder.speaker_encoder.frame_encoder.5.running_var": "audio_encoder.safetensors",
|
| 1963 |
-
"audio_encoder.speaker_encoder.frame_encoder.5.num_batches_tracked": "audio_encoder.safetensors",
|
| 1964 |
"audio_encoder.speaker_encoder.frame_encoder.6.weight": "audio_encoder.safetensors",
|
| 1965 |
"audio_encoder.speaker_encoder.frame_encoder.6.bias": "audio_encoder.safetensors",
|
| 1966 |
"audio_encoder.speaker_encoder.frame_encoder.8.weight": "audio_encoder.safetensors",
|
| 1967 |
"audio_encoder.speaker_encoder.frame_encoder.8.bias": "audio_encoder.safetensors",
|
| 1968 |
-
"audio_encoder.speaker_encoder.frame_encoder.8.running_mean": "audio_encoder.safetensors",
|
| 1969 |
-
"audio_encoder.speaker_encoder.frame_encoder.8.running_var": "audio_encoder.safetensors",
|
| 1970 |
-
"audio_encoder.speaker_encoder.frame_encoder.8.num_batches_tracked": "audio_encoder.safetensors",
|
| 1971 |
"audio_encoder.speaker_encoder.lstm.weight_ih_l0": "audio_encoder.safetensors",
|
| 1972 |
"audio_encoder.speaker_encoder.lstm.weight_hh_l0": "audio_encoder.safetensors",
|
| 1973 |
"audio_encoder.speaker_encoder.lstm.bias_ih_l0": "audio_encoder.safetensors",
|
|
@@ -2033,9 +2024,6 @@
|
|
| 2033 |
"audio_encoder.conformer_blocks.0.conv.depthwise_conv.bias": "audio_encoder.safetensors",
|
| 2034 |
"audio_encoder.conformer_blocks.0.conv.batch_norm.weight": "audio_encoder.safetensors",
|
| 2035 |
"audio_encoder.conformer_blocks.0.conv.batch_norm.bias": "audio_encoder.safetensors",
|
| 2036 |
-
"audio_encoder.conformer_blocks.0.conv.batch_norm.running_mean": "audio_encoder.safetensors",
|
| 2037 |
-
"audio_encoder.conformer_blocks.0.conv.batch_norm.running_var": "audio_encoder.safetensors",
|
| 2038 |
-
"audio_encoder.conformer_blocks.0.conv.batch_norm.num_batches_tracked": "audio_encoder.safetensors",
|
| 2039 |
"audio_encoder.conformer_blocks.0.conv.pointwise_conv2.weight": "audio_encoder.safetensors",
|
| 2040 |
"audio_encoder.conformer_blocks.0.conv.pointwise_conv2.bias": "audio_encoder.safetensors",
|
| 2041 |
"audio_encoder.conformer_blocks.0.ff2_norm.weight": "audio_encoder.safetensors",
|
|
@@ -2069,9 +2057,6 @@
|
|
| 2069 |
"audio_encoder.conformer_blocks.1.conv.depthwise_conv.bias": "audio_encoder.safetensors",
|
| 2070 |
"audio_encoder.conformer_blocks.1.conv.batch_norm.weight": "audio_encoder.safetensors",
|
| 2071 |
"audio_encoder.conformer_blocks.1.conv.batch_norm.bias": "audio_encoder.safetensors",
|
| 2072 |
-
"audio_encoder.conformer_blocks.1.conv.batch_norm.running_mean": "audio_encoder.safetensors",
|
| 2073 |
-
"audio_encoder.conformer_blocks.1.conv.batch_norm.running_var": "audio_encoder.safetensors",
|
| 2074 |
-
"audio_encoder.conformer_blocks.1.conv.batch_norm.num_batches_tracked": "audio_encoder.safetensors",
|
| 2075 |
"audio_encoder.conformer_blocks.1.conv.pointwise_conv2.weight": "audio_encoder.safetensors",
|
| 2076 |
"audio_encoder.conformer_blocks.1.conv.pointwise_conv2.bias": "audio_encoder.safetensors",
|
| 2077 |
"audio_encoder.conformer_blocks.1.ff2_norm.weight": "audio_encoder.safetensors",
|
|
@@ -2105,9 +2090,6 @@
|
|
| 2105 |
"audio_encoder.conformer_blocks.2.conv.depthwise_conv.bias": "audio_encoder.safetensors",
|
| 2106 |
"audio_encoder.conformer_blocks.2.conv.batch_norm.weight": "audio_encoder.safetensors",
|
| 2107 |
"audio_encoder.conformer_blocks.2.conv.batch_norm.bias": "audio_encoder.safetensors",
|
| 2108 |
-
"audio_encoder.conformer_blocks.2.conv.batch_norm.running_mean": "audio_encoder.safetensors",
|
| 2109 |
-
"audio_encoder.conformer_blocks.2.conv.batch_norm.running_var": "audio_encoder.safetensors",
|
| 2110 |
-
"audio_encoder.conformer_blocks.2.conv.batch_norm.num_batches_tracked": "audio_encoder.safetensors",
|
| 2111 |
"audio_encoder.conformer_blocks.2.conv.pointwise_conv2.weight": "audio_encoder.safetensors",
|
| 2112 |
"audio_encoder.conformer_blocks.2.conv.pointwise_conv2.bias": "audio_encoder.safetensors",
|
| 2113 |
"audio_encoder.conformer_blocks.2.ff2_norm.weight": "audio_encoder.safetensors",
|
|
@@ -2141,9 +2123,6 @@
|
|
| 2141 |
"audio_encoder.conformer_blocks.3.conv.depthwise_conv.bias": "audio_encoder.safetensors",
|
| 2142 |
"audio_encoder.conformer_blocks.3.conv.batch_norm.weight": "audio_encoder.safetensors",
|
| 2143 |
"audio_encoder.conformer_blocks.3.conv.batch_norm.bias": "audio_encoder.safetensors",
|
| 2144 |
-
"audio_encoder.conformer_blocks.3.conv.batch_norm.running_mean": "audio_encoder.safetensors",
|
| 2145 |
-
"audio_encoder.conformer_blocks.3.conv.batch_norm.running_var": "audio_encoder.safetensors",
|
| 2146 |
-
"audio_encoder.conformer_blocks.3.conv.batch_norm.num_batches_tracked": "audio_encoder.safetensors",
|
| 2147 |
"audio_encoder.conformer_blocks.3.conv.pointwise_conv2.weight": "audio_encoder.safetensors",
|
| 2148 |
"audio_encoder.conformer_blocks.3.conv.pointwise_conv2.bias": "audio_encoder.safetensors",
|
| 2149 |
"audio_encoder.conformer_blocks.3.ff2_norm.weight": "audio_encoder.safetensors",
|
|
@@ -2177,9 +2156,6 @@
|
|
| 2177 |
"audio_encoder.conformer_blocks.4.conv.depthwise_conv.bias": "audio_encoder.safetensors",
|
| 2178 |
"audio_encoder.conformer_blocks.4.conv.batch_norm.weight": "audio_encoder.safetensors",
|
| 2179 |
"audio_encoder.conformer_blocks.4.conv.batch_norm.bias": "audio_encoder.safetensors",
|
| 2180 |
-
"audio_encoder.conformer_blocks.4.conv.batch_norm.running_mean": "audio_encoder.safetensors",
|
| 2181 |
-
"audio_encoder.conformer_blocks.4.conv.batch_norm.running_var": "audio_encoder.safetensors",
|
| 2182 |
-
"audio_encoder.conformer_blocks.4.conv.batch_norm.num_batches_tracked": "audio_encoder.safetensors",
|
| 2183 |
"audio_encoder.conformer_blocks.4.conv.pointwise_conv2.weight": "audio_encoder.safetensors",
|
| 2184 |
"audio_encoder.conformer_blocks.4.conv.pointwise_conv2.bias": "audio_encoder.safetensors",
|
| 2185 |
"audio_encoder.conformer_blocks.4.ff2_norm.weight": "audio_encoder.safetensors",
|
|
@@ -2213,9 +2189,6 @@
|
|
| 2213 |
"audio_encoder.conformer_blocks.5.conv.depthwise_conv.bias": "audio_encoder.safetensors",
|
| 2214 |
"audio_encoder.conformer_blocks.5.conv.batch_norm.weight": "audio_encoder.safetensors",
|
| 2215 |
"audio_encoder.conformer_blocks.5.conv.batch_norm.bias": "audio_encoder.safetensors",
|
| 2216 |
-
"audio_encoder.conformer_blocks.5.conv.batch_norm.running_mean": "audio_encoder.safetensors",
|
| 2217 |
-
"audio_encoder.conformer_blocks.5.conv.batch_norm.running_var": "audio_encoder.safetensors",
|
| 2218 |
-
"audio_encoder.conformer_blocks.5.conv.batch_norm.num_batches_tracked": "audio_encoder.safetensors",
|
| 2219 |
"audio_encoder.conformer_blocks.5.conv.pointwise_conv2.weight": "audio_encoder.safetensors",
|
| 2220 |
"audio_encoder.conformer_blocks.5.conv.pointwise_conv2.bias": "audio_encoder.safetensors",
|
| 2221 |
"audio_encoder.conformer_blocks.5.ff2_norm.weight": "audio_encoder.safetensors",
|
|
@@ -2588,30 +2561,18 @@
|
|
| 2588 |
"audio_decoder.postnet.0.0.bias": "audio_decoder.safetensors",
|
| 2589 |
"audio_decoder.postnet.0.1.weight": "audio_decoder.safetensors",
|
| 2590 |
"audio_decoder.postnet.0.1.bias": "audio_decoder.safetensors",
|
| 2591 |
-
"audio_decoder.postnet.0.1.running_mean": "audio_decoder.safetensors",
|
| 2592 |
-
"audio_decoder.postnet.0.1.running_var": "audio_decoder.safetensors",
|
| 2593 |
-
"audio_decoder.postnet.0.1.num_batches_tracked": "audio_decoder.safetensors",
|
| 2594 |
"audio_decoder.postnet.1.0.weight": "audio_decoder.safetensors",
|
| 2595 |
"audio_decoder.postnet.1.0.bias": "audio_decoder.safetensors",
|
| 2596 |
"audio_decoder.postnet.1.1.weight": "audio_decoder.safetensors",
|
| 2597 |
"audio_decoder.postnet.1.1.bias": "audio_decoder.safetensors",
|
| 2598 |
-
"audio_decoder.postnet.1.1.running_mean": "audio_decoder.safetensors",
|
| 2599 |
-
"audio_decoder.postnet.1.1.running_var": "audio_decoder.safetensors",
|
| 2600 |
-
"audio_decoder.postnet.1.1.num_batches_tracked": "audio_decoder.safetensors",
|
| 2601 |
"audio_decoder.postnet.2.0.weight": "audio_decoder.safetensors",
|
| 2602 |
"audio_decoder.postnet.2.0.bias": "audio_decoder.safetensors",
|
| 2603 |
"audio_decoder.postnet.2.1.weight": "audio_decoder.safetensors",
|
| 2604 |
"audio_decoder.postnet.2.1.bias": "audio_decoder.safetensors",
|
| 2605 |
-
"audio_decoder.postnet.2.1.running_mean": "audio_decoder.safetensors",
|
| 2606 |
-
"audio_decoder.postnet.2.1.running_var": "audio_decoder.safetensors",
|
| 2607 |
-
"audio_decoder.postnet.2.1.num_batches_tracked": "audio_decoder.safetensors",
|
| 2608 |
"audio_decoder.postnet.3.0.weight": "audio_decoder.safetensors",
|
| 2609 |
"audio_decoder.postnet.3.0.bias": "audio_decoder.safetensors",
|
| 2610 |
"audio_decoder.postnet.3.1.weight": "audio_decoder.safetensors",
|
| 2611 |
"audio_decoder.postnet.3.1.bias": "audio_decoder.safetensors",
|
| 2612 |
-
"audio_decoder.postnet.3.1.running_mean": "audio_decoder.safetensors",
|
| 2613 |
-
"audio_decoder.postnet.3.1.running_var": "audio_decoder.safetensors",
|
| 2614 |
-
"audio_decoder.postnet.3.1.num_batches_tracked": "audio_decoder.safetensors",
|
| 2615 |
"audio_decoder.postnet.4.weight": "audio_decoder.safetensors",
|
| 2616 |
"audio_decoder.postnet.4.bias": "audio_decoder.safetensors",
|
| 2617 |
"audio_decoder.waveform_decoder.input_proj.bias": "audio_decoder.safetensors",
|
|
|
|
| 1 |
{
|
| 2 |
"metadata": {
|
| 3 |
+
"total_size": 7309365038,
|
| 4 |
"format": "components"
|
| 5 |
},
|
| 6 |
"weight_map": {
|
|
|
|
| 1951 |
"audio_encoder.speaker_encoder.frame_encoder.0.bias": "audio_encoder.safetensors",
|
| 1952 |
"audio_encoder.speaker_encoder.frame_encoder.2.weight": "audio_encoder.safetensors",
|
| 1953 |
"audio_encoder.speaker_encoder.frame_encoder.2.bias": "audio_encoder.safetensors",
|
|
|
|
|
|
|
|
|
|
| 1954 |
"audio_encoder.speaker_encoder.frame_encoder.3.weight": "audio_encoder.safetensors",
|
| 1955 |
"audio_encoder.speaker_encoder.frame_encoder.3.bias": "audio_encoder.safetensors",
|
| 1956 |
"audio_encoder.speaker_encoder.frame_encoder.5.weight": "audio_encoder.safetensors",
|
| 1957 |
"audio_encoder.speaker_encoder.frame_encoder.5.bias": "audio_encoder.safetensors",
|
|
|
|
|
|
|
|
|
|
| 1958 |
"audio_encoder.speaker_encoder.frame_encoder.6.weight": "audio_encoder.safetensors",
|
| 1959 |
"audio_encoder.speaker_encoder.frame_encoder.6.bias": "audio_encoder.safetensors",
|
| 1960 |
"audio_encoder.speaker_encoder.frame_encoder.8.weight": "audio_encoder.safetensors",
|
| 1961 |
"audio_encoder.speaker_encoder.frame_encoder.8.bias": "audio_encoder.safetensors",
|
|
|
|
|
|
|
|
|
|
| 1962 |
"audio_encoder.speaker_encoder.lstm.weight_ih_l0": "audio_encoder.safetensors",
|
| 1963 |
"audio_encoder.speaker_encoder.lstm.weight_hh_l0": "audio_encoder.safetensors",
|
| 1964 |
"audio_encoder.speaker_encoder.lstm.bias_ih_l0": "audio_encoder.safetensors",
|
|
|
|
| 2024 |
"audio_encoder.conformer_blocks.0.conv.depthwise_conv.bias": "audio_encoder.safetensors",
|
| 2025 |
"audio_encoder.conformer_blocks.0.conv.batch_norm.weight": "audio_encoder.safetensors",
|
| 2026 |
"audio_encoder.conformer_blocks.0.conv.batch_norm.bias": "audio_encoder.safetensors",
|
|
|
|
|
|
|
|
|
|
| 2027 |
"audio_encoder.conformer_blocks.0.conv.pointwise_conv2.weight": "audio_encoder.safetensors",
|
| 2028 |
"audio_encoder.conformer_blocks.0.conv.pointwise_conv2.bias": "audio_encoder.safetensors",
|
| 2029 |
"audio_encoder.conformer_blocks.0.ff2_norm.weight": "audio_encoder.safetensors",
|
|
|
|
| 2057 |
"audio_encoder.conformer_blocks.1.conv.depthwise_conv.bias": "audio_encoder.safetensors",
|
| 2058 |
"audio_encoder.conformer_blocks.1.conv.batch_norm.weight": "audio_encoder.safetensors",
|
| 2059 |
"audio_encoder.conformer_blocks.1.conv.batch_norm.bias": "audio_encoder.safetensors",
|
|
|
|
|
|
|
|
|
|
| 2060 |
"audio_encoder.conformer_blocks.1.conv.pointwise_conv2.weight": "audio_encoder.safetensors",
|
| 2061 |
"audio_encoder.conformer_blocks.1.conv.pointwise_conv2.bias": "audio_encoder.safetensors",
|
| 2062 |
"audio_encoder.conformer_blocks.1.ff2_norm.weight": "audio_encoder.safetensors",
|
|
|
|
| 2090 |
"audio_encoder.conformer_blocks.2.conv.depthwise_conv.bias": "audio_encoder.safetensors",
|
| 2091 |
"audio_encoder.conformer_blocks.2.conv.batch_norm.weight": "audio_encoder.safetensors",
|
| 2092 |
"audio_encoder.conformer_blocks.2.conv.batch_norm.bias": "audio_encoder.safetensors",
|
|
|
|
|
|
|
|
|
|
| 2093 |
"audio_encoder.conformer_blocks.2.conv.pointwise_conv2.weight": "audio_encoder.safetensors",
|
| 2094 |
"audio_encoder.conformer_blocks.2.conv.pointwise_conv2.bias": "audio_encoder.safetensors",
|
| 2095 |
"audio_encoder.conformer_blocks.2.ff2_norm.weight": "audio_encoder.safetensors",
|
|
|
|
| 2123 |
"audio_encoder.conformer_blocks.3.conv.depthwise_conv.bias": "audio_encoder.safetensors",
|
| 2124 |
"audio_encoder.conformer_blocks.3.conv.batch_norm.weight": "audio_encoder.safetensors",
|
| 2125 |
"audio_encoder.conformer_blocks.3.conv.batch_norm.bias": "audio_encoder.safetensors",
|
|
|
|
|
|
|
|
|
|
| 2126 |
"audio_encoder.conformer_blocks.3.conv.pointwise_conv2.weight": "audio_encoder.safetensors",
|
| 2127 |
"audio_encoder.conformer_blocks.3.conv.pointwise_conv2.bias": "audio_encoder.safetensors",
|
| 2128 |
"audio_encoder.conformer_blocks.3.ff2_norm.weight": "audio_encoder.safetensors",
|
|
|
|
| 2156 |
"audio_encoder.conformer_blocks.4.conv.depthwise_conv.bias": "audio_encoder.safetensors",
|
| 2157 |
"audio_encoder.conformer_blocks.4.conv.batch_norm.weight": "audio_encoder.safetensors",
|
| 2158 |
"audio_encoder.conformer_blocks.4.conv.batch_norm.bias": "audio_encoder.safetensors",
|
|
|
|
|
|
|
|
|
|
| 2159 |
"audio_encoder.conformer_blocks.4.conv.pointwise_conv2.weight": "audio_encoder.safetensors",
|
| 2160 |
"audio_encoder.conformer_blocks.4.conv.pointwise_conv2.bias": "audio_encoder.safetensors",
|
| 2161 |
"audio_encoder.conformer_blocks.4.ff2_norm.weight": "audio_encoder.safetensors",
|
|
|
|
| 2189 |
"audio_encoder.conformer_blocks.5.conv.depthwise_conv.bias": "audio_encoder.safetensors",
|
| 2190 |
"audio_encoder.conformer_blocks.5.conv.batch_norm.weight": "audio_encoder.safetensors",
|
| 2191 |
"audio_encoder.conformer_blocks.5.conv.batch_norm.bias": "audio_encoder.safetensors",
|
|
|
|
|
|
|
|
|
|
| 2192 |
"audio_encoder.conformer_blocks.5.conv.pointwise_conv2.weight": "audio_encoder.safetensors",
|
| 2193 |
"audio_encoder.conformer_blocks.5.conv.pointwise_conv2.bias": "audio_encoder.safetensors",
|
| 2194 |
"audio_encoder.conformer_blocks.5.ff2_norm.weight": "audio_encoder.safetensors",
|
|
|
|
| 2561 |
"audio_decoder.postnet.0.0.bias": "audio_decoder.safetensors",
|
| 2562 |
"audio_decoder.postnet.0.1.weight": "audio_decoder.safetensors",
|
| 2563 |
"audio_decoder.postnet.0.1.bias": "audio_decoder.safetensors",
|
|
|
|
|
|
|
|
|
|
| 2564 |
"audio_decoder.postnet.1.0.weight": "audio_decoder.safetensors",
|
| 2565 |
"audio_decoder.postnet.1.0.bias": "audio_decoder.safetensors",
|
| 2566 |
"audio_decoder.postnet.1.1.weight": "audio_decoder.safetensors",
|
| 2567 |
"audio_decoder.postnet.1.1.bias": "audio_decoder.safetensors",
|
|
|
|
|
|
|
|
|
|
| 2568 |
"audio_decoder.postnet.2.0.weight": "audio_decoder.safetensors",
|
| 2569 |
"audio_decoder.postnet.2.0.bias": "audio_decoder.safetensors",
|
| 2570 |
"audio_decoder.postnet.2.1.weight": "audio_decoder.safetensors",
|
| 2571 |
"audio_decoder.postnet.2.1.bias": "audio_decoder.safetensors",
|
|
|
|
|
|
|
|
|
|
| 2572 |
"audio_decoder.postnet.3.0.weight": "audio_decoder.safetensors",
|
| 2573 |
"audio_decoder.postnet.3.0.bias": "audio_decoder.safetensors",
|
| 2574 |
"audio_decoder.postnet.3.1.weight": "audio_decoder.safetensors",
|
| 2575 |
"audio_decoder.postnet.3.1.bias": "audio_decoder.safetensors",
|
|
|
|
|
|
|
|
|
|
| 2576 |
"audio_decoder.postnet.4.weight": "audio_decoder.safetensors",
|
| 2577 |
"audio_decoder.postnet.4.bias": "audio_decoder.safetensors",
|
| 2578 |
"audio_decoder.waveform_decoder.input_proj.bias": "audio_decoder.safetensors",
|
modeling_xoron.py
CHANGED
|
@@ -4371,16 +4371,18 @@ class SpeakerEncoder(nn.Module):
|
|
| 4371 |
self.output_size = output_size
|
| 4372 |
|
| 4373 |
# Frame-level encoder
|
|
|
|
|
|
|
| 4374 |
self.frame_encoder = nn.Sequential(
|
| 4375 |
nn.Conv1d(80, hidden_size, 5, 1, 2),
|
| 4376 |
nn.ReLU(),
|
| 4377 |
-
nn.
|
| 4378 |
nn.Conv1d(hidden_size, hidden_size, 5, 1, 2),
|
| 4379 |
nn.ReLU(),
|
| 4380 |
-
nn.
|
| 4381 |
nn.Conv1d(hidden_size, hidden_size, 5, 1, 2),
|
| 4382 |
nn.ReLU(),
|
| 4383 |
-
nn.
|
| 4384 |
)
|
| 4385 |
|
| 4386 |
# LSTM for temporal modeling
|
|
@@ -4853,7 +4855,8 @@ class ConvolutionModule(nn.Module):
|
|
| 4853 |
channels, channels, kernel_size=kernel_size,
|
| 4854 |
padding=(kernel_size - 1) // 2, groups=channels
|
| 4855 |
)
|
| 4856 |
-
|
|
|
|
| 4857 |
self.pointwise_conv2 = nn.Conv1d(channels, channels, kernel_size=1)
|
| 4858 |
self.dropout = nn.Dropout(dropout)
|
| 4859 |
|
|
@@ -5544,25 +5547,27 @@ class AudioDecoder(nn.Module):
|
|
| 5544 |
self.mel_linear = nn.Linear(hidden_size, n_mels)
|
| 5545 |
|
| 5546 |
# Postnet
|
|
|
|
|
|
|
| 5547 |
self.postnet = nn.ModuleList([
|
| 5548 |
nn.Sequential(
|
| 5549 |
nn.Conv1d(n_mels, 256, kernel_size=5, padding=2),
|
| 5550 |
-
nn.
|
| 5551 |
nn.Tanh(),
|
| 5552 |
),
|
| 5553 |
nn.Sequential(
|
| 5554 |
nn.Conv1d(256, 256, kernel_size=5, padding=2),
|
| 5555 |
-
nn.
|
| 5556 |
nn.Tanh(),
|
| 5557 |
),
|
| 5558 |
nn.Sequential(
|
| 5559 |
nn.Conv1d(256, 256, kernel_size=5, padding=2),
|
| 5560 |
-
nn.
|
| 5561 |
nn.Tanh(),
|
| 5562 |
),
|
| 5563 |
nn.Sequential(
|
| 5564 |
nn.Conv1d(256, 256, kernel_size=5, padding=2),
|
| 5565 |
-
nn.
|
| 5566 |
nn.Tanh(),
|
| 5567 |
),
|
| 5568 |
nn.Conv1d(256, n_mels, kernel_size=5, padding=2),
|
|
@@ -5770,9 +5775,8 @@ class AudioDecoder(nn.Module):
|
|
| 5770 |
energy_pred = F.softplus(self.energy_predictor(x))
|
| 5771 |
|
| 5772 |
# Determine output length
|
| 5773 |
-
#
|
| 5774 |
-
|
| 5775 |
-
MIN_MEL_LENGTH = 2
|
| 5776 |
if target_length is not None:
|
| 5777 |
mel_length = max(MIN_MEL_LENGTH, target_length)
|
| 5778 |
else:
|
|
@@ -8608,6 +8612,7 @@ class AuxLosslessMoELayer(nn.Module):
|
|
| 8608 |
|
| 8609 |
def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
|
| 8610 |
batch_size, seq_len, hidden_size = hidden_states.shape
|
|
|
|
| 8611 |
hidden_flat = hidden_states.view(-1, hidden_size)
|
| 8612 |
|
| 8613 |
top_k_probs, top_k_indices, _ = self.router(hidden_states)
|
|
@@ -8622,10 +8627,11 @@ class AuxLosslessMoELayer(nn.Module):
|
|
| 8622 |
expert_input = hidden_flat[mask]
|
| 8623 |
expert_output = expert(expert_input)
|
| 8624 |
weight = top_k_probs[mask, k:k+1]
|
| 8625 |
-
|
|
|
|
| 8626 |
|
| 8627 |
shared_output = self.shared_expert(hidden_flat)
|
| 8628 |
-
final_output = final_output + shared_output
|
| 8629 |
|
| 8630 |
final_output = final_output.view(batch_size, seq_len, hidden_size)
|
| 8631 |
|
|
@@ -9125,7 +9131,14 @@ class XoronMultimodalModel(nn.Module):
|
|
| 9125 |
super().__init__()
|
| 9126 |
self.config = config
|
| 9127 |
self.device_map = device_map
|
| 9128 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9129 |
|
| 9130 |
print("\n" + "=" * 60)
|
| 9131 |
print("🚀 BUILDING XORON-DEV MULTIMODAL MODEL")
|
|
@@ -9273,7 +9286,11 @@ class XoronMultimodalModel(nn.Module):
|
|
| 9273 |
def apply_model_parallel(self, device_map: Dict[str, str]):
|
| 9274 |
"""Apply Model Parallelism by placing components on different devices."""
|
| 9275 |
self.device_map = device_map
|
| 9276 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9277 |
|
| 9278 |
if not self._model_parallel:
|
| 9279 |
print(" ℹ️ Single device - no model parallelism needed")
|
|
|
|
| 4371 |
self.output_size = output_size
|
| 4372 |
|
| 4373 |
# Frame-level encoder
|
| 4374 |
+
# Use GroupNorm instead of BatchNorm1d to handle sequence length of 1
|
| 4375 |
+
# GroupNorm(1, C) is equivalent to LayerNorm and works with any batch/seq size
|
| 4376 |
self.frame_encoder = nn.Sequential(
|
| 4377 |
nn.Conv1d(80, hidden_size, 5, 1, 2),
|
| 4378 |
nn.ReLU(),
|
| 4379 |
+
nn.GroupNorm(1, hidden_size),
|
| 4380 |
nn.Conv1d(hidden_size, hidden_size, 5, 1, 2),
|
| 4381 |
nn.ReLU(),
|
| 4382 |
+
nn.GroupNorm(1, hidden_size),
|
| 4383 |
nn.Conv1d(hidden_size, hidden_size, 5, 1, 2),
|
| 4384 |
nn.ReLU(),
|
| 4385 |
+
nn.GroupNorm(1, hidden_size),
|
| 4386 |
)
|
| 4387 |
|
| 4388 |
# LSTM for temporal modeling
|
|
|
|
| 4855 |
channels, channels, kernel_size=kernel_size,
|
| 4856 |
padding=(kernel_size - 1) // 2, groups=channels
|
| 4857 |
)
|
| 4858 |
+
# Use GroupNorm instead of BatchNorm1d to handle sequence length of 1
|
| 4859 |
+
self.batch_norm = nn.GroupNorm(1, channels)
|
| 4860 |
self.pointwise_conv2 = nn.Conv1d(channels, channels, kernel_size=1)
|
| 4861 |
self.dropout = nn.Dropout(dropout)
|
| 4862 |
|
|
|
|
| 5547 |
self.mel_linear = nn.Linear(hidden_size, n_mels)
|
| 5548 |
|
| 5549 |
# Postnet
|
| 5550 |
+
# Use GroupNorm instead of BatchNorm1d to handle sequence length of 1
|
| 5551 |
+
# GroupNorm(1, C) is equivalent to LayerNorm and works with any batch/seq size
|
| 5552 |
self.postnet = nn.ModuleList([
|
| 5553 |
nn.Sequential(
|
| 5554 |
nn.Conv1d(n_mels, 256, kernel_size=5, padding=2),
|
| 5555 |
+
nn.GroupNorm(1, 256),
|
| 5556 |
nn.Tanh(),
|
| 5557 |
),
|
| 5558 |
nn.Sequential(
|
| 5559 |
nn.Conv1d(256, 256, kernel_size=5, padding=2),
|
| 5560 |
+
nn.GroupNorm(1, 256),
|
| 5561 |
nn.Tanh(),
|
| 5562 |
),
|
| 5563 |
nn.Sequential(
|
| 5564 |
nn.Conv1d(256, 256, kernel_size=5, padding=2),
|
| 5565 |
+
nn.GroupNorm(1, 256),
|
| 5566 |
nn.Tanh(),
|
| 5567 |
),
|
| 5568 |
nn.Sequential(
|
| 5569 |
nn.Conv1d(256, 256, kernel_size=5, padding=2),
|
| 5570 |
+
nn.GroupNorm(1, 256),
|
| 5571 |
nn.Tanh(),
|
| 5572 |
),
|
| 5573 |
nn.Conv1d(256, n_mels, kernel_size=5, padding=2),
|
|
|
|
| 5775 |
energy_pred = F.softplus(self.energy_predictor(x))
|
| 5776 |
|
| 5777 |
# Determine output length
|
| 5778 |
+
# Note: We use GroupNorm instead of BatchNorm1d so any sequence length works
|
| 5779 |
+
MIN_MEL_LENGTH = 1
|
|
|
|
| 5780 |
if target_length is not None:
|
| 5781 |
mel_length = max(MIN_MEL_LENGTH, target_length)
|
| 5782 |
else:
|
|
|
|
| 8612 |
|
| 8613 |
def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
|
| 8614 |
batch_size, seq_len, hidden_size = hidden_states.shape
|
| 8615 |
+
original_dtype = hidden_states.dtype
|
| 8616 |
hidden_flat = hidden_states.view(-1, hidden_size)
|
| 8617 |
|
| 8618 |
top_k_probs, top_k_indices, _ = self.router(hidden_states)
|
|
|
|
| 8627 |
expert_input = hidden_flat[mask]
|
| 8628 |
expert_output = expert(expert_input)
|
| 8629 |
weight = top_k_probs[mask, k:k+1]
|
| 8630 |
+
weighted_output = (weight * expert_output).to(original_dtype)
|
| 8631 |
+
final_output[mask] = final_output[mask] + weighted_output
|
| 8632 |
|
| 8633 |
shared_output = self.shared_expert(hidden_flat)
|
| 8634 |
+
final_output = final_output + shared_output.to(original_dtype)
|
| 8635 |
|
| 8636 |
final_output = final_output.view(batch_size, seq_len, hidden_size)
|
| 8637 |
|
|
|
|
| 9131 |
super().__init__()
|
| 9132 |
self.config = config
|
| 9133 |
self.device_map = device_map
|
| 9134 |
+
|
| 9135 |
+
# Check for model parallelism - only consider string device values
|
| 9136 |
+
# (device_map may contain metadata like 'training_gpus' list, 'dual_gpu_mode' bool)
|
| 9137 |
+
if device_map is not None:
|
| 9138 |
+
device_values = [v for v in device_map.values() if isinstance(v, str)]
|
| 9139 |
+
self._model_parallel = len(set(device_values)) > 1
|
| 9140 |
+
else:
|
| 9141 |
+
self._model_parallel = False
|
| 9142 |
|
| 9143 |
print("\n" + "=" * 60)
|
| 9144 |
print("🚀 BUILDING XORON-DEV MULTIMODAL MODEL")
|
|
|
|
| 9286 |
def apply_model_parallel(self, device_map: Dict[str, str]):
|
| 9287 |
"""Apply Model Parallelism by placing components on different devices."""
|
| 9288 |
self.device_map = device_map
|
| 9289 |
+
|
| 9290 |
+
# Check for model parallelism - only consider string device values
|
| 9291 |
+
# (device_map may contain metadata like 'training_gpus' list, 'dual_gpu_mode' bool)
|
| 9292 |
+
device_values = [v for v in device_map.values() if isinstance(v, str)]
|
| 9293 |
+
self._model_parallel = len(set(device_values)) > 1
|
| 9294 |
|
| 9295 |
if not self._model_parallel:
|
| 9296 |
print(" ℹ️ Single device - no model parallelism needed")
|
streaming_state.json
CHANGED
|
@@ -1,15 +1,15 @@
|
|
| 1 |
{
|
| 2 |
-
"epoch":
|
| 3 |
-
"unique_samples":
|
| 4 |
-
"total_yields":
|
| 5 |
"dataset_positions": {
|
| 6 |
-
"WebSight":
|
| 7 |
-
"ScienceQA":
|
| 8 |
-
"InstructPix2Pix":
|
| 9 |
-
"Flickr8k":
|
| 10 |
-
"NewYorker":
|
| 11 |
"Football": 6,
|
| 12 |
-
"MagicBrush":
|
| 13 |
"WildChat": 200,
|
| 14 |
"Synth-ShellExecution": 200,
|
| 15 |
"Midjourney-Prompts": 200,
|
|
@@ -113,20 +113,20 @@
|
|
| 113 |
"OpenAssistant": 200
|
| 114 |
},
|
| 115 |
"image": {
|
| 116 |
-
"WebSight":
|
| 117 |
-
"ScienceQA":
|
| 118 |
-
"InstructPix2Pix":
|
| 119 |
-
"Flickr8k":
|
| 120 |
-
"NewYorker":
|
| 121 |
"Football": 6,
|
| 122 |
-
"MagicBrush":
|
| 123 |
},
|
| 124 |
"video": {},
|
| 125 |
"audio": {}
|
| 126 |
},
|
| 127 |
"modality_counts": {
|
| 128 |
-
"text":
|
| 129 |
-
"image":
|
| 130 |
"video": 0,
|
| 131 |
"audio": 0
|
| 132 |
},
|
|
|
|
| 1 |
{
|
| 2 |
+
"epoch": 19,
|
| 3 |
+
"unique_samples": 300,
|
| 4 |
+
"total_yields": 600,
|
| 5 |
"dataset_positions": {
|
| 6 |
+
"WebSight": 386,
|
| 7 |
+
"ScienceQA": 364,
|
| 8 |
+
"InstructPix2Pix": 386,
|
| 9 |
+
"Flickr8k": 386,
|
| 10 |
+
"NewYorker": 386,
|
| 11 |
"Football": 6,
|
| 12 |
+
"MagicBrush": 386,
|
| 13 |
"WildChat": 200,
|
| 14 |
"Synth-ShellExecution": 200,
|
| 15 |
"Midjourney-Prompts": 200,
|
|
|
|
| 113 |
"OpenAssistant": 200
|
| 114 |
},
|
| 115 |
"image": {
|
| 116 |
+
"WebSight": 386,
|
| 117 |
+
"ScienceQA": 364,
|
| 118 |
+
"InstructPix2Pix": 386,
|
| 119 |
+
"Flickr8k": 386,
|
| 120 |
+
"NewYorker": 386,
|
| 121 |
"Football": 6,
|
| 122 |
+
"MagicBrush": 386
|
| 123 |
},
|
| 124 |
"video": {},
|
| 125 |
"audio": {}
|
| 126 |
},
|
| 127 |
"modality_counts": {
|
| 128 |
+
"text": 0,
|
| 129 |
+
"image": 300,
|
| 130 |
"video": 0,
|
| 131 |
"audio": 0
|
| 132 |
},
|
trainer_state.json
CHANGED
|
@@ -1,14 +1,14 @@
|
|
| 1 |
{
|
| 2 |
"best_model_checkpoint": "/kaggle/working/xoron-final",
|
| 3 |
-
"best_metric": 3.
|
| 4 |
"epoch": 4,
|
| 5 |
"epochs_completed": 4,
|
| 6 |
-
"global_step":
|
| 7 |
"is_local_process_zero": true,
|
| 8 |
"is_world_process_zero": true,
|
| 9 |
"log_history": [],
|
| 10 |
"logging_steps": 50,
|
| 11 |
-
"max_steps":
|
| 12 |
"num_train_epochs": 4,
|
| 13 |
"total_flos": 0,
|
| 14 |
"train_batch_size": 1,
|
|
@@ -16,16 +16,16 @@
|
|
| 16 |
"learning_rate": 0.0001,
|
| 17 |
"max_grad_norm": 1.0,
|
| 18 |
"trainable_components": [
|
|
|
|
| 19 |
"llm",
|
| 20 |
"cross_attention",
|
|
|
|
| 21 |
"modality_markers"
|
| 22 |
],
|
| 23 |
"frozen_components": [
|
| 24 |
-
"vision",
|
| 25 |
"video",
|
| 26 |
"audio",
|
| 27 |
"speech",
|
| 28 |
-
"image_generation",
|
| 29 |
"video_generation"
|
| 30 |
],
|
| 31 |
"trial_name": null,
|
|
|
|
| 1 |
{
|
| 2 |
"best_model_checkpoint": "/kaggle/working/xoron-final",
|
| 3 |
+
"best_metric": 3.0820325045382684,
|
| 4 |
"epoch": 4,
|
| 5 |
"epochs_completed": 4,
|
| 6 |
+
"global_step": 148,
|
| 7 |
"is_local_process_zero": true,
|
| 8 |
"is_world_process_zero": true,
|
| 9 |
"log_history": [],
|
| 10 |
"logging_steps": 50,
|
| 11 |
+
"max_steps": 148,
|
| 12 |
"num_train_epochs": 4,
|
| 13 |
"total_flos": 0,
|
| 14 |
"train_batch_size": 1,
|
|
|
|
| 16 |
"learning_rate": 0.0001,
|
| 17 |
"max_grad_norm": 1.0,
|
| 18 |
"trainable_components": [
|
| 19 |
+
"vision",
|
| 20 |
"llm",
|
| 21 |
"cross_attention",
|
| 22 |
+
"image_generation",
|
| 23 |
"modality_markers"
|
| 24 |
],
|
| 25 |
"frozen_components": [
|
|
|
|
| 26 |
"video",
|
| 27 |
"audio",
|
| 28 |
"speech",
|
|
|
|
| 29 |
"video_generation"
|
| 30 |
],
|
| 31 |
"trial_name": null,
|
training_state.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d17b71b1b8d8d73a29371b107d2020d349cf453a9089b49b44d1b5cb446fba74
|
| 3 |
+
size 1419723549
|