Any-to-Any
Transformers
Safetensors
English
xoron
multimodal
Mixture of Experts
text-to-image
image editing
image to video
text-to-video
video editing
text-to-speech
speech-to-text
speech-to-speech
image-to-text
video-to-text
agentic
tool-use
flow-matching
3d-rope
titok
vidtok
dual-stream-attention
zero-shot-voice-cloning
bigvgan
snake-activation
multi-receptive-field-fusion
custom_code
Update model weights after training (epoch 1, loss 12.3738)
Browse files- audio_decoder.safetensors +1 -1
- audio_encoder.safetensors +1 -1
- audio_projector.safetensors +1 -1
- cross_attention.safetensors +1 -1
- generator.safetensors +1 -1
- llm.safetensors +1 -1
- modality_markers.safetensors +0 -0
- modeling_xoron.py +36 -4
- projector.safetensors +1 -1
- trainer_state.json +1 -1
- training_state.pt +1 -1
- video_encoder.safetensors +1 -1
- video_generator.safetensors +1 -1
- vision_encoder.safetensors +1 -1
- waveform_decoder.safetensors +1 -1
audio_decoder.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1458415836
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:261d0afbb5e08b6b1900e3dea25eb42c412e5542bee5a4f0681a898ae9c8bcd8
|
| 3 |
size 1458415836
|
audio_encoder.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 466150140
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:30c6b7e43c61588099a04e970a49bd38fd73465ef129d8e39ed9a1e8c45aeecf
|
| 3 |
size 466150140
|
audio_projector.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2099352
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:728f8031a27e5eb3d42d6c7632872efcd0739f89bbddee2174bc9ed01776730b
|
| 3 |
size 2099352
|
cross_attention.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 174191400
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:27a688ad60e1a8efc783cfa66ad0ed5e9c4e74a0f5437e134ea717bdbb761eb3
|
| 3 |
size 174191400
|
generator.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 629440508
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3c01fd06b809de1bc14f78c1fd5f2f2cac625db3b22f0b696d532e7442aee71a
|
| 3 |
size 629440508
|
llm.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1506831304
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5f05d7774509a9338dd769956837f7b62c63ad4ab45a56fbb919230f51c876c6
|
| 3 |
size 1506831304
|
modality_markers.safetensors
CHANGED
|
Binary files a/modality_markers.safetensors and b/modality_markers.safetensors differ
|
|
|
modeling_xoron.py
CHANGED
|
@@ -10189,15 +10189,31 @@ class XoronModel(XoronPreTrainedModel):
|
|
| 10189 |
|
| 10190 |
# Handle vocab size mismatch for LLM component
|
| 10191 |
if comp_name == 'llm':
|
| 10192 |
-
# Check if embed_tokens size differs
|
| 10193 |
embed_key = 'model.embed_tokens.weight'
|
|
|
|
|
|
|
| 10194 |
if embed_key in state_dict:
|
| 10195 |
saved_vocab_size = state_dict[embed_key].shape[0]
|
|
|
|
| 10196 |
current_vocab_size = component.model.embed_tokens.weight.shape[0]
|
| 10197 |
|
| 10198 |
if saved_vocab_size != current_vocab_size:
|
| 10199 |
print(f" π Resizing embeddings: {current_vocab_size} -> {saved_vocab_size}")
|
| 10200 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10201 |
|
| 10202 |
component.load_state_dict(state_dict, strict=False)
|
| 10203 |
print(f" β
Loaded {comp_name}")
|
|
@@ -11105,15 +11121,31 @@ class XoronModel(XoronPreTrainedModel):
|
|
| 11105 |
|
| 11106 |
# Handle vocab size mismatch for LLM component
|
| 11107 |
if comp_name == 'llm':
|
| 11108 |
-
# Check if embed_tokens size differs
|
| 11109 |
embed_key = 'model.embed_tokens.weight'
|
|
|
|
|
|
|
| 11110 |
if embed_key in state_dict:
|
| 11111 |
saved_vocab_size = state_dict[embed_key].shape[0]
|
|
|
|
| 11112 |
current_vocab_size = component.model.embed_tokens.weight.shape[0]
|
| 11113 |
|
| 11114 |
if saved_vocab_size != current_vocab_size:
|
| 11115 |
print(f" π Resizing embeddings: {current_vocab_size} -> {saved_vocab_size}")
|
| 11116 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11117 |
|
| 11118 |
component.load_state_dict(state_dict, strict=False)
|
| 11119 |
print(f" β
Loaded {comp_name}")
|
|
|
|
| 10189 |
|
| 10190 |
# Handle vocab size mismatch for LLM component
|
| 10191 |
if comp_name == 'llm':
|
|
|
|
| 10192 |
embed_key = 'model.embed_tokens.weight'
|
| 10193 |
+
lm_head_key = 'lm_head.weight'
|
| 10194 |
+
|
| 10195 |
if embed_key in state_dict:
|
| 10196 |
saved_vocab_size = state_dict[embed_key].shape[0]
|
| 10197 |
+
hidden_size = state_dict[embed_key].shape[1]
|
| 10198 |
current_vocab_size = component.model.embed_tokens.weight.shape[0]
|
| 10199 |
|
| 10200 |
if saved_vocab_size != current_vocab_size:
|
| 10201 |
print(f" π Resizing embeddings: {current_vocab_size} -> {saved_vocab_size}")
|
| 10202 |
+
# Manually resize embed_tokens
|
| 10203 |
+
new_embed = nn.Embedding(saved_vocab_size, hidden_size)
|
| 10204 |
+
new_embed.weight.data = state_dict[embed_key]
|
| 10205 |
+
component.model.embed_tokens = new_embed
|
| 10206 |
+
|
| 10207 |
+
# Manually resize lm_head if present
|
| 10208 |
+
if lm_head_key in state_dict:
|
| 10209 |
+
new_lm_head = nn.Linear(hidden_size, saved_vocab_size, bias=False)
|
| 10210 |
+
new_lm_head.weight.data = state_dict[lm_head_key]
|
| 10211 |
+
component.lm_head = new_lm_head
|
| 10212 |
+
|
| 10213 |
+
# Remove these keys from state_dict since we already loaded them
|
| 10214 |
+
del state_dict[embed_key]
|
| 10215 |
+
if lm_head_key in state_dict:
|
| 10216 |
+
del state_dict[lm_head_key]
|
| 10217 |
|
| 10218 |
component.load_state_dict(state_dict, strict=False)
|
| 10219 |
print(f" β
Loaded {comp_name}")
|
|
|
|
| 11121 |
|
| 11122 |
# Handle vocab size mismatch for LLM component
|
| 11123 |
if comp_name == 'llm':
|
|
|
|
| 11124 |
embed_key = 'model.embed_tokens.weight'
|
| 11125 |
+
lm_head_key = 'lm_head.weight'
|
| 11126 |
+
|
| 11127 |
if embed_key in state_dict:
|
| 11128 |
saved_vocab_size = state_dict[embed_key].shape[0]
|
| 11129 |
+
hidden_size = state_dict[embed_key].shape[1]
|
| 11130 |
current_vocab_size = component.model.embed_tokens.weight.shape[0]
|
| 11131 |
|
| 11132 |
if saved_vocab_size != current_vocab_size:
|
| 11133 |
print(f" π Resizing embeddings: {current_vocab_size} -> {saved_vocab_size}")
|
| 11134 |
+
# Manually resize embed_tokens
|
| 11135 |
+
new_embed = nn.Embedding(saved_vocab_size, hidden_size)
|
| 11136 |
+
new_embed.weight.data = state_dict[embed_key]
|
| 11137 |
+
component.model.embed_tokens = new_embed
|
| 11138 |
+
|
| 11139 |
+
# Manually resize lm_head if present
|
| 11140 |
+
if lm_head_key in state_dict:
|
| 11141 |
+
new_lm_head = nn.Linear(hidden_size, saved_vocab_size, bias=False)
|
| 11142 |
+
new_lm_head.weight.data = state_dict[lm_head_key]
|
| 11143 |
+
component.lm_head = new_lm_head
|
| 11144 |
+
|
| 11145 |
+
# Remove these keys from state_dict since we already loaded them
|
| 11146 |
+
del state_dict[embed_key]
|
| 11147 |
+
if lm_head_key in state_dict:
|
| 11148 |
+
del state_dict[lm_head_key]
|
| 11149 |
|
| 11150 |
component.load_state_dict(state_dict, strict=False)
|
| 11151 |
print(f" β
Loaded {comp_name}")
|
projector.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 52880664
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:899fc13cba54ecbf1eef6401108393ed0e9b8e4584e8d180947f87d4807c39dd
|
| 3 |
size 52880664
|
trainer_state.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"best_model_checkpoint": "/kaggle/working/xoron-final",
|
| 3 |
-
"best_metric": 12.
|
| 4 |
"epoch": 1,
|
| 5 |
"epochs_completed": 1,
|
| 6 |
"global_step": 0,
|
|
|
|
| 1 |
{
|
| 2 |
"best_model_checkpoint": "/kaggle/working/xoron-final",
|
| 3 |
+
"best_metric": 12.373827934265137,
|
| 4 |
"epoch": 1,
|
| 5 |
"epochs_completed": 1,
|
| 6 |
"global_step": 0,
|
training_state.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 5143
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:09c47de74291bae883f60d4986fe8dbc38a2c68de7574fdabc66ce46222ed711
|
| 3 |
size 5143
|
video_encoder.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1923089112
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f70226e533706675adf13f72c46122854021d13fe388445bc4d6b7495fa64e3a
|
| 3 |
size 1923089112
|
video_generator.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 61574134
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c68805a467c37a4b172786a99fb83589c8a4e40b7b7a40886176cf1fd2188dc5
|
| 3 |
size 61574134
|
vision_encoder.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1000535480
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4ba3ed68a4082c90661c88d125f4dd6b40717652173dbdc31df0c8cc5fa6260a
|
| 3 |
size 1000535480
|
waveform_decoder.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 34681076
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7416892f4786903c832800ef5baac11cf3a787979cf223afa39a00434b5c639f
|
| 3 |
size 34681076
|