Any-to-Any
Transformers
Safetensors
English
xoron
multimodal
Mixture of Experts
text-to-image
image editing
image to video
text-to-video
video editing
text-to-speech
speech-to-text
speech-to-speech
image-to-text
video-to-text
agentic
tool-use
flow-matching
3d-rope
titok
vidtok
dual-stream-attention
zero-shot-voice-cloning
bigvgan
snake-activation
multi-receptive-field-fusion
custom_code
Update model weights after training (epoch 1, loss 12.6406)
Browse files- audio_decoder.safetensors +1 -1
- audio_encoder.safetensors +1 -1
- audio_projector.safetensors +1 -1
- cross_attention.safetensors +1 -1
- generator.safetensors +1 -1
- llm.safetensors +1 -1
- modality_markers.safetensors +0 -0
- modeling_xoron.py +26 -0
- projector.safetensors +1 -1
- streaming_state.json +2 -2
- trainer_state.json +1 -1
- training_state.pt +1 -1
- video_encoder.safetensors +1 -1
- video_generator.safetensors +1 -1
- vision_encoder.safetensors +1 -1
- waveform_decoder.safetensors +1 -1
audio_decoder.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1458415836
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:69fb66efd2fcf1cca60ed861cab7e732be1f5afd1da828c8e756ecdbeaba07e4
|
| 3 |
size 1458415836
|
audio_encoder.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 466150140
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:12e4401d2925cec8a9b29973f98528943b9ccd70107fb8d90baacd11a897b051
|
| 3 |
size 466150140
|
audio_projector.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2099352
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d836099185cc6b6e01afdc72679fb120799bc0e3109a86779d04de093ee5b4f2
|
| 3 |
size 2099352
|
cross_attention.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 174191400
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b8d2d5d278130bf1d488e55783917dd6f62c1c5d1c30467339dba224b2ac890c
|
| 3 |
size 174191400
|
generator.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 629440508
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8e586a0ad33c4d788f5d915cda9ade7766844890c1fea0d91d514644c8b415f6
|
| 3 |
size 629440508
|
llm.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1506831304
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:076b2a9ee201e21b04fa3113d34fb156cba0ac55c6cfbcc8c0bd79d393f00aca
|
| 3 |
size 1506831304
|
modality_markers.safetensors
CHANGED
|
Binary files a/modality_markers.safetensors and b/modality_markers.safetensors differ
|
|
|
modeling_xoron.py
CHANGED
|
@@ -10186,6 +10186,19 @@ class XoronModel(XoronPreTrainedModel):
|
|
| 10186 |
if component is not None:
|
| 10187 |
with safe_open(comp_path, framework="pt") as f:
|
| 10188 |
state_dict = {k: f.get_tensor(k) for k in f.keys()}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10189 |
component.load_state_dict(state_dict, strict=False)
|
| 10190 |
print(f" โ
Loaded {comp_name}")
|
| 10191 |
|
|
@@ -11089,6 +11102,19 @@ class XoronModel(XoronPreTrainedModel):
|
|
| 11089 |
if component is not None:
|
| 11090 |
with safe_open(comp_path, framework="pt") as f:
|
| 11091 |
state_dict = {k: f.get_tensor(k) for k in f.keys()}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11092 |
component.load_state_dict(state_dict, strict=False)
|
| 11093 |
print(f" โ
Loaded {comp_name}")
|
| 11094 |
|
|
|
|
| 10186 |
if component is not None:
|
| 10187 |
with safe_open(comp_path, framework="pt") as f:
|
| 10188 |
state_dict = {k: f.get_tensor(k) for k in f.keys()}
|
| 10189 |
+
|
| 10190 |
+
# Handle vocab size mismatch for LLM component
|
| 10191 |
+
if comp_name == 'llm':
|
| 10192 |
+
# Check if embed_tokens size differs
|
| 10193 |
+
embed_key = 'model.embed_tokens.weight'
|
| 10194 |
+
if embed_key in state_dict:
|
| 10195 |
+
saved_vocab_size = state_dict[embed_key].shape[0]
|
| 10196 |
+
current_vocab_size = component.model.embed_tokens.weight.shape[0]
|
| 10197 |
+
|
| 10198 |
+
if saved_vocab_size != current_vocab_size:
|
| 10199 |
+
print(f" ๐ Resizing embeddings: {current_vocab_size} -> {saved_vocab_size}")
|
| 10200 |
+
component.resize_token_embeddings(saved_vocab_size)
|
| 10201 |
+
|
| 10202 |
component.load_state_dict(state_dict, strict=False)
|
| 10203 |
print(f" โ
Loaded {comp_name}")
|
| 10204 |
|
|
|
|
| 11102 |
if component is not None:
|
| 11103 |
with safe_open(comp_path, framework="pt") as f:
|
| 11104 |
state_dict = {k: f.get_tensor(k) for k in f.keys()}
|
| 11105 |
+
|
| 11106 |
+
# Handle vocab size mismatch for LLM component
|
| 11107 |
+
if comp_name == 'llm':
|
| 11108 |
+
# Check if embed_tokens size differs
|
| 11109 |
+
embed_key = 'model.embed_tokens.weight'
|
| 11110 |
+
if embed_key in state_dict:
|
| 11111 |
+
saved_vocab_size = state_dict[embed_key].shape[0]
|
| 11112 |
+
current_vocab_size = component.model.embed_tokens.weight.shape[0]
|
| 11113 |
+
|
| 11114 |
+
if saved_vocab_size != current_vocab_size:
|
| 11115 |
+
print(f" ๐ Resizing embeddings: {current_vocab_size} -> {saved_vocab_size}")
|
| 11116 |
+
component.resize_token_embeddings(saved_vocab_size)
|
| 11117 |
+
|
| 11118 |
component.load_state_dict(state_dict, strict=False)
|
| 11119 |
print(f" โ
Loaded {comp_name}")
|
| 11120 |
|
projector.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 52880664
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fa7c80f806f46f1616d3acfe6048b56e597e36027b7700eefd9ca13e5b868da9
|
| 3 |
size 52880664
|
streaming_state.json
CHANGED
|
@@ -3,12 +3,12 @@
|
|
| 3 |
"unique_samples": 1,
|
| 4 |
"total_yields": 2,
|
| 5 |
"dataset_positions": {
|
| 6 |
-
"
|
| 7 |
},
|
| 8 |
"modality_positions": {
|
| 9 |
"text": {},
|
| 10 |
"image": {
|
| 11 |
-
"
|
| 12 |
},
|
| 13 |
"video": {},
|
| 14 |
"audio": {}
|
|
|
|
| 3 |
"unique_samples": 1,
|
| 4 |
"total_yields": 2,
|
| 5 |
"dataset_positions": {
|
| 6 |
+
"InstructPix2Pix": 1
|
| 7 |
},
|
| 8 |
"modality_positions": {
|
| 9 |
"text": {},
|
| 10 |
"image": {
|
| 11 |
+
"InstructPix2Pix": 1
|
| 12 |
},
|
| 13 |
"video": {},
|
| 14 |
"audio": {}
|
trainer_state.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"best_model_checkpoint": "/kaggle/working/xoron-final",
|
| 3 |
-
"best_metric": 12.
|
| 4 |
"epoch": 1,
|
| 5 |
"epochs_completed": 1,
|
| 6 |
"global_step": 0,
|
|
|
|
| 1 |
{
|
| 2 |
"best_model_checkpoint": "/kaggle/working/xoron-final",
|
| 3 |
+
"best_metric": 12.640625,
|
| 4 |
"epoch": 1,
|
| 5 |
"epochs_completed": 1,
|
| 6 |
"global_step": 0,
|
training_state.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 5143
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b9e348eb62b15d8fe7418f5c527b62ddecd4e918c36b6ebce28a035479b9432e
|
| 3 |
size 5143
|
video_encoder.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1923089112
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f72e26338eb6855e72508d1a57a3d038d63c84ab92a9448307ad5b3430393bca
|
| 3 |
size 1923089112
|
video_generator.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 61574134
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1aa2bef313e63c8c8dbead2f22c6e044abd56883629f7504e54953e37e62f00b
|
| 3 |
size 61574134
|
vision_encoder.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1000535480
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bc7028344414ab330bb1b2d321f2dd2e828f77a13da8b68c038a4790e170df2b
|
| 3 |
size 1000535480
|
waveform_decoder.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 34681076
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9bff6c300b238aa47a1673e2ffd1f65c029916e9271cd0952b3d6e873b11837a
|
| 3 |
size 34681076
|