Any-to-Any
Transformers
Safetensors
English
xoron
multimodal
Mixture of Experts
text-to-image
image editing
image to video
text-to-video
video editing
text-to-speech
speech-to-text
speech-to-speech
image-to-text
video-to-text
agentic
tool-use
flow-matching
3d-rope
titok
vidtok
dual-stream-attention
zero-shot-voice-cloning
bigvgan
snake-activation
multi-receptive-field-fusion
custom_code
Update model weights after training (epoch 4, loss 3.8692)
Browse files- audio_decoder.safetensors +1 -1
- cross_attention.safetensors +1 -1
- llm.safetensors +1 -1
- streaming_state.json +26 -7
- trainer_state.json +6 -6
- training_state.pt +2 -2
- video_generator.safetensors +1 -1
audio_decoder.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1458410612
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0644bb8cb74a2a1d0e055138e41ec52d65d83dca9bc9466cbdd8f388f1aa96b2
|
| 3 |
size 1458410612
|
cross_attention.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 174191400
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:10a70bf7bf4edce737146b199b106166957aa843440edfc45831f1d6033b7e11
|
| 3 |
size 174191400
|
llm.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1506831304
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b78daf2a6be38a3c0753175dd705363f8a348dc24b7d7a6fb9539715c530f22e
|
| 3 |
size 1506831304
|
streaming_state.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
{
|
| 2 |
-
"epoch":
|
| 3 |
-
"unique_samples":
|
| 4 |
-
"total_yields":
|
| 5 |
"dataset_positions": {
|
| 6 |
"WebSight": 386,
|
| 7 |
"ScienceQA": 364,
|
|
@@ -75,7 +75,16 @@
|
|
| 75 |
"Synth-Debugging": 200,
|
| 76 |
"Tool-Calls-SingleTurn": 200,
|
| 77 |
"Tool-Calls-Multiturn": 200,
|
| 78 |
-
"OpenAssistant": 200
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
},
|
| 80 |
"modality_positions": {
|
| 81 |
"text": {
|
|
@@ -121,13 +130,23 @@
|
|
| 121 |
"Football": 6,
|
| 122 |
"MagicBrush": 386
|
| 123 |
},
|
| 124 |
-
"video": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
"audio": {}
|
| 126 |
},
|
| 127 |
"modality_counts": {
|
| 128 |
"text": 0,
|
| 129 |
-
"image":
|
| 130 |
-
"video":
|
| 131 |
"audio": 0
|
| 132 |
},
|
| 133 |
"last_modality": null
|
|
|
|
| 1 |
{
|
| 2 |
+
"epoch": 26,
|
| 3 |
+
"unique_samples": 586,
|
| 4 |
+
"total_yields": 1172,
|
| 5 |
"dataset_positions": {
|
| 6 |
"WebSight": 386,
|
| 7 |
"ScienceQA": 364,
|
|
|
|
| 75 |
"Synth-Debugging": 200,
|
| 76 |
"Tool-Calls-SingleTurn": 200,
|
| 77 |
"Tool-Calls-Multiturn": 200,
|
| 78 |
+
"OpenAssistant": 200,
|
| 79 |
+
"T2V-Sora-Preferences-2": 200,
|
| 80 |
+
"T2V-Human-Preferences": 200,
|
| 81 |
+
"Sora-Alignment-Likert": 198,
|
| 82 |
+
"Sora-Style-Likert": 198,
|
| 83 |
+
"I2V-Preference-Seedance": 198,
|
| 84 |
+
"WebVid-10M": 200,
|
| 85 |
+
"Sora-Physics-Likert": 198,
|
| 86 |
+
"TIP-I2V": 200,
|
| 87 |
+
"Pexels-I2V-350k": 200
|
| 88 |
},
|
| 89 |
"modality_positions": {
|
| 90 |
"text": {
|
|
|
|
| 130 |
"Football": 6,
|
| 131 |
"MagicBrush": 386
|
| 132 |
},
|
| 133 |
+
"video": {
|
| 134 |
+
"T2V-Sora-Preferences-2": 200,
|
| 135 |
+
"T2V-Human-Preferences": 200,
|
| 136 |
+
"Sora-Alignment-Likert": 198,
|
| 137 |
+
"Sora-Style-Likert": 198,
|
| 138 |
+
"I2V-Preference-Seedance": 198,
|
| 139 |
+
"WebVid-10M": 200,
|
| 140 |
+
"Sora-Physics-Likert": 198,
|
| 141 |
+
"TIP-I2V": 200,
|
| 142 |
+
"Pexels-I2V-350k": 200
|
| 143 |
+
},
|
| 144 |
"audio": {}
|
| 145 |
},
|
| 146 |
"modality_counts": {
|
| 147 |
"text": 0,
|
| 148 |
+
"image": 0,
|
| 149 |
+
"video": 586,
|
| 150 |
"audio": 0
|
| 151 |
},
|
| 152 |
"last_modality": null
|
trainer_state.json
CHANGED
|
@@ -1,14 +1,14 @@
|
|
| 1 |
{
|
| 2 |
"best_model_checkpoint": "/kaggle/working/xoron-final",
|
| 3 |
-
"best_metric": 3.
|
| 4 |
"epoch": 4,
|
| 5 |
"epochs_completed": 4,
|
| 6 |
-
"global_step":
|
| 7 |
"is_local_process_zero": true,
|
| 8 |
"is_world_process_zero": true,
|
| 9 |
"log_history": [],
|
| 10 |
"logging_steps": 50,
|
| 11 |
-
"max_steps":
|
| 12 |
"num_train_epochs": 4,
|
| 13 |
"total_flos": 0,
|
| 14 |
"train_batch_size": 1,
|
|
@@ -17,16 +17,16 @@
|
|
| 17 |
"max_grad_norm": 1.0,
|
| 18 |
"trainable_components": [
|
| 19 |
"vision",
|
|
|
|
| 20 |
"llm",
|
| 21 |
"cross_attention",
|
| 22 |
-
"
|
| 23 |
"modality_markers"
|
| 24 |
],
|
| 25 |
"frozen_components": [
|
| 26 |
-
"video",
|
| 27 |
"audio",
|
| 28 |
"speech",
|
| 29 |
-
"
|
| 30 |
],
|
| 31 |
"trial_name": null,
|
| 32 |
"trial_params": null
|
|
|
|
| 1 |
{
|
| 2 |
"best_model_checkpoint": "/kaggle/working/xoron-final",
|
| 3 |
+
"best_metric": 3.869171884744816,
|
| 4 |
"epoch": 4,
|
| 5 |
"epochs_completed": 4,
|
| 6 |
+
"global_step": 298,
|
| 7 |
"is_local_process_zero": true,
|
| 8 |
"is_world_process_zero": true,
|
| 9 |
"log_history": [],
|
| 10 |
"logging_steps": 50,
|
| 11 |
+
"max_steps": 298,
|
| 12 |
"num_train_epochs": 4,
|
| 13 |
"total_flos": 0,
|
| 14 |
"train_batch_size": 1,
|
|
|
|
| 17 |
"max_grad_norm": 1.0,
|
| 18 |
"trainable_components": [
|
| 19 |
"vision",
|
| 20 |
+
"video",
|
| 21 |
"llm",
|
| 22 |
"cross_attention",
|
| 23 |
+
"video_generation",
|
| 24 |
"modality_markers"
|
| 25 |
],
|
| 26 |
"frozen_components": [
|
|
|
|
| 27 |
"audio",
|
| 28 |
"speech",
|
| 29 |
+
"image_generation"
|
| 30 |
],
|
| 31 |
"trial_name": null,
|
| 32 |
"trial_params": null
|
training_state.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a9b37a03cba59de5ddbc9ab88c301e76b8a0fa5bc81d6d471cbefe513d0699cf
|
| 3 |
+
size 724684421
|
video_generator.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 61574134
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b4b113c1d2cf10b7fad0c03661c1093738604762583c5c8f0fb0c8c84bcdc6f4
|
| 3 |
size 61574134
|