Any-to-Any
Transformers
Safetensors
English
xoron
multimodal
Mixture of Experts
text-to-image
image editing
image to video
text-to-video
video editing
text-to-speech
speech-to-text
speech-to-speech
image-to-text
video-to-text
agentic
tool-use
flow-matching
3d-rope
titok
vidtok
dual-stream-attention
zero-shot-voice-cloning
bigvgan
snake-activation
multi-receptive-field-fusion
custom_code
Update model weights after training (epoch 1, loss 3.3989)
Browse files- audio_decoder.safetensors +1 -1
- config.json +1 -1
- cross_attention.safetensors +1 -1
- llm.safetensors +1 -1
- streaming_state.json +15 -15
- trainer_state.json +10 -10
- training_state.pt +2 -2
- video_generator.safetensors +1 -1
audio_decoder.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1458410612
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d817de2ba9f31539807a8d57d1ad5441f33794329008e0a6b9e01764b831f909
|
| 3 |
size 1458410612
|
config.json
CHANGED
|
@@ -49,7 +49,7 @@
|
|
| 49 |
"image_size_step": 32,
|
| 50 |
"video_min_size": 128,
|
| 51 |
"video_max_size": 320,
|
| 52 |
-
"video_base_size":
|
| 53 |
"video_size_step": 32,
|
| 54 |
"video_min_frames": 8,
|
| 55 |
"video_max_frames": 8,
|
|
|
|
| 49 |
"image_size_step": 32,
|
| 50 |
"video_min_size": 128,
|
| 51 |
"video_max_size": 320,
|
| 52 |
+
"video_base_size": 128,
|
| 53 |
"video_size_step": 32,
|
| 54 |
"video_min_frames": 8,
|
| 55 |
"video_max_frames": 8,
|
cross_attention.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 174191400
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6beff1e6cfb37ea461f112bf9d138ca007c01e24ac716b997a92000813aa8de5
|
| 3 |
size 174191400
|
llm.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1506832040
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b168f1e28965acb01ab0375c81614f3af6cd312b27c630633ce21c555d8ab3b5
|
| 3 |
size 1506832040
|
streaming_state.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
{
|
| 2 |
-
"epoch":
|
| 3 |
-
"unique_samples":
|
| 4 |
-
"total_yields":
|
| 5 |
"dataset_positions": {
|
| 6 |
"WebSight": 386,
|
| 7 |
"ScienceQA": 364,
|
|
@@ -76,15 +76,15 @@
|
|
| 76 |
"Tool-Calls-SingleTurn": 200,
|
| 77 |
"Tool-Calls-Multiturn": 200,
|
| 78 |
"OpenAssistant": 450,
|
| 79 |
-
"T2V-Sora-Preferences-2":
|
| 80 |
-
"T2V-Human-Preferences":
|
| 81 |
"Sora-Alignment-Likert": 198,
|
| 82 |
"Sora-Style-Likert": 198,
|
| 83 |
"I2V-Preference-Seedance": 198,
|
| 84 |
-
"WebVid-10M":
|
| 85 |
"Sora-Physics-Likert": 198,
|
| 86 |
-
"TIP-I2V":
|
| 87 |
-
"Pexels-I2V-350k":
|
| 88 |
"SmolTalk-OpenHermes": 250,
|
| 89 |
"SmolTalk-All": 250,
|
| 90 |
"Cosmopedia-AutoMath": 250,
|
|
@@ -157,22 +157,22 @@
|
|
| 157 |
"MagicBrush": 386
|
| 158 |
},
|
| 159 |
"video": {
|
| 160 |
-
"T2V-Sora-Preferences-2":
|
| 161 |
-
"T2V-Human-Preferences":
|
| 162 |
"Sora-Alignment-Likert": 198,
|
| 163 |
"Sora-Style-Likert": 198,
|
| 164 |
"I2V-Preference-Seedance": 198,
|
| 165 |
-
"WebVid-10M":
|
| 166 |
"Sora-Physics-Likert": 198,
|
| 167 |
-
"TIP-I2V":
|
| 168 |
-
"Pexels-I2V-350k":
|
| 169 |
},
|
| 170 |
"audio": {}
|
| 171 |
},
|
| 172 |
"modality_counts": {
|
| 173 |
-
"text":
|
| 174 |
"image": 0,
|
| 175 |
-
"video":
|
| 176 |
"audio": 0
|
| 177 |
},
|
| 178 |
"last_modality": null
|
|
|
|
| 1 |
{
|
| 2 |
+
"epoch": 85,
|
| 3 |
+
"unique_samples": 400,
|
| 4 |
+
"total_yields": 800,
|
| 5 |
"dataset_positions": {
|
| 6 |
"WebSight": 386,
|
| 7 |
"ScienceQA": 364,
|
|
|
|
| 76 |
"Tool-Calls-SingleTurn": 200,
|
| 77 |
"Tool-Calls-Multiturn": 200,
|
| 78 |
"OpenAssistant": 450,
|
| 79 |
+
"T2V-Sora-Preferences-2": 650,
|
| 80 |
+
"T2V-Human-Preferences": 650,
|
| 81 |
"Sora-Alignment-Likert": 198,
|
| 82 |
"Sora-Style-Likert": 198,
|
| 83 |
"I2V-Preference-Seedance": 198,
|
| 84 |
+
"WebVid-10M": 650,
|
| 85 |
"Sora-Physics-Likert": 198,
|
| 86 |
+
"TIP-I2V": 650,
|
| 87 |
+
"Pexels-I2V-350k": 650,
|
| 88 |
"SmolTalk-OpenHermes": 250,
|
| 89 |
"SmolTalk-All": 250,
|
| 90 |
"Cosmopedia-AutoMath": 250,
|
|
|
|
| 157 |
"MagicBrush": 386
|
| 158 |
},
|
| 159 |
"video": {
|
| 160 |
+
"T2V-Sora-Preferences-2": 650,
|
| 161 |
+
"T2V-Human-Preferences": 650,
|
| 162 |
"Sora-Alignment-Likert": 198,
|
| 163 |
"Sora-Style-Likert": 198,
|
| 164 |
"I2V-Preference-Seedance": 198,
|
| 165 |
+
"WebVid-10M": 650,
|
| 166 |
"Sora-Physics-Likert": 198,
|
| 167 |
+
"TIP-I2V": 650,
|
| 168 |
+
"Pexels-I2V-350k": 650
|
| 169 |
},
|
| 170 |
"audio": {}
|
| 171 |
},
|
| 172 |
"modality_counts": {
|
| 173 |
+
"text": 0,
|
| 174 |
"image": 0,
|
| 175 |
+
"video": 250,
|
| 176 |
"audio": 0
|
| 177 |
},
|
| 178 |
"last_modality": null
|
trainer_state.json
CHANGED
|
@@ -1,32 +1,32 @@
|
|
| 1 |
{
|
| 2 |
"best_model_checkpoint": "/kaggle/working/xoron-final",
|
| 3 |
-
"best_metric":
|
| 4 |
-
"epoch":
|
| 5 |
-
"epochs_completed":
|
| 6 |
-
"global_step":
|
| 7 |
"is_local_process_zero": true,
|
| 8 |
"is_world_process_zero": true,
|
| 9 |
"log_history": [],
|
| 10 |
"logging_steps": 50,
|
| 11 |
-
"max_steps":
|
| 12 |
-
"num_train_epochs":
|
| 13 |
"total_flos": 0,
|
| 14 |
"train_batch_size": 1,
|
| 15 |
"effective_batch_size": 16,
|
| 16 |
"learning_rate": 0.0001,
|
| 17 |
"max_grad_norm": 1.0,
|
| 18 |
"trainable_components": [
|
|
|
|
|
|
|
| 19 |
"llm",
|
| 20 |
"cross_attention",
|
|
|
|
| 21 |
"modality_markers"
|
| 22 |
],
|
| 23 |
"frozen_components": [
|
| 24 |
-
"vision",
|
| 25 |
-
"video",
|
| 26 |
"audio",
|
| 27 |
"speech",
|
| 28 |
-
"image_generation"
|
| 29 |
-
"video_generation"
|
| 30 |
],
|
| 31 |
"trial_name": null,
|
| 32 |
"trial_params": null
|
|
|
|
| 1 |
{
|
| 2 |
"best_model_checkpoint": "/kaggle/working/xoron-final",
|
| 3 |
+
"best_metric": 3.398919365755515,
|
| 4 |
+
"epoch": 1,
|
| 5 |
+
"epochs_completed": 1,
|
| 6 |
+
"global_step": 31,
|
| 7 |
"is_local_process_zero": true,
|
| 8 |
"is_world_process_zero": true,
|
| 9 |
"log_history": [],
|
| 10 |
"logging_steps": 50,
|
| 11 |
+
"max_steps": 31,
|
| 12 |
+
"num_train_epochs": 1,
|
| 13 |
"total_flos": 0,
|
| 14 |
"train_batch_size": 1,
|
| 15 |
"effective_batch_size": 16,
|
| 16 |
"learning_rate": 0.0001,
|
| 17 |
"max_grad_norm": 1.0,
|
| 18 |
"trainable_components": [
|
| 19 |
+
"vision",
|
| 20 |
+
"video",
|
| 21 |
"llm",
|
| 22 |
"cross_attention",
|
| 23 |
+
"video_generation",
|
| 24 |
"modality_markers"
|
| 25 |
],
|
| 26 |
"frozen_components": [
|
|
|
|
|
|
|
| 27 |
"audio",
|
| 28 |
"speech",
|
| 29 |
+
"image_generation"
|
|
|
|
| 30 |
],
|
| 31 |
"trial_name": null,
|
| 32 |
"trial_params": null
|
training_state.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b224a38701068628ea2346719232695d255cff3500d63df4b888e5a94eab7ab4
|
| 3 |
+
size 3426643671
|
video_generator.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 61574134
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c17764963de9c77345a4b2ae8b508c3c4c7cb5bf15aa65b7c5239c3e8babc1ce
|
| 3 |
size 61574134
|