Any-to-Any
Transformers
Safetensors
English
xoron
multimodal
Mixture of Experts
text-to-image
image editing
image to video
text-to-video
video editing
text-to-speech
speech-to-text
speech-to-speech
image-to-text
video-to-text
agentic
tool-use
flow-matching
3d-rope
titok
vidtok
dual-stream-attention
zero-shot-voice-cloning
bigvgan
snake-activation
multi-receptive-field-fusion
custom_code
Update model weights after training (epoch 1, loss 4.0192)
Browse files- audio_decoder.safetensors +1 -1
- cross_attention.safetensors +1 -1
- generator.safetensors +1 -1
- llm.safetensors +1 -1
- streaming_state.json +15 -15
- trainer_state.json +1 -1
- training_state.pt +2 -2
audio_decoder.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1458415836
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9cc0af3086d6987e71c00b7121394b8ac820d2276ff994014479d4fc2cf094bf
|
| 3 |
size 1458415836
|
cross_attention.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 174191400
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9034e718a6461ed04d5723c8ecf429d0daedb7fc49274a1fbd17b80bb9dd77b9
|
| 3 |
size 174191400
|
generator.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 629440508
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:02673bab0a9e2949c2f3bfe0725ff77cd631fd89a559d86e9c886c99455a5e72
|
| 3 |
size 629440508
|
llm.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1506831304
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b4ce3c75e0ec09d93c0580ed862f12191e3ea3bd53ab8906e0108cafd5d6fc18
|
| 3 |
size 1506831304
|
streaming_state.json
CHANGED
|
@@ -1,26 +1,26 @@
|
|
| 1 |
{
|
| 2 |
-
"epoch":
|
| 3 |
-
"unique_samples":
|
| 4 |
-
"total_yields":
|
| 5 |
"dataset_positions": {
|
| 6 |
-
"WebSight":
|
| 7 |
-
"ScienceQA":
|
| 8 |
-
"InstructPix2Pix":
|
| 9 |
-
"Flickr8k":
|
| 10 |
-
"NewYorker":
|
| 11 |
"Football": 6,
|
| 12 |
-
"MagicBrush":
|
| 13 |
},
|
| 14 |
"modality_positions": {
|
| 15 |
"text": {},
|
| 16 |
"image": {
|
| 17 |
-
"WebSight":
|
| 18 |
-
"ScienceQA":
|
| 19 |
-
"InstructPix2Pix":
|
| 20 |
-
"Flickr8k":
|
| 21 |
-
"NewYorker":
|
| 22 |
"Football": 6,
|
| 23 |
-
"MagicBrush":
|
| 24 |
},
|
| 25 |
"video": {},
|
| 26 |
"audio": {}
|
|
|
|
| 1 |
{
|
| 2 |
+
"epoch": 5,
|
| 3 |
+
"unique_samples": 1100,
|
| 4 |
+
"total_yields": 2200,
|
| 5 |
"dataset_positions": {
|
| 6 |
+
"WebSight": 186,
|
| 7 |
+
"ScienceQA": 164,
|
| 8 |
+
"InstructPix2Pix": 186,
|
| 9 |
+
"Flickr8k": 186,
|
| 10 |
+
"NewYorker": 186,
|
| 11 |
"Football": 6,
|
| 12 |
+
"MagicBrush": 186
|
| 13 |
},
|
| 14 |
"modality_positions": {
|
| 15 |
"text": {},
|
| 16 |
"image": {
|
| 17 |
+
"WebSight": 186,
|
| 18 |
+
"ScienceQA": 164,
|
| 19 |
+
"InstructPix2Pix": 186,
|
| 20 |
+
"Flickr8k": 186,
|
| 21 |
+
"NewYorker": 186,
|
| 22 |
"Football": 6,
|
| 23 |
+
"MagicBrush": 186
|
| 24 |
},
|
| 25 |
"video": {},
|
| 26 |
"audio": {}
|
trainer_state.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"best_model_checkpoint": "/kaggle/working/xoron-final",
|
| 3 |
-
"best_metric": 4.
|
| 4 |
"epoch": 1,
|
| 5 |
"epochs_completed": 1,
|
| 6 |
"global_step": 37,
|
|
|
|
| 1 |
{
|
| 2 |
"best_model_checkpoint": "/kaggle/working/xoron-final",
|
| 3 |
+
"best_metric": 4.019162586334472,
|
| 4 |
"epoch": 1,
|
| 5 |
"epochs_completed": 1,
|
| 6 |
"global_step": 37,
|
training_state.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cff88f1a8ee14094dfffadc0ac06d52480a2d90bd740252423ecf77cdef8f6cc
|
| 3 |
+
size 1419723549
|