Any-to-Any
Transformers
Safetensors
English
xoron
multimodal
Mixture of Experts
text-to-image
image editing
image to video
text-to-video
video editing
text-to-speech
speech-to-text
speech-to-speech
image-to-text
video-to-text
agentic
tool-use
flow-matching
3d-rope
titok
vidtok
dual-stream-attention
zero-shot-voice-cloning
bigvgan
snake-activation
multi-receptive-field-fusion
custom_code
File size: 702 Bytes
c3b1f1c 7936332 c3b1f1c 7936332 c3b1f1c 39827f9 c3b1f1c 39827f9 c3b1f1c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 | {
"best_model_checkpoint": "/kaggle/working/xoron-final",
"best_metric": 5.891898287038009,
"epoch": 4,
"epochs_completed": 4,
"global_step": 72,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [],
"logging_steps": 50,
"max_steps": 72,
"num_train_epochs": 4,
"total_flos": 0,
"train_batch_size": 1,
"effective_batch_size": 16,
"learning_rate": 0.0001,
"max_grad_norm": 1.0,
"trainable_components": [
"llm",
"cross_attention",
"modality_markers"
],
"frozen_components": [
"vision",
"video",
"audio",
"speech",
"image_generation",
"video_generation"
],
"trial_name": null,
"trial_params": null
} |