Any-to-Any
Transformers
Safetensors
English
xoron
multimodal
Mixture of Experts
text-to-image
image editing
image to video
text-to-video
video editing
text-to-speech
speech-to-text
speech-to-speech
image-to-text
video-to-text
agentic
tool-use
flow-matching
3d-rope
titok
vidtok
dual-stream-attention
zero-shot-voice-cloning
bigvgan
snake-activation
multi-receptive-field-fusion
custom_code
| { | |
| "model_type": "xoron", | |
| "model_name": "Xoron-Dev-MultiMoE", | |
| "hidden_size": 1024, | |
| "num_layers": 12, | |
| "num_heads": 16, | |
| "intermediate_size": 2048, | |
| "vocab_size": 152200, | |
| "max_position_embeddings": 131072, | |
| "rms_norm_eps": 1e-06, | |
| "use_ring_attention": true, | |
| "ring_attention_chunk_size": 4096, | |
| "tie_word_embeddings": true, | |
| "use_moe": true, | |
| "num_experts": 8, | |
| "num_experts_per_tok": 2, | |
| "moe_layer_freq": 2, | |
| "use_shared_expert": true, | |
| "moe_capacity_factor": 1.25, | |
| "use_aux_lossless": true, | |
| "vision_model_name": "google/siglip-so400m-patch14-384", | |
| "freeze_vision": false, | |
| "num_vision_tokens": 64, | |
| "projector_type": "perceiver", | |
| "use_vision_dual_stream": true, | |
| "use_vision_titok": true, | |
| "num_vision_titok_tokens": 256, | |
| "num_vision_dual_stream_layers": 2, | |
| "use_video_3d_rope": true, | |
| "use_video_temporal_moe": true, | |
| "num_video_encoder_layers": 4, | |
| "num_video_experts": 4, | |
| "use_video_vidtok": true, | |
| "vidtok_latent_channels": 4, | |
| "vidtok_temporal_compression": 4, | |
| "vidtok_spatial_compression": 8, | |
| "vidtok_causal": true, | |
| "vidtok_use_fsq": false, | |
| "use_video_titok": true, | |
| "num_video_titok_tokens": 64, | |
| "num_video_titok_layers": 2, | |
| "num_video_titok_heads": 8, | |
| "video_titok_dropout": 0.1, | |
| "use_multi_scale": true, | |
| "use_continuous_scale": true, | |
| "image_min_size": 128, | |
| "image_max_size": 384, | |
| "image_base_size": 256, | |
| "image_size_step": 32, | |
| "video_min_size": 128, | |
| "video_max_size": 320, | |
| "video_base_size": 320, | |
| "video_size_step": 32, | |
| "video_min_frames": 8, | |
| "video_max_frames": 8, | |
| "video_base_frames": 16, | |
| "video_frame_step": 4, | |
| "multi_scale_strategy": "adaptive", | |
| "multi_scale_warmup_epochs": 3, | |
| "adaptive_scale_oom_penalty": 0.5, | |
| "adaptive_scale_success_boost": 0.1, | |
| "generation_supported_sizes": [ | |
| 192, | |
| 256, | |
| 320, | |
| 384 | |
| ], | |
| "generation_supported_frames": [ | |
| 8, | |
| 12, | |
| 16, | |
| 20, | |
| 24 | |
| ], | |
| "enable_generation": true, | |
| "generation_latent_channels": 4, | |
| "generation_base_channels": 128, | |
| "generation_inference_steps": 50, | |
| "generation_cfg_scale": 7.5, | |
| "generation_use_flow_matching": true, | |
| "generation_num_experts": 4, | |
| "generation_use_dual_stream": true, | |
| "generation_video_cfg_scale": 7.5, | |
| "generation_video_use_flow_matching": true, | |
| "generation_video_num_experts": 4, | |
| "generation_video_use_3d_rope": true, | |
| "generation_video_use_temporal_moe": true, | |
| "audio_sample_rate": 16000, | |
| "audio_n_mels": 80, | |
| "audio_max_length": 625, | |
| "audio_max_waveform_samples": 160000, | |
| "audio_num_speakers": 256, | |
| "use_raw_waveform": true, | |
| "audio_kv_lora_rank": 256, | |
| "audio_speaker_embed_dim": 256, | |
| "use_mas": true, | |
| "use_in_context_audio_prompting": true, | |
| "tokenizer_name": "Qwen/Qwen2.5-1.5B", | |
| "use_lora": true, | |
| "lora_r": 32, | |
| "lora_alpha": 64, | |
| "lora_dropout": 0.05, | |
| "lora_target_modules": [ | |
| "q_proj", | |
| "k_proj", | |
| "v_proj", | |
| "o_proj", | |
| "gate_proj", | |
| "up_proj", | |
| "down_proj" | |
| ], | |
| "train_lora_only": false, | |
| "use_rslora": true, | |
| "use_dora": false, | |
| "lora_plus_lr_ratio": 4.0, | |
| "use_cross_attention": true, | |
| "cross_attention_layers": 4, | |
| "cross_attention_heads": 8, | |
| "cross_attention_dropout": 0.1, | |
| "use_flash_attention": true, | |
| "output_dir": "./xoron-model", | |
| "has_audio_encoder": true, | |
| "has_audio_decoder": true, | |
| "has_waveform_decoder": true, | |
| "has_vision_encoder": true, | |
| "has_video_encoder": true, | |
| "has_generator": true, | |
| "has_video_generator": true, | |
| "has_cross_attention": true, | |
| "lora_applied": true, | |
| "architecture_version": 2, | |
| "auto_map": { | |
| "AutoConfig": "configuration_xoron.XoronConfig", | |
| "AutoModel": "modeling_xoron.XoronModel", | |
| "AutoModelForCausalLM": "modeling_xoron.XoronForCausalLM" | |
| } | |
| } |