Xoron-Dev-MultiMoe / config.json
Backup-bdg's picture
Update model weights after training (epoch 7, loss 5.3543)
39827f9 verified
{
"model_type": "xoron",
"model_name": "Xoron-Dev-MultiMoE",
"hidden_size": 1024,
"num_layers": 12,
"num_heads": 16,
"intermediate_size": 2048,
"vocab_size": 152200,
"max_position_embeddings": 131072,
"rms_norm_eps": 1e-06,
"use_ring_attention": true,
"ring_attention_chunk_size": 4096,
"tie_word_embeddings": true,
"use_moe": true,
"num_experts": 8,
"num_experts_per_tok": 2,
"moe_layer_freq": 2,
"use_shared_expert": true,
"moe_capacity_factor": 1.25,
"use_aux_lossless": true,
"vision_model_name": "google/siglip-so400m-patch14-384",
"freeze_vision": false,
"num_vision_tokens": 64,
"projector_type": "perceiver",
"use_vision_dual_stream": true,
"use_vision_titok": true,
"num_vision_titok_tokens": 256,
"num_vision_dual_stream_layers": 2,
"use_video_3d_rope": true,
"use_video_temporal_moe": true,
"num_video_encoder_layers": 4,
"num_video_experts": 4,
"use_video_vidtok": true,
"vidtok_latent_channels": 4,
"vidtok_temporal_compression": 4,
"vidtok_spatial_compression": 8,
"vidtok_causal": true,
"vidtok_use_fsq": false,
"use_video_titok": true,
"num_video_titok_tokens": 64,
"num_video_titok_layers": 2,
"num_video_titok_heads": 8,
"video_titok_dropout": 0.1,
"use_multi_scale": true,
"use_continuous_scale": true,
"image_min_size": 128,
"image_max_size": 384,
"image_base_size": 256,
"image_size_step": 32,
"video_min_size": 128,
"video_max_size": 320,
"video_base_size": 320,
"video_size_step": 32,
"video_min_frames": 8,
"video_max_frames": 8,
"video_base_frames": 16,
"video_frame_step": 4,
"multi_scale_strategy": "adaptive",
"multi_scale_warmup_epochs": 3,
"adaptive_scale_oom_penalty": 0.5,
"adaptive_scale_success_boost": 0.1,
"generation_supported_sizes": [
192,
256,
320,
384
],
"generation_supported_frames": [
8,
12,
16,
20,
24
],
"enable_generation": true,
"generation_latent_channels": 4,
"generation_base_channels": 128,
"generation_inference_steps": 50,
"generation_cfg_scale": 7.5,
"generation_use_flow_matching": true,
"generation_num_experts": 4,
"generation_use_dual_stream": true,
"generation_video_cfg_scale": 7.5,
"generation_video_use_flow_matching": true,
"generation_video_num_experts": 4,
"generation_video_use_3d_rope": true,
"generation_video_use_temporal_moe": true,
"audio_sample_rate": 16000,
"audio_n_mels": 80,
"audio_max_length": 625,
"audio_max_waveform_samples": 160000,
"audio_num_speakers": 256,
"use_raw_waveform": true,
"audio_kv_lora_rank": 256,
"audio_speaker_embed_dim": 256,
"use_mas": true,
"use_in_context_audio_prompting": true,
"tokenizer_name": "Qwen/Qwen2.5-1.5B",
"use_lora": true,
"lora_r": 32,
"lora_alpha": 64,
"lora_dropout": 0.05,
"lora_target_modules": [
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"gate_proj",
"up_proj",
"down_proj"
],
"train_lora_only": false,
"use_rslora": true,
"use_dora": false,
"lora_plus_lr_ratio": 4.0,
"use_cross_attention": true,
"cross_attention_layers": 4,
"cross_attention_heads": 8,
"cross_attention_dropout": 0.1,
"use_flash_attention": true,
"output_dir": "./xoron-model",
"has_audio_encoder": true,
"has_audio_decoder": true,
"has_waveform_decoder": true,
"has_vision_encoder": true,
"has_video_encoder": true,
"has_generator": true,
"has_video_generator": true,
"has_cross_attention": true,
"lora_applied": true,
"architecture_version": 2,
"auto_map": {
"AutoConfig": "configuration_xoron.XoronConfig",
"AutoModel": "modeling_xoron.XoronModel",
"AutoModelForCausalLM": "modeling_xoron.XoronForCausalLM"
}
}