Any-to-Any
Transformers
Safetensors
English
xoron
multimodal
Mixture of Experts
text-to-image
image editing
image to video
text-to-video
video editing
text-to-speech
speech-to-text
speech-to-speech
image-to-text
video-to-text
agentic
tool-use
flow-matching
3d-rope
titok
vidtok
dual-stream-attention
zero-shot-voice-cloning
bigvgan
snake-activation
multi-receptive-field-fusion
custom_code
| """ | |
| Xoron Model Configuration for HuggingFace Transformers. | |
| This module provides a HuggingFace-compatible configuration class for the Xoron | |
| multimodal model. It inherits from PreTrainedConfig to enable: | |
| - Loading via AutoConfig | |
| - Saving/loading with save_pretrained/from_pretrained | |
| - Hub integration with push_to_hub | |
| Usage: | |
| from transformers import AutoConfig | |
| config = AutoConfig.from_pretrained("your-repo/xoron-model", trust_remote_code=True) | |
| """ | |
| from transformers import PreTrainedConfig | |
| from typing import List ,Tuple ,Union | |
| class XoronConfig (PreTrainedConfig ): | |
| """ | |
| Configuration class for Xoron-Dev multimodal model. | |
| This is a HuggingFace-compatible configuration that stores all the parameters | |
| needed to instantiate a XoronMultimodalModel. | |
| Args: | |
| model_name (`str`, *optional*, defaults to `"Xoron-Dev-MultiMoE"`): | |
| Name of the model. | |
| hidden_size (`int`, *optional*, defaults to 1024): | |
| Dimension of the hidden representations. | |
| num_layers (`int`, *optional*, defaults to 12): | |
| Number of transformer layers. | |
| num_heads (`int`, *optional*, defaults to 16): | |
| Number of attention heads. | |
| intermediate_size (`int`, *optional*, defaults to 2048): | |
| Dimension of the MLP intermediate layer. | |
| vocab_size (`int`, *optional*, defaults to 151643): | |
| Vocabulary size (Qwen2.5 tokenizer). | |
| max_position_embeddings (`int`, *optional*, defaults to 131072): | |
| Maximum sequence length (128K context). | |
| SOTA Features: | |
| - MLA (Multi-Head Latent Attention) for compressed KV cache | |
| - MoE with shared expert isolation (DeepSeek-style) | |
| - Ring Attention for distributed 128K+ context | |
| - YaRN/LongRoPE for superior long-context extrapolation | |
| - LoRA variants (rsLoRA, DoRA, LoRA+) | |
| - Perceiver Resampler for vision projection | |
| - Cross-attention for multimodal fusion | |
| - MoE-DiT with Flow Matching for image generation | |
| - 3D-RoPE + 3D Causal Transformers for video generation | |
| - TiTok-style 1D tokenization for vision encoding | |
| - VidTok-style 1D tokenization for video encoding | |
| - VideoTiTokTokenizer for efficient video token compression | |
| - Dual-stream attention for symmetric processing | |
| - Conformer audio encoder/decoder | |
| - FP16-native numerical stability | |
| - Multi-scale training for variable resolution handling | |
| """ | |
| model_type ="xoron" | |
| def __init__ ( | |
| self , | |
| model_name :str ="Xoron-Dev-MultiMoE", | |
| hidden_size :int =1024 , | |
| num_layers :int =12 , | |
| num_heads :int =16 , | |
| intermediate_size :int =2048 , | |
| vocab_size :int =151643 , | |
| max_position_embeddings :int =131072 , | |
| rms_norm_eps :float =1e-6 , | |
| use_ring_attention :bool =True , | |
| ring_attention_chunk_size :int =4096 , | |
| tie_word_embeddings :bool =True , | |
| use_moe :bool =True , | |
| num_experts :int =8 , | |
| num_experts_per_tok :int =2 , | |
| moe_layer_freq :int =2 , | |
| use_shared_expert :bool =True , | |
| moe_capacity_factor :float =1.25 , | |
| use_aux_lossless :bool =True , | |
| vision_model_name :str ="google/siglip-so400m-patch14-384", | |
| freeze_vision :bool =False , | |
| num_vision_tokens :int =64 , | |
| projector_type :str ="perceiver", | |
| use_vision_dual_stream :bool =True , | |
| use_vision_titok :bool =True , | |
| num_vision_titok_tokens :int =256 , | |
| num_vision_dual_stream_layers :int =2 , | |
| use_video_3d_rope :bool =True , | |
| use_video_temporal_moe :bool =True , | |
| num_video_encoder_layers :int =4 , | |
| num_video_experts :int =4 , | |
| use_video_vidtok :bool =True , | |
| vidtok_latent_channels :int =4 , | |
| vidtok_temporal_compression :int =4 , | |
| vidtok_spatial_compression :int =8 , | |
| vidtok_causal :bool =True , | |
| vidtok_use_fsq :bool =False , | |
| use_video_titok :bool =True , | |
| num_video_titok_tokens :int =64 , | |
| num_video_titok_layers :int =2 , | |
| num_video_titok_heads :int =8 , | |
| video_titok_dropout :float =0.1 , | |
| use_multi_scale :bool =True , | |
| use_continuous_scale :bool =True , | |
| image_min_size :int =128 , | |
| image_max_size :int =384 , | |
| image_base_size :int =256 , | |
| image_size_step :int =32 , | |
| video_min_size :int =128 , | |
| video_max_size :int =320 , | |
| video_base_size :int =192 , | |
| video_size_step :int =32 , | |
| video_min_frames :int =8 , | |
| video_max_frames :int =24 , | |
| video_base_frames :int =16 , | |
| video_frame_step :int =4 , | |
| multi_scale_strategy :str ="adaptive", | |
| multi_scale_warmup_epochs :int =3 , | |
| adaptive_scale_oom_penalty :float =0.5 , | |
| adaptive_scale_success_boost :float =0.1 , | |
| generation_supported_sizes :Union [List [int ],Tuple [int ,...]]=(192 ,256 ,320 ,384 ), | |
| generation_supported_frames :Union [List [int ],Tuple [int ,...]]=(8 ,12 ,16 ,20 ,24 ), | |
| enable_generation :bool =True , | |
| generation_latent_channels :int =4 , | |
| generation_base_channels :int =128 , | |
| generation_inference_steps :int =50 , | |
| generation_cfg_scale :float =7.5 , | |
| generation_use_flow_matching :bool =True , | |
| generation_num_experts :int =4 , | |
| generation_use_dual_stream :bool =True , | |
| generation_video_cfg_scale :float =7.5 , | |
| generation_video_use_flow_matching :bool =True , | |
| generation_video_num_experts :int =4 , | |
| generation_video_use_3d_rope :bool =True , | |
| generation_video_use_temporal_moe :bool =True , | |
| audio_sample_rate :int =16000 , | |
| audio_n_mels :int =80 , | |
| audio_max_length :int =625 , | |
| audio_max_waveform_samples :int =160000 , | |
| audio_num_speakers :int =256 , | |
| use_raw_waveform :bool =True , | |
| audio_kv_lora_rank :int =256 , | |
| audio_speaker_embed_dim :int =256 , | |
| use_mas :bool =True , | |
| use_in_context_audio_prompting :bool =True , | |
| tokenizer_name :str ="Qwen/Qwen2.5-1.5B", | |
| use_lora :bool =True , | |
| lora_r :int =32 , | |
| lora_alpha :int =64 , | |
| lora_dropout :float =0.05 , | |
| lora_target_modules :Union [List [str ],Tuple [str ,...]]=( | |
| 'q_proj','k_proj','v_proj','o_proj', | |
| 'gate_proj','up_proj','down_proj', | |
| ), | |
| train_lora_only :bool =False , | |
| use_rslora :bool =True , | |
| use_dora :bool =False , | |
| lora_plus_lr_ratio :float =4.0 , | |
| use_cross_attention :bool =True , | |
| cross_attention_layers :int =4 , | |
| cross_attention_heads :int =8 , | |
| cross_attention_dropout :float =0.1 , | |
| use_flash_attention :bool =True , | |
| has_audio_encoder :bool =True , | |
| has_audio_decoder :bool =True , | |
| has_waveform_decoder :bool =True , | |
| has_vision_encoder :bool =True , | |
| has_video_encoder :bool =True , | |
| has_generator :bool =True , | |
| has_video_generator :bool =True , | |
| has_cross_attention :bool =True , | |
| lora_applied :bool =False , | |
| architecture_version :int =2 , | |
| output_dir :str ="./xoron-model", | |
| modality_dropout_prob :float =0.0 , | |
| **kwargs , | |
| ): | |
| super ().__init__ (**kwargs ) | |
| self .modality_dropout_prob =modality_dropout_prob | |
| self .model_name =model_name | |
| self .hidden_size =hidden_size | |
| self .num_layers =num_layers | |
| self .num_heads =num_heads | |
| self .intermediate_size =intermediate_size | |
| self .vocab_size =vocab_size | |
| self .max_position_embeddings =max_position_embeddings | |
| self .rms_norm_eps =rms_norm_eps | |
| self .use_ring_attention =use_ring_attention | |
| self .ring_attention_chunk_size =ring_attention_chunk_size | |
| self .tie_word_embeddings =tie_word_embeddings | |
| self .use_moe =use_moe | |
| self .num_experts =num_experts | |
| self .num_experts_per_tok =num_experts_per_tok | |
| self .moe_layer_freq =moe_layer_freq | |
| self .use_shared_expert =use_shared_expert | |
| self .moe_capacity_factor =moe_capacity_factor | |
| self .use_aux_lossless =use_aux_lossless | |
| self .vision_model_name =vision_model_name | |
| self .freeze_vision =freeze_vision | |
| self .num_vision_tokens =num_vision_tokens | |
| self .projector_type =projector_type | |
| self .use_vision_dual_stream =use_vision_dual_stream | |
| self .use_vision_titok =use_vision_titok | |
| self .num_vision_titok_tokens =num_vision_titok_tokens | |
| self .num_vision_dual_stream_layers =num_vision_dual_stream_layers | |
| self .use_video_3d_rope =use_video_3d_rope | |
| self .use_video_temporal_moe =use_video_temporal_moe | |
| self .num_video_encoder_layers =num_video_encoder_layers | |
| self .num_video_experts =num_video_experts | |
| self .use_video_vidtok =use_video_vidtok | |
| self .vidtok_latent_channels =vidtok_latent_channels | |
| self .vidtok_temporal_compression =vidtok_temporal_compression | |
| self .vidtok_spatial_compression =vidtok_spatial_compression | |
| self .vidtok_causal =vidtok_causal | |
| self .vidtok_use_fsq =vidtok_use_fsq | |
| self .use_video_titok =use_video_titok | |
| self .num_video_titok_tokens =num_video_titok_tokens | |
| self .num_video_titok_layers =num_video_titok_layers | |
| self .num_video_titok_heads =num_video_titok_heads | |
| self .video_titok_dropout =video_titok_dropout | |
| self .use_multi_scale =use_multi_scale | |
| self .use_continuous_scale =use_continuous_scale | |
| self .image_min_size =image_min_size | |
| self .image_max_size =image_max_size | |
| self .image_base_size =image_base_size | |
| self .image_size_step =image_size_step | |
| self .video_min_size =video_min_size | |
| self .video_max_size =video_max_size | |
| self .video_base_size =video_base_size | |
| self .video_size_step =video_size_step | |
| self .video_min_frames =video_min_frames | |
| self .video_max_frames =video_max_frames | |
| self .video_base_frames =video_base_frames | |
| self .video_frame_step =video_frame_step | |
| self .multi_scale_strategy =multi_scale_strategy | |
| self .multi_scale_warmup_epochs =multi_scale_warmup_epochs | |
| self .adaptive_scale_oom_penalty =adaptive_scale_oom_penalty | |
| self .adaptive_scale_success_boost =adaptive_scale_success_boost | |
| self .generation_supported_sizes =list (generation_supported_sizes )if not isinstance (generation_supported_sizes ,list )else generation_supported_sizes | |
| self .generation_supported_frames =list (generation_supported_frames )if not isinstance (generation_supported_frames ,list )else generation_supported_frames | |
| self .enable_generation =enable_generation | |
| self .generation_latent_channels =generation_latent_channels | |
| self .generation_base_channels =generation_base_channels | |
| self .generation_inference_steps =generation_inference_steps | |
| self .generation_cfg_scale =generation_cfg_scale | |
| self .generation_use_flow_matching =generation_use_flow_matching | |
| self .generation_num_experts =generation_num_experts | |
| self .generation_use_dual_stream =generation_use_dual_stream | |
| self .generation_video_cfg_scale =generation_video_cfg_scale | |
| self .generation_video_use_flow_matching =generation_video_use_flow_matching | |
| self .generation_video_num_experts =generation_video_num_experts | |
| self .generation_video_use_3d_rope =generation_video_use_3d_rope | |
| self .generation_video_use_temporal_moe =generation_video_use_temporal_moe | |
| self .audio_sample_rate =audio_sample_rate | |
| self .audio_n_mels =audio_n_mels | |
| self .audio_max_length =audio_max_length | |
| self .audio_max_waveform_samples =audio_max_waveform_samples | |
| self .audio_num_speakers =audio_num_speakers | |
| self .use_raw_waveform =use_raw_waveform | |
| self .audio_kv_lora_rank =audio_kv_lora_rank | |
| self .audio_speaker_embed_dim =audio_speaker_embed_dim | |
| self .use_mas =use_mas | |
| self .use_in_context_audio_prompting =use_in_context_audio_prompting | |
| self .tokenizer_name =tokenizer_name | |
| self .use_lora =use_lora | |
| self .lora_r =lora_r | |
| self .lora_alpha =lora_alpha | |
| self .lora_dropout =lora_dropout | |
| self .lora_target_modules =list (lora_target_modules )if not isinstance (lora_target_modules ,list )else lora_target_modules | |
| self .train_lora_only =train_lora_only | |
| self .use_rslora =use_rslora | |
| self .use_dora =use_dora | |
| self .lora_plus_lr_ratio =lora_plus_lr_ratio | |
| self .use_cross_attention =use_cross_attention | |
| self .cross_attention_layers =cross_attention_layers | |
| self .cross_attention_heads =cross_attention_heads | |
| self .cross_attention_dropout =cross_attention_dropout | |
| self .use_flash_attention =use_flash_attention | |
| self .has_audio_encoder =has_audio_encoder | |
| self .has_audio_decoder =has_audio_decoder | |
| self .has_waveform_decoder =has_waveform_decoder | |
| self .has_vision_encoder =has_vision_encoder | |
| self .has_video_encoder =has_video_encoder | |
| self .has_generator =has_generator | |
| self .has_video_generator =has_video_generator | |
| self .has_cross_attention =has_cross_attention | |
| self .lora_applied =lora_applied | |
| self .architecture_version =architecture_version | |
| self .output_dir =output_dir | |
| def from_pretrained (cls ,pretrained_model_name_or_path :str ,**kwargs ): | |
| """ | |
| SOTA: Load config from directory, filtering out keys that don't match XoronConfig. | |
| This enables loading configs from newer/different versions gracefully. | |
| """ | |
| import json | |
| import os | |
| if os .path .isdir (pretrained_model_name_or_path ): | |
| config_path =os .path .join (pretrained_model_name_or_path ,"config.json") | |
| else : | |
| config_path =pretrained_model_name_or_path | |
| if os .path .exists (config_path ): | |
| with open (config_path ,"r")as f : | |
| config_dict =json .load (f ) | |
| import inspect | |
| sig =inspect .signature (cls .__init__ ) | |
| valid_keys =set (sig .parameters .keys ()) | |
| filtered_config ={k :v for k ,v in config_dict .items ()if k in valid_keys } | |
| filtered_config .update (kwargs ) | |
| return cls (**filtered_config ) | |
| try : | |
| return super ().from_pretrained (pretrained_model_name_or_path ,**kwargs ) | |
| except Exception : | |
| return cls (**kwargs ) | |