Backup-bdg commited on Feb 13

Commit

5ee35eb

verified ·

1 Parent(s): 9234f2c

Update model weights after training (epoch 1, loss 12.6258)

Browse files

Files changed (18) hide show

audio_decoder.safetensors +1 -1
audio_encoder.safetensors +1 -1
audio_projector.safetensors +1 -1
config.json +35 -97
configuration_xoron.py +367 -0
cross_attention.safetensors +1 -1
generator.safetensors +1 -1
llm.safetensors +1 -1
modality_markers.safetensors +0 -0
modeling_xoron.py +539 -0
projector.safetensors +1 -1
streaming_state.json +8 -24
trainer_state.json +6 -6
training_state.pt +2 -2
video_encoder.safetensors +2 -2
video_generator.safetensors +2 -2
vision_encoder.safetensors +1 -1
waveform_decoder.safetensors +1 -1

audio_decoder.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a9f369795e1f9e71b138a0dd3705d8549661085d292fdbf729982ba0e4f2b6ab
 size 1458415836

 version https://git-lfs.github.com/spec/v1
+oid sha256:f0f29b6f9c83c81bcdc351798dafef1e6afccd6fd74a4651ec02a829fef90157
 size 1458415836

audio_encoder.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1170b76158b117e913232d6a55dd51638c9e72f91bec4e85f1ce866cfe3b7744
 size 466150140

 version https://git-lfs.github.com/spec/v1
+oid sha256:c8ee92bfbfd53017d729cdadfdabbafce57057461aca0234c481ca122cd8485e
 size 466150140

audio_projector.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:df1d4100644c839f5d6f31ff19fc787fa0cb2416b3fdec70955df4331ed7902a
 size 2099352

 version https://git-lfs.github.com/spec/v1
+oid sha256:65558731e70800c2dca684217b1d7c5a7e09baf669f7f5c3173b344c159a8c45
 size 2099352

config.json CHANGED Viewed

@@ -1,4 +1,5 @@
 {
   "model_name": "Xoron-Dev-MultiMoE",
   "hidden_size": 1024,
   "num_layers": 12,
@@ -29,116 +30,47 @@
   "use_video_temporal_moe": true,
   "num_video_encoder_layers": 4,
   "num_video_experts": 4,
   "use_multi_scale": true,
-  "image_scales": [
-    [
-      128,
-      128
-    ],
-    [
-      192,
-      192
-    ],
-    [
-      256,
-      256
-    ],
-    [
-      320,
-      320
-    ],
-    [
-      384,
-      384
-    ],
-    [
-      448,
-      448
-    ],
-    [
-      512,
-      512
-    ]
-  ],
-  "image_scale_probs": [
-    0.05,
-    0.1,
-    0.3,
-    0.25,
-    0.15,
-    0.1,
-    0.05
-  ],
   "image_min_size": 128,
-  "image_max_size": 512,
   "image_base_size": 256,
-  "video_scales": [
-    [
-      128,
-      128
-    ],
-    [
-      192,
-      192
-    ],
-    [
-      256,
-      256
-    ],
-    [
-      320,
-      320
-    ],
-    [
-      384,
-      384
-    ]
-  ],
-  "video_scale_probs": [
-    0.1,
-    0.2,
-    0.35,
-    0.25,
-    0.1
-  ],
   "video_min_size": 128,
-  "video_max_size": 384,
-  "video_base_size": 256,
-  "video_frame_scales": [
-    8,
-    12,
-    16,
-    20,
-    24,
-    32
-  ],
-  "video_frame_scale_probs": [
-    0.1,
-    0.15,
-    0.3,
-    0.2,
-    0.15,
-    0.1
-  ],
   "video_min_frames": 8,
-  "video_max_frames": 32,
   "video_base_frames": 16,
-  "multi_scale_strategy": "random",
-  "multi_scale_warmup_epochs": 5,
   "generation_supported_sizes": [
     256,
     320,
-    384,
-    448,
-    512
   ],
   "generation_supported_frames": [
     8,
     12,
     16,
     20,
-    24,
-    32
   ],
   "enable_generation": true,
   "generation_latent_channels": 4,
@@ -155,7 +87,8 @@
   "generation_video_use_temporal_moe": true,
   "audio_sample_rate": 16000,
   "audio_n_mels": 80,
-  "audio_max_length": 1000,
   "audio_num_speakers": 256,
   "use_raw_waveform": true,
   "audio_kv_lora_rank": 256,
@@ -195,5 +128,10 @@
   "has_video_generator": true,
   "has_cross_attention": true,
   "lora_applied": true,
-  "architecture_version": 2
 }

 {
+  "model_type": "xoron",
   "model_name": "Xoron-Dev-MultiMoE",
   "hidden_size": 1024,
   "num_layers": 12,
   "use_video_temporal_moe": true,
   "num_video_encoder_layers": 4,
   "num_video_experts": 4,
+  "use_video_vidtok": true,
+  "vidtok_latent_channels": 4,
+  "vidtok_temporal_compression": 4,
+  "vidtok_spatial_compression": 8,
+  "vidtok_causal": true,
+  "vidtok_use_fsq": false,
+  "use_video_titok": true,
+  "num_video_titok_tokens": 64,
+  "num_video_titok_layers": 2,
+  "num_video_titok_heads": 8,
+  "video_titok_dropout": 0.1,
   "use_multi_scale": true,
+  "use_continuous_scale": true,
   "image_min_size": 128,
+  "image_max_size": 384,
   "image_base_size": 256,
+  "image_size_step": 32,
   "video_min_size": 128,
+  "video_max_size": 320,
+  "video_base_size": 192,
+  "video_size_step": 32,
   "video_min_frames": 8,
+  "video_max_frames": 24,
   "video_base_frames": 16,
+  "video_frame_step": 4,
+  "multi_scale_strategy": "adaptive",
+  "multi_scale_warmup_epochs": 3,
+  "adaptive_scale_oom_penalty": 0.5,
+  "adaptive_scale_success_boost": 0.1,
   "generation_supported_sizes": [
+    192,
     256,
     320,
+    384
   ],
   "generation_supported_frames": [
     8,
     12,
     16,
     20,
+    24
   ],
   "enable_generation": true,
   "generation_latent_channels": 4,
   "generation_video_use_temporal_moe": true,
   "audio_sample_rate": 16000,
   "audio_n_mels": 80,
+  "audio_max_length": 625,
+  "audio_max_waveform_samples": 160000,
   "audio_num_speakers": 256,
   "use_raw_waveform": true,
   "audio_kv_lora_rank": 256,
   "has_video_generator": true,
   "has_cross_attention": true,
   "lora_applied": true,
+  "architecture_version": 2,
+  "auto_map": {
+    "AutoConfig": "configuration_xoron.XoronConfig",
+    "AutoModel": "modeling_xoron.XoronModel",
+    "AutoModelForCausalLM": "modeling_xoron.XoronForCausalLM"
+  }
 }

configuration_xoron.py ADDED Viewed

	@@ -0,0 +1,367 @@

+"""
+Xoron Model Configuration for HuggingFace Transformers.
+This module provides a HuggingFace-compatible configuration class for the Xoron
+multimodal model. It inherits from PreTrainedConfig to enable:
+- Loading via AutoConfig
+- Saving/loading with save_pretrained/from_pretrained
+- Hub integration with push_to_hub
+Usage:
+    from transformers import AutoConfig
+    config = AutoConfig.from_pretrained("your-repo/xoron-model", trust_remote_code=True)
+"""
+from transformers import PreTrainedConfig
+from typing import List, Tuple, Union
+class XoronConfig(PreTrainedConfig):
+    """
+    Configuration class for Xoron-Dev multimodal model.
+    This is a HuggingFace-compatible configuration that stores all the parameters
+    needed to instantiate a XoronMultimodalModel.
+    Args:
+        model_name (`str`, *optional*, defaults to `"Xoron-Dev-MultiMoE"`):
+            Name of the model.
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimension of the hidden representations.
+        num_layers (`int`, *optional*, defaults to 12):
+            Number of transformer layers.
+        num_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads.
+        intermediate_size (`int`, *optional*, defaults to 2048):
+            Dimension of the MLP intermediate layer.
+        vocab_size (`int`, *optional*, defaults to 151643):
+            Vocabulary size (Qwen2.5 tokenizer).
+        max_position_embeddings (`int`, *optional*, defaults to 131072):
+            Maximum sequence length (128K context).
+    SOTA Features:
+        - MLA (Multi-Head Latent Attention) for compressed KV cache
+        - MoE with shared expert isolation (DeepSeek-style)
+        - Ring Attention for distributed 128K+ context
+        - YaRN/LongRoPE for superior long-context extrapolation
+        - LoRA variants (rsLoRA, DoRA, LoRA+)
+        - Perceiver Resampler for vision projection
+        - Cross-attention for multimodal fusion
+        - MoE-DiT with Flow Matching for image generation
+        - 3D-RoPE + 3D Causal Transformers for video generation
+        - TiTok-style 1D tokenization for vision encoding
+        - VidTok-style 1D tokenization for video encoding
+        - VideoTiTokTokenizer for efficient video token compression
+        - Dual-stream attention for symmetric processing
+        - Conformer audio encoder/decoder
+        - FP16-native numerical stability
+        - Multi-scale training for variable resolution handling
+    """
+    model_type = "xoron"
+    def __init__(
+        self,
+        # Model identification
+        model_name: str = "Xoron-Dev-MultiMoE",
+        # LLM Architecture
+        hidden_size: int = 1024,
+        num_layers: int = 12,
+        num_heads: int = 16,
+        intermediate_size: int = 2048,
+        vocab_size: int = 151643,
+        max_position_embeddings: int = 131072,
+        rms_norm_eps: float = 1e-6,
+        # Ring Attention
+        use_ring_attention: bool = True,
+        ring_attention_chunk_size: int = 4096,
+        # Tie word embeddings
+        tie_word_embeddings: bool = True,
+        # MoE Configuration
+        use_moe: bool = True,
+        num_experts: int = 8,
+        num_experts_per_tok: int = 2,
+        moe_layer_freq: int = 2,
+        use_shared_expert: bool = True,
+        moe_capacity_factor: float = 1.25,
+        use_aux_lossless: bool = True,
+        # Vision Configuration
+        vision_model_name: str = "google/siglip-so400m-patch14-384",
+        freeze_vision: bool = False,
+        num_vision_tokens: int = 64,
+        projector_type: str = "perceiver",
+        # Vision Encoder SOTA Features
+        use_vision_dual_stream: bool = True,
+        use_vision_titok: bool = True,
+        num_vision_titok_tokens: int = 256,
+        num_vision_dual_stream_layers: int = 2,
+        # Video Encoder SOTA Features
+        use_video_3d_rope: bool = True,
+        use_video_temporal_moe: bool = True,
+        num_video_encoder_layers: int = 4,
+        num_video_experts: int = 4,
+        use_video_vidtok: bool = True,
+        vidtok_latent_channels: int = 4,
+        vidtok_temporal_compression: int = 4,
+        vidtok_spatial_compression: int = 8,
+        vidtok_causal: bool = True,
+        vidtok_use_fsq: bool = False,
+        # VideoTiTokTokenizer Configuration (SOTA: TiTok-style 1D tokenization for video)
+        use_video_titok: bool = True,
+        num_video_titok_tokens: int = 64,
+        num_video_titok_layers: int = 2,
+        num_video_titok_heads: int = 8,
+        video_titok_dropout: float = 0.1,
+        # Continuous-Scale Training Configuration
+        use_multi_scale: bool = True,
+        use_continuous_scale: bool = True,
+        image_min_size: int = 128,
+        image_max_size: int = 384,
+        image_base_size: int = 256,
+        image_size_step: int = 32,
+        video_min_size: int = 128,
+        video_max_size: int = 320,
+        video_base_size: int = 192,
+        video_size_step: int = 32,
+        video_min_frames: int = 8,
+        video_max_frames: int = 24,
+        video_base_frames: int = 16,
+        video_frame_step: int = 4,
+        multi_scale_strategy: str = "adaptive",
+        multi_scale_warmup_epochs: int = 3,
+        adaptive_scale_oom_penalty: float = 0.5,
+        adaptive_scale_success_boost: float = 0.1,
+        generation_supported_sizes: Union[List[int], Tuple[int, ...]] = (192, 256, 320, 384),
+        generation_supported_frames: Union[List[int], Tuple[int, ...]] = (8, 12, 16, 20, 24),
+        # Image Generation Configuration
+        enable_generation: bool = True,
+        generation_latent_channels: int = 4,
+        generation_base_channels: int = 128,
+        generation_inference_steps: int = 50,
+        generation_cfg_scale: float = 7.5,
+        generation_use_flow_matching: bool = True,
+        generation_num_experts: int = 4,
+        generation_use_dual_stream: bool = True,
+        # Video Generation Configuration
+        generation_video_cfg_scale: float = 7.5,
+        generation_video_use_flow_matching: bool = True,
+        generation_video_num_experts: int = 4,
+        generation_video_use_3d_rope: bool = True,
+        generation_video_use_temporal_moe: bool = True,
+        # Audio Configuration
+        audio_sample_rate: int = 16000,
+        audio_n_mels: int = 80,
+        audio_max_length: int = 625,  # Max mel frames (10 seconds at 16kHz with hop=256)
+        audio_max_waveform_samples: int = 160000,  # Max raw waveform (10 seconds at 16kHz)
+        audio_num_speakers: int = 256,
+        use_raw_waveform: bool = True,
+        audio_kv_lora_rank: int = 256,
+        audio_speaker_embed_dim: int = 256,
+        use_mas: bool = True,
+        use_in_context_audio_prompting: bool = True,
+        # Tokenizer Configuration
+        tokenizer_name: str = "Qwen/Qwen2.5-1.5B",
+        # LoRA Configuration
+        use_lora: bool = True,
+        lora_r: int = 32,
+        lora_alpha: int = 64,
+        lora_dropout: float = 0.05,
+        lora_target_modules: Union[List[str], Tuple[str, ...]] = (
+            'q_proj', 'k_proj', 'v_proj', 'o_proj',
+            'gate_proj', 'up_proj', 'down_proj',
+        ),
+        train_lora_only: bool = False,
+        use_rslora: bool = True,
+        use_dora: bool = False,
+        lora_plus_lr_ratio: float = 4.0,
+        # Cross-Attention Configuration
+        use_cross_attention: bool = True,
+        cross_attention_layers: int = 4,
+        cross_attention_heads: int = 8,
+        cross_attention_dropout: float = 0.1,
+        # Flash Attention Configuration
+        use_flash_attention: bool = True,
+        # Architecture flags (set during save to track what components exist)
+        has_audio_encoder: bool = True,
+        has_audio_decoder: bool = True,
+        has_waveform_decoder: bool = True,
+        has_vision_encoder: bool = True,
+        has_video_encoder: bool = True,
+        has_generator: bool = True,
+        has_video_generator: bool = True,
+        has_cross_attention: bool = True,
+        lora_applied: bool = False,
+        architecture_version: int = 2,
+        # Output path (used during training)
+        output_dir: str = "./xoron-model",
+        **kwargs,
+    ):
+        # Call parent init
+        super().__init__(**kwargs)
+        # Model identification
+        self.model_name = model_name
+        # LLM Architecture
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.intermediate_size = intermediate_size
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.rms_norm_eps = rms_norm_eps
+        # Ring Attention
+        self.use_ring_attention = use_ring_attention
+        self.ring_attention_chunk_size = ring_attention_chunk_size
+        # Tie word embeddings
+        self.tie_word_embeddings = tie_word_embeddings
+        # MoE Configuration
+        self.use_moe = use_moe
+        self.num_experts = num_experts
+        self.num_experts_per_tok = num_experts_per_tok
+        self.moe_layer_freq = moe_layer_freq
+        self.use_shared_expert = use_shared_expert
+        self.moe_capacity_factor = moe_capacity_factor
+        self.use_aux_lossless = use_aux_lossless
+        # Vision Configuration
+        self.vision_model_name = vision_model_name
+        self.freeze_vision = freeze_vision
+        self.num_vision_tokens = num_vision_tokens
+        self.projector_type = projector_type
+        # Vision Encoder SOTA Features
+        self.use_vision_dual_stream = use_vision_dual_stream
+        self.use_vision_titok = use_vision_titok
+        self.num_vision_titok_tokens = num_vision_titok_tokens
+        self.num_vision_dual_stream_layers = num_vision_dual_stream_layers
+        # Video Encoder SOTA Features
+        self.use_video_3d_rope = use_video_3d_rope
+        self.use_video_temporal_moe = use_video_temporal_moe
+        self.num_video_encoder_layers = num_video_encoder_layers
+        self.num_video_experts = num_video_experts
+        self.use_video_vidtok = use_video_vidtok
+        self.vidtok_latent_channels = vidtok_latent_channels
+        self.vidtok_temporal_compression = vidtok_temporal_compression
+        self.vidtok_spatial_compression = vidtok_spatial_compression
+        self.vidtok_causal = vidtok_causal
+        self.vidtok_use_fsq = vidtok_use_fsq
+        # VideoTiTokTokenizer Configuration
+        self.use_video_titok = use_video_titok
+        self.num_video_titok_tokens = num_video_titok_tokens
+        self.num_video_titok_layers = num_video_titok_layers
+        self.num_video_titok_heads = num_video_titok_heads
+        self.video_titok_dropout = video_titok_dropout
+        # Continuous-Scale Training Configuration
+        self.use_multi_scale = use_multi_scale
+        self.use_continuous_scale = use_continuous_scale
+        self.image_min_size = image_min_size
+        self.image_max_size = image_max_size
+        self.image_base_size = image_base_size
+        self.image_size_step = image_size_step
+        self.video_min_size = video_min_size
+        self.video_max_size = video_max_size
+        self.video_base_size = video_base_size
+        self.video_size_step = video_size_step
+        self.video_min_frames = video_min_frames
+        self.video_max_frames = video_max_frames
+        self.video_base_frames = video_base_frames
+        self.video_frame_step = video_frame_step
+        self.multi_scale_strategy = multi_scale_strategy
+        self.multi_scale_warmup_epochs = multi_scale_warmup_epochs
+        self.adaptive_scale_oom_penalty = adaptive_scale_oom_penalty
+        self.adaptive_scale_success_boost = adaptive_scale_success_boost
+        self.generation_supported_sizes = list(generation_supported_sizes) if not isinstance(generation_supported_sizes, list) else generation_supported_sizes
+        self.generation_supported_frames = list(generation_supported_frames) if not isinstance(generation_supported_frames, list) else generation_supported_frames
+        # Image Generation Configuration
+        self.enable_generation = enable_generation
+        self.generation_latent_channels = generation_latent_channels
+        self.generation_base_channels = generation_base_channels
+        self.generation_inference_steps = generation_inference_steps
+        self.generation_cfg_scale = generation_cfg_scale
+        self.generation_use_flow_matching = generation_use_flow_matching
+        self.generation_num_experts = generation_num_experts
+        self.generation_use_dual_stream = generation_use_dual_stream
+        # Video Generation Configuration
+        self.generation_video_cfg_scale = generation_video_cfg_scale
+        self.generation_video_use_flow_matching = generation_video_use_flow_matching
+        self.generation_video_num_experts = generation_video_num_experts
+        self.generation_video_use_3d_rope = generation_video_use_3d_rope
+        self.generation_video_use_temporal_moe = generation_video_use_temporal_moe
+        # Audio Configuration
+        self.audio_sample_rate = audio_sample_rate
+        self.audio_n_mels = audio_n_mels
+        self.audio_max_length = audio_max_length
+        self.audio_max_waveform_samples = audio_max_waveform_samples
+        self.audio_num_speakers = audio_num_speakers
+        self.use_raw_waveform = use_raw_waveform
+        self.audio_kv_lora_rank = audio_kv_lora_rank
+        self.audio_speaker_embed_dim = audio_speaker_embed_dim
+        self.use_mas = use_mas
+        self.use_in_context_audio_prompting = use_in_context_audio_prompting
+        # Tokenizer Configuration
+        self.tokenizer_name = tokenizer_name
+        # LoRA Configuration
+        self.use_lora = use_lora
+        self.lora_r = lora_r
+        self.lora_alpha = lora_alpha
+        self.lora_dropout = lora_dropout
+        self.lora_target_modules = list(lora_target_modules) if not isinstance(lora_target_modules, list) else lora_target_modules
+        self.train_lora_only = train_lora_only
+        self.use_rslora = use_rslora
+        self.use_dora = use_dora
+        self.lora_plus_lr_ratio = lora_plus_lr_ratio
+        # Cross-Attention Configuration
+        self.use_cross_attention = use_cross_attention
+        self.cross_attention_layers = cross_attention_layers
+        self.cross_attention_heads = cross_attention_heads
+        self.cross_attention_dropout = cross_attention_dropout
+        # Flash Attention Configuration
+        self.use_flash_attention = use_flash_attention
+        # Architecture flags
+        self.has_audio_encoder = has_audio_encoder
+        self.has_audio_decoder = has_audio_decoder
+        self.has_waveform_decoder = has_waveform_decoder
+        self.has_vision_encoder = has_vision_encoder
+        self.has_video_encoder = has_video_encoder
+        self.has_generator = has_generator
+        self.has_video_generator = has_video_generator
+        self.has_cross_attention = has_cross_attention
+        self.lora_applied = lora_applied
+        self.architecture_version = architecture_version
+        # Output path
+        self.output_dir = output_dir

cross_attention.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:00ccf22a5f8b57898225721289813d8ed505792fb72da5cee876361dce723121
 size 174191400

 version https://git-lfs.github.com/spec/v1
+oid sha256:de3fa394f8f7b43e6b69cc072d57ebd48d0829237b8db79c488b535322fcbe6e
 size 174191400

generator.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3b89dbdfd553807dafa1575d850aa47b60edbc4cb851bf8fa2e414e658c3e169
 size 629440508

 version https://git-lfs.github.com/spec/v1
+oid sha256:8ea1e8367259016a8f378aeb772561bd6388e2e50035ae69ba5ce3f9a0b7a47b
 size 629440508

llm.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c66c683c33908f29d47b1a83b7a463d695da78c4e745e57f26ffc2633b8dd164
 size 1506831304

 version https://git-lfs.github.com/spec/v1
+oid sha256:cc371a972f367b91db376d5e6270cb218da9a8680c4f6172cc171e181464759a
 size 1506831304

modality_markers.safetensors CHANGED Viewed

Binary files a/modality_markers.safetensors and b/modality_markers.safetensors differ

modeling_xoron.py ADDED Viewed

	@@ -0,0 +1,539 @@

+"""
+Xoron Model for HuggingFace Transformers.
+This module provides a HuggingFace-compatible model class for the Xoron
+multimodal model. It inherits from PreTrainedModel to enable:
+- Loading via AutoModel
+- Saving/loading with save_pretrained/from_pretrained
+- Hub integration with push_to_hub
+- trust_remote_code support
+Usage:
+    from transformers import AutoModel
+    model = AutoModel.from_pretrained("your-repo/xoron-model", trust_remote_code=True)
+"""
+import os
+import sys
+import json
+import logging
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional, Dict, List, Union, Tuple
+from transformers import PreTrainedModel
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+# Import configuration - handle both package and standalone imports
+try:
+    from .configuration_xoron import XoronConfig
+except ImportError:
+    from configuration_xoron import XoronConfig
+logger = logging.getLogger(__name__)
+# FP16 safe max value
+MAX_HIDDEN = 10000.0
+def safe_clamp_tensor(x: torch.Tensor, max_val: float = MAX_HIDDEN) -> torch.Tensor:
+    """Clamp tensor values for FP16 safety."""
+    if x is None or x.numel() == 0:
+        return x
+    x = torch.nan_to_num(x, nan=0.0, posinf=max_val, neginf=-max_val)
+    return x.clamp(-max_val, max_val)
+class XoronPreTrainedModel(PreTrainedModel):
+    """
+    Base class for Xoron models providing HuggingFace integration.
+    This is the base class that provides weight initialization and
+    a simple interface for loading pretrained models.
+    """
+    config_class = XoronConfig
+    base_model_prefix = "xoron"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["XoronMultimodalModel"]
+    _skip_keys_device_placement = "past_key_values"
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        std = 0.02
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+class XoronModel(XoronPreTrainedModel):
+    """
+    Xoron Multimodal Model for HuggingFace.
+    This is a wrapper around the internal XoronMultimodalModel that provides
+    HuggingFace compatibility for loading via AutoModel with trust_remote_code=True.
+    The model supports:
+    - Image/video understanding (SigLIP encoder)
+    - Text generation (MoE LLM)
+    - Image/video generation (MobileDiffusion)
+    - Voice understanding and generation (ASR/TTS)
+    - Cross-attention for multimodal fusion
+    - LoRA support for efficient fine-tuning
+    Example:
+        >>> from transformers import AutoModel, AutoConfig
+        >>> config = AutoConfig.from_pretrained("your-repo/xoron", trust_remote_code=True)
+        >>> model = AutoModel.from_pretrained("your-repo/xoron", trust_remote_code=True)
+        >>> # Forward pass
+        >>> outputs = model(input_ids=input_ids, pixel_values=images)
+    """
+    def __init__(self, config: XoronConfig):
+        super().__init__(config)
+        self.config = config
+        # Import the internal model - this handles all the actual implementation
+        # We use lazy import to avoid circular dependencies
+        self._internal_model = None
+        self._internal_config = None
+    def _ensure_internal_model(self):
+        """Lazily initialize the internal model."""
+        if self._internal_model is None:
+            # Convert HF config to internal config
+            # Try importing from the Xoron-Dev package (if installed)
+            # or from the local directory structure
+            try:
+                from config.model_config import XoronConfig as InternalConfig
+            except ImportError:
+                try:
+                    # Try alternative import path for when running from HuggingFace Hub
+                    import importlib.util
+                    import sys
+                    # Get the directory containing this file
+                    current_dir = os.path.dirname(os.path.abspath(__file__))
+                    # Add to path if not already there
+                    if current_dir not in sys.path:
+                        sys.path.insert(0, current_dir)
+                    # Try importing again
+                    from config.model_config import XoronConfig as InternalConfig
+                except ImportError:
+                    raise ImportError(
+                        "Could not import XoronConfig from config.model_config. "
+                        "Please install the Xoron-Dev package first:\n"
+                        "  pip install git+https://github.com/nigfuapp-web/Xoron-Dev.git@beta\n"
+                        "Or clone the repository and install locally:\n"
+                        "  git clone -b beta https://github.com/nigfuapp-web/Xoron-Dev.git\n"
+                        "  cd Xoron-Dev && pip install -e ."
+                    )
+            # Create internal config from HF config
+            config_dict = {k: v for k, v in self.config.to_dict().items()
+                         if not k.startswith('_') and k not in ['transformers_version', 'model_type', 'torch_dtype', 'auto_map']}
+            # Handle tuple conversions
+            if 'lora_target_modules' in config_dict and isinstance(config_dict['lora_target_modules'], list):
+                config_dict['lora_target_modules'] = tuple(config_dict['lora_target_modules'])
+            if 'generation_supported_sizes' in config_dict and isinstance(config_dict['generation_supported_sizes'], list):
+                config_dict['generation_supported_sizes'] = tuple(config_dict['generation_supported_sizes'])
+            if 'generation_supported_frames' in config_dict and isinstance(config_dict['generation_supported_frames'], list):
+                config_dict['generation_supported_frames'] = tuple(config_dict['generation_supported_frames'])
+            self._internal_config = InternalConfig.from_dict(config_dict)
+            # Import and create internal model
+            try:
+                from models.xoron import XoronMultimodalModel
+            except ImportError:
+                raise ImportError(
+                    "Could not import XoronMultimodalModel from models.xoron. "
+                    "Please install the Xoron-Dev package first:\n"
+                    "  pip install git+https://github.com/nigfuapp-web/Xoron-Dev.git@beta\n"
+                    "Or clone the repository and install locally:\n"
+                    "  git clone -b beta https://github.com/nigfuapp-web/Xoron-Dev.git\n"
+                    "  cd Xoron-Dev && pip install -e ."
+                )
+            self._internal_model = XoronMultimodalModel(self._internal_config)
+    @property
+    def internal_model(self):
+        """Get the internal XoronMultimodalModel."""
+        self._ensure_internal_model()
+        return self._internal_model
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        pixel_values: Optional[torch.Tensor] = None,
+        video_frames: Optional[torch.Tensor] = None,
+        audio_features: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        """
+        Forward pass for the Xoron multimodal model.
+        Args:
+            input_ids: Input token IDs of shape (batch_size, seq_len)
+            attention_mask: Attention mask of shape (batch_size, seq_len)
+            pixel_values: Image inputs of shape (batch_size, channels, height, width)
+            video_frames: Video inputs of shape (batch_size, num_frames, channels, height, width)
+            audio_features: Audio inputs (mel spectrogram or raw waveform)
+            labels: Labels for language modeling loss
+            output_attentions: Whether to return attention weights
+            output_hidden_states: Whether to return hidden states
+            return_dict: Whether to return a ModelOutput object
+        Returns:
+            CausalLMOutputWithPast containing loss, logits, and optionally hidden states
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # Ensure internal model is initialized
+        self._ensure_internal_model()
+        # Call internal model forward
+        outputs = self._internal_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            pixel_values=pixel_values,
+            video_frames=video_frames,
+            audio_features=audio_features,
+            labels=labels,
+        )
+        if not return_dict:
+            return (outputs.get('loss'), outputs.get('logits'), outputs.get('hidden_states'))
+        return CausalLMOutputWithPast(
+            loss=outputs.get('loss'),
+            logits=outputs.get('logits'),
+            past_key_values=None,
+            hidden_states=outputs.get('hidden_states') if output_hidden_states else None,
+            attentions=None,
+        )
+    def generate(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        pixel_values: Optional[torch.Tensor] = None,
+        video_frames: Optional[torch.Tensor] = None,
+        audio_features: Optional[torch.Tensor] = None,
+        max_new_tokens: int = 512,
+        temperature: float = 0.7,
+        top_p: float = 0.9,
+        top_k: int = 50,
+        do_sample: bool = True,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        Generate text given inputs.
+        Args:
+            input_ids: Input token IDs
+            pixel_values: Image inputs
+            video_frames: Video inputs
+            audio_features: Audio inputs
+            max_new_tokens: Maximum tokens to generate
+            temperature: Sampling temperature
+            top_p: Nucleus sampling parameter
+            top_k: Top-k sampling parameter
+            do_sample: Whether to use sampling
+        Returns:
+            Generated token IDs
+        """
+        self._ensure_internal_model()
+        # Use internal model's generate method if available
+        if hasattr(self._internal_model, 'generate'):
+            return self._internal_model.generate(
+                input_ids=input_ids,
+                pixel_values=pixel_values,
+                video_frames=video_frames,
+                audio_features=audio_features,
+                max_new_tokens=max_new_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                do_sample=do_sample,
+                **kwargs,
+            )
+        # Fallback to basic autoregressive generation
+        return self._basic_generate(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            video_frames=video_frames,
+            audio_features=audio_features,
+            max_new_tokens=max_new_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            do_sample=do_sample,
+        )
+    def _basic_generate(
+        self,
+        input_ids: torch.Tensor,
+        pixel_values: Optional[torch.Tensor] = None,
+        video_frames: Optional[torch.Tensor] = None,
+        audio_features: Optional[torch.Tensor] = None,
+        max_new_tokens: int = 512,
+        temperature: float = 0.7,
+        top_p: float = 0.9,
+        top_k: int = 50,
+        do_sample: bool = True,
+    ) -> torch.Tensor:
+        """Basic autoregressive generation."""
+        generated = input_ids.clone()
+        for _ in range(max_new_tokens):
+            outputs = self.forward(
+                input_ids=generated,
+                pixel_values=pixel_values if generated.shape[1] == input_ids.shape[1] else None,
+                video_frames=video_frames if generated.shape[1] == input_ids.shape[1] else None,
+                audio_features=audio_features if generated.shape[1] == input_ids.shape[1] else None,
+            )
+            logits = outputs.logits[:, -1, :]
+            if do_sample:
+                # Apply temperature
+                logits = logits / temperature
+                # Apply top-k
+                if top_k > 0:
+                    indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+                    logits[indices_to_remove] = float('-inf')
+                # Apply top-p (nucleus sampling)
+                if top_p < 1.0:
+                    sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+                    cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+                    sorted_indices_to_remove = cumulative_probs > top_p
+                    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+                    sorted_indices_to_remove[..., 0] = 0
+                    indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+                    logits[indices_to_remove] = float('-inf')
+                probs = F.softmax(logits, dim=-1)
+                next_token = torch.multinomial(probs, num_samples=1)
+            else:
+                next_token = torch.argmax(logits, dim=-1, keepdim=True)
+            generated = torch.cat([generated, next_token], dim=1)
+            # Check for EOS token
+            if hasattr(self.config, 'eos_token_id') and self.config.eos_token_id is not None:
+                if (next_token == self.config.eos_token_id).all():
+                    break
+        return generated
+    def generate_image(
+        self,
+        prompt_embeds: torch.Tensor,
+        height: int = 256,
+        width: int = 256,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        **kwargs,
+    ) -> torch.Tensor:
+        """Generate image from text embeddings."""
+        self._ensure_internal_model()
+        if hasattr(self._internal_model, 'generate_image'):
+            return self._internal_model.generate_image(
+                prompt_embeds=prompt_embeds,
+                height=height,
+                width=width,
+                num_inference_steps=num_inference_steps,
+                guidance_scale=guidance_scale,
+                **kwargs,
+            )
+        raise NotImplementedError("Image generation not available")
+    def generate_video(
+        self,
+        prompt_embeds: torch.Tensor,
+        num_frames: int = 16,
+        height: int = 256,
+        width: int = 256,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        **kwargs,
+    ) -> torch.Tensor:
+        """Generate video from text embeddings."""
+        self._ensure_internal_model()
+        if hasattr(self._internal_model, 'generate_video'):
+            return self._internal_model.generate_video(
+                prompt_embeds=prompt_embeds,
+                num_frames=num_frames,
+                height=height,
+                width=width,
+                num_inference_steps=num_inference_steps,
+                guidance_scale=guidance_scale,
+                **kwargs,
+            )
+        raise NotImplementedError("Video generation not available")
+    def generate_audio(
+        self,
+        text_embeds: torch.Tensor,
+        speaker_embedding: Optional[torch.Tensor] = None,
+        max_length: int = 1000,
+        **kwargs,
+    ) -> torch.Tensor:
+        """Generate audio from text embeddings (TTS)."""
+        self._ensure_internal_model()
+        if hasattr(self._internal_model, 'generate_audio'):
+            return self._internal_model.generate_audio(
+                text_embeds=text_embeds,
+                speaker_embedding=speaker_embedding,
+                max_length=max_length,
+                **kwargs,
+            )
+        raise NotImplementedError("Audio generation not available")
+    def encode_image(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        """Encode image to embeddings."""
+        self._ensure_internal_model()
+        return self._internal_model.encode_image(pixel_values)
+    def encode_video(self, video_frames: torch.Tensor) -> torch.Tensor:
+        """Encode video to embeddings."""
+        self._ensure_internal_model()
+        return self._internal_model.encode_video(video_frames)
+    def encode_audio(self, audio_features: torch.Tensor) -> torch.Tensor:
+        """Encode audio to embeddings."""
+        self._ensure_internal_model()
+        return self._internal_model.encode_audio(audio_features)
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        """
+        Load pretrained model from HuggingFace Hub or local path.
+        This method handles loading the model weights from component files
+        created by the save_pretrained method.
+        """
+        # First load config and create model shell
+        config = kwargs.pop('config', None)
+        if config is None:
+            config = XoronConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        model = cls(config)
+        # Now load the actual weights
+        model._ensure_internal_model()
+        # Check if this is a local path or HF hub
+        if os.path.isdir(pretrained_model_name_or_path):
+            model_path = pretrained_model_name_or_path
+        else:
+            # Download from HuggingFace Hub
+            from huggingface_hub import snapshot_download
+            model_path = snapshot_download(
+                repo_id=pretrained_model_name_or_path,
+                allow_patterns=["*.safetensors", "*.json", "*.py"],
+            )
+        # Load weights into internal model
+        model._internal_model.load_pretrained(model_path)
+        return model
+    def save_pretrained(
+        self,
+        save_directory: str,
+        is_main_process: bool = True,
+        state_dict: Optional[Dict] = None,
+        save_function = None,
+        push_to_hub: bool = False,
+        max_shard_size: str = "2GB",
+        safe_serialization: bool = True,
+        **kwargs,
+    ):
+        """
+        Save model to directory in HuggingFace format.
+        This saves both the model weights and the custom code files
+        needed for trust_remote_code loading.
+        """
+        os.makedirs(save_directory, exist_ok=True)
+        # Save config
+        self.config.save_pretrained(save_directory)
+        # Save internal model weights
+        if self._internal_model is not None:
+            self._internal_model.save_pretrained(save_directory)
+        # Copy custom code files for trust_remote_code
+        import shutil
+        current_dir = os.path.dirname(os.path.abspath(__file__))
+        # Files to copy
+        files_to_copy = [
+            'configuration_xoron.py',
+            'modeling_xoron.py',
+        ]
+        for filename in files_to_copy:
+            src = os.path.join(current_dir, filename)
+            dst = os.path.join(save_directory, filename)
+            if os.path.exists(src):
+                shutil.copy2(src, dst)
+        # Update config.json with auto_map for trust_remote_code
+        config_path = os.path.join(save_directory, 'config.json')
+        if os.path.exists(config_path):
+            with open(config_path, 'r') as f:
+                config_dict = json.load(f)
+            config_dict['auto_map'] = {
+                'AutoConfig': 'configuration_xoron.XoronConfig',
+                'AutoModel': 'modeling_xoron.XoronModel',
+            }
+            config_dict['model_type'] = 'xoron'
+            with open(config_path, 'w') as f:
+                json.dump(config_dict, f, indent=2)
+        if push_to_hub:
+            self.push_to_hub(save_directory, **kwargs)
+class XoronForCausalLM(XoronModel):
+    """
+    Xoron model with a causal language modeling head.
+    This is an alias for XoronModel that provides compatibility
+    with AutoModelForCausalLM.
+    """
+    pass
+# Register for AutoClass - these will be called when the model is loaded
+# with trust_remote_code=True
+XoronConfig.register_for_auto_class()
+XoronModel.register_for_auto_class("AutoModel")
+XoronForCausalLM.register_for_auto_class("AutoModelForCausalLM")

projector.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2090fa0134d9992f2aeb1e1a4267c262f234d7347012ff344b40facf54bb9180
 size 52880664

 version https://git-lfs.github.com/spec/v1
+oid sha256:a028b9db9aa5779cd30534d17b166024b305a69dddeb947a4b577d4ba431b0cf
 size 52880664

streaming_state.json CHANGED Viewed

@@ -1,38 +1,22 @@
 {
   "epoch": 1,
-  "unique_samples": 600,
-  "total_yields": 1200,
   "dataset_positions": {
-    "T2V-Sora-Preferences-2": 50,
-    "Sora-Physics-Likert": 50,
-    "Sora-Style-Likert": 50,
-    "Sora-Alignment-Likert": 50,
-    "WebVid-10M": 50,
-    "T2V-Human-Preferences": 50,
-    "TIP-I2V": 50,
-    "I2V-Preference-Seedance": 50,
-    "Pexels-I2V-350k": 50
   },
   "modality_positions": {
     "text": {},
-    "image": {},
-    "video": {
-      "T2V-Sora-Preferences-2": 50,
-      "Sora-Physics-Likert": 50,
-      "Sora-Style-Likert": 50,
-      "Sora-Alignment-Likert": 50,
-      "WebVid-10M": 50,
-      "T2V-Human-Preferences": 50,
-      "TIP-I2V": 50,
-      "I2V-Preference-Seedance": 50,
-      "Pexels-I2V-350k": 50
     },
     "audio": {}
   },
   "modality_counts": {
     "text": 0,
-    "image": 0,
-    "video": 600,
     "audio": 0
   },
   "last_modality": null

 {
   "epoch": 1,
+  "unique_samples": 1,
+  "total_yields": 2,
   "dataset_positions": {
+    "Flickr8k": 1
   },
   "modality_positions": {
     "text": {},
+    "image": {
+      "Flickr8k": 1
     },
+    "video": {},
     "audio": {}
   },
   "modality_counts": {
     "text": 0,
+    "image": 1,
+    "video": 0,
     "audio": 0
   },
   "last_modality": null

trainer_state.json CHANGED Viewed

@@ -1,14 +1,14 @@
 {
   "best_model_checkpoint": "/kaggle/working/xoron-final",
-  "best_metric": 5.7690568415215235,
   "epoch": 1,
   "epochs_completed": 1,
-  "global_step": 75,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [],
   "logging_steps": 50,
-  "max_steps": 75,
   "num_train_epochs": 1,
   "total_flos": 0,
   "train_batch_size": 1,
@@ -17,16 +17,16 @@
   "max_grad_norm": 1.0,
   "trainable_components": [
     "vision",
-    "video",
     "llm",
     "cross_attention",
-    "video_generation",
     "modality_markers"
   ],
   "frozen_components": [
     "audio",
     "speech",
-    "image_generation"
   ],
   "trial_name": null,
   "trial_params": null

 {
   "best_model_checkpoint": "/kaggle/working/xoron-final",
+  "best_metric": 12.625781536102295,
   "epoch": 1,
   "epochs_completed": 1,
+  "global_step": 0,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [],
   "logging_steps": 50,
+  "max_steps": 0,
   "num_train_epochs": 1,
   "total_flos": 0,
   "train_batch_size": 1,
   "max_grad_norm": 1.0,
   "trainable_components": [
     "vision",
     "llm",
     "cross_attention",
+    "image_generation",
     "modality_markers"
   ],
   "frozen_components": [
+    "video",
     "audio",
     "speech",
+    "video_generation"
   ],
   "trial_name": null,
   "trial_params": null

training_state.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:647593a02fec5fd433976f9437423cf476ebbde58661768128bb3a260d7df19c
-size 781492737

 version https://git-lfs.github.com/spec/v1
+oid sha256:a9cb126a887fe8972de925a303bc7c7957ee5f7688b418e9511729631dfffeb2
+size 5143

video_encoder.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c948b6061b0ecb4072916f2235ae4f82e70de6afaec7c92e062eab816f39d009
-size 1701439560

 version https://git-lfs.github.com/spec/v1
+oid sha256:0fafb9b809d825639bd20d8efc0c8b62ca224c6c82764f80988ac5dc994d3b44
+size 1923089112

video_generator.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f0247a6cd33b7379b130129a2fd83fa1277367cc63057bd6f78996139e1a8a74
-size 47250054

 version https://git-lfs.github.com/spec/v1
+oid sha256:0a6b0b9aa6db134da1489ab57b9e73a3f53089362c7d14a08eb6883785ea47f9
+size 61574134

vision_encoder.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6ec8ec4f98fc89e96a37bb96634ca6de303cc7ebadb0d4e706e3727f947ecd15
 size 1000535480

 version https://git-lfs.github.com/spec/v1
+oid sha256:aeec0eab24a37b010516e3011104a586ca6ac9cdef6485ac687d14105fce96cd
 size 1000535480

waveform_decoder.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:42452429ed067aab616714ccfc0b1304166da6c21a8e7d8265815451701b6ed4
 size 34681076

 version https://git-lfs.github.com/spec/v1
+oid sha256:0f00704edbaf072a99297b4d03330bc7f2c1fae0a45f3ada55e558fa6979a179
 size 34681076