Backup-bdg commited on
Commit
5ee35eb
·
verified ·
1 Parent(s): 9234f2c

Update model weights after training (epoch 1, loss 12.6258)

Browse files
audio_decoder.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a9f369795e1f9e71b138a0dd3705d8549661085d292fdbf729982ba0e4f2b6ab
3
  size 1458415836
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0f29b6f9c83c81bcdc351798dafef1e6afccd6fd74a4651ec02a829fef90157
3
  size 1458415836
audio_encoder.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1170b76158b117e913232d6a55dd51638c9e72f91bec4e85f1ce866cfe3b7744
3
  size 466150140
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8ee92bfbfd53017d729cdadfdabbafce57057461aca0234c481ca122cd8485e
3
  size 466150140
audio_projector.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:df1d4100644c839f5d6f31ff19fc787fa0cb2416b3fdec70955df4331ed7902a
3
  size 2099352
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65558731e70800c2dca684217b1d7c5a7e09baf669f7f5c3173b344c159a8c45
3
  size 2099352
config.json CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "model_name": "Xoron-Dev-MultiMoE",
3
  "hidden_size": 1024,
4
  "num_layers": 12,
@@ -29,116 +30,47 @@
29
  "use_video_temporal_moe": true,
30
  "num_video_encoder_layers": 4,
31
  "num_video_experts": 4,
 
 
 
 
 
 
 
 
 
 
 
32
  "use_multi_scale": true,
33
- "image_scales": [
34
- [
35
- 128,
36
- 128
37
- ],
38
- [
39
- 192,
40
- 192
41
- ],
42
- [
43
- 256,
44
- 256
45
- ],
46
- [
47
- 320,
48
- 320
49
- ],
50
- [
51
- 384,
52
- 384
53
- ],
54
- [
55
- 448,
56
- 448
57
- ],
58
- [
59
- 512,
60
- 512
61
- ]
62
- ],
63
- "image_scale_probs": [
64
- 0.05,
65
- 0.1,
66
- 0.3,
67
- 0.25,
68
- 0.15,
69
- 0.1,
70
- 0.05
71
- ],
72
  "image_min_size": 128,
73
- "image_max_size": 512,
74
  "image_base_size": 256,
75
- "video_scales": [
76
- [
77
- 128,
78
- 128
79
- ],
80
- [
81
- 192,
82
- 192
83
- ],
84
- [
85
- 256,
86
- 256
87
- ],
88
- [
89
- 320,
90
- 320
91
- ],
92
- [
93
- 384,
94
- 384
95
- ]
96
- ],
97
- "video_scale_probs": [
98
- 0.1,
99
- 0.2,
100
- 0.35,
101
- 0.25,
102
- 0.1
103
- ],
104
  "video_min_size": 128,
105
- "video_max_size": 384,
106
- "video_base_size": 256,
107
- "video_frame_scales": [
108
- 8,
109
- 12,
110
- 16,
111
- 20,
112
- 24,
113
- 32
114
- ],
115
- "video_frame_scale_probs": [
116
- 0.1,
117
- 0.15,
118
- 0.3,
119
- 0.2,
120
- 0.15,
121
- 0.1
122
- ],
123
  "video_min_frames": 8,
124
- "video_max_frames": 32,
125
  "video_base_frames": 16,
126
- "multi_scale_strategy": "random",
127
- "multi_scale_warmup_epochs": 5,
 
 
 
128
  "generation_supported_sizes": [
 
129
  256,
130
  320,
131
- 384,
132
- 448,
133
- 512
134
  ],
135
  "generation_supported_frames": [
136
  8,
137
  12,
138
  16,
139
  20,
140
- 24,
141
- 32
142
  ],
143
  "enable_generation": true,
144
  "generation_latent_channels": 4,
@@ -155,7 +87,8 @@
155
  "generation_video_use_temporal_moe": true,
156
  "audio_sample_rate": 16000,
157
  "audio_n_mels": 80,
158
- "audio_max_length": 1000,
 
159
  "audio_num_speakers": 256,
160
  "use_raw_waveform": true,
161
  "audio_kv_lora_rank": 256,
@@ -195,5 +128,10 @@
195
  "has_video_generator": true,
196
  "has_cross_attention": true,
197
  "lora_applied": true,
198
- "architecture_version": 2
 
 
 
 
 
199
  }
 
1
  {
2
+ "model_type": "xoron",
3
  "model_name": "Xoron-Dev-MultiMoE",
4
  "hidden_size": 1024,
5
  "num_layers": 12,
 
30
  "use_video_temporal_moe": true,
31
  "num_video_encoder_layers": 4,
32
  "num_video_experts": 4,
33
+ "use_video_vidtok": true,
34
+ "vidtok_latent_channels": 4,
35
+ "vidtok_temporal_compression": 4,
36
+ "vidtok_spatial_compression": 8,
37
+ "vidtok_causal": true,
38
+ "vidtok_use_fsq": false,
39
+ "use_video_titok": true,
40
+ "num_video_titok_tokens": 64,
41
+ "num_video_titok_layers": 2,
42
+ "num_video_titok_heads": 8,
43
+ "video_titok_dropout": 0.1,
44
  "use_multi_scale": true,
45
+ "use_continuous_scale": true,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  "image_min_size": 128,
47
+ "image_max_size": 384,
48
  "image_base_size": 256,
49
+ "image_size_step": 32,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  "video_min_size": 128,
51
+ "video_max_size": 320,
52
+ "video_base_size": 192,
53
+ "video_size_step": 32,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  "video_min_frames": 8,
55
+ "video_max_frames": 24,
56
  "video_base_frames": 16,
57
+ "video_frame_step": 4,
58
+ "multi_scale_strategy": "adaptive",
59
+ "multi_scale_warmup_epochs": 3,
60
+ "adaptive_scale_oom_penalty": 0.5,
61
+ "adaptive_scale_success_boost": 0.1,
62
  "generation_supported_sizes": [
63
+ 192,
64
  256,
65
  320,
66
+ 384
 
 
67
  ],
68
  "generation_supported_frames": [
69
  8,
70
  12,
71
  16,
72
  20,
73
+ 24
 
74
  ],
75
  "enable_generation": true,
76
  "generation_latent_channels": 4,
 
87
  "generation_video_use_temporal_moe": true,
88
  "audio_sample_rate": 16000,
89
  "audio_n_mels": 80,
90
+ "audio_max_length": 625,
91
+ "audio_max_waveform_samples": 160000,
92
  "audio_num_speakers": 256,
93
  "use_raw_waveform": true,
94
  "audio_kv_lora_rank": 256,
 
128
  "has_video_generator": true,
129
  "has_cross_attention": true,
130
  "lora_applied": true,
131
+ "architecture_version": 2,
132
+ "auto_map": {
133
+ "AutoConfig": "configuration_xoron.XoronConfig",
134
+ "AutoModel": "modeling_xoron.XoronModel",
135
+ "AutoModelForCausalLM": "modeling_xoron.XoronForCausalLM"
136
+ }
137
  }
configuration_xoron.py ADDED
@@ -0,0 +1,367 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Xoron Model Configuration for HuggingFace Transformers.
3
+
4
+ This module provides a HuggingFace-compatible configuration class for the Xoron
5
+ multimodal model. It inherits from PreTrainedConfig to enable:
6
+ - Loading via AutoConfig
7
+ - Saving/loading with save_pretrained/from_pretrained
8
+ - Hub integration with push_to_hub
9
+
10
+ Usage:
11
+ from transformers import AutoConfig
12
+ config = AutoConfig.from_pretrained("your-repo/xoron-model", trust_remote_code=True)
13
+ """
14
+
15
+ from transformers import PreTrainedConfig
16
+ from typing import List, Tuple, Union
17
+
18
+
19
+ class XoronConfig(PreTrainedConfig):
20
+ """
21
+ Configuration class for Xoron-Dev multimodal model.
22
+
23
+ This is a HuggingFace-compatible configuration that stores all the parameters
24
+ needed to instantiate a XoronMultimodalModel.
25
+
26
+ Args:
27
+ model_name (`str`, *optional*, defaults to `"Xoron-Dev-MultiMoE"`):
28
+ Name of the model.
29
+ hidden_size (`int`, *optional*, defaults to 1024):
30
+ Dimension of the hidden representations.
31
+ num_layers (`int`, *optional*, defaults to 12):
32
+ Number of transformer layers.
33
+ num_heads (`int`, *optional*, defaults to 16):
34
+ Number of attention heads.
35
+ intermediate_size (`int`, *optional*, defaults to 2048):
36
+ Dimension of the MLP intermediate layer.
37
+ vocab_size (`int`, *optional*, defaults to 151643):
38
+ Vocabulary size (Qwen2.5 tokenizer).
39
+ max_position_embeddings (`int`, *optional*, defaults to 131072):
40
+ Maximum sequence length (128K context).
41
+
42
+ SOTA Features:
43
+ - MLA (Multi-Head Latent Attention) for compressed KV cache
44
+ - MoE with shared expert isolation (DeepSeek-style)
45
+ - Ring Attention for distributed 128K+ context
46
+ - YaRN/LongRoPE for superior long-context extrapolation
47
+ - LoRA variants (rsLoRA, DoRA, LoRA+)
48
+ - Perceiver Resampler for vision projection
49
+ - Cross-attention for multimodal fusion
50
+ - MoE-DiT with Flow Matching for image generation
51
+ - 3D-RoPE + 3D Causal Transformers for video generation
52
+ - TiTok-style 1D tokenization for vision encoding
53
+ - VidTok-style 1D tokenization for video encoding
54
+ - VideoTiTokTokenizer for efficient video token compression
55
+ - Dual-stream attention for symmetric processing
56
+ - Conformer audio encoder/decoder
57
+ - FP16-native numerical stability
58
+ - Multi-scale training for variable resolution handling
59
+ """
60
+
61
+ model_type = "xoron"
62
+
63
+ def __init__(
64
+ self,
65
+ # Model identification
66
+ model_name: str = "Xoron-Dev-MultiMoE",
67
+
68
+ # LLM Architecture
69
+ hidden_size: int = 1024,
70
+ num_layers: int = 12,
71
+ num_heads: int = 16,
72
+ intermediate_size: int = 2048,
73
+ vocab_size: int = 151643,
74
+ max_position_embeddings: int = 131072,
75
+ rms_norm_eps: float = 1e-6,
76
+
77
+ # Ring Attention
78
+ use_ring_attention: bool = True,
79
+ ring_attention_chunk_size: int = 4096,
80
+
81
+ # Tie word embeddings
82
+ tie_word_embeddings: bool = True,
83
+
84
+ # MoE Configuration
85
+ use_moe: bool = True,
86
+ num_experts: int = 8,
87
+ num_experts_per_tok: int = 2,
88
+ moe_layer_freq: int = 2,
89
+ use_shared_expert: bool = True,
90
+ moe_capacity_factor: float = 1.25,
91
+ use_aux_lossless: bool = True,
92
+
93
+ # Vision Configuration
94
+ vision_model_name: str = "google/siglip-so400m-patch14-384",
95
+ freeze_vision: bool = False,
96
+ num_vision_tokens: int = 64,
97
+ projector_type: str = "perceiver",
98
+
99
+ # Vision Encoder SOTA Features
100
+ use_vision_dual_stream: bool = True,
101
+ use_vision_titok: bool = True,
102
+ num_vision_titok_tokens: int = 256,
103
+ num_vision_dual_stream_layers: int = 2,
104
+
105
+ # Video Encoder SOTA Features
106
+ use_video_3d_rope: bool = True,
107
+ use_video_temporal_moe: bool = True,
108
+ num_video_encoder_layers: int = 4,
109
+ num_video_experts: int = 4,
110
+ use_video_vidtok: bool = True,
111
+ vidtok_latent_channels: int = 4,
112
+ vidtok_temporal_compression: int = 4,
113
+ vidtok_spatial_compression: int = 8,
114
+ vidtok_causal: bool = True,
115
+ vidtok_use_fsq: bool = False,
116
+
117
+ # VideoTiTokTokenizer Configuration (SOTA: TiTok-style 1D tokenization for video)
118
+ use_video_titok: bool = True,
119
+ num_video_titok_tokens: int = 64,
120
+ num_video_titok_layers: int = 2,
121
+ num_video_titok_heads: int = 8,
122
+ video_titok_dropout: float = 0.1,
123
+
124
+ # Continuous-Scale Training Configuration
125
+ use_multi_scale: bool = True,
126
+ use_continuous_scale: bool = True,
127
+ image_min_size: int = 128,
128
+ image_max_size: int = 384,
129
+ image_base_size: int = 256,
130
+ image_size_step: int = 32,
131
+ video_min_size: int = 128,
132
+ video_max_size: int = 320,
133
+ video_base_size: int = 192,
134
+ video_size_step: int = 32,
135
+ video_min_frames: int = 8,
136
+ video_max_frames: int = 24,
137
+ video_base_frames: int = 16,
138
+ video_frame_step: int = 4,
139
+ multi_scale_strategy: str = "adaptive",
140
+ multi_scale_warmup_epochs: int = 3,
141
+ adaptive_scale_oom_penalty: float = 0.5,
142
+ adaptive_scale_success_boost: float = 0.1,
143
+ generation_supported_sizes: Union[List[int], Tuple[int, ...]] = (192, 256, 320, 384),
144
+ generation_supported_frames: Union[List[int], Tuple[int, ...]] = (8, 12, 16, 20, 24),
145
+
146
+ # Image Generation Configuration
147
+ enable_generation: bool = True,
148
+ generation_latent_channels: int = 4,
149
+ generation_base_channels: int = 128,
150
+ generation_inference_steps: int = 50,
151
+ generation_cfg_scale: float = 7.5,
152
+ generation_use_flow_matching: bool = True,
153
+ generation_num_experts: int = 4,
154
+ generation_use_dual_stream: bool = True,
155
+
156
+ # Video Generation Configuration
157
+ generation_video_cfg_scale: float = 7.5,
158
+ generation_video_use_flow_matching: bool = True,
159
+ generation_video_num_experts: int = 4,
160
+ generation_video_use_3d_rope: bool = True,
161
+ generation_video_use_temporal_moe: bool = True,
162
+
163
+ # Audio Configuration
164
+ audio_sample_rate: int = 16000,
165
+ audio_n_mels: int = 80,
166
+ audio_max_length: int = 625, # Max mel frames (10 seconds at 16kHz with hop=256)
167
+ audio_max_waveform_samples: int = 160000, # Max raw waveform (10 seconds at 16kHz)
168
+ audio_num_speakers: int = 256,
169
+ use_raw_waveform: bool = True,
170
+ audio_kv_lora_rank: int = 256,
171
+ audio_speaker_embed_dim: int = 256,
172
+ use_mas: bool = True,
173
+ use_in_context_audio_prompting: bool = True,
174
+
175
+ # Tokenizer Configuration
176
+ tokenizer_name: str = "Qwen/Qwen2.5-1.5B",
177
+
178
+ # LoRA Configuration
179
+ use_lora: bool = True,
180
+ lora_r: int = 32,
181
+ lora_alpha: int = 64,
182
+ lora_dropout: float = 0.05,
183
+ lora_target_modules: Union[List[str], Tuple[str, ...]] = (
184
+ 'q_proj', 'k_proj', 'v_proj', 'o_proj',
185
+ 'gate_proj', 'up_proj', 'down_proj',
186
+ ),
187
+ train_lora_only: bool = False,
188
+ use_rslora: bool = True,
189
+ use_dora: bool = False,
190
+ lora_plus_lr_ratio: float = 4.0,
191
+
192
+ # Cross-Attention Configuration
193
+ use_cross_attention: bool = True,
194
+ cross_attention_layers: int = 4,
195
+ cross_attention_heads: int = 8,
196
+ cross_attention_dropout: float = 0.1,
197
+
198
+ # Flash Attention Configuration
199
+ use_flash_attention: bool = True,
200
+
201
+ # Architecture flags (set during save to track what components exist)
202
+ has_audio_encoder: bool = True,
203
+ has_audio_decoder: bool = True,
204
+ has_waveform_decoder: bool = True,
205
+ has_vision_encoder: bool = True,
206
+ has_video_encoder: bool = True,
207
+ has_generator: bool = True,
208
+ has_video_generator: bool = True,
209
+ has_cross_attention: bool = True,
210
+ lora_applied: bool = False,
211
+ architecture_version: int = 2,
212
+
213
+ # Output path (used during training)
214
+ output_dir: str = "./xoron-model",
215
+
216
+ **kwargs,
217
+ ):
218
+ # Call parent init
219
+ super().__init__(**kwargs)
220
+
221
+ # Model identification
222
+ self.model_name = model_name
223
+
224
+ # LLM Architecture
225
+ self.hidden_size = hidden_size
226
+ self.num_layers = num_layers
227
+ self.num_heads = num_heads
228
+ self.intermediate_size = intermediate_size
229
+ self.vocab_size = vocab_size
230
+ self.max_position_embeddings = max_position_embeddings
231
+ self.rms_norm_eps = rms_norm_eps
232
+
233
+ # Ring Attention
234
+ self.use_ring_attention = use_ring_attention
235
+ self.ring_attention_chunk_size = ring_attention_chunk_size
236
+
237
+ # Tie word embeddings
238
+ self.tie_word_embeddings = tie_word_embeddings
239
+
240
+ # MoE Configuration
241
+ self.use_moe = use_moe
242
+ self.num_experts = num_experts
243
+ self.num_experts_per_tok = num_experts_per_tok
244
+ self.moe_layer_freq = moe_layer_freq
245
+ self.use_shared_expert = use_shared_expert
246
+ self.moe_capacity_factor = moe_capacity_factor
247
+ self.use_aux_lossless = use_aux_lossless
248
+
249
+ # Vision Configuration
250
+ self.vision_model_name = vision_model_name
251
+ self.freeze_vision = freeze_vision
252
+ self.num_vision_tokens = num_vision_tokens
253
+ self.projector_type = projector_type
254
+
255
+ # Vision Encoder SOTA Features
256
+ self.use_vision_dual_stream = use_vision_dual_stream
257
+ self.use_vision_titok = use_vision_titok
258
+ self.num_vision_titok_tokens = num_vision_titok_tokens
259
+ self.num_vision_dual_stream_layers = num_vision_dual_stream_layers
260
+
261
+ # Video Encoder SOTA Features
262
+ self.use_video_3d_rope = use_video_3d_rope
263
+ self.use_video_temporal_moe = use_video_temporal_moe
264
+ self.num_video_encoder_layers = num_video_encoder_layers
265
+ self.num_video_experts = num_video_experts
266
+ self.use_video_vidtok = use_video_vidtok
267
+ self.vidtok_latent_channels = vidtok_latent_channels
268
+ self.vidtok_temporal_compression = vidtok_temporal_compression
269
+ self.vidtok_spatial_compression = vidtok_spatial_compression
270
+ self.vidtok_causal = vidtok_causal
271
+ self.vidtok_use_fsq = vidtok_use_fsq
272
+
273
+ # VideoTiTokTokenizer Configuration
274
+ self.use_video_titok = use_video_titok
275
+ self.num_video_titok_tokens = num_video_titok_tokens
276
+ self.num_video_titok_layers = num_video_titok_layers
277
+ self.num_video_titok_heads = num_video_titok_heads
278
+ self.video_titok_dropout = video_titok_dropout
279
+
280
+ # Continuous-Scale Training Configuration
281
+ self.use_multi_scale = use_multi_scale
282
+ self.use_continuous_scale = use_continuous_scale
283
+ self.image_min_size = image_min_size
284
+ self.image_max_size = image_max_size
285
+ self.image_base_size = image_base_size
286
+ self.image_size_step = image_size_step
287
+ self.video_min_size = video_min_size
288
+ self.video_max_size = video_max_size
289
+ self.video_base_size = video_base_size
290
+ self.video_size_step = video_size_step
291
+ self.video_min_frames = video_min_frames
292
+ self.video_max_frames = video_max_frames
293
+ self.video_base_frames = video_base_frames
294
+ self.video_frame_step = video_frame_step
295
+ self.multi_scale_strategy = multi_scale_strategy
296
+ self.multi_scale_warmup_epochs = multi_scale_warmup_epochs
297
+ self.adaptive_scale_oom_penalty = adaptive_scale_oom_penalty
298
+ self.adaptive_scale_success_boost = adaptive_scale_success_boost
299
+ self.generation_supported_sizes = list(generation_supported_sizes) if not isinstance(generation_supported_sizes, list) else generation_supported_sizes
300
+ self.generation_supported_frames = list(generation_supported_frames) if not isinstance(generation_supported_frames, list) else generation_supported_frames
301
+
302
+ # Image Generation Configuration
303
+ self.enable_generation = enable_generation
304
+ self.generation_latent_channels = generation_latent_channels
305
+ self.generation_base_channels = generation_base_channels
306
+ self.generation_inference_steps = generation_inference_steps
307
+ self.generation_cfg_scale = generation_cfg_scale
308
+ self.generation_use_flow_matching = generation_use_flow_matching
309
+ self.generation_num_experts = generation_num_experts
310
+ self.generation_use_dual_stream = generation_use_dual_stream
311
+
312
+ # Video Generation Configuration
313
+ self.generation_video_cfg_scale = generation_video_cfg_scale
314
+ self.generation_video_use_flow_matching = generation_video_use_flow_matching
315
+ self.generation_video_num_experts = generation_video_num_experts
316
+ self.generation_video_use_3d_rope = generation_video_use_3d_rope
317
+ self.generation_video_use_temporal_moe = generation_video_use_temporal_moe
318
+
319
+ # Audio Configuration
320
+ self.audio_sample_rate = audio_sample_rate
321
+ self.audio_n_mels = audio_n_mels
322
+ self.audio_max_length = audio_max_length
323
+ self.audio_max_waveform_samples = audio_max_waveform_samples
324
+ self.audio_num_speakers = audio_num_speakers
325
+ self.use_raw_waveform = use_raw_waveform
326
+ self.audio_kv_lora_rank = audio_kv_lora_rank
327
+ self.audio_speaker_embed_dim = audio_speaker_embed_dim
328
+ self.use_mas = use_mas
329
+ self.use_in_context_audio_prompting = use_in_context_audio_prompting
330
+
331
+ # Tokenizer Configuration
332
+ self.tokenizer_name = tokenizer_name
333
+
334
+ # LoRA Configuration
335
+ self.use_lora = use_lora
336
+ self.lora_r = lora_r
337
+ self.lora_alpha = lora_alpha
338
+ self.lora_dropout = lora_dropout
339
+ self.lora_target_modules = list(lora_target_modules) if not isinstance(lora_target_modules, list) else lora_target_modules
340
+ self.train_lora_only = train_lora_only
341
+ self.use_rslora = use_rslora
342
+ self.use_dora = use_dora
343
+ self.lora_plus_lr_ratio = lora_plus_lr_ratio
344
+
345
+ # Cross-Attention Configuration
346
+ self.use_cross_attention = use_cross_attention
347
+ self.cross_attention_layers = cross_attention_layers
348
+ self.cross_attention_heads = cross_attention_heads
349
+ self.cross_attention_dropout = cross_attention_dropout
350
+
351
+ # Flash Attention Configuration
352
+ self.use_flash_attention = use_flash_attention
353
+
354
+ # Architecture flags
355
+ self.has_audio_encoder = has_audio_encoder
356
+ self.has_audio_decoder = has_audio_decoder
357
+ self.has_waveform_decoder = has_waveform_decoder
358
+ self.has_vision_encoder = has_vision_encoder
359
+ self.has_video_encoder = has_video_encoder
360
+ self.has_generator = has_generator
361
+ self.has_video_generator = has_video_generator
362
+ self.has_cross_attention = has_cross_attention
363
+ self.lora_applied = lora_applied
364
+ self.architecture_version = architecture_version
365
+
366
+ # Output path
367
+ self.output_dir = output_dir
cross_attention.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:00ccf22a5f8b57898225721289813d8ed505792fb72da5cee876361dce723121
3
  size 174191400
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de3fa394f8f7b43e6b69cc072d57ebd48d0829237b8db79c488b535322fcbe6e
3
  size 174191400
generator.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3b89dbdfd553807dafa1575d850aa47b60edbc4cb851bf8fa2e414e658c3e169
3
  size 629440508
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ea1e8367259016a8f378aeb772561bd6388e2e50035ae69ba5ce3f9a0b7a47b
3
  size 629440508
llm.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c66c683c33908f29d47b1a83b7a463d695da78c4e745e57f26ffc2633b8dd164
3
  size 1506831304
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc371a972f367b91db376d5e6270cb218da9a8680c4f6172cc171e181464759a
3
  size 1506831304
modality_markers.safetensors CHANGED
Binary files a/modality_markers.safetensors and b/modality_markers.safetensors differ
 
modeling_xoron.py ADDED
@@ -0,0 +1,539 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Xoron Model for HuggingFace Transformers.
3
+
4
+ This module provides a HuggingFace-compatible model class for the Xoron
5
+ multimodal model. It inherits from PreTrainedModel to enable:
6
+ - Loading via AutoModel
7
+ - Saving/loading with save_pretrained/from_pretrained
8
+ - Hub integration with push_to_hub
9
+ - trust_remote_code support
10
+
11
+ Usage:
12
+ from transformers import AutoModel
13
+ model = AutoModel.from_pretrained("your-repo/xoron-model", trust_remote_code=True)
14
+ """
15
+
16
+ import os
17
+ import sys
18
+ import json
19
+ import logging
20
+ import torch
21
+ import torch.nn as nn
22
+ import torch.nn.functional as F
23
+ from typing import Optional, Dict, List, Union, Tuple
24
+ from transformers import PreTrainedModel
25
+ from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
26
+
27
+ # Import configuration - handle both package and standalone imports
28
+ try:
29
+ from .configuration_xoron import XoronConfig
30
+ except ImportError:
31
+ from configuration_xoron import XoronConfig
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+ # FP16 safe max value
36
+ MAX_HIDDEN = 10000.0
37
+
38
+
39
+ def safe_clamp_tensor(x: torch.Tensor, max_val: float = MAX_HIDDEN) -> torch.Tensor:
40
+ """Clamp tensor values for FP16 safety."""
41
+ if x is None or x.numel() == 0:
42
+ return x
43
+ x = torch.nan_to_num(x, nan=0.0, posinf=max_val, neginf=-max_val)
44
+ return x.clamp(-max_val, max_val)
45
+
46
+
47
+ class XoronPreTrainedModel(PreTrainedModel):
48
+ """
49
+ Base class for Xoron models providing HuggingFace integration.
50
+
51
+ This is the base class that provides weight initialization and
52
+ a simple interface for loading pretrained models.
53
+ """
54
+
55
+ config_class = XoronConfig
56
+ base_model_prefix = "xoron"
57
+ supports_gradient_checkpointing = True
58
+ _no_split_modules = ["XoronMultimodalModel"]
59
+ _skip_keys_device_placement = "past_key_values"
60
+
61
+ def _init_weights(self, module):
62
+ """Initialize the weights."""
63
+ std = 0.02
64
+ if isinstance(module, nn.Linear):
65
+ module.weight.data.normal_(mean=0.0, std=std)
66
+ if module.bias is not None:
67
+ module.bias.data.zero_()
68
+ elif isinstance(module, nn.Embedding):
69
+ module.weight.data.normal_(mean=0.0, std=std)
70
+ if module.padding_idx is not None:
71
+ module.weight.data[module.padding_idx].zero_()
72
+ elif isinstance(module, nn.LayerNorm):
73
+ module.bias.data.zero_()
74
+ module.weight.data.fill_(1.0)
75
+
76
+
77
+ class XoronModel(XoronPreTrainedModel):
78
+ """
79
+ Xoron Multimodal Model for HuggingFace.
80
+
81
+ This is a wrapper around the internal XoronMultimodalModel that provides
82
+ HuggingFace compatibility for loading via AutoModel with trust_remote_code=True.
83
+
84
+ The model supports:
85
+ - Image/video understanding (SigLIP encoder)
86
+ - Text generation (MoE LLM)
87
+ - Image/video generation (MobileDiffusion)
88
+ - Voice understanding and generation (ASR/TTS)
89
+ - Cross-attention for multimodal fusion
90
+ - LoRA support for efficient fine-tuning
91
+
92
+ Example:
93
+ >>> from transformers import AutoModel, AutoConfig
94
+ >>> config = AutoConfig.from_pretrained("your-repo/xoron", trust_remote_code=True)
95
+ >>> model = AutoModel.from_pretrained("your-repo/xoron", trust_remote_code=True)
96
+ >>> # Forward pass
97
+ >>> outputs = model(input_ids=input_ids, pixel_values=images)
98
+ """
99
+
100
+ def __init__(self, config: XoronConfig):
101
+ super().__init__(config)
102
+ self.config = config
103
+
104
+ # Import the internal model - this handles all the actual implementation
105
+ # We use lazy import to avoid circular dependencies
106
+ self._internal_model = None
107
+ self._internal_config = None
108
+
109
+ def _ensure_internal_model(self):
110
+ """Lazily initialize the internal model."""
111
+ if self._internal_model is None:
112
+ # Convert HF config to internal config
113
+ # Try importing from the Xoron-Dev package (if installed)
114
+ # or from the local directory structure
115
+ try:
116
+ from config.model_config import XoronConfig as InternalConfig
117
+ except ImportError:
118
+ try:
119
+ # Try alternative import path for when running from HuggingFace Hub
120
+ import importlib.util
121
+ import sys
122
+
123
+ # Get the directory containing this file
124
+ current_dir = os.path.dirname(os.path.abspath(__file__))
125
+
126
+ # Add to path if not already there
127
+ if current_dir not in sys.path:
128
+ sys.path.insert(0, current_dir)
129
+
130
+ # Try importing again
131
+ from config.model_config import XoronConfig as InternalConfig
132
+ except ImportError:
133
+ raise ImportError(
134
+ "Could not import XoronConfig from config.model_config. "
135
+ "Please install the Xoron-Dev package first:\n"
136
+ " pip install git+https://github.com/nigfuapp-web/Xoron-Dev.git@beta\n"
137
+ "Or clone the repository and install locally:\n"
138
+ " git clone -b beta https://github.com/nigfuapp-web/Xoron-Dev.git\n"
139
+ " cd Xoron-Dev && pip install -e ."
140
+ )
141
+
142
+ # Create internal config from HF config
143
+ config_dict = {k: v for k, v in self.config.to_dict().items()
144
+ if not k.startswith('_') and k not in ['transformers_version', 'model_type', 'torch_dtype', 'auto_map']}
145
+
146
+ # Handle tuple conversions
147
+ if 'lora_target_modules' in config_dict and isinstance(config_dict['lora_target_modules'], list):
148
+ config_dict['lora_target_modules'] = tuple(config_dict['lora_target_modules'])
149
+ if 'generation_supported_sizes' in config_dict and isinstance(config_dict['generation_supported_sizes'], list):
150
+ config_dict['generation_supported_sizes'] = tuple(config_dict['generation_supported_sizes'])
151
+ if 'generation_supported_frames' in config_dict and isinstance(config_dict['generation_supported_frames'], list):
152
+ config_dict['generation_supported_frames'] = tuple(config_dict['generation_supported_frames'])
153
+
154
+ self._internal_config = InternalConfig.from_dict(config_dict)
155
+
156
+ # Import and create internal model
157
+ try:
158
+ from models.xoron import XoronMultimodalModel
159
+ except ImportError:
160
+ raise ImportError(
161
+ "Could not import XoronMultimodalModel from models.xoron. "
162
+ "Please install the Xoron-Dev package first:\n"
163
+ " pip install git+https://github.com/nigfuapp-web/Xoron-Dev.git@beta\n"
164
+ "Or clone the repository and install locally:\n"
165
+ " git clone -b beta https://github.com/nigfuapp-web/Xoron-Dev.git\n"
166
+ " cd Xoron-Dev && pip install -e ."
167
+ )
168
+
169
+ self._internal_model = XoronMultimodalModel(self._internal_config)
170
+
171
+ @property
172
+ def internal_model(self):
173
+ """Get the internal XoronMultimodalModel."""
174
+ self._ensure_internal_model()
175
+ return self._internal_model
176
+
177
+ def forward(
178
+ self,
179
+ input_ids: torch.Tensor,
180
+ attention_mask: Optional[torch.Tensor] = None,
181
+ pixel_values: Optional[torch.Tensor] = None,
182
+ video_frames: Optional[torch.Tensor] = None,
183
+ audio_features: Optional[torch.Tensor] = None,
184
+ labels: Optional[torch.Tensor] = None,
185
+ output_attentions: Optional[bool] = None,
186
+ output_hidden_states: Optional[bool] = None,
187
+ return_dict: Optional[bool] = None,
188
+ **kwargs,
189
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
190
+ """
191
+ Forward pass for the Xoron multimodal model.
192
+
193
+ Args:
194
+ input_ids: Input token IDs of shape (batch_size, seq_len)
195
+ attention_mask: Attention mask of shape (batch_size, seq_len)
196
+ pixel_values: Image inputs of shape (batch_size, channels, height, width)
197
+ video_frames: Video inputs of shape (batch_size, num_frames, channels, height, width)
198
+ audio_features: Audio inputs (mel spectrogram or raw waveform)
199
+ labels: Labels for language modeling loss
200
+ output_attentions: Whether to return attention weights
201
+ output_hidden_states: Whether to return hidden states
202
+ return_dict: Whether to return a ModelOutput object
203
+
204
+ Returns:
205
+ CausalLMOutputWithPast containing loss, logits, and optionally hidden states
206
+ """
207
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
208
+
209
+ # Ensure internal model is initialized
210
+ self._ensure_internal_model()
211
+
212
+ # Call internal model forward
213
+ outputs = self._internal_model(
214
+ input_ids=input_ids,
215
+ attention_mask=attention_mask,
216
+ pixel_values=pixel_values,
217
+ video_frames=video_frames,
218
+ audio_features=audio_features,
219
+ labels=labels,
220
+ )
221
+
222
+ if not return_dict:
223
+ return (outputs.get('loss'), outputs.get('logits'), outputs.get('hidden_states'))
224
+
225
+ return CausalLMOutputWithPast(
226
+ loss=outputs.get('loss'),
227
+ logits=outputs.get('logits'),
228
+ past_key_values=None,
229
+ hidden_states=outputs.get('hidden_states') if output_hidden_states else None,
230
+ attentions=None,
231
+ )
232
+
233
+ def generate(
234
+ self,
235
+ input_ids: Optional[torch.Tensor] = None,
236
+ pixel_values: Optional[torch.Tensor] = None,
237
+ video_frames: Optional[torch.Tensor] = None,
238
+ audio_features: Optional[torch.Tensor] = None,
239
+ max_new_tokens: int = 512,
240
+ temperature: float = 0.7,
241
+ top_p: float = 0.9,
242
+ top_k: int = 50,
243
+ do_sample: bool = True,
244
+ **kwargs,
245
+ ) -> torch.Tensor:
246
+ """
247
+ Generate text given inputs.
248
+
249
+ Args:
250
+ input_ids: Input token IDs
251
+ pixel_values: Image inputs
252
+ video_frames: Video inputs
253
+ audio_features: Audio inputs
254
+ max_new_tokens: Maximum tokens to generate
255
+ temperature: Sampling temperature
256
+ top_p: Nucleus sampling parameter
257
+ top_k: Top-k sampling parameter
258
+ do_sample: Whether to use sampling
259
+
260
+ Returns:
261
+ Generated token IDs
262
+ """
263
+ self._ensure_internal_model()
264
+
265
+ # Use internal model's generate method if available
266
+ if hasattr(self._internal_model, 'generate'):
267
+ return self._internal_model.generate(
268
+ input_ids=input_ids,
269
+ pixel_values=pixel_values,
270
+ video_frames=video_frames,
271
+ audio_features=audio_features,
272
+ max_new_tokens=max_new_tokens,
273
+ temperature=temperature,
274
+ top_p=top_p,
275
+ top_k=top_k,
276
+ do_sample=do_sample,
277
+ **kwargs,
278
+ )
279
+
280
+ # Fallback to basic autoregressive generation
281
+ return self._basic_generate(
282
+ input_ids=input_ids,
283
+ pixel_values=pixel_values,
284
+ video_frames=video_frames,
285
+ audio_features=audio_features,
286
+ max_new_tokens=max_new_tokens,
287
+ temperature=temperature,
288
+ top_p=top_p,
289
+ top_k=top_k,
290
+ do_sample=do_sample,
291
+ )
292
+
293
+ def _basic_generate(
294
+ self,
295
+ input_ids: torch.Tensor,
296
+ pixel_values: Optional[torch.Tensor] = None,
297
+ video_frames: Optional[torch.Tensor] = None,
298
+ audio_features: Optional[torch.Tensor] = None,
299
+ max_new_tokens: int = 512,
300
+ temperature: float = 0.7,
301
+ top_p: float = 0.9,
302
+ top_k: int = 50,
303
+ do_sample: bool = True,
304
+ ) -> torch.Tensor:
305
+ """Basic autoregressive generation."""
306
+ generated = input_ids.clone()
307
+
308
+ for _ in range(max_new_tokens):
309
+ outputs = self.forward(
310
+ input_ids=generated,
311
+ pixel_values=pixel_values if generated.shape[1] == input_ids.shape[1] else None,
312
+ video_frames=video_frames if generated.shape[1] == input_ids.shape[1] else None,
313
+ audio_features=audio_features if generated.shape[1] == input_ids.shape[1] else None,
314
+ )
315
+
316
+ logits = outputs.logits[:, -1, :]
317
+
318
+ if do_sample:
319
+ # Apply temperature
320
+ logits = logits / temperature
321
+
322
+ # Apply top-k
323
+ if top_k > 0:
324
+ indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
325
+ logits[indices_to_remove] = float('-inf')
326
+
327
+ # Apply top-p (nucleus sampling)
328
+ if top_p < 1.0:
329
+ sorted_logits, sorted_indices = torch.sort(logits, descending=True)
330
+ cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
331
+ sorted_indices_to_remove = cumulative_probs > top_p
332
+ sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
333
+ sorted_indices_to_remove[..., 0] = 0
334
+ indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
335
+ logits[indices_to_remove] = float('-inf')
336
+
337
+ probs = F.softmax(logits, dim=-1)
338
+ next_token = torch.multinomial(probs, num_samples=1)
339
+ else:
340
+ next_token = torch.argmax(logits, dim=-1, keepdim=True)
341
+
342
+ generated = torch.cat([generated, next_token], dim=1)
343
+
344
+ # Check for EOS token
345
+ if hasattr(self.config, 'eos_token_id') and self.config.eos_token_id is not None:
346
+ if (next_token == self.config.eos_token_id).all():
347
+ break
348
+
349
+ return generated
350
+
351
+ def generate_image(
352
+ self,
353
+ prompt_embeds: torch.Tensor,
354
+ height: int = 256,
355
+ width: int = 256,
356
+ num_inference_steps: int = 50,
357
+ guidance_scale: float = 7.5,
358
+ **kwargs,
359
+ ) -> torch.Tensor:
360
+ """Generate image from text embeddings."""
361
+ self._ensure_internal_model()
362
+ if hasattr(self._internal_model, 'generate_image'):
363
+ return self._internal_model.generate_image(
364
+ prompt_embeds=prompt_embeds,
365
+ height=height,
366
+ width=width,
367
+ num_inference_steps=num_inference_steps,
368
+ guidance_scale=guidance_scale,
369
+ **kwargs,
370
+ )
371
+ raise NotImplementedError("Image generation not available")
372
+
373
+ def generate_video(
374
+ self,
375
+ prompt_embeds: torch.Tensor,
376
+ num_frames: int = 16,
377
+ height: int = 256,
378
+ width: int = 256,
379
+ num_inference_steps: int = 50,
380
+ guidance_scale: float = 7.5,
381
+ **kwargs,
382
+ ) -> torch.Tensor:
383
+ """Generate video from text embeddings."""
384
+ self._ensure_internal_model()
385
+ if hasattr(self._internal_model, 'generate_video'):
386
+ return self._internal_model.generate_video(
387
+ prompt_embeds=prompt_embeds,
388
+ num_frames=num_frames,
389
+ height=height,
390
+ width=width,
391
+ num_inference_steps=num_inference_steps,
392
+ guidance_scale=guidance_scale,
393
+ **kwargs,
394
+ )
395
+ raise NotImplementedError("Video generation not available")
396
+
397
+ def generate_audio(
398
+ self,
399
+ text_embeds: torch.Tensor,
400
+ speaker_embedding: Optional[torch.Tensor] = None,
401
+ max_length: int = 1000,
402
+ **kwargs,
403
+ ) -> torch.Tensor:
404
+ """Generate audio from text embeddings (TTS)."""
405
+ self._ensure_internal_model()
406
+ if hasattr(self._internal_model, 'generate_audio'):
407
+ return self._internal_model.generate_audio(
408
+ text_embeds=text_embeds,
409
+ speaker_embedding=speaker_embedding,
410
+ max_length=max_length,
411
+ **kwargs,
412
+ )
413
+ raise NotImplementedError("Audio generation not available")
414
+
415
+ def encode_image(self, pixel_values: torch.Tensor) -> torch.Tensor:
416
+ """Encode image to embeddings."""
417
+ self._ensure_internal_model()
418
+ return self._internal_model.encode_image(pixel_values)
419
+
420
+ def encode_video(self, video_frames: torch.Tensor) -> torch.Tensor:
421
+ """Encode video to embeddings."""
422
+ self._ensure_internal_model()
423
+ return self._internal_model.encode_video(video_frames)
424
+
425
+ def encode_audio(self, audio_features: torch.Tensor) -> torch.Tensor:
426
+ """Encode audio to embeddings."""
427
+ self._ensure_internal_model()
428
+ return self._internal_model.encode_audio(audio_features)
429
+
430
+ @classmethod
431
+ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
432
+ """
433
+ Load pretrained model from HuggingFace Hub or local path.
434
+
435
+ This method handles loading the model weights from component files
436
+ created by the save_pretrained method.
437
+ """
438
+ # First load config and create model shell
439
+ config = kwargs.pop('config', None)
440
+ if config is None:
441
+ config = XoronConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
442
+
443
+ model = cls(config)
444
+
445
+ # Now load the actual weights
446
+ model._ensure_internal_model()
447
+
448
+ # Check if this is a local path or HF hub
449
+ if os.path.isdir(pretrained_model_name_or_path):
450
+ model_path = pretrained_model_name_or_path
451
+ else:
452
+ # Download from HuggingFace Hub
453
+ from huggingface_hub import snapshot_download
454
+ model_path = snapshot_download(
455
+ repo_id=pretrained_model_name_or_path,
456
+ allow_patterns=["*.safetensors", "*.json", "*.py"],
457
+ )
458
+
459
+ # Load weights into internal model
460
+ model._internal_model.load_pretrained(model_path)
461
+
462
+ return model
463
+
464
+ def save_pretrained(
465
+ self,
466
+ save_directory: str,
467
+ is_main_process: bool = True,
468
+ state_dict: Optional[Dict] = None,
469
+ save_function = None,
470
+ push_to_hub: bool = False,
471
+ max_shard_size: str = "2GB",
472
+ safe_serialization: bool = True,
473
+ **kwargs,
474
+ ):
475
+ """
476
+ Save model to directory in HuggingFace format.
477
+
478
+ This saves both the model weights and the custom code files
479
+ needed for trust_remote_code loading.
480
+ """
481
+ os.makedirs(save_directory, exist_ok=True)
482
+
483
+ # Save config
484
+ self.config.save_pretrained(save_directory)
485
+
486
+ # Save internal model weights
487
+ if self._internal_model is not None:
488
+ self._internal_model.save_pretrained(save_directory)
489
+
490
+ # Copy custom code files for trust_remote_code
491
+ import shutil
492
+ current_dir = os.path.dirname(os.path.abspath(__file__))
493
+
494
+ # Files to copy
495
+ files_to_copy = [
496
+ 'configuration_xoron.py',
497
+ 'modeling_xoron.py',
498
+ ]
499
+
500
+ for filename in files_to_copy:
501
+ src = os.path.join(current_dir, filename)
502
+ dst = os.path.join(save_directory, filename)
503
+ if os.path.exists(src):
504
+ shutil.copy2(src, dst)
505
+
506
+ # Update config.json with auto_map for trust_remote_code
507
+ config_path = os.path.join(save_directory, 'config.json')
508
+ if os.path.exists(config_path):
509
+ with open(config_path, 'r') as f:
510
+ config_dict = json.load(f)
511
+
512
+ config_dict['auto_map'] = {
513
+ 'AutoConfig': 'configuration_xoron.XoronConfig',
514
+ 'AutoModel': 'modeling_xoron.XoronModel',
515
+ }
516
+ config_dict['model_type'] = 'xoron'
517
+
518
+ with open(config_path, 'w') as f:
519
+ json.dump(config_dict, f, indent=2)
520
+
521
+ if push_to_hub:
522
+ self.push_to_hub(save_directory, **kwargs)
523
+
524
+
525
+ class XoronForCausalLM(XoronModel):
526
+ """
527
+ Xoron model with a causal language modeling head.
528
+
529
+ This is an alias for XoronModel that provides compatibility
530
+ with AutoModelForCausalLM.
531
+ """
532
+ pass
533
+
534
+
535
+ # Register for AutoClass - these will be called when the model is loaded
536
+ # with trust_remote_code=True
537
+ XoronConfig.register_for_auto_class()
538
+ XoronModel.register_for_auto_class("AutoModel")
539
+ XoronForCausalLM.register_for_auto_class("AutoModelForCausalLM")
projector.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2090fa0134d9992f2aeb1e1a4267c262f234d7347012ff344b40facf54bb9180
3
  size 52880664
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a028b9db9aa5779cd30534d17b166024b305a69dddeb947a4b577d4ba431b0cf
3
  size 52880664
streaming_state.json CHANGED
@@ -1,38 +1,22 @@
1
  {
2
  "epoch": 1,
3
- "unique_samples": 600,
4
- "total_yields": 1200,
5
  "dataset_positions": {
6
- "T2V-Sora-Preferences-2": 50,
7
- "Sora-Physics-Likert": 50,
8
- "Sora-Style-Likert": 50,
9
- "Sora-Alignment-Likert": 50,
10
- "WebVid-10M": 50,
11
- "T2V-Human-Preferences": 50,
12
- "TIP-I2V": 50,
13
- "I2V-Preference-Seedance": 50,
14
- "Pexels-I2V-350k": 50
15
  },
16
  "modality_positions": {
17
  "text": {},
18
- "image": {},
19
- "video": {
20
- "T2V-Sora-Preferences-2": 50,
21
- "Sora-Physics-Likert": 50,
22
- "Sora-Style-Likert": 50,
23
- "Sora-Alignment-Likert": 50,
24
- "WebVid-10M": 50,
25
- "T2V-Human-Preferences": 50,
26
- "TIP-I2V": 50,
27
- "I2V-Preference-Seedance": 50,
28
- "Pexels-I2V-350k": 50
29
  },
 
30
  "audio": {}
31
  },
32
  "modality_counts": {
33
  "text": 0,
34
- "image": 0,
35
- "video": 600,
36
  "audio": 0
37
  },
38
  "last_modality": null
 
1
  {
2
  "epoch": 1,
3
+ "unique_samples": 1,
4
+ "total_yields": 2,
5
  "dataset_positions": {
6
+ "Flickr8k": 1
 
 
 
 
 
 
 
 
7
  },
8
  "modality_positions": {
9
  "text": {},
10
+ "image": {
11
+ "Flickr8k": 1
 
 
 
 
 
 
 
 
 
12
  },
13
+ "video": {},
14
  "audio": {}
15
  },
16
  "modality_counts": {
17
  "text": 0,
18
+ "image": 1,
19
+ "video": 0,
20
  "audio": 0
21
  },
22
  "last_modality": null
trainer_state.json CHANGED
@@ -1,14 +1,14 @@
1
  {
2
  "best_model_checkpoint": "/kaggle/working/xoron-final",
3
- "best_metric": 5.7690568415215235,
4
  "epoch": 1,
5
  "epochs_completed": 1,
6
- "global_step": 75,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [],
10
  "logging_steps": 50,
11
- "max_steps": 75,
12
  "num_train_epochs": 1,
13
  "total_flos": 0,
14
  "train_batch_size": 1,
@@ -17,16 +17,16 @@
17
  "max_grad_norm": 1.0,
18
  "trainable_components": [
19
  "vision",
20
- "video",
21
  "llm",
22
  "cross_attention",
23
- "video_generation",
24
  "modality_markers"
25
  ],
26
  "frozen_components": [
 
27
  "audio",
28
  "speech",
29
- "image_generation"
30
  ],
31
  "trial_name": null,
32
  "trial_params": null
 
1
  {
2
  "best_model_checkpoint": "/kaggle/working/xoron-final",
3
+ "best_metric": 12.625781536102295,
4
  "epoch": 1,
5
  "epochs_completed": 1,
6
+ "global_step": 0,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [],
10
  "logging_steps": 50,
11
+ "max_steps": 0,
12
  "num_train_epochs": 1,
13
  "total_flos": 0,
14
  "train_batch_size": 1,
 
17
  "max_grad_norm": 1.0,
18
  "trainable_components": [
19
  "vision",
 
20
  "llm",
21
  "cross_attention",
22
+ "image_generation",
23
  "modality_markers"
24
  ],
25
  "frozen_components": [
26
+ "video",
27
  "audio",
28
  "speech",
29
+ "video_generation"
30
  ],
31
  "trial_name": null,
32
  "trial_params": null
training_state.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:647593a02fec5fd433976f9437423cf476ebbde58661768128bb3a260d7df19c
3
- size 781492737
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9cb126a887fe8972de925a303bc7c7957ee5f7688b418e9511729631dfffeb2
3
+ size 5143
video_encoder.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c948b6061b0ecb4072916f2235ae4f82e70de6afaec7c92e062eab816f39d009
3
- size 1701439560
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0fafb9b809d825639bd20d8efc0c8b62ca224c6c82764f80988ac5dc994d3b44
3
+ size 1923089112
video_generator.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f0247a6cd33b7379b130129a2fd83fa1277367cc63057bd6f78996139e1a8a74
3
- size 47250054
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a6b0b9aa6db134da1489ab57b9e73a3f53089362c7d14a08eb6883785ea47f9
3
+ size 61574134
vision_encoder.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6ec8ec4f98fc89e96a37bb96634ca6de303cc7ebadb0d4e706e3727f947ecd15
3
  size 1000535480
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aeec0eab24a37b010516e3011104a586ca6ac9cdef6485ac687d14105fce96cd
3
  size 1000535480
waveform_decoder.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:42452429ed067aab616714ccfc0b1304166da6c21a8e7d8265815451701b6ed4
3
  size 34681076
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f00704edbaf072a99297b4d03330bc7f2c1fae0a45f3ada55e558fa6979a179
3
  size 34681076