Any-to-Any
Transformers
Safetensors
English
xoron
multimodal
Mixture of Experts
text-to-image
image editing
image to video
text-to-video
video editing
text-to-speech
speech-to-text
speech-to-speech
image-to-text
video-to-text
agentic
tool-use
flow-matching
3d-rope
titok
vidtok
dual-stream-attention
zero-shot-voice-cloning
bigvgan
snake-activation
multi-receptive-field-fusion
custom_code
Update model weights after training (epoch 7, loss 4.8695)
Browse files- audio_decoder.safetensors +1 -1
- cross_attention.safetensors +1 -1
- llm.safetensors +1 -1
- model.safetensors.index.json +1 -1
- modeling_xoron.py +53 -0
- streaming_state.json +5 -5
- trainer_state.json +6 -6
- training_state.pt +1 -1
- video_encoder.safetensors +2 -2
audio_decoder.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1458410612
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:be283b159de0c4a206d250a8791de6c6dd88188cbf4bca57c4ff4f1b0b83ebf7
|
| 3 |
size 1458410612
|
cross_attention.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 174191400
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:879ba97c8a30d794181570c76bd10ecbb10fb84fabcb10047d7f5d7f944cc707
|
| 3 |
size 174191400
|
llm.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1506832040
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0b534cad0e5014cdd5984e8a4bd04771ffc7e701c12cea42b5467e4d051224d9
|
| 3 |
size 1506832040
|
model.safetensors.index.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"metadata": {
|
| 3 |
-
"total_size":
|
| 4 |
"format": "components"
|
| 5 |
},
|
| 6 |
"weight_map": {
|
|
|
|
| 1 |
{
|
| 2 |
"metadata": {
|
| 3 |
+
"total_size": 7309254542,
|
| 4 |
"format": "components"
|
| 5 |
},
|
| 6 |
"weight_map": {
|
modeling_xoron.py
CHANGED
|
@@ -3824,6 +3824,42 @@ class VideoTiTokTokenizer(nn.Module):
|
|
| 3824 |
print(f" Temporal tokens: {self.num_temporal_tokens}, Content tokens: {self.num_content_tokens}")
|
| 3825 |
print(f" Layers: {num_layers}, Heads: {num_heads}")
|
| 3826 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3827 |
def _add_3d_pos_encoding(self, x: torch.Tensor, num_frames: int, patches_per_frame: int) -> torch.Tensor:
|
| 3828 |
"""Add 3D positional encoding (temporal + spatial)."""
|
| 3829 |
B, seq_len, D = x.shape
|
|
@@ -4023,6 +4059,23 @@ class VideoEncoder(nn.Module):
|
|
| 4023 |
|
| 4024 |
print(f" 🎬 Video encoder: max {max_frames} frames (multi-scale enabled)")
|
| 4025 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4026 |
def _extract_frame_features(self, frames: torch.Tensor) -> torch.Tensor:
|
| 4027 |
"""Extract per-frame features using vision encoder."""
|
| 4028 |
batch_size, num_frames = frames.shape[:2]
|
|
|
|
| 3824 |
print(f" Temporal tokens: {self.num_temporal_tokens}, Content tokens: {self.num_content_tokens}")
|
| 3825 |
print(f" Layers: {num_layers}, Heads: {num_heads}")
|
| 3826 |
|
| 3827 |
+
def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
|
| 3828 |
+
"""Production-grade hook to handle dynamic frame counts and token counts when loading checkpoints."""
|
| 3829 |
+
|
| 3830 |
+
# 1. Handle temporal_pos: [1, max_frames, 1, hidden_size]
|
| 3831 |
+
t_pos_key = prefix + 'temporal_pos'
|
| 3832 |
+
if t_pos_key in state_dict:
|
| 3833 |
+
ckpt_pos = state_dict[t_pos_key]
|
| 3834 |
+
if ckpt_pos.shape != self.temporal_pos.shape:
|
| 3835 |
+
print(f" ⚠️ VideoTiTokTokenizer: Interpolating {t_pos_key} from {ckpt_pos.shape[1]} to {self.max_frames} frames.")
|
| 3836 |
+
# [1, T, 1, D] -> [1, D, T]
|
| 3837 |
+
ckpt_pos = ckpt_pos.squeeze(2).transpose(1, 2)
|
| 3838 |
+
resized = F.interpolate(ckpt_pos, size=self.max_frames, mode='linear', align_corners=False)
|
| 3839 |
+
state_dict[t_pos_key] = resized.transpose(1, 2).unsqueeze(2)
|
| 3840 |
+
|
| 3841 |
+
# 2. Handle temporal_queries: [1, num_temporal_tokens, hidden_size]
|
| 3842 |
+
t_query_key = prefix + 'temporal_queries'
|
| 3843 |
+
if t_query_key in state_dict:
|
| 3844 |
+
ckpt_query = state_dict[t_query_key]
|
| 3845 |
+
if ckpt_query.shape != self.temporal_queries.shape:
|
| 3846 |
+
print(f" ⚠️ VideoTiTokTokenizer: Interpolating {t_query_key} from {ckpt_query.shape[1]} to {self.num_temporal_tokens} tokens.")
|
| 3847 |
+
ckpt_query = ckpt_query.transpose(1, 2)
|
| 3848 |
+
resized = F.interpolate(ckpt_query, size=self.num_temporal_tokens, mode='linear', align_corners=False)
|
| 3849 |
+
state_dict[t_query_key] = resized.transpose(1, 2)
|
| 3850 |
+
|
| 3851 |
+
# 3. Handle content_queries: [1, num_content_tokens, hidden_size]
|
| 3852 |
+
c_query_key = prefix + 'content_queries'
|
| 3853 |
+
if c_query_key in state_dict:
|
| 3854 |
+
ckpt_query = state_dict[c_query_key]
|
| 3855 |
+
if ckpt_query.shape != self.content_queries.shape:
|
| 3856 |
+
print(f" ⚠️ VideoTiTokTokenizer: Interpolating {c_query_key} from {ckpt_query.shape[1]} to {self.num_content_tokens} tokens.")
|
| 3857 |
+
ckpt_query = ckpt_query.transpose(1, 2)
|
| 3858 |
+
resized = F.interpolate(ckpt_query, size=self.num_content_tokens, mode='linear', align_corners=False)
|
| 3859 |
+
state_dict[c_query_key] = resized.transpose(1, 2)
|
| 3860 |
+
|
| 3861 |
+
super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs)
|
| 3862 |
+
|
| 3863 |
def _add_3d_pos_encoding(self, x: torch.Tensor, num_frames: int, patches_per_frame: int) -> torch.Tensor:
|
| 3864 |
"""Add 3D positional encoding (temporal + spatial)."""
|
| 3865 |
B, seq_len, D = x.shape
|
|
|
|
| 4059 |
|
| 4060 |
print(f" 🎬 Video encoder: max {max_frames} frames (multi-scale enabled)")
|
| 4061 |
|
| 4062 |
+
def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
|
| 4063 |
+
"""Production-grade hook to handle dynamic frame counts when loading checkpoints.
|
| 4064 |
+
Interpolates temporal embeddings if the checkpoint frames differ from max_frames.
|
| 4065 |
+
"""
|
| 4066 |
+
# Handle frame_pos_embed
|
| 4067 |
+
embed_key = prefix + 'frame_pos_embed'
|
| 4068 |
+
if embed_key in state_dict:
|
| 4069 |
+
ckpt_embed = state_dict[embed_key]
|
| 4070 |
+
if ckpt_embed.shape != self.frame_pos_embed.shape:
|
| 4071 |
+
print(f" ⚠️ VideoEncoder: Interpolating {embed_key} from {ckpt_embed.shape[1]} to {self.max_frames} frames.")
|
| 4072 |
+
# [1, T, D] -> [1, D, T] for interpolation
|
| 4073 |
+
ckpt_embed = ckpt_embed.transpose(1, 2)
|
| 4074 |
+
resized = F.interpolate(ckpt_embed, size=self.max_frames, mode='linear', align_corners=False)
|
| 4075 |
+
state_dict[embed_key] = resized.transpose(1, 2) # Back to [1, T, D]
|
| 4076 |
+
|
| 4077 |
+
super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs)
|
| 4078 |
+
|
| 4079 |
def _extract_frame_features(self, frames: torch.Tensor) -> torch.Tensor:
|
| 4080 |
"""Extract per-frame features using vision encoder."""
|
| 4081 |
batch_size, num_frames = frames.shape[:2]
|
streaming_state.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
{
|
| 2 |
-
"epoch":
|
| 3 |
-
"unique_samples":
|
| 4 |
-
"total_yields":
|
| 5 |
"dataset_positions": {
|
| 6 |
"WebSight": 386,
|
| 7 |
"ScienceQA": 364,
|
|
@@ -30,7 +30,7 @@
|
|
| 30 |
"NoRobots": 450,
|
| 31 |
"Synth-LanguageSetup": 200,
|
| 32 |
"Function-Calling-ChatML": 200,
|
| 33 |
-
"Synth-CoT":
|
| 34 |
"Python-Code-18k": 200,
|
| 35 |
"Code-Feedback": 200,
|
| 36 |
"HumanEval-CPP": 164,
|
|
@@ -148,7 +148,7 @@
|
|
| 148 |
"audio": {}
|
| 149 |
},
|
| 150 |
"modality_counts": {
|
| 151 |
-
"text":
|
| 152 |
"image": 0,
|
| 153 |
"video": 0,
|
| 154 |
"audio": 0
|
|
|
|
| 1 |
{
|
| 2 |
+
"epoch": 48,
|
| 3 |
+
"unique_samples": 50,
|
| 4 |
+
"total_yields": 100,
|
| 5 |
"dataset_positions": {
|
| 6 |
"WebSight": 386,
|
| 7 |
"ScienceQA": 364,
|
|
|
|
| 30 |
"NoRobots": 450,
|
| 31 |
"Synth-LanguageSetup": 200,
|
| 32 |
"Function-Calling-ChatML": 200,
|
| 33 |
+
"Synth-CoT": 550,
|
| 34 |
"Python-Code-18k": 200,
|
| 35 |
"Code-Feedback": 200,
|
| 36 |
"HumanEval-CPP": 164,
|
|
|
|
| 148 |
"audio": {}
|
| 149 |
},
|
| 150 |
"modality_counts": {
|
| 151 |
+
"text": 50,
|
| 152 |
"image": 0,
|
| 153 |
"video": 0,
|
| 154 |
"audio": 0
|
trainer_state.json
CHANGED
|
@@ -1,15 +1,15 @@
|
|
| 1 |
{
|
| 2 |
"best_model_checkpoint": "/kaggle/working/xoron-final",
|
| 3 |
-
"best_metric":
|
| 4 |
-
"epoch":
|
| 5 |
-
"epochs_completed":
|
| 6 |
-
"global_step":
|
| 7 |
"is_local_process_zero": true,
|
| 8 |
"is_world_process_zero": true,
|
| 9 |
"log_history": [],
|
| 10 |
"logging_steps": 50,
|
| 11 |
-
"max_steps":
|
| 12 |
-
"num_train_epochs":
|
| 13 |
"total_flos": 0,
|
| 14 |
"train_batch_size": 1,
|
| 15 |
"effective_batch_size": 16,
|
|
|
|
| 1 |
{
|
| 2 |
"best_model_checkpoint": "/kaggle/working/xoron-final",
|
| 3 |
+
"best_metric": 4.869536457061767,
|
| 4 |
+
"epoch": 7,
|
| 5 |
+
"epochs_completed": 7,
|
| 6 |
+
"global_step": 42,
|
| 7 |
"is_local_process_zero": true,
|
| 8 |
"is_world_process_zero": true,
|
| 9 |
"log_history": [],
|
| 10 |
"logging_steps": 50,
|
| 11 |
+
"max_steps": 42,
|
| 12 |
+
"num_train_epochs": 7,
|
| 13 |
"total_flos": 0,
|
| 14 |
"train_batch_size": 1,
|
| 15 |
"effective_batch_size": 16,
|
training_state.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1514911851
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5b7335b590f20d3da7af0b586bde304e2566d2306489e64d38993d1cd20df627
|
| 3 |
size 1514911851
|
video_encoder.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f79301c42561645d3000aa5d61769c0dd5089c6120c49a3fdca5a7eb5af9d2c9
|
| 3 |
+
size 1922978512
|