Backup-bdg commited on
Commit
f5d4f0d
·
verified ·
1 Parent(s): f9dcb77

Update model weights after training (epoch 7, loss 4.8695)

Browse files
audio_decoder.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c225077ec0e29909d0f390011f666158ae658fa3385cf8032280f5203da09cae
3
  size 1458410612
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be283b159de0c4a206d250a8791de6c6dd88188cbf4bca57c4ff4f1b0b83ebf7
3
  size 1458410612
cross_attention.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c5dc29d69984df0e49cf508c56c03b7a18a7a49baf89a414fa3128513d753e7e
3
  size 174191400
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:879ba97c8a30d794181570c76bd10ecbb10fb84fabcb10047d7f5d7f944cc707
3
  size 174191400
llm.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5de86313a868d4108f814a3debd9d1ed31dc72281458ef9c7824b9a4398ce28f
3
  size 1506832040
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b534cad0e5014cdd5984e8a4bd04771ffc7e701c12cea42b5467e4d051224d9
3
  size 1506832040
model.safetensors.index.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "metadata": {
3
- "total_size": 7309365134,
4
  "format": "components"
5
  },
6
  "weight_map": {
 
1
  {
2
  "metadata": {
3
+ "total_size": 7309254542,
4
  "format": "components"
5
  },
6
  "weight_map": {
modeling_xoron.py CHANGED
@@ -3824,6 +3824,42 @@ class VideoTiTokTokenizer(nn.Module):
3824
  print(f" Temporal tokens: {self.num_temporal_tokens}, Content tokens: {self.num_content_tokens}")
3825
  print(f" Layers: {num_layers}, Heads: {num_heads}")
3826
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3827
  def _add_3d_pos_encoding(self, x: torch.Tensor, num_frames: int, patches_per_frame: int) -> torch.Tensor:
3828
  """Add 3D positional encoding (temporal + spatial)."""
3829
  B, seq_len, D = x.shape
@@ -4023,6 +4059,23 @@ class VideoEncoder(nn.Module):
4023
 
4024
  print(f" 🎬 Video encoder: max {max_frames} frames (multi-scale enabled)")
4025
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4026
  def _extract_frame_features(self, frames: torch.Tensor) -> torch.Tensor:
4027
  """Extract per-frame features using vision encoder."""
4028
  batch_size, num_frames = frames.shape[:2]
 
3824
  print(f" Temporal tokens: {self.num_temporal_tokens}, Content tokens: {self.num_content_tokens}")
3825
  print(f" Layers: {num_layers}, Heads: {num_heads}")
3826
 
3827
+ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
3828
+ """Production-grade hook to handle dynamic frame counts and token counts when loading checkpoints."""
3829
+
3830
+ # 1. Handle temporal_pos: [1, max_frames, 1, hidden_size]
3831
+ t_pos_key = prefix + 'temporal_pos'
3832
+ if t_pos_key in state_dict:
3833
+ ckpt_pos = state_dict[t_pos_key]
3834
+ if ckpt_pos.shape != self.temporal_pos.shape:
3835
+ print(f" ⚠️ VideoTiTokTokenizer: Interpolating {t_pos_key} from {ckpt_pos.shape[1]} to {self.max_frames} frames.")
3836
+ # [1, T, 1, D] -> [1, D, T]
3837
+ ckpt_pos = ckpt_pos.squeeze(2).transpose(1, 2)
3838
+ resized = F.interpolate(ckpt_pos, size=self.max_frames, mode='linear', align_corners=False)
3839
+ state_dict[t_pos_key] = resized.transpose(1, 2).unsqueeze(2)
3840
+
3841
+ # 2. Handle temporal_queries: [1, num_temporal_tokens, hidden_size]
3842
+ t_query_key = prefix + 'temporal_queries'
3843
+ if t_query_key in state_dict:
3844
+ ckpt_query = state_dict[t_query_key]
3845
+ if ckpt_query.shape != self.temporal_queries.shape:
3846
+ print(f" ⚠️ VideoTiTokTokenizer: Interpolating {t_query_key} from {ckpt_query.shape[1]} to {self.num_temporal_tokens} tokens.")
3847
+ ckpt_query = ckpt_query.transpose(1, 2)
3848
+ resized = F.interpolate(ckpt_query, size=self.num_temporal_tokens, mode='linear', align_corners=False)
3849
+ state_dict[t_query_key] = resized.transpose(1, 2)
3850
+
3851
+ # 3. Handle content_queries: [1, num_content_tokens, hidden_size]
3852
+ c_query_key = prefix + 'content_queries'
3853
+ if c_query_key in state_dict:
3854
+ ckpt_query = state_dict[c_query_key]
3855
+ if ckpt_query.shape != self.content_queries.shape:
3856
+ print(f" ⚠️ VideoTiTokTokenizer: Interpolating {c_query_key} from {ckpt_query.shape[1]} to {self.num_content_tokens} tokens.")
3857
+ ckpt_query = ckpt_query.transpose(1, 2)
3858
+ resized = F.interpolate(ckpt_query, size=self.num_content_tokens, mode='linear', align_corners=False)
3859
+ state_dict[c_query_key] = resized.transpose(1, 2)
3860
+
3861
+ super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs)
3862
+
3863
  def _add_3d_pos_encoding(self, x: torch.Tensor, num_frames: int, patches_per_frame: int) -> torch.Tensor:
3864
  """Add 3D positional encoding (temporal + spatial)."""
3865
  B, seq_len, D = x.shape
 
4059
 
4060
  print(f" 🎬 Video encoder: max {max_frames} frames (multi-scale enabled)")
4061
 
4062
+ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
4063
+ """Production-grade hook to handle dynamic frame counts when loading checkpoints.
4064
+ Interpolates temporal embeddings if the checkpoint frames differ from max_frames.
4065
+ """
4066
+ # Handle frame_pos_embed
4067
+ embed_key = prefix + 'frame_pos_embed'
4068
+ if embed_key in state_dict:
4069
+ ckpt_embed = state_dict[embed_key]
4070
+ if ckpt_embed.shape != self.frame_pos_embed.shape:
4071
+ print(f" ⚠️ VideoEncoder: Interpolating {embed_key} from {ckpt_embed.shape[1]} to {self.max_frames} frames.")
4072
+ # [1, T, D] -> [1, D, T] for interpolation
4073
+ ckpt_embed = ckpt_embed.transpose(1, 2)
4074
+ resized = F.interpolate(ckpt_embed, size=self.max_frames, mode='linear', align_corners=False)
4075
+ state_dict[embed_key] = resized.transpose(1, 2) # Back to [1, T, D]
4076
+
4077
+ super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs)
4078
+
4079
  def _extract_frame_features(self, frames: torch.Tensor) -> torch.Tensor:
4080
  """Extract per-frame features using vision encoder."""
4081
  batch_size, num_frames = frames.shape[:2]
streaming_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
- "epoch": 35,
3
- "unique_samples": 400,
4
- "total_yields": 800,
5
  "dataset_positions": {
6
  "WebSight": 386,
7
  "ScienceQA": 364,
@@ -30,7 +30,7 @@
30
  "NoRobots": 450,
31
  "Synth-LanguageSetup": 200,
32
  "Function-Calling-ChatML": 200,
33
- "Synth-CoT": 200,
34
  "Python-Code-18k": 200,
35
  "Code-Feedback": 200,
36
  "HumanEval-CPP": 164,
@@ -148,7 +148,7 @@
148
  "audio": {}
149
  },
150
  "modality_counts": {
151
- "text": 400,
152
  "image": 0,
153
  "video": 0,
154
  "audio": 0
 
1
  {
2
+ "epoch": 48,
3
+ "unique_samples": 50,
4
+ "total_yields": 100,
5
  "dataset_positions": {
6
  "WebSight": 386,
7
  "ScienceQA": 364,
 
30
  "NoRobots": 450,
31
  "Synth-LanguageSetup": 200,
32
  "Function-Calling-ChatML": 200,
33
+ "Synth-CoT": 550,
34
  "Python-Code-18k": 200,
35
  "Code-Feedback": 200,
36
  "HumanEval-CPP": 164,
 
148
  "audio": {}
149
  },
150
  "modality_counts": {
151
+ "text": 50,
152
  "image": 0,
153
  "video": 0,
154
  "audio": 0
trainer_state.json CHANGED
@@ -1,15 +1,15 @@
1
  {
2
  "best_model_checkpoint": "/kaggle/working/xoron-final",
3
- "best_metric": 6.958861378133297,
4
- "epoch": 5,
5
- "epochs_completed": 5,
6
- "global_step": 250,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [],
10
  "logging_steps": 50,
11
- "max_steps": 250,
12
- "num_train_epochs": 5,
13
  "total_flos": 0,
14
  "train_batch_size": 1,
15
  "effective_batch_size": 16,
 
1
  {
2
  "best_model_checkpoint": "/kaggle/working/xoron-final",
3
+ "best_metric": 4.869536457061767,
4
+ "epoch": 7,
5
+ "epochs_completed": 7,
6
+ "global_step": 42,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [],
10
  "logging_steps": 50,
11
+ "max_steps": 42,
12
+ "num_train_epochs": 7,
13
  "total_flos": 0,
14
  "train_batch_size": 1,
15
  "effective_batch_size": 16,
training_state.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a751ecf22021470154d58846b700d04286522c14cda7393ece31f907eff5a2c7
3
  size 1514911851
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b7335b590f20d3da7af0b586bde304e2566d2306489e64d38993d1cd20df627
3
  size 1514911851
video_encoder.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f70226e533706675adf13f72c46122854021d13fe388445bc4d6b7495fa64e3a
3
- size 1923089112
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f79301c42561645d3000aa5d61769c0dd5089c6120c49a3fdca5a7eb5af9d2c9
3
+ size 1922978512