Update model weights after training (epoch 1, loss 3.3989)

Files changed (8) hide show

audio_decoder.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4a240617216d0644ac615eed664398b69c732420dba3c0121a5c1344652fb7fa
 size 1458410612

 version https://git-lfs.github.com/spec/v1
+oid sha256:d817de2ba9f31539807a8d57d1ad5441f33794329008e0a6b9e01764b831f909
 size 1458410612

config.json CHANGED Viewed

@@ -49,7 +49,7 @@
   "image_size_step": 32,
   "video_min_size": 128,
   "video_max_size": 320,
-  "video_base_size": 320,
   "video_size_step": 32,
   "video_min_frames": 8,
   "video_max_frames": 8,

   "image_size_step": 32,
   "video_min_size": 128,
   "video_max_size": 320,
+  "video_base_size": 128,
   "video_size_step": 32,
   "video_min_frames": 8,
   "video_max_frames": 8,

cross_attention.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8d5b0a5a4040b026f16479ff817aadab4e42a2281750c7728f70aba9fd988a1f
 size 174191400

 version https://git-lfs.github.com/spec/v1
+oid sha256:6beff1e6cfb37ea461f112bf9d138ca007c01e24ac716b997a92000813aa8de5
 size 174191400

llm.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:eeebe91f597a9aba32227820d24ec97ec34c5680090cd85ad3006eeefb812081
 size 1506832040

 version https://git-lfs.github.com/spec/v1
+oid sha256:b168f1e28965acb01ab0375c81614f3af6cd312b27c630633ce21c555d8ab3b5
 size 1506832040

streaming_state.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
-  "epoch": 84,
-  "unique_samples": 150,
-  "total_yields": 300,
   "dataset_positions": {
     "WebSight": 386,
     "ScienceQA": 364,
@@ -76,15 +76,15 @@
     "Tool-Calls-SingleTurn": 200,
     "Tool-Calls-Multiturn": 200,
     "OpenAssistant": 450,
-    "T2V-Sora-Preferences-2": 600,
-    "T2V-Human-Preferences": 600,
     "Sora-Alignment-Likert": 198,
     "Sora-Style-Likert": 198,
     "I2V-Preference-Seedance": 198,
-    "WebVid-10M": 600,
     "Sora-Physics-Likert": 198,
-    "TIP-I2V": 600,
-    "Pexels-I2V-350k": 600,
     "SmolTalk-OpenHermes": 250,
     "SmolTalk-All": 250,
     "Cosmopedia-AutoMath": 250,
@@ -157,22 +157,22 @@
       "MagicBrush": 386
     },
     "video": {
-      "T2V-Sora-Preferences-2": 600,
-      "T2V-Human-Preferences": 600,
       "Sora-Alignment-Likert": 198,
       "Sora-Style-Likert": 198,
       "I2V-Preference-Seedance": 198,
-      "WebVid-10M": 600,
       "Sora-Physics-Likert": 198,
-      "TIP-I2V": 600,
-      "Pexels-I2V-350k": 600
     },
     "audio": {}
   },
   "modality_counts": {
-    "text": 150,
     "image": 0,
-    "video": 0,
     "audio": 0
   },
   "last_modality": null

 {
+  "epoch": 85,
+  "unique_samples": 400,
+  "total_yields": 800,
   "dataset_positions": {
     "WebSight": 386,
     "ScienceQA": 364,
     "Tool-Calls-SingleTurn": 200,
     "Tool-Calls-Multiturn": 200,
     "OpenAssistant": 450,
+    "T2V-Sora-Preferences-2": 650,
+    "T2V-Human-Preferences": 650,
     "Sora-Alignment-Likert": 198,
     "Sora-Style-Likert": 198,
     "I2V-Preference-Seedance": 198,
+    "WebVid-10M": 650,
     "Sora-Physics-Likert": 198,
+    "TIP-I2V": 650,
+    "Pexels-I2V-350k": 650,
     "SmolTalk-OpenHermes": 250,
     "SmolTalk-All": 250,
     "Cosmopedia-AutoMath": 250,
       "MagicBrush": 386
     },
     "video": {
+      "T2V-Sora-Preferences-2": 650,
+      "T2V-Human-Preferences": 650,
       "Sora-Alignment-Likert": 198,
       "Sora-Style-Likert": 198,
       "I2V-Preference-Seedance": 198,
+      "WebVid-10M": 650,
       "Sora-Physics-Likert": 198,
+      "TIP-I2V": 650,
+      "Pexels-I2V-350k": 650
     },
     "audio": {}
   },
   "modality_counts": {
+    "text": 0,
     "image": 0,
+    "video": 250,
     "audio": 0
   },
   "last_modality": null

trainer_state.json CHANGED Viewed

@@ -1,32 +1,32 @@
 {
   "best_model_checkpoint": "/kaggle/working/xoron-final",
-  "best_metric": 7.186485166748365,
-  "epoch": 7,
-  "epochs_completed": 7,
-  "global_step": 126,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [],
   "logging_steps": 50,
-  "max_steps": 126,
-  "num_train_epochs": 7,
   "total_flos": 0,
   "train_batch_size": 1,
   "effective_batch_size": 16,
   "learning_rate": 0.0001,
   "max_grad_norm": 1.0,
   "trainable_components": [
     "llm",
     "cross_attention",
     "modality_markers"
   ],
   "frozen_components": [
-    "vision",
-    "video",
     "audio",
     "speech",
-    "image_generation",
-    "video_generation"
   ],
   "trial_name": null,
   "trial_params": null

 {
   "best_model_checkpoint": "/kaggle/working/xoron-final",
+  "best_metric": 3.398919365755515,
+  "epoch": 1,
+  "epochs_completed": 1,
+  "global_step": 31,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [],
   "logging_steps": 50,
+  "max_steps": 31,
+  "num_train_epochs": 1,
   "total_flos": 0,
   "train_batch_size": 1,
   "effective_batch_size": 16,
   "learning_rate": 0.0001,
   "max_grad_norm": 1.0,
   "trainable_components": [
+    "vision",
+    "video",
     "llm",
     "cross_attention",
+    "video_generation",
     "modality_markers"
   ],
   "frozen_components": [
     "audio",
     "speech",
+    "image_generation"
   ],
   "trial_name": null,
   "trial_params": null

training_state.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6733079c08d0e2c4e1bec055b3d45e16f552e09e2b8027d98d7dba03554b4300
-size 1514911851

 version https://git-lfs.github.com/spec/v1
+oid sha256:b224a38701068628ea2346719232695d255cff3500d63df4b888e5a94eab7ab4
+size 3426643671

video_generator.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:70eaa4447488bf781bc930d6054b5439f3a21b610f298399b947cf89d457a101
 size 61574134

 version https://git-lfs.github.com/spec/v1
+oid sha256:c17764963de9c77345a4b2ae8b508c3c4c7cb5bf15aa65b7c5239c3e8babc1ce
 size 61574134