Update model weights after training (epoch 5, loss 6.5644)

Files changed (6) hide show

audio_decoder.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5908d239f6fbfc57829c2cbb1d27f6be5dd66124afde0e57fa61a908465aaeb2
 size 1458410612

 version https://git-lfs.github.com/spec/v1
+oid sha256:e161c50803ebcf38666c109dde1baef4dc92fa5db9967fbd8e72f2b5af392b76
 size 1458410612

cross_attention.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:15d8ec9d811b147a55801291ea133030f28ab124bb05ce6e6695198c9ba9983b
 size 174191400

 version https://git-lfs.github.com/spec/v1
+oid sha256:0ba284496e9f5135658519fcb742fe092211dfd5df2aa73ac4e3effa31fb5319
 size 174191400

llm.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b686364d4e646df1bbeb0448c37d3a4da241b0c2e22cb8aec05244d3fa2f5aa7
 size 1506832040

 version https://git-lfs.github.com/spec/v1
+oid sha256:050b46cae242ff76f36def2a463e491fd494e2a8f5ba239229e8cd25851300e5
 size 1506832040

streaming_state.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
-  "epoch": 62,
-  "unique_samples": 250,
-  "total_yields": 500,
   "dataset_positions": {
     "WebSight": 386,
     "ScienceQA": 364,
@@ -86,7 +86,15 @@
     "TIP-I2V": 600,
     "Pexels-I2V-350k": 600,
     "SmolTalk-OpenHermes": 250,
-    "SmolTalk-All": 250
   },
   "modality_positions": {
     "text": {
@@ -123,7 +131,15 @@
       "Tool-Calls-Multiturn": 200,
       "OpenAssistant": 450,
       "SmolTalk-OpenHermes": 250,
-      "SmolTalk-All": 250
     },
     "image": {
       "WebSight": 386,
@@ -148,9 +164,9 @@
     "audio": {}
   },
   "modality_counts": {
-    "text": 0,
     "image": 0,
-    "video": 250,
     "audio": 0
   },
   "last_modality": null

 {
+  "epoch": 71,
+  "unique_samples": 400,
+  "total_yields": 800,
   "dataset_positions": {
     "WebSight": 386,
     "ScienceQA": 364,
     "TIP-I2V": 600,
     "Pexels-I2V-350k": 600,
     "SmolTalk-OpenHermes": 250,
+    "SmolTalk-All": 250,
+    "Cosmopedia-AutoMath": 250,
+    "OpenMathInstruct-1": 250,
+    "NuminaMath-CoT": 250,
+    "UltraData-Math-Conv": 250,
+    "Cosmopedia-KhanAcademy": 250,
+    "NuminaMath-TIR": 250,
+    "UltraData-Math-QA": 250,
+    "Cosmopedia-OpenStax": 250
   },
   "modality_positions": {
     "text": {
       "Tool-Calls-Multiturn": 200,
       "OpenAssistant": 450,
       "SmolTalk-OpenHermes": 250,
+      "SmolTalk-All": 250,
+      "Cosmopedia-AutoMath": 250,
+      "OpenMathInstruct-1": 250,
+      "NuminaMath-CoT": 250,
+      "UltraData-Math-Conv": 250,
+      "Cosmopedia-KhanAcademy": 250,
+      "NuminaMath-TIR": 250,
+      "UltraData-Math-QA": 250,
+      "Cosmopedia-OpenStax": 250
     },
     "image": {
       "WebSight": 386,
     "audio": {}
   },
   "modality_counts": {
+    "text": 400,
     "image": 0,
+    "video": 0,
     "audio": 0
   },
   "last_modality": null

trainer_state.json CHANGED Viewed

@@ -1,32 +1,32 @@
 {
   "best_model_checkpoint": "/kaggle/working/xoron-final",
-  "best_metric": 2.7717735348048738,
-  "epoch": 6,
-  "epochs_completed": 6,
-  "global_step": 186,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [],
   "logging_steps": 50,
-  "max_steps": 186,
-  "num_train_epochs": 6,
   "total_flos": 0,
   "train_batch_size": 1,
   "effective_batch_size": 16,
   "learning_rate": 0.0001,
   "max_grad_norm": 1.0,
   "trainable_components": [
-    "vision",
-    "video",
     "llm",
     "cross_attention",
-    "video_generation",
     "modality_markers"
   ],
   "frozen_components": [
     "audio",
     "speech",
-    "image_generation"
   ],
   "trial_name": null,
   "trial_params": null

 {
   "best_model_checkpoint": "/kaggle/working/xoron-final",
+  "best_metric": 6.564389287829399,
+  "epoch": 5,
+  "epochs_completed": 5,
+  "global_step": 250,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [],
   "logging_steps": 50,
+  "max_steps": 250,
+  "num_train_epochs": 5,
   "total_flos": 0,
   "train_batch_size": 1,
   "effective_batch_size": 16,
   "learning_rate": 0.0001,
   "max_grad_norm": 1.0,
   "trainable_components": [
     "llm",
     "cross_attention",
     "modality_markers"
   ],
   "frozen_components": [
+    "vision",
+    "video",
     "audio",
     "speech",
+    "image_generation",
+    "video_generation"
   ],
   "trial_name": null,
   "trial_params": null

training_state.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b2b27f88bfa4b75b4f8d0e5a76bd83c68286060a451b136de1eeb07a923ed70a
-size 3426643671

 version https://git-lfs.github.com/spec/v1
+oid sha256:b617b2a3ab7ba08cac1c55c6a02085d0d226885f2a225d7626553579ac8029ab
+size 1514911851