Update model weights after training (epoch 1, loss 5.4568)

Browse files

Files changed (8) hide show

audio_decoder.safetensors +1 -1
cross_attention.safetensors +1 -1
generator.safetensors +1 -1
llm.safetensors +1 -1
modeling_xoron.py +10 -4
streaming_state.json +20 -104
trainer_state.json +5 -5
training_state.pt +2 -2

audio_decoder.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3226fee536a749a40aab83f5afa949808d778485026f933161c2d0a6b66c03f9
 size 1458415836

 version https://git-lfs.github.com/spec/v1
+oid sha256:e5cf2dc3522c1e490afa6006d181588c79ab8bcbc3f7cebba953c82ce31fb9ce
 size 1458415836

cross_attention.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c1dd70b1b4136042c3241058967ff7fb8423547263fe302498c3cc9f2ab00703
 size 174191400

 version https://git-lfs.github.com/spec/v1
+oid sha256:a855a5462cdb45b14eb557d70c448f69e8d7b4f48219beed964c65c20f4a78c6
 size 174191400

generator.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3c01fd06b809de1bc14f78c1fd5f2f2cac625db3b22f0b696d532e7442aee71a
 size 629440508

 version https://git-lfs.github.com/spec/v1
+oid sha256:98ed4a2b16c382e396eb0b9f421ec2a11f4a292179f4716f590e83cb011de934
 size 629440508

llm.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c84ad6f98c7c9d20394a4a356dd6f56d27ee8ada70d3a891c1e8e557df3280dd
 size 1506831304

 version https://git-lfs.github.com/spec/v1
+oid sha256:4fb4e556e9f5e85b31c0b5c926823e4dbbb13ba1a273b166f1c8fb0bec85b258
 size 1506831304

modeling_xoron.py CHANGED Viewed

@@ -5770,8 +5770,11 @@ class AudioDecoder(nn.Module):
         energy_pred = F.softplus(self.energy_predictor(x))
         # Determine output length
         if target_length is not None:
-            mel_length = target_length
         else:
             mel_length = int(durations.sum(dim=1).max().item())
             mel_length = max(16, min(mel_length, self.max_audio_length))
@@ -9634,7 +9637,8 @@ class XoronMultimodalModel(nn.Module):
     def generate_speech(self, input_ids: torch.Tensor, attention_mask: torch.Tensor = None):
         """Generate speech (mel-spectrogram) from text (TTS)."""
         text_embeds = self.get_text_embeddings(input_ids, attention_mask)
-        mel, durations = self.audio_decoder(text_embeds)
         return mel, durations
     @torch.no_grad()
@@ -9667,7 +9671,8 @@ class XoronMultimodalModel(nn.Module):
         # Generate intermediate features through audio decoder
         # This gives us the linguistic/prosodic representation
-        mel, durations, _ = self.audio_decoder(
             text_embeds,
             speaker_embedding=speaker_embedding,
         )
@@ -9748,7 +9753,8 @@ class XoronMultimodalModel(nn.Module):
         # 4. Speak - convert text response to audio
         if response_embeds is not None:
-            mel, durations, _ = self.audio_decoder(
                 response_embeds,
                 speaker_embedding=speaker_embedding,
             )

         energy_pred = F.softplus(self.energy_predictor(x))
         # Determine output length
+        # IMPORTANT: BatchNorm1d requires sequence length > 1 during training
+        # Enforce minimum length of 2 to avoid "Expected more than 1 value per channel" error
+        MIN_MEL_LENGTH = 2
         if target_length is not None:
+            mel_length = max(MIN_MEL_LENGTH, target_length)
         else:
             mel_length = int(durations.sum(dim=1).max().item())
             mel_length = max(16, min(mel_length, self.max_audio_length))
     def generate_speech(self, input_ids: torch.Tensor, attention_mask: torch.Tensor = None):
         """Generate speech (mel-spectrogram) from text (TTS)."""
         text_embeds = self.get_text_embeddings(input_ids, attention_mask)
+        # AudioDecoder.forward() returns 4 values: (mel, durations, alignment, extras)
+        mel, durations, _, _ = self.audio_decoder(text_embeds)
         return mel, durations
     @torch.no_grad()
         # Generate intermediate features through audio decoder
         # This gives us the linguistic/prosodic representation
+        # AudioDecoder.forward() returns 4 values: (mel, durations, alignment, extras)
+        mel, durations, _, _ = self.audio_decoder(
             text_embeds,
             speaker_embedding=speaker_embedding,
         )
         # 4. Speak - convert text response to audio
         if response_embeds is not None:
+            # AudioDecoder.forward() returns 4 values: (mel, durations, alignment, extras)
+            mel, durations, _, _ = self.audio_decoder(
                 response_embeds,
                 speaker_embedding=speaker_embedding,
             )

streaming_state.json CHANGED Viewed

@@ -1,117 +1,33 @@
 {
   "epoch": 1,
-  "unique_samples": 3260,
-  "total_yields": 6520,
   "dataset_positions": {
-    "Synth-SelfCorrection": 50,
-    "Synth-Documents": 50,
-    "Synth-ShellTimeout": 50,
-    "Jupyter-Code": 50,
-    "HumanEval-JavaScript": 50,
-    "Synth-DesktopSetup": 50,
-    "UltraChat": 50,
-    "HumanEval-Python": 50,
-    "Dolly-15k": 50,
-    "Synth-ShellExecution": 50,
-    "Midjourney-Prompts": 50,
-    "Synth-PythonScripts": 50,
-    "Synth-Issues": 50,
-    "Synth-Monitoring": 50,
-    "Synth-KnowledgeCutoff": 50,
-    "Synth-Uncertainty": 50,
-    "Swift-Code-RLVR": 50,
-    "HumanEval-CPP": 50,
-    "Synth-CoT": 50,
-    "Synth-Debugging": 50,
-    "Swift-Code-Edit": 10,
-    "SD-Prompts-2M": 50,
-    "Synth-WebserverSetup": 50,
-    "Synth-SSHSetup": 50,
-    "File-Operations-Medium": 50,
-    "Python-Code-18k": 50,
-    "Synth-RepoContext": 50,
-    "Synth-IDK": 50,
-    "WildChat": 50,
-    "Synth-FIM": 50,
-    "Synth-GroundedResponse": 50,
-    "Synth-AptInstall": 50,
-    "Golang-Coder": 50,
-    "HumanEval-Java": 50,
-    "AgentInstruct": 50,
-    "Function-Calling-ChatML": 50,
-    "Synth-Downloads": 50,
-    "Synth-MultiStepExecution": 50,
-    "Synth-RetrievalGrounded": 50,
-    "Pythonic-Function-Calling": 50,
-    "OpenOrca": 50,
-    "Synth-Citation": 50,
-    "Golang-QA-2k": 50,
-    "Synth-APIGen": 50,
-    "CodeParrot-Clean": 50,
-    "Synth-Jupyter": 50,
-    "Synth-ShellErrors": 50,
-    "NoRobots": 50,
-    "Synth-Docker": 50,
-    "Glaive-Code-Assistant": 50,
-    "Synth-Diffs": 50,
-    "ShareGPT-Clean": 50,
-    "Code-Feedback": 50,
-    "Conversation-Summarization": 50,
-    "SD-Prompts": 50,
-    "Synth-LanguageSetup": 50,
-    "Synth-FactCheck": 50,
-    "Synth-Execution": 50,
-    "HumanEval-Rust": 50,
-    "Synth-DatabaseSetup": 50,
-    "Synth-ConfidenceLevel": 50,
-    "Synth-Commits": 50,
-    "HumanEval-Go": 50,
-    "Tool-Calls-Multiturn": 50,
-    "OpenAssistant": 50,
-    "Tool-Calls-SingleTurn": 50
   },
   "modality_positions": {
-    "text": {
-      "Jupyter-Code": 50,
-      "HumanEval-JavaScript": 50,
-      "UltraChat": 50,
-      "HumanEval-Python": 50,
-      "Dolly-15k": 50,
-      "Midjourney-Prompts": 50,
-      "Swift-Code-RLVR": 50,
-      "HumanEval-CPP": 50,
-      "Swift-Code-Edit": 10,
-      "SD-Prompts-2M": 50,
-      "Python-Code-18k": 50,
-      "WildChat": 50,
-      "Golang-Coder": 50,
-      "HumanEval-Java": 50,
-      "AgentInstruct": 50,
-      "Function-Calling-ChatML": 50,
-      "Pythonic-Function-Calling": 50,
-      "OpenOrca": 50,
-      "Golang-QA-2k": 50,
-      "Synth-APIGen": 50,
-      "CodeParrot-Clean": 50,
-      "NoRobots": 50,
-      "Glaive-Code-Assistant": 50,
-      "ShareGPT-Clean": 50,
-      "Code-Feedback": 50,
-      "Conversation-Summarization": 50,
-      "SD-Prompts": 50,
-      "HumanEval-Rust": 50,
-      "HumanEval-Go": 50,
-      "Tool-Calls-Multiturn": 50,
-      "OpenAssistant": 50,
-      "Tool-Calls-SingleTurn": 50
     },
-    "image": {},
     "video": {},
     "audio": {}
   },
   "modality_counts": {
-    "text": 3260,
-    "image": 0,
     "video": 0,
     "audio": 0
   },

 {
   "epoch": 1,
+  "unique_samples": 306,
+  "total_yields": 612,
   "dataset_positions": {
+    "WebSight": 50,
+    "ScienceQA": 50,
+    "InstructPix2Pix": 50,
+    "Flickr8k": 50,
+    "NewYorker": 50,
+    "Football": 6,
+    "MagicBrush": 50
   },
   "modality_positions": {
+    "text": {},
+    "image": {
+      "WebSight": 50,
+      "ScienceQA": 50,
+      "InstructPix2Pix": 50,
+      "Flickr8k": 50,
+      "NewYorker": 50,
+      "Football": 6,
+      "MagicBrush": 50
     },
     "video": {},
     "audio": {}
   },
   "modality_counts": {
+    "text": 0,
+    "image": 306,
     "video": 0,
     "audio": 0
   },

trainer_state.json CHANGED Viewed

@@ -1,14 +1,14 @@
 {
   "best_model_checkpoint": "/kaggle/working/xoron-final",
-  "best_metric": 6.629150597175206,
   "epoch": 1,
   "epochs_completed": 1,
-  "global_step": 407,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [],
   "logging_steps": 50,
-  "max_steps": 407,
   "num_train_epochs": 1,
   "total_flos": 0,
   "train_batch_size": 1,
@@ -16,16 +16,16 @@
   "learning_rate": 0.0001,
   "max_grad_norm": 1.0,
   "trainable_components": [
     "llm",
     "cross_attention",
     "modality_markers"
   ],
   "frozen_components": [
-    "vision",
     "video",
     "audio",
     "speech",
-    "image_generation",
     "video_generation"
   ],
   "trial_name": null,

 {
   "best_model_checkpoint": "/kaggle/working/xoron-final",
+  "best_metric": 5.456806772260689,
   "epoch": 1,
   "epochs_completed": 1,
+  "global_step": 38,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [],
   "logging_steps": 50,
+  "max_steps": 38,
   "num_train_epochs": 1,
   "total_flos": 0,
   "train_batch_size": 1,
   "learning_rate": 0.0001,
   "max_grad_norm": 1.0,
   "trainable_components": [
+    "vision",
     "llm",
     "cross_attention",
+    "image_generation",
     "modality_markers"
   ],
   "frozen_components": [
     "video",
     "audio",
     "speech",
     "video_generation"
   ],
   "trial_name": null,

training_state.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d32523d7fc44d7f0f1c884a1463149d48212161a3bfbd0f82b045cf4a1d583a2
-size 781490561

 version https://git-lfs.github.com/spec/v1
+oid sha256:701076954d95569aec679ca5649e39cbc864ff2c78b7faccafa6f2501d93a6fb
+size 1419713437