Backup-bdg commited on
Commit
a942d2b
·
verified ·
1 Parent(s): 2068eeb

Update model weights after training (epoch 1, loss 5.4568)

Browse files
audio_decoder.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3226fee536a749a40aab83f5afa949808d778485026f933161c2d0a6b66c03f9
3
  size 1458415836
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5cf2dc3522c1e490afa6006d181588c79ab8bcbc3f7cebba953c82ce31fb9ce
3
  size 1458415836
cross_attention.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c1dd70b1b4136042c3241058967ff7fb8423547263fe302498c3cc9f2ab00703
3
  size 174191400
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a855a5462cdb45b14eb557d70c448f69e8d7b4f48219beed964c65c20f4a78c6
3
  size 174191400
generator.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3c01fd06b809de1bc14f78c1fd5f2f2cac625db3b22f0b696d532e7442aee71a
3
  size 629440508
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98ed4a2b16c382e396eb0b9f421ec2a11f4a292179f4716f590e83cb011de934
3
  size 629440508
llm.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c84ad6f98c7c9d20394a4a356dd6f56d27ee8ada70d3a891c1e8e557df3280dd
3
  size 1506831304
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4fb4e556e9f5e85b31c0b5c926823e4dbbb13ba1a273b166f1c8fb0bec85b258
3
  size 1506831304
modeling_xoron.py CHANGED
@@ -5770,8 +5770,11 @@ class AudioDecoder(nn.Module):
5770
  energy_pred = F.softplus(self.energy_predictor(x))
5771
 
5772
  # Determine output length
 
 
 
5773
  if target_length is not None:
5774
- mel_length = target_length
5775
  else:
5776
  mel_length = int(durations.sum(dim=1).max().item())
5777
  mel_length = max(16, min(mel_length, self.max_audio_length))
@@ -9634,7 +9637,8 @@ class XoronMultimodalModel(nn.Module):
9634
  def generate_speech(self, input_ids: torch.Tensor, attention_mask: torch.Tensor = None):
9635
  """Generate speech (mel-spectrogram) from text (TTS)."""
9636
  text_embeds = self.get_text_embeddings(input_ids, attention_mask)
9637
- mel, durations = self.audio_decoder(text_embeds)
 
9638
  return mel, durations
9639
 
9640
  @torch.no_grad()
@@ -9667,7 +9671,8 @@ class XoronMultimodalModel(nn.Module):
9667
 
9668
  # Generate intermediate features through audio decoder
9669
  # This gives us the linguistic/prosodic representation
9670
- mel, durations, _ = self.audio_decoder(
 
9671
  text_embeds,
9672
  speaker_embedding=speaker_embedding,
9673
  )
@@ -9748,7 +9753,8 @@ class XoronMultimodalModel(nn.Module):
9748
 
9749
  # 4. Speak - convert text response to audio
9750
  if response_embeds is not None:
9751
- mel, durations, _ = self.audio_decoder(
 
9752
  response_embeds,
9753
  speaker_embedding=speaker_embedding,
9754
  )
 
5770
  energy_pred = F.softplus(self.energy_predictor(x))
5771
 
5772
  # Determine output length
5773
+ # IMPORTANT: BatchNorm1d requires sequence length > 1 during training
5774
+ # Enforce minimum length of 2 to avoid "Expected more than 1 value per channel" error
5775
+ MIN_MEL_LENGTH = 2
5776
  if target_length is not None:
5777
+ mel_length = max(MIN_MEL_LENGTH, target_length)
5778
  else:
5779
  mel_length = int(durations.sum(dim=1).max().item())
5780
  mel_length = max(16, min(mel_length, self.max_audio_length))
 
9637
  def generate_speech(self, input_ids: torch.Tensor, attention_mask: torch.Tensor = None):
9638
  """Generate speech (mel-spectrogram) from text (TTS)."""
9639
  text_embeds = self.get_text_embeddings(input_ids, attention_mask)
9640
+ # AudioDecoder.forward() returns 4 values: (mel, durations, alignment, extras)
9641
+ mel, durations, _, _ = self.audio_decoder(text_embeds)
9642
  return mel, durations
9643
 
9644
  @torch.no_grad()
 
9671
 
9672
  # Generate intermediate features through audio decoder
9673
  # This gives us the linguistic/prosodic representation
9674
+ # AudioDecoder.forward() returns 4 values: (mel, durations, alignment, extras)
9675
+ mel, durations, _, _ = self.audio_decoder(
9676
  text_embeds,
9677
  speaker_embedding=speaker_embedding,
9678
  )
 
9753
 
9754
  # 4. Speak - convert text response to audio
9755
  if response_embeds is not None:
9756
+ # AudioDecoder.forward() returns 4 values: (mel, durations, alignment, extras)
9757
+ mel, durations, _, _ = self.audio_decoder(
9758
  response_embeds,
9759
  speaker_embedding=speaker_embedding,
9760
  )
streaming_state.json CHANGED
@@ -1,117 +1,33 @@
1
  {
2
  "epoch": 1,
3
- "unique_samples": 3260,
4
- "total_yields": 6520,
5
  "dataset_positions": {
6
- "Synth-SelfCorrection": 50,
7
- "Synth-Documents": 50,
8
- "Synth-ShellTimeout": 50,
9
- "Jupyter-Code": 50,
10
- "HumanEval-JavaScript": 50,
11
- "Synth-DesktopSetup": 50,
12
- "UltraChat": 50,
13
- "HumanEval-Python": 50,
14
- "Dolly-15k": 50,
15
- "Synth-ShellExecution": 50,
16
- "Midjourney-Prompts": 50,
17
- "Synth-PythonScripts": 50,
18
- "Synth-Issues": 50,
19
- "Synth-Monitoring": 50,
20
- "Synth-KnowledgeCutoff": 50,
21
- "Synth-Uncertainty": 50,
22
- "Swift-Code-RLVR": 50,
23
- "HumanEval-CPP": 50,
24
- "Synth-CoT": 50,
25
- "Synth-Debugging": 50,
26
- "Swift-Code-Edit": 10,
27
- "SD-Prompts-2M": 50,
28
- "Synth-WebserverSetup": 50,
29
- "Synth-SSHSetup": 50,
30
- "File-Operations-Medium": 50,
31
- "Python-Code-18k": 50,
32
- "Synth-RepoContext": 50,
33
- "Synth-IDK": 50,
34
- "WildChat": 50,
35
- "Synth-FIM": 50,
36
- "Synth-GroundedResponse": 50,
37
- "Synth-AptInstall": 50,
38
- "Golang-Coder": 50,
39
- "HumanEval-Java": 50,
40
- "AgentInstruct": 50,
41
- "Function-Calling-ChatML": 50,
42
- "Synth-Downloads": 50,
43
- "Synth-MultiStepExecution": 50,
44
- "Synth-RetrievalGrounded": 50,
45
- "Pythonic-Function-Calling": 50,
46
- "OpenOrca": 50,
47
- "Synth-Citation": 50,
48
- "Golang-QA-2k": 50,
49
- "Synth-APIGen": 50,
50
- "CodeParrot-Clean": 50,
51
- "Synth-Jupyter": 50,
52
- "Synth-ShellErrors": 50,
53
- "NoRobots": 50,
54
- "Synth-Docker": 50,
55
- "Glaive-Code-Assistant": 50,
56
- "Synth-Diffs": 50,
57
- "ShareGPT-Clean": 50,
58
- "Code-Feedback": 50,
59
- "Conversation-Summarization": 50,
60
- "SD-Prompts": 50,
61
- "Synth-LanguageSetup": 50,
62
- "Synth-FactCheck": 50,
63
- "Synth-Execution": 50,
64
- "HumanEval-Rust": 50,
65
- "Synth-DatabaseSetup": 50,
66
- "Synth-ConfidenceLevel": 50,
67
- "Synth-Commits": 50,
68
- "HumanEval-Go": 50,
69
- "Tool-Calls-Multiturn": 50,
70
- "OpenAssistant": 50,
71
- "Tool-Calls-SingleTurn": 50
72
  },
73
  "modality_positions": {
74
- "text": {
75
- "Jupyter-Code": 50,
76
- "HumanEval-JavaScript": 50,
77
- "UltraChat": 50,
78
- "HumanEval-Python": 50,
79
- "Dolly-15k": 50,
80
- "Midjourney-Prompts": 50,
81
- "Swift-Code-RLVR": 50,
82
- "HumanEval-CPP": 50,
83
- "Swift-Code-Edit": 10,
84
- "SD-Prompts-2M": 50,
85
- "Python-Code-18k": 50,
86
- "WildChat": 50,
87
- "Golang-Coder": 50,
88
- "HumanEval-Java": 50,
89
- "AgentInstruct": 50,
90
- "Function-Calling-ChatML": 50,
91
- "Pythonic-Function-Calling": 50,
92
- "OpenOrca": 50,
93
- "Golang-QA-2k": 50,
94
- "Synth-APIGen": 50,
95
- "CodeParrot-Clean": 50,
96
- "NoRobots": 50,
97
- "Glaive-Code-Assistant": 50,
98
- "ShareGPT-Clean": 50,
99
- "Code-Feedback": 50,
100
- "Conversation-Summarization": 50,
101
- "SD-Prompts": 50,
102
- "HumanEval-Rust": 50,
103
- "HumanEval-Go": 50,
104
- "Tool-Calls-Multiturn": 50,
105
- "OpenAssistant": 50,
106
- "Tool-Calls-SingleTurn": 50
107
  },
108
- "image": {},
109
  "video": {},
110
  "audio": {}
111
  },
112
  "modality_counts": {
113
- "text": 3260,
114
- "image": 0,
115
  "video": 0,
116
  "audio": 0
117
  },
 
1
  {
2
  "epoch": 1,
3
+ "unique_samples": 306,
4
+ "total_yields": 612,
5
  "dataset_positions": {
6
+ "WebSight": 50,
7
+ "ScienceQA": 50,
8
+ "InstructPix2Pix": 50,
9
+ "Flickr8k": 50,
10
+ "NewYorker": 50,
11
+ "Football": 6,
12
+ "MagicBrush": 50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  },
14
  "modality_positions": {
15
+ "text": {},
16
+ "image": {
17
+ "WebSight": 50,
18
+ "ScienceQA": 50,
19
+ "InstructPix2Pix": 50,
20
+ "Flickr8k": 50,
21
+ "NewYorker": 50,
22
+ "Football": 6,
23
+ "MagicBrush": 50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  },
 
25
  "video": {},
26
  "audio": {}
27
  },
28
  "modality_counts": {
29
+ "text": 0,
30
+ "image": 306,
31
  "video": 0,
32
  "audio": 0
33
  },
trainer_state.json CHANGED
@@ -1,14 +1,14 @@
1
  {
2
  "best_model_checkpoint": "/kaggle/working/xoron-final",
3
- "best_metric": 6.629150597175206,
4
  "epoch": 1,
5
  "epochs_completed": 1,
6
- "global_step": 407,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [],
10
  "logging_steps": 50,
11
- "max_steps": 407,
12
  "num_train_epochs": 1,
13
  "total_flos": 0,
14
  "train_batch_size": 1,
@@ -16,16 +16,16 @@
16
  "learning_rate": 0.0001,
17
  "max_grad_norm": 1.0,
18
  "trainable_components": [
 
19
  "llm",
20
  "cross_attention",
 
21
  "modality_markers"
22
  ],
23
  "frozen_components": [
24
- "vision",
25
  "video",
26
  "audio",
27
  "speech",
28
- "image_generation",
29
  "video_generation"
30
  ],
31
  "trial_name": null,
 
1
  {
2
  "best_model_checkpoint": "/kaggle/working/xoron-final",
3
+ "best_metric": 5.456806772260689,
4
  "epoch": 1,
5
  "epochs_completed": 1,
6
+ "global_step": 38,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [],
10
  "logging_steps": 50,
11
+ "max_steps": 38,
12
  "num_train_epochs": 1,
13
  "total_flos": 0,
14
  "train_batch_size": 1,
 
16
  "learning_rate": 0.0001,
17
  "max_grad_norm": 1.0,
18
  "trainable_components": [
19
+ "vision",
20
  "llm",
21
  "cross_attention",
22
+ "image_generation",
23
  "modality_markers"
24
  ],
25
  "frozen_components": [
 
26
  "video",
27
  "audio",
28
  "speech",
 
29
  "video_generation"
30
  ],
31
  "trial_name": null,
training_state.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d32523d7fc44d7f0f1c884a1463149d48212161a3bfbd0f82b045cf4a1d583a2
3
- size 781490561
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:701076954d95569aec679ca5649e39cbc864ff2c78b7faccafa6f2501d93a6fb
3
+ size 1419713437