primepake commited on
Commit
ca7dd21
·
1 Parent(s): 11db1dc

debug training code

Browse files
speech/cosyvoice/dataset/processor.py CHANGED
@@ -313,9 +313,23 @@ def compute_fbank(data,
313
  feat = feat_extractor(waveform).squeeze(dim=0).transpose(0, 1)
314
  if token_mel_ratio != 0:
315
  # trim to align speech_token and speech_feat
316
- token_len = int(min(feat.shape[0] / token_mel_ratio, sample["speech_token"].shape[0]))
 
 
 
 
 
 
 
 
 
 
 
317
  feat = feat[:token_mel_ratio * token_len]
318
- sample["speech_token"] = sample["speech_token"][:token_len]
 
 
 
319
  sample['speech_feat'] = feat
320
  yield sample
321
 
 
313
  feat = feat_extractor(waveform).squeeze(dim=0).transpose(0, 1)
314
  if token_mel_ratio != 0:
315
  # trim to align speech_token and speech_feat
316
+ # token_len = int(min(feat.shape[0] / token_mel_ratio, sample["speech_token"].shape[0]))
317
+ # feat = feat[:token_mel_ratio * token_len]
318
+ # sample["speech_token"] = sample["speech_token"][:token_len]
319
+
320
+ # Convert speech_token to tensor if it's a list
321
+ if isinstance(sample["speech_token"], list):
322
+ speech_token_tensor = torch.tensor(sample["speech_token"])
323
+ else:
324
+ speech_token_tensor = sample["speech_token"]
325
+
326
+ # trim to align speech_token and speech_feat
327
+ token_len = int(min(feat.shape[0] / token_mel_ratio, speech_token_tensor.shape[0]))
328
  feat = feat[:token_mel_ratio * token_len]
329
+
330
+ # Update speech_token - keep as tensor for consistency
331
+ sample["speech_token"] = speech_token_tensor[:token_len]
332
+
333
  sample['speech_feat'] = feat
334
  yield sample
335
 
speech/cosyvoice/flow/decoder.py CHANGED
@@ -421,6 +421,8 @@ class CausalConditionalDecoder(ConditionalDecoder):
421
  """
422
  t = self.time_embeddings(t).to(t.dtype)
423
  t = self.time_mlp(t)
 
 
424
 
425
  x = pack([x, mu], "b * t")[0]
426
 
 
421
  """
422
  t = self.time_embeddings(t).to(t.dtype)
423
  t = self.time_mlp(t)
424
+ # print('x shape', x.shape)
425
+ # print('mu shape:', mu.shape)
426
 
427
  x = pack([x, mu], "b * t")[0]
428
 
speech/cosyvoice/flow/flow.py CHANGED
@@ -73,6 +73,7 @@ class MaskedDiffWithXvec(torch.nn.Module):
73
  embedding = self.spk_embed_affine_layer(embedding)
74
 
75
  # concat text and prompt_text
 
76
  mask = (~make_pad_mask(token_len)).float().unsqueeze(-1).to(device)
77
  token = self.input_embedding(torch.clamp(token, min=0)) * mask
78
 
@@ -197,13 +198,19 @@ class CausalMaskedDiffWithXvec(torch.nn.Module):
197
  feat_len = batch['speech_feat_len'].to(device)
198
  embedding = batch['embedding'].to(device)
199
 
 
 
 
 
 
 
200
  # NOTE unified training, static_chunk_size > 0 or = 0
201
- streaming = True if random.random() < 0.5 else False
202
 
203
  # xvec projection
204
  embedding = F.normalize(embedding, dim=1)
205
  embedding = self.spk_embed_affine_layer(embedding)
206
-
207
  # concat text and prompt_text
208
  mask = (~make_pad_mask(token_len)).float().unsqueeze(-1).to(device)
209
  token = self.input_embedding(torch.clamp(token, min=0)) * mask
@@ -222,6 +229,14 @@ class CausalMaskedDiffWithXvec(torch.nn.Module):
222
  conds = conds.transpose(1, 2)
223
 
224
  mask = (~make_pad_mask(h_lengths.sum(dim=-1).squeeze(dim=1))).to(h)
 
 
 
 
 
 
 
 
225
  loss, _ = self.decoder.compute_loss(
226
  feat.transpose(1, 2).contiguous(),
227
  mask.unsqueeze(1),
 
73
  embedding = self.spk_embed_affine_layer(embedding)
74
 
75
  # concat text and prompt_text
76
+ print('token_len values: ', token_len)
77
  mask = (~make_pad_mask(token_len)).float().unsqueeze(-1).to(device)
78
  token = self.input_embedding(torch.clamp(token, min=0)) * mask
79
 
 
198
  feat_len = batch['speech_feat_len'].to(device)
199
  embedding = batch['embedding'].to(device)
200
 
201
+ # print('token: ', token.shape)
202
+ # print('token_len: ', token_len.shape)
203
+ # print('feat: ', feat.shape)
204
+ # print('feat_len: ', feat_len.shape)
205
+ # print('embedding: ', embedding.shape)
206
+
207
  # NOTE unified training, static_chunk_size > 0 or = 0
208
+ streaming = False# if random.random() < 0.5 else False
209
 
210
  # xvec projection
211
  embedding = F.normalize(embedding, dim=1)
212
  embedding = self.spk_embed_affine_layer(embedding)
213
+ # print('token_len values: ', token_len)
214
  # concat text and prompt_text
215
  mask = (~make_pad_mask(token_len)).float().unsqueeze(-1).to(device)
216
  token = self.input_embedding(torch.clamp(token, min=0)) * mask
 
229
  conds = conds.transpose(1, 2)
230
 
231
  mask = (~make_pad_mask(h_lengths.sum(dim=-1).squeeze(dim=1))).to(h)
232
+
233
+ # print('feat shape: ', feat.shape)
234
+ # print('mask shape: ', mask.shape)
235
+ # print('h shape: ', h.shape)
236
+ # print('embedding shape: ', embedding.shape)
237
+ # print('conds shape: ', conds.shape)
238
+ # print('streaming: ', streaming)
239
+
240
  loss, _ = self.decoder.compute_loss(
241
  feat.transpose(1, 2).contiguous(),
242
  mask.unsqueeze(1),
speech/cosyvoice/flow/flow_matching.py CHANGED
@@ -187,7 +187,13 @@ class ConditionalCFM(BASECFM):
187
  mu = mu * cfg_mask.view(-1, 1, 1)
188
  spks = spks * cfg_mask.view(-1, 1)
189
  cond = cond * cfg_mask.view(-1, 1, 1)
190
-
 
 
 
 
 
 
191
  pred = self.estimator(y, mask, mu, t.squeeze(), spks, cond, streaming=streaming)
192
  loss = F.mse_loss(pred * mask, u * mask, reduction="sum") / (torch.sum(mask) * u.shape[1])
193
  return loss, y
 
187
  mu = mu * cfg_mask.view(-1, 1, 1)
188
  spks = spks * cfg_mask.view(-1, 1)
189
  cond = cond * cfg_mask.view(-1, 1, 1)
190
+ # print('y shape: ', y.shape)
191
+ # print('mask shape: ', mask.shape)
192
+ # print('mu shape: ', mu.shape)
193
+ # print('t shape: ', t.shape)
194
+ # print('spks shape: ', spks.shape)
195
+ # print('cond shape: ', cond.shape)
196
+ # print('streaming: ', streaming)
197
  pred = self.estimator(y, mask, mu, t.squeeze(), spks, cond, streaming=streaming)
198
  loss = F.mse_loss(pred * mask, u * mask, reduction="sum") / (torch.sum(mask) * u.shape[1])
199
  return loss, y
speech/cosyvoice/transformer/upsample_encoder.py CHANGED
@@ -272,6 +272,14 @@ class UpsampleConformerEncoder(torch.nn.Module):
272
  checkpointing API because `__call__` attaches all the hooks of the module.
273
  https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2
274
  """
 
 
 
 
 
 
 
 
275
  T = xs.size(1)
276
  masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T)
277
  if self.global_cmvn is not None:
@@ -303,6 +311,8 @@ class UpsampleConformerEncoder(torch.nn.Module):
303
  # Here we assume the mask is not changed in encoder layers, so just
304
  # return the masks before encoder layers, and the masks will be used
305
  # for cross attention with decoder later
 
 
306
  return xs, masks
307
 
308
  def forward_layers(self, xs: torch.Tensor, chunk_masks: torch.Tensor,
 
272
  checkpointing API because `__call__` attaches all the hooks of the module.
273
  https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2
274
  """
275
+
276
+ # print('xs shape: ', xs.shape)
277
+ # print('xs_lens shape: ', xs_lens.shape)
278
+ # print('context shape: ', context.shape)
279
+ # print('decoding_chunk_size: ', decoding_chunk_size)
280
+ # print('num_decoding_left_chunks: ', num_decoding_left_chunks)
281
+ # print('streaming: ', streaming)
282
+
283
  T = xs.size(1)
284
  masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T)
285
  if self.global_cmvn is not None:
 
311
  # Here we assume the mask is not changed in encoder layers, so just
312
  # return the masks before encoder layers, and the masks will be used
313
  # for cross attention with decoder later
314
+ # print('output xs shape: ', xs.shape)
315
+ # print('output masks shape: ', masks.shape)
316
  return xs, masks
317
 
318
  def forward_layers(self, xs: torch.Tensor, chunk_masks: torch.Tensor,
speech/cosyvoice/utils/executor.py CHANGED
@@ -48,11 +48,9 @@ class Executor:
48
  optimizer,
49
  scheduler,
50
  train_data_loader,
51
- cv_data_loader,
52
  experiment,
53
  info_dict,
54
  scaler,
55
- group_join,
56
  model_type
57
  ):
58
  """Train one epoch"""
@@ -68,58 +66,51 @@ class Executor:
68
  model.train()
69
  if self.ref_model is not None:
70
  self.ref_model.eval()
71
- model_context = (
72
- model.join if info_dict["train_engine"] == "torch_ddp" else nullcontext
73
- )
74
- with model_context():
75
- for batch_idx, batch_dict in enumerate(train_data_loader):
76
- info_dict["tag"] = "TRAIN"
77
- info_dict["step"] = self.step
78
- info_dict["epoch"] = self.epoch
79
- info_dict["batch_idx"] = batch_idx
80
- if cosyvoice_join(group_join, info_dict):
81
- break
82
-
83
-
84
- if (
85
- info_dict["train_engine"] == "torch_ddp"
86
- and (batch_idx + 1) % info_dict["accum_grad"] != 0
87
- ):
88
- context = model.no_sync
89
-
90
- else:
91
- context = nullcontext
92
-
93
- with context():
94
- info_dict = batch_forward(
95
- model,
96
- batch_dict,
97
- scaler,
98
- info_dict,
99
- ref_model=self.ref_model,
100
- dpo_loss=self.dpo_loss,
101
- )
102
- info_dict = batch_backward(model, scaler, info_dict)
103
-
104
- info_dict = update_parameter_and_lr(
105
- model, optimizer, scheduler, scaler, info_dict, model_type=model_type
106
  )
107
- log_per_step(experiment, info_dict)
108
-
109
- if (
110
- info_dict["save_per_step"] > 0
111
- and (self.step + 1) % info_dict["save_per_step"] == 0
112
- and (batch_idx + 1) % info_dict["accum_grad"] == 0
113
- ):
 
 
 
 
 
 
114
  dist.barrier()
115
- self.cv(
116
- model, cv_data_loader, experiment, info_dict, on_batch_end=False
117
- )
118
- model.train()
119
- if (batch_idx + 1) % info_dict["accum_grad"] == 0:
120
- self.step += 1
 
 
121
  dist.barrier()
122
- #self.cv(model, cv_data_loader, writer, info_dict, on_batch_end=True)
123
 
124
  @torch.inference_mode()
125
  def cv(self, model, cv_data_loader, experiment, info_dict, on_batch_end=True):
 
48
  optimizer,
49
  scheduler,
50
  train_data_loader,
 
51
  experiment,
52
  info_dict,
53
  scaler,
 
54
  model_type
55
  ):
56
  """Train one epoch"""
 
66
  model.train()
67
  if self.ref_model is not None:
68
  self.ref_model.eval()
69
+
70
+ use_ddp = info_dict["train_engine"] == "torch_ddp"
71
+
72
+ for batch_idx, batch_dict in enumerate(train_data_loader):
73
+ info_dict["tag"] = "TRAIN"
74
+ info_dict["step"] = self.step
75
+ info_dict["epoch"] = self.epoch
76
+ info_dict["batch_idx"] = batch_idx
77
+
78
+ if use_ddp and (batch_idx + 1) % info_dict["accum_grad"] != 0:
79
+ context = model.no_sync
80
+ else:
81
+ context = nullcontext
82
+ with context():
83
+ info_dict = batch_forward(
84
+ model,
85
+ batch_dict,
86
+ scaler,
87
+ info_dict,
88
+ ref_model=self.ref_model,
89
+ dpo_loss=self.dpo_loss,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  )
91
+ info_dict = batch_backward(model, scaler, info_dict)
92
+
93
+ info_dict = update_parameter_and_lr(
94
+ model, optimizer, scheduler, scaler, info_dict, model_type=model_type
95
+ )
96
+ log_per_step(experiment, info_dict)
97
+
98
+ if (
99
+ info_dict.get("save_per_step", -1) > 0
100
+ and (self.step + 1) % info_dict["save_per_step"] == 0
101
+ and (batch_idx + 1) % info_dict["accum_grad"] == 0
102
+ ):
103
+ if dist.is_initialized():
104
  dist.barrier()
105
+ model_name = (
106
+ f"epoch_{self.epoch}_step_{self.step + 1}"
107
+ )
108
+ save_model(model, model_name, info_dict)
109
+ model.train()
110
+
111
+ if (batch_idx + 1) % info_dict["accum_grad"] == 0:
112
+ self.step += 1
113
  dist.barrier()
 
114
 
115
  @torch.inference_mode()
116
  def cv(self, model, cv_data_loader, experiment, info_dict, on_batch_end=True):
speech/cosyvoice/utils/train_utils.py CHANGED
@@ -14,7 +14,6 @@
14
  # See the License for the specific language governing permissions and
15
  # limitations under the License.
16
 
17
- import logging
18
  import os
19
  import torch
20
  import json
@@ -33,7 +32,49 @@ from deepspeed.runtime.zero.stage_1_and_2 import estimate_zero2_model_states_mem
33
 
34
  from cosyvoice.dataset.dataset import Dataset
35
 
36
- from torch.optim.lr_scheduler import LinearLR, ConstantLR, SequentialLR
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
  def init_distributed(args):
39
  world_size = int(os.environ.get('WORLD_SIZE', 1))
@@ -100,7 +141,7 @@ def wrap_cuda_model(args, model):
100
  model = torch.nn.parallel.DistributedDataParallel(model, find_unused_parameters=True)
101
  else:
102
  if int(os.environ.get('RANK', 0)) == 0:
103
- logging.info("Estimating model states memory needs (zero2)...")
104
  estimate_zero2_model_states_mem_needs_all_live(
105
  model,
106
  num_gpus_per_node=local_world_size,
@@ -132,12 +173,13 @@ def init_optimizer_and_scheduler(configs, model):
132
  )
133
 
134
  # Combine schedulers: warmup for 5k steps, then constant
135
- scheduler = SequentialLR(
136
  optimizer,
137
  schedulers=[warmup_scheduler, constant_scheduler],
138
- milestones=[5000] # Switch after 5k steps
139
  )
140
 
 
141
  return model, optimizer, scheduler
142
 
143
 
@@ -146,7 +188,9 @@ def save_model(model, model_name, info_dict):
146
  """Save model"""
147
  rank = int(os.environ.get('RANK', 0))
148
  model_dir = info_dict["model_dir"]
 
149
  save_model_path = os.path.join(model_dir, '{}.pt'.format(model_name))
 
150
 
151
  if info_dict["train_engine"] == "torch_ddp":
152
  if rank == 0:
@@ -162,7 +206,7 @@ def save_model(model, model_name, info_dict):
162
  with open(info_path, 'w') as fout:
163
  data = yaml.dump(info_dict)
164
  fout.write(data)
165
- logging.info('[Rank {}] Checkpoint: save to checkpoint {}'.format(rank, save_model_path))
166
 
167
 
168
  def cosyvoice_join(group_join, info_dict):
@@ -178,7 +222,7 @@ def cosyvoice_join(group_join, info_dict):
178
  timeout=group_join.options._timeout)
179
  return False
180
  except RuntimeError as e:
181
- logging.info("Detected uneven workload distribution: {}\n".format(e) +
182
  "Break current worker to manually join all workers, " +
183
  "world_size {}, current rank {}, current local_rank {}\n".
184
  format(world_size, rank, local_rank))
@@ -326,14 +370,14 @@ def update_parameter_and_lr(model, optimizer, scheduler, scaler, info_dict, mode
326
  if torch.isfinite(grad_norm):
327
  scaler.step(optimizer)
328
  else:
329
- logging.warning('get infinite grad_norm, check your code/data if it appears frequently')
330
  scaler.update()
331
  else:
332
  grad_norm = clip_grad_norm_(model.parameters(), info_dict['grad_clip'])
333
  if torch.isfinite(grad_norm):
334
  optimizer.step()
335
  else:
336
- logging.warning('get infinite grad_norm, check your code/data if it appears frequently')
337
  optimizer.zero_grad()
338
  scheduler.step()
339
  info_dict["lr"] = optimizer.param_groups[0]['lr']
@@ -376,7 +420,7 @@ def log_per_step(experiment, info_dict):
376
  if tag == "TRAIN":
377
  log_str += f'lr {info_dict["lr"]:.8f} grad_norm {info_dict["grad_norm"]:.6f}'
378
  log_str += f' rank {rank}'
379
- logging.debug(log_str)
380
 
381
  def log_per_save(experiment, info_dict):
382
  """Log per save using Comet ML"""
@@ -387,7 +431,7 @@ def log_per_save(experiment, info_dict):
387
  lr = info_dict['lr']
388
  rank = int(os.environ.get('RANK', 0))
389
 
390
- # Create loss string for logging
391
  loss_str = ' '.join([f"{k} {v.item() if isinstance(v, torch.Tensor) else v}" for k, v in loss_dict.items()])
392
  logger.info(f'Epoch {epoch} Step {step + 1} CV info lr {lr} {rank} {loss_str}')
393
 
 
14
  # See the License for the specific language governing permissions and
15
  # limitations under the License.
16
 
 
17
  import os
18
  import torch
19
  import json
 
32
 
33
  from cosyvoice.dataset.dataset import Dataset
34
 
35
+ from torch.optim.lr_scheduler import LinearLR, ConstantLR, SequentialLR, _LRScheduler
36
+
37
+ from loguru import logger
38
+
39
+ class ResumableSequentialLR(_LRScheduler):
40
+ """A resumable version of SequentialLR that supports set_step"""
41
+ def __init__(self, optimizer, schedulers, milestones, last_epoch=-1):
42
+ self.schedulers = schedulers
43
+ self.milestones = milestones
44
+ self._last_lr = [group['lr'] for group in optimizer.param_groups]
45
+ super().__init__(optimizer, last_epoch)
46
+
47
+ def get_lr(self):
48
+ # Find which scheduler to use based on last_epoch
49
+ idx = 0
50
+ for i, milestone in enumerate(self.milestones):
51
+ if self.last_epoch >= milestone:
52
+ idx = i + 1
53
+
54
+ if idx >= len(self.schedulers):
55
+ idx = len(self.schedulers) - 1
56
+
57
+ # Get lr from the appropriate scheduler
58
+ scheduler = self.schedulers[idx]
59
+ if hasattr(scheduler, '_get_closed_form_lr'):
60
+ return scheduler._get_closed_form_lr()
61
+ else:
62
+ return scheduler.get_lr()
63
+
64
+ def step(self, epoch=None):
65
+ if epoch is None:
66
+ self.last_epoch += 1
67
+ else:
68
+ self.last_epoch = epoch
69
+
70
+ # Update learning rates
71
+ for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
72
+ param_group['lr'] = lr
73
+ self._last_lr = [group['lr'] for group in self.optimizer.param_groups]
74
+
75
+ def set_step(self, step):
76
+ """Set the current step for resuming training"""
77
+ self.last_epoch = step - 1 # -1 because step() will increment it
78
 
79
  def init_distributed(args):
80
  world_size = int(os.environ.get('WORLD_SIZE', 1))
 
141
  model = torch.nn.parallel.DistributedDataParallel(model, find_unused_parameters=True)
142
  else:
143
  if int(os.environ.get('RANK', 0)) == 0:
144
+ logger.info("Estimating model states memory needs (zero2)...")
145
  estimate_zero2_model_states_mem_needs_all_live(
146
  model,
147
  num_gpus_per_node=local_world_size,
 
173
  )
174
 
175
  # Combine schedulers: warmup for 5k steps, then constant
176
+ scheduler = ResumableSequentialLR(
177
  optimizer,
178
  schedulers=[warmup_scheduler, constant_scheduler],
179
+ milestones=[5000]
180
  )
181
 
182
+
183
  return model, optimizer, scheduler
184
 
185
 
 
188
  """Save model"""
189
  rank = int(os.environ.get('RANK', 0))
190
  model_dir = info_dict["model_dir"]
191
+ # os.makedirs(model_dir, exist_ok=True)
192
  save_model_path = os.path.join(model_dir, '{}.pt'.format(model_name))
193
+
194
 
195
  if info_dict["train_engine"] == "torch_ddp":
196
  if rank == 0:
 
206
  with open(info_path, 'w') as fout:
207
  data = yaml.dump(info_dict)
208
  fout.write(data)
209
+ logger.info('[Rank {}] Checkpoint: save to checkpoint {}'.format(rank, save_model_path))
210
 
211
 
212
  def cosyvoice_join(group_join, info_dict):
 
222
  timeout=group_join.options._timeout)
223
  return False
224
  except RuntimeError as e:
225
+ logger.info("Detected uneven workload distribution: {}\n".format(e) +
226
  "Break current worker to manually join all workers, " +
227
  "world_size {}, current rank {}, current local_rank {}\n".
228
  format(world_size, rank, local_rank))
 
370
  if torch.isfinite(grad_norm):
371
  scaler.step(optimizer)
372
  else:
373
+ logger.warning('get infinite grad_norm, check your code/data if it appears frequently')
374
  scaler.update()
375
  else:
376
  grad_norm = clip_grad_norm_(model.parameters(), info_dict['grad_clip'])
377
  if torch.isfinite(grad_norm):
378
  optimizer.step()
379
  else:
380
+ logger.warning('get infinite grad_norm, check your code/data if it appears frequently')
381
  optimizer.zero_grad()
382
  scheduler.step()
383
  info_dict["lr"] = optimizer.param_groups[0]['lr']
 
420
  if tag == "TRAIN":
421
  log_str += f'lr {info_dict["lr"]:.8f} grad_norm {info_dict["grad_norm"]:.6f}'
422
  log_str += f' rank {rank}'
423
+ logger.info(log_str)
424
 
425
  def log_per_save(experiment, info_dict):
426
  """Log per save using Comet ML"""
 
431
  lr = info_dict['lr']
432
  rank = int(os.environ.get('RANK', 0))
433
 
434
+ # Create loss string for logger
435
  loss_str = ' '.join([f"{k} {v.item() if isinstance(v, torch.Tensor) else v}" for k, v in loss_dict.items()])
436
  logger.info(f'Epoch {epoch} Step {step + 1} CV info lr {lr} {rank} {loss_str}')
437
 
speech/train.py CHANGED
@@ -22,11 +22,11 @@ from copy import deepcopy
22
  import deepspeed
23
  import torch
24
  import torch.distributed as dist
 
25
  from hyperpyyaml import load_hyperpyyaml
26
  from loguru import logger
27
  from torch.distributed.elastic.multiprocessing.errors import record
28
 
29
- from comet_ml import Experiment
30
  from cosyvoice.utils.executor import Executor
31
  from cosyvoice.utils.losses import DPOLoss
32
  from cosyvoice.utils.train_utils import (check_modify_and_save_config,
@@ -35,6 +35,8 @@ from cosyvoice.utils.train_utils import (check_modify_and_save_config,
35
  save_model)
36
 
37
  os.environ["COMET_LOGGING_CONSOLE"] = "ERROR" # Only show errors
 
 
38
  def get_args():
39
  parser = argparse.ArgumentParser(description="training your network")
40
  parser.add_argument(
@@ -107,14 +109,8 @@ def get_args():
107
  default=False,
108
  help="Disable comet ml experiment",
109
  )
110
- parser.add_argument(
111
- "--comet_project",
112
- default="speech"
113
- )
114
- parser.add_argument(
115
- "--comet_experiment_name",
116
- default="test"
117
- )
118
  parser = deepspeed.add_config_arguments(parser)
119
  args = parser.parse_args()
120
  return args
@@ -122,8 +118,8 @@ def get_args():
122
 
123
  def init_comet_experiment(args, configs):
124
  """Initialize Comet ML experiment"""
125
- rank = int(os.environ.get('RANK', 0))
126
-
127
  # Only create experiment on rank 0 to avoid duplicates
128
  if rank == 0 and not args.comet_disabled:
129
  # Set up Comet ML experiment
@@ -131,7 +127,7 @@ def init_comet_experiment(args, configs):
131
  project_name=args.comet_project,
132
  experiment_name=args.comet_experiment_name,
133
  )
134
-
135
  # Log hyperparameters
136
  experiment.log_parameters(configs["train_conf"])
137
  experiment.log_parameter("model_type", args.model)
@@ -141,24 +137,29 @@ def init_comet_experiment(args, configs):
141
  experiment.log_parameter("dpo", args.dpo)
142
  experiment.log_parameter("num_workers", args.num_workers)
143
  experiment.log_parameter("prefetch", args.prefetch)
144
-
145
  # Log model architecture if available
146
  if args.model in configs:
147
- model_config = configs[args.model].__dict__ if hasattr(configs[args.model], '__dict__') else {}
 
 
 
 
148
  experiment.log_parameters(model_config, prefix=f"{args.model}/")
149
-
150
  # Add tags
151
  experiment.add_tag(args.model)
152
  if args.dpo:
153
  experiment.add_tag("dpo")
154
  if args.use_amp:
155
  experiment.add_tag("amp")
156
-
157
  logger.info(f"Comet ML experiment initialized: {experiment.get_name()}")
158
  return experiment
159
  else:
160
  return None
161
 
 
162
  @record
163
  def main():
164
  args = get_args()
@@ -182,12 +183,14 @@ def main():
182
 
183
  configs["train_conf"].update(vars(args))
184
 
185
- world_size = int(os.environ.get('WORLD_SIZE', 1))
186
- local_rank = int(os.environ.get('LOCAL_RANK', 0))
187
- rank = int(os.environ.get('RANK', 0))
188
- logger.info(f'training on multiple gpus, this gpu {local_rank}, rank {rank}, world_size {world_size}')
 
 
189
  torch.cuda.set_device(local_rank)
190
- dist.init_process_group(args.dist_backend)
191
 
192
  # Get dataset & dataloader
193
  train_dataset, _, train_data_loader, cv_data_loader = init_dataset_and_dataloader(
@@ -200,7 +203,6 @@ def main():
200
  # Tensorboard summary
201
  experiment = init_comet_experiment(args, configs)
202
 
203
-
204
  # load checkpoint
205
  if args.dpo is True:
206
  configs[args.model].forward = configs[args.model].forward_dpo
@@ -230,9 +232,7 @@ def main():
230
  )
231
 
232
  # Get optimizer & scheduler
233
- model, optimizer, scheduler = (
234
- init_optimizer_and_scheduler(configs, model)
235
- )
236
  scheduler.set_step(start_step)
237
 
238
  # Save init checkpoints
@@ -246,7 +246,7 @@ def main():
246
  experiment.log_model(
247
  name=f"{args.model}_init",
248
  file_or_folder=os.path.join(args.model_dir, "init.pt"),
249
- metadata=info_dict
250
  )
251
 
252
  # DPO related
@@ -279,26 +279,23 @@ def main():
279
  for epoch in range(start_epoch + 1, info_dict["max_epoch"]):
280
  executor.epoch = epoch
281
  train_dataset.set_epoch(epoch)
282
- dist.barrier()
283
- group_join = dist.new_group(
284
- backend="nccl", timeout=datetime.timedelta(seconds=args.timeout)
285
- )
286
-
287
  executor.train_one_epoc(
288
  model,
289
  optimizer,
290
  scheduler,
291
  train_data_loader,
292
- cv_data_loader,
293
  experiment,
294
  info_dict,
295
  scaler,
296
- group_join,
297
- model_type=args.model
298
  )
299
- dist.destroy_process_group(group_join)
 
 
 
300
  if experiment:
301
  experiment.end()
302
 
 
303
  if __name__ == "__main__":
304
  main()
 
22
  import deepspeed
23
  import torch
24
  import torch.distributed as dist
25
+ from comet_ml import Experiment
26
  from hyperpyyaml import load_hyperpyyaml
27
  from loguru import logger
28
  from torch.distributed.elastic.multiprocessing.errors import record
29
 
 
30
  from cosyvoice.utils.executor import Executor
31
  from cosyvoice.utils.losses import DPOLoss
32
  from cosyvoice.utils.train_utils import (check_modify_and_save_config,
 
35
  save_model)
36
 
37
  os.environ["COMET_LOGGING_CONSOLE"] = "ERROR" # Only show errors
38
+
39
+
40
  def get_args():
41
  parser = argparse.ArgumentParser(description="training your network")
42
  parser.add_argument(
 
109
  default=False,
110
  help="Disable comet ml experiment",
111
  )
112
+ parser.add_argument("--comet_project", default="speech")
113
+ parser.add_argument("--comet_experiment_name", default="test")
 
 
 
 
 
 
114
  parser = deepspeed.add_config_arguments(parser)
115
  args = parser.parse_args()
116
  return args
 
118
 
119
  def init_comet_experiment(args, configs):
120
  """Initialize Comet ML experiment"""
121
+ rank = int(os.environ.get("RANK", 0))
122
+
123
  # Only create experiment on rank 0 to avoid duplicates
124
  if rank == 0 and not args.comet_disabled:
125
  # Set up Comet ML experiment
 
127
  project_name=args.comet_project,
128
  experiment_name=args.comet_experiment_name,
129
  )
130
+
131
  # Log hyperparameters
132
  experiment.log_parameters(configs["train_conf"])
133
  experiment.log_parameter("model_type", args.model)
 
137
  experiment.log_parameter("dpo", args.dpo)
138
  experiment.log_parameter("num_workers", args.num_workers)
139
  experiment.log_parameter("prefetch", args.prefetch)
140
+
141
  # Log model architecture if available
142
  if args.model in configs:
143
+ model_config = (
144
+ configs[args.model].__dict__
145
+ if hasattr(configs[args.model], "__dict__")
146
+ else {}
147
+ )
148
  experiment.log_parameters(model_config, prefix=f"{args.model}/")
149
+
150
  # Add tags
151
  experiment.add_tag(args.model)
152
  if args.dpo:
153
  experiment.add_tag("dpo")
154
  if args.use_amp:
155
  experiment.add_tag("amp")
156
+
157
  logger.info(f"Comet ML experiment initialized: {experiment.get_name()}")
158
  return experiment
159
  else:
160
  return None
161
 
162
+
163
  @record
164
  def main():
165
  args = get_args()
 
183
 
184
  configs["train_conf"].update(vars(args))
185
 
186
+ world_size = int(os.environ.get("WORLD_SIZE", 1))
187
+ local_rank = int(os.environ.get("LOCAL_RANK", 0))
188
+ rank = int(os.environ.get("RANK", 0))
189
+ logger.info(
190
+ f"training on multiple gpus, this gpu {local_rank}, rank {rank}, world_size {world_size}"
191
+ )
192
  torch.cuda.set_device(local_rank)
193
+ dist.init_process_group("nccl")
194
 
195
  # Get dataset & dataloader
196
  train_dataset, _, train_data_loader, cv_data_loader = init_dataset_and_dataloader(
 
203
  # Tensorboard summary
204
  experiment = init_comet_experiment(args, configs)
205
 
 
206
  # load checkpoint
207
  if args.dpo is True:
208
  configs[args.model].forward = configs[args.model].forward_dpo
 
232
  )
233
 
234
  # Get optimizer & scheduler
235
+ model, optimizer, scheduler = init_optimizer_and_scheduler(configs, model)
 
 
236
  scheduler.set_step(start_step)
237
 
238
  # Save init checkpoints
 
246
  experiment.log_model(
247
  name=f"{args.model}_init",
248
  file_or_folder=os.path.join(args.model_dir, "init.pt"),
249
+ metadata=info_dict,
250
  )
251
 
252
  # DPO related
 
279
  for epoch in range(start_epoch + 1, info_dict["max_epoch"]):
280
  executor.epoch = epoch
281
  train_dataset.set_epoch(epoch)
 
 
 
 
 
282
  executor.train_one_epoc(
283
  model,
284
  optimizer,
285
  scheduler,
286
  train_data_loader,
 
287
  experiment,
288
  info_dict,
289
  scaler,
290
+ model_type=args.model,
 
291
  )
292
+
293
+ if dist.is_initialized():
294
+ dist.destroy_process_group()
295
+
296
  if experiment:
297
  experiment.end()
298
 
299
+
300
  if __name__ == "__main__":
301
  main()