diff --git a/speech/cosyvoice/utils/executor.py b/speech/cosyvoice/utils/executor.py index c12a0c7b96cd0a4a0a5712578dc298e6b2ecceff..b4e9e01aba0118e9cdd1f5b0a71a450a10e30bc4 100644 --- a/speech/cosyvoice/utils/executor.py +++ b/speech/cosyvoice/utils/executor.py @@ -49,10 +49,11 @@ class Executor: scheduler, train_data_loader, cv_data_loader, - writer, + experiment, info_dict, scaler, group_join, + model_type ): """Train one epoch""" @@ -101,10 +102,10 @@ class Executor: info_dict = batch_backward(model, scaler, info_dict) info_dict = update_parameter_and_lr( - model, optimizer, scheduler, scaler, info_dict + model, optimizer, scheduler, scaler, info_dict, model_type=model_type ) - log_per_step(writer, info_dict) - # NOTE specify save_per_step in cosyvoice.yaml if you want to enable step save + log_per_step(experiment, info_dict) + if ( info_dict["save_per_step"] > 0 and (self.step + 1) % info_dict["save_per_step"] == 0 @@ -112,102 +113,16 @@ class Executor: ): dist.barrier() self.cv( - model, cv_data_loader, writer, info_dict, on_batch_end=False + model, cv_data_loader, experiment, info_dict, on_batch_end=False ) model.train() if (batch_idx + 1) % info_dict["accum_grad"] == 0: self.step += 1 dist.barrier() - self.cv(model, cv_data_loader, writer, info_dict, on_batch_end=True) - - def train_one_epoc_gan( - self, - model, - optimizer, - scheduler, - optimizer_d, - scheduler_d, - train_data_loader, - cv_data_loader, - writer, - info_dict, - scaler, - group_join, - ): - """Train one epoch""" - - lr = optimizer.param_groups[0]["lr"] - logger.info( - f"Epoch {self.epoch} TRAIN info lr {lr} rank {self.rank}" - ) - logger.info( - f"using accumulate grad, new batch size is {info_dict['accum_grad']} times larger than before" - ) - # A context manager to be used in conjunction with an instance of - # torch.nn.parallel.DistributedDataParallel to be able to train - # with uneven inputs across participating processes. - model.train() - model_context = ( - model.join if info_dict["train_engine"] == "torch_ddp" else nullcontext - ) - with model_context(): - for batch_idx, batch_dict in enumerate(train_data_loader): - info_dict["tag"] = "TRAIN" - info_dict["step"] = self.step - info_dict["epoch"] = self.epoch - info_dict["batch_idx"] = batch_idx - if cosyvoice_join(group_join, info_dict): - break - - # Disable gradient synchronizations across DDP processes. - # Within this context, gradients will be accumulated on module - # variables, which will later be synchronized. - if ( - info_dict["train_engine"] == "torch_ddp" - and (batch_idx + 1) % info_dict["accum_grad"] != 0 - ): - context = model.no_sync - # Used for single gpu training and DDP gradient synchronization - # processes. - else: - context = nullcontext - - with context(): - batch_dict["turn"] = "discriminator" - info_dict = batch_forward(model, batch_dict, scaler, info_dict) - info_dict = batch_backward(model, scaler, info_dict) - info_dict = update_parameter_and_lr( - model, optimizer_d, scheduler_d, scaler, info_dict - ) - optimizer.zero_grad() - log_per_step(writer, info_dict) - with context(): - batch_dict["turn"] = "generator" - info_dict = batch_forward(model, batch_dict, scaler, info_dict) - info_dict = batch_backward(model, scaler, info_dict) - info_dict = update_parameter_and_lr( - model, optimizer, scheduler, scaler, info_dict - ) - optimizer_d.zero_grad() - log_per_step(writer, info_dict) - # NOTE specify save_per_step in cosyvoice.yaml if you want to enable step save - if ( - info_dict["save_per_step"] > 0 - and (self.step + 1) % info_dict["save_per_step"] == 0 - and (batch_idx + 1) % info_dict["accum_grad"] == 0 - ): - dist.barrier() - self.cv( - model, cv_data_loader, writer, info_dict, on_batch_end=False - ) - model.train() - if (batch_idx + 1) % info_dict["accum_grad"] == 0: - self.step += 1 - dist.barrier() - # self.cv(model, cv_data_loader, writer, info_dict, on_batch_end=True) + #self.cv(model, cv_data_loader, writer, info_dict, on_batch_end=True) @torch.inference_mode() - def cv(self, model, cv_data_loader, writer, info_dict, on_batch_end=True): + def cv(self, model, cv_data_loader, experiment, info_dict, on_batch_end=True): """Cross validation on""" logger.info(f"Epoch {self.epoch} Step {self.step + 1} on_batch_end {on_batch_end} CV rank {self.rank}") model.eval() @@ -233,7 +148,7 @@ class Executor: for k, v in total_loss_dict.items(): total_loss_dict[k] = sum(v) / total_num_utts info_dict["loss_dict"] = total_loss_dict - log_per_save(writer, info_dict) + log_per_save(experiment, info_dict) model_name = ( f"epoch_{self.epoch}_whole" if on_batch_end diff --git a/speech/cosyvoice/utils/train_utils.py b/speech/cosyvoice/utils/train_utils.py index 885d85109ba2d6f14848e06b91ca0c07b7abb605..4ba27e3a8ee25da173850fef7ffb978c8d6ba97f 100644 --- a/speech/cosyvoice/utils/train_utils.py +++ b/speech/cosyvoice/utils/train_utils.py @@ -26,15 +26,14 @@ import deepspeed import torch.optim as optim import torch.distributed as dist -from torch.utils.tensorboard import SummaryWriter from torch.utils.data import DataLoader from torch.nn.utils import clip_grad_norm_ from loguru import logger from deepspeed.runtime.zero.stage_1_and_2 import estimate_zero2_model_states_mem_needs_all_live from cosyvoice.dataset.dataset import Dataset -from cosyvoice.utils.scheduler import WarmupLR, NoamHoldAnnealing, ConstantLR +from torch.optim.lr_scheduler import LinearLR, ConstantLR, SequentialLR def init_distributed(args): world_size = int(os.environ.get('WORLD_SIZE', 1)) @@ -49,10 +48,10 @@ def init_distributed(args): return world_size, local_rank, rank -def init_dataset_and_dataloader(args, configs, gan, dpo): - data_pipeline = configs['data_pipeline_gan'] if gan is True else configs['data_pipeline'] - train_dataset = Dataset(args.train_data, data_pipeline=data_pipeline, mode='train', gan=gan, dpo=dpo, shuffle=True, partition=True) - cv_dataset = Dataset(args.cv_data, data_pipeline=data_pipeline, mode='train', gan=gan, dpo=dpo, shuffle=False, partition=False) +def init_dataset_and_dataloader(args, configs, dpo): + data_pipeline = configs['data_pipeline'] + train_dataset = Dataset(args.train_data, data_pipeline=data_pipeline, mode='train', gan=False, dpo=dpo, shuffle=True, partition=True) + cv_dataset = Dataset(args.cv_data, data_pipeline=data_pipeline, mode='train', gan=False, dpo=dpo, shuffle=False, partition=False) # do not use persistent_workers=True, as whisper tokenizer opens tiktoken file each time when the for loop starts train_data_loader = DataLoader(train_dataset, @@ -109,90 +108,38 @@ def wrap_cuda_model(args, model): return model -def init_optimizer_and_scheduler(args, configs, model, gan): +def init_optimizer_and_scheduler(configs, model): """Init optimizer and scheduler""" - if gan is False: - if configs['train_conf']['optim'] == 'adam': - optimizer = optim.Adam(model.parameters(), **configs['train_conf']['optim_conf']) - elif configs['train_conf']['optim'] == 'adamw': - optimizer = optim.AdamW(model.parameters(), **configs['train_conf']['optim_conf']) - else: - raise ValueError("unknown optimizer: " + configs['train_conf']) - - if configs['train_conf']['scheduler'] == 'warmuplr': - scheduler_type = WarmupLR - scheduler = WarmupLR(optimizer, **configs['train_conf']['scheduler_conf']) - elif configs['train_conf']['scheduler'] == 'NoamHoldAnnealing': - scheduler_type = NoamHoldAnnealing - scheduler = NoamHoldAnnealing(optimizer, **configs['train_conf']['scheduler_conf']) - elif configs['train_conf']['scheduler'] == 'constantlr': - scheduler_type = ConstantLR - scheduler = ConstantLR(optimizer) - else: - raise ValueError("unknown scheduler: " + configs['train_conf']) - - # use deepspeed optimizer for speedup - if args.train_engine == "deepspeed": - def scheduler(opt): - return scheduler_type(opt, **configs['train_conf']['scheduler_conf']) - model, optimizer, _, scheduler = deepspeed.initialize( - args=args, - model=model, - optimizer=None, - lr_scheduler=scheduler, - model_parameters=model.parameters()) - - optimizer_d, scheduler_d = None, None - + if configs['train_conf']['optim'] == 'adam': + optimizer = optim.Adam(model.parameters(), **configs['train_conf']['optim_conf']) + elif configs['train_conf']['optim'] == 'adamw': + optimizer = optim.AdamW(model.parameters(), **configs['train_conf']['optim_conf']) else: - # currently we wrap generator and discriminator in one model, so we cannot use deepspeed - if configs['train_conf']['optim'] == 'adam': - optimizer = optim.Adam(model.module.generator.parameters(), **configs['train_conf']['optim_conf']) - elif configs['train_conf']['optim'] == 'adamw': - optimizer = optim.AdamW(model.module.generator.parameters(), **configs['train_conf']['optim_conf']) - else: - raise ValueError("unknown optimizer: " + configs['train_conf']) - - if configs['train_conf']['scheduler'] == 'warmuplr': - scheduler_type = WarmupLR - scheduler = WarmupLR(optimizer, **configs['train_conf']['scheduler_conf']) - elif configs['train_conf']['scheduler'] == 'NoamHoldAnnealing': - scheduler_type = NoamHoldAnnealing - scheduler = NoamHoldAnnealing(optimizer, **configs['train_conf']['scheduler_conf']) - elif configs['train_conf']['scheduler'] == 'constantlr': - scheduler_type = ConstantLR - scheduler = ConstantLR(optimizer) - else: - raise ValueError("unknown scheduler: " + configs['train_conf']) + raise ValueError("unknown optimizer: " + configs['train_conf']) + + # Create schedulers + warmup_scheduler = LinearLR( + optimizer, + start_factor=1e-9, # Start at nearly 0 + end_factor=1.0, # End at base learning rate + total_iters=5000 # 5k warmup steps + ) + + constant_scheduler = ConstantLR( + optimizer, + factor=1.0, # Keep learning rate constant + total_iters=float('inf') # Run indefinitely + ) + + # Combine schedulers: warmup for 5k steps, then constant + scheduler = SequentialLR( + optimizer, + schedulers=[warmup_scheduler, constant_scheduler], + milestones=[5000] # Switch after 5k steps + ) + + return model, optimizer, scheduler - if configs['train_conf']['optim_d'] == 'adam': - optimizer_d = optim.Adam(model.module.discriminator.parameters(), **configs['train_conf']['optim_conf']) - elif configs['train_conf']['optim_d'] == 'adamw': - optimizer_d = optim.AdamW(model.module.discriminator.parameters(), **configs['train_conf']['optim_conf']) - else: - raise ValueError("unknown optimizer: " + configs['train_conf']) - - if configs['train_conf']['scheduler_d'] == 'warmuplr': - scheduler_type = WarmupLR - scheduler_d = WarmupLR(optimizer_d, **configs['train_conf']['scheduler_conf']) - elif configs['train_conf']['scheduler_d'] == 'NoamHoldAnnealing': - scheduler_type = NoamHoldAnnealing - scheduler_d = NoamHoldAnnealing(optimizer_d, **configs['train_conf']['scheduler_conf']) - elif configs['train_conf']['scheduler'] == 'constantlr': - scheduler_type = ConstantLR - scheduler_d = ConstantLR(optimizer_d) - else: - raise ValueError("unknown scheduler: " + configs['train_conf']) - return model, optimizer, scheduler, optimizer_d, scheduler_d - - -def init_summarywriter(args): - """Init summary writer""" - writer = None - if int(os.environ.get('RANK', 0)) == 0: - os.makedirs(args.model_dir, exist_ok=True) - writer = SummaryWriter(args.tensorboard_dir) - return writer def save_model(model, model_name, info_dict): @@ -295,21 +242,87 @@ def batch_backward(model, scaler, info_dict): return info_dict -def update_parameter_and_lr(model, optimizer, scheduler, scaler, info_dict): +def update_parameter_and_lr(model, optimizer, scheduler, scaler, info_dict, model_type='llm'): """Update parameters and learning rate""" + + #Define key components based on model type + if model_type == 'llm': + key_components = { + # Text processing components + 'text_embedding': [], + 'text_encoder': [], + 'text_encoder_affine': [], + + # LLM core components + 'llm_embedding': [], + 'llm.model': [], # Qwen2 model layers + 'llm_decoder': [], + + # Speech components + 'speech_embedding': [], + 'spk_embed_affine': [], + + # Other components + 'other': [] + } + elif model_type == 'flow': + key_components = { + # Input processing + 'input_embedding': [], + 'spk_embed_affine': [], + + # Encoder components + 'encoder': [], + 'encoder_proj': [], + + # Flow/Diffusion components + 'decoder.cfm': [], # Conditional Flow Matching + 'decoder.unet': [], # UNet backbone + 'decoder.estimator': [], # Score/velocity estimator + 'decoder.time_embedding': [], # Time embeddings + 'decoder.conv': [], # Convolutional layers + 'decoder.attention': [], # Attention layers + + # Length regulation + 'length_regulator': [], + + # Other components + 'other': [] + } + grad_norm = 0.0 - if info_dict['train_engine'] == "deepspeed": - info_dict["is_gradient_accumulation_boundary"] = model.is_gradient_accumulation_boundary() - model.step() - grad_norm = model.get_global_grad_norm() - elif (info_dict['batch_idx'] + 1) % info_dict["accum_grad"] == 0: + layer_grad_norms = {} + + if (info_dict['batch_idx'] + 1) % info_dict["accum_grad"] == 0: + + for name, param in model.named_parameters(): + if param.grad is not None: + # Calculate gradient norm for this parameter + param_grad_norm = param.grad.data.norm(2).item() + layer_grad_norms[name] = param_grad_norm + + # Categorize into key components + categorized = False + for component_key in key_components: + if component_key != 'other': + # Special handling for decoder sub-components in flow models + if model_type == 'flow' and component_key.startswith('decoder.'): + component_pattern = component_key.replace('decoder.', '') + if 'decoder' in name and component_pattern in name: + key_components[component_key].append((name, param_grad_norm)) + categorized = True + break + elif component_key in name: + key_components[component_key].append((name, param_grad_norm)) + categorized = True + break + if not categorized: + key_components['other'].append((name, param_grad_norm)) + # Use mixed precision training if scaler is not None: scaler.unscale_(optimizer) grad_norm = clip_grad_norm_(model.parameters(), info_dict['grad_clip']) - # We don't check grad here since that if the gradient - # has inf/nan values, scaler.step will skip - # optimizer.step(). if torch.isfinite(grad_norm): scaler.step(optimizer) else: @@ -325,11 +338,12 @@ def update_parameter_and_lr(model, optimizer, scheduler, scaler, info_dict): scheduler.step() info_dict["lr"] = optimizer.param_groups[0]['lr'] info_dict["grad_norm"] = grad_norm + info_dict["layer_grad_norms"] = layer_grad_norms + info_dict["key_component_grads"] = key_components return info_dict - -def log_per_step(writer, info_dict): - """Log per step""" +def log_per_step(experiment, info_dict): + """Log per step using Comet ML""" tag = info_dict["tag"] epoch = info_dict.get('epoch', 0) step = info_dict["step"] @@ -337,39 +351,61 @@ def log_per_step(writer, info_dict): loss_dict = info_dict['loss_dict'] rank = int(os.environ.get('RANK', 0)) - # only rank 0 write to tensorboard to avoid multi-process write - if writer is not None: + # Only rank 0 writes to Comet ML to avoid multi-process write + if experiment is not None and rank == 0: if (info_dict['train_engine'] == 'deepspeed' and info_dict['is_gradient_accumulation_boundary'] is True) or \ (info_dict['train_engine'] == 'torch_ddp' and (info_dict['batch_idx'] + 1) % info_dict['accum_grad'] == 0): - for k in ['epoch', 'lr', 'grad_norm']: - writer.add_scalar(f'{tag}/{k}', info_dict[k], step + 1) + # Log metrics to Comet ML + experiment.log_metric(f'{tag}_epoch', info_dict['epoch'], step=step + 1) + experiment.log_metric(f'{tag}_lr', info_dict['lr'], step=step + 1) + experiment.log_metric(f'{tag}_grad_norm', info_dict['grad_norm'], step=step + 1) + + # Log all losses for k, v in loss_dict.items(): - writer.add_scalar(f'{tag}/{k}', v, step + 1) + if isinstance(v, torch.Tensor): + v = v.item() + experiment.log_metric(f'{tag}_{k}', v, step=step + 1) # TRAIN & CV, Shell log (stdout) if (info_dict['batch_idx'] + 1) % info_dict['log_interval'] == 0: log_str = f'{tag} Batch {epoch}/{batch_idx + 1} ' for name, value in loss_dict.items(): + if isinstance(value, torch.Tensor): + value = value.item() log_str += f'{name} {value:.6f} ' if tag == "TRAIN": log_str += f'lr {info_dict["lr"]:.8f} grad_norm {info_dict["grad_norm"]:.6f}' log_str += f' rank {rank}' logging.debug(log_str) - -def log_per_save(writer, info_dict): - """Log per save""" +def log_per_save(experiment, info_dict): + """Log per save using Comet ML""" tag = info_dict["tag"] epoch = info_dict["epoch"] step = info_dict["step"] loss_dict = info_dict["loss_dict"] lr = info_dict['lr'] rank = int(os.environ.get('RANK', 0)) - logger.info( - f'Epoch {epoch} Step {step + 1} CV info lr {lr} {rank} {''.join([f"{k} {v}" for k, v in loss_dict.items()])}') - - if writer is not None: - for k in ['epoch', 'lr']: - writer.add_scalar(f'{tag}/{k}', info_dict[k], step + 1) + + # Create loss string for logging + loss_str = ' '.join([f"{k} {v.item() if isinstance(v, torch.Tensor) else v}" for k, v in loss_dict.items()]) + logger.info(f'Epoch {epoch} Step {step + 1} CV info lr {lr} {rank} {loss_str}') + + if experiment is not None and rank == 0: + # Log metrics to Comet ML + experiment.log_metric(f'{tag}_epoch', info_dict['epoch'], step=step + 1) + experiment.log_metric(f'{tag}_lr', info_dict['lr'], step=step + 1) + + # Log all losses for k, v in loss_dict.items(): - writer.add_scalar(f'{tag}/{k}', v, step + 1) + if isinstance(v, torch.Tensor): + v = v.item() + experiment.log_metric(f'{tag}_{k}', v, step=step + 1) + + # Log additional validation info + if tag == "CV": + # Calculate average CV loss for the epoch + avg_loss = loss_dict.get('loss', 0) + if isinstance(avg_loss, torch.Tensor): + avg_loss = avg_loss.item() + experiment.log_metric('cv_avg_loss_per_epoch', avg_loss, epoch=epoch) diff --git a/speech/cosyvoice2.yaml b/speech/cosyvoice2.yaml new file mode 100644 index 0000000000000000000000000000000000000000..551e42edc0258d9de2ee20a680e58ffe101af168 --- /dev/null +++ b/speech/cosyvoice2.yaml @@ -0,0 +1,217 @@ +# set random seed, so that you may reproduce your result. +__set_seed1: !apply:random.seed [1986] +__set_seed2: !apply:numpy.random.seed [1986] +__set_seed3: !apply:torch.manual_seed [1986] +__set_seed4: !apply:torch.cuda.manual_seed_all [1986] + +# fixed params +sample_rate: 24000 +llm_input_size: 896 +llm_output_size: 896 +spk_embed_dim: 192 +qwen_pretrain_path: '' +token_frame_rate: 25 +token_mel_ratio: 2 + +# stream related params +chunk_size: 25 # streaming inference chunk size, in token +num_decoding_left_chunks: -1 # streaming inference flow decoder left chunk size, <0 means use all left chunks + +# model params +# for all class/function included in this repo, we use ! or ! for intialization, so that user may find all corresponding class/function according to one single yaml. +# for system/third_party class/function, we do not require this. +llm: !new:cosyvoice.llm.llm.Qwen2LM + llm_input_size: !ref + llm_output_size: !ref + speech_token_size: 6561 + length_normalized_loss: True + lsm_weight: 0 + mix_ratio: [5, 15] + llm: !new:cosyvoice.llm.llm.Qwen2Encoder + pretrain_path: !ref + sampling: !name:cosyvoice.utils.common.ras_sampling + top_p: 0.8 + top_k: 25 + win_size: 10 + tau_r: 0.1 + +flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec + input_size: 512 + output_size: 80 + spk_embed_dim: !ref + output_type: 'mel' + vocab_size: 6561 + input_frame_rate: !ref + only_mask_loss: True + token_mel_ratio: !ref + pre_lookahead_len: 3 + encoder: !new:cosyvoice.transformer.upsample_encoder.UpsampleConformerEncoder + output_size: 512 + attention_heads: 8 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.1 + normalize_before: True + input_layer: 'linear' + pos_enc_layer_type: 'rel_pos_espnet' + selfattention_layer_type: 'rel_selfattn' + input_size: 512 + use_cnn_module: False + macaron_style: False + static_chunk_size: !ref + decoder: !new:cosyvoice.flow.flow_matching.CausalConditionalCFM + in_channels: 240 + n_spks: 1 + spk_emb_dim: 80 + cfm_params: !new:omegaconf.DictConfig + content: + sigma_min: 1e-06 + solver: 'euler' + t_scheduler: 'cosine' + training_cfg_rate: 0.2 + inference_cfg_rate: 0.7 + reg_loss_type: 'l1' + estimator: !new:cosyvoice.flow.decoder.CausalConditionalDecoder + in_channels: 320 + out_channels: 80 + channels: [256] + dropout: 0.0 + attention_head_dim: 64 + n_blocks: 4 + num_mid_blocks: 12 + num_heads: 8 + act_fn: 'gelu' + static_chunk_size: !ref * + num_decoding_left_chunks: !ref + +hift: !new:cosyvoice.hifigan.generator.HiFTGenerator + in_channels: 80 + base_channels: 512 + nb_harmonics: 8 + sampling_rate: !ref + nsf_alpha: 0.1 + nsf_sigma: 0.003 + nsf_voiced_threshold: 10 + upsample_rates: [8, 5, 3] + upsample_kernel_sizes: [16, 11, 7] + istft_params: + n_fft: 16 + hop_len: 4 + resblock_kernel_sizes: [3, 7, 11] + resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]] + source_resblock_kernel_sizes: [7, 7, 11] + source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]] + lrelu_slope: 0.1 + audio_limit: 0.99 + f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor + num_class: 1 + in_channels: 80 + cond_channels: 512 + +# gan related module +mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram + n_fft: 1920 + num_mels: 80 + sampling_rate: !ref + hop_size: 480 + win_size: 1920 + fmin: 0 + fmax: null + center: False +hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan + generator: !ref + discriminator: !new:cosyvoice.hifigan.discriminator.MultipleDiscriminator + mpd: !new:matcha.hifigan.models.MultiPeriodDiscriminator + mrd: !new:cosyvoice.hifigan.discriminator.MultiResSpecDiscriminator + mel_spec_transform: [ + !ref + ] + +# processor functions +parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener +get_tokenizer: !name:cosyvoice.tokenizer.tokenizer.get_qwen_tokenizer + token_path: !ref + skip_special_tokens: True +allowed_special: 'all' +tokenize: !name:cosyvoice.dataset.processor.tokenize + get_tokenizer: !ref + allowed_special: !ref +filter: !name:cosyvoice.dataset.processor.filter + max_length: 40960 + min_length: 100 + token_max_length: 200 + token_min_length: 1 +resample: !name:cosyvoice.dataset.processor.resample + resample_rate: !ref +truncate: !name:cosyvoice.dataset.processor.truncate + truncate_length: 24480 # must be a multiplier of hop_size +feat_extractor: !name:matcha.utils.audio.mel_spectrogram + n_fft: 1920 + num_mels: 80 + sampling_rate: !ref + hop_size: 480 + win_size: 1920 + fmin: 0 + fmax: 8000 + center: False +compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank + feat_extractor: !ref +compute_f0: !name:cosyvoice.dataset.processor.compute_f0 + sample_rate: !ref + hop_size: 480 +parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding + normalize: True +shuffle: !name:cosyvoice.dataset.processor.shuffle + shuffle_size: 1000 +sort: !name:cosyvoice.dataset.processor.sort + sort_size: 500 # sort_size should be less than shuffle_size +batch: !name:cosyvoice.dataset.processor.batch + batch_type: 'dynamic' + max_frames_in_batch: 2000 +padding: !name:cosyvoice.dataset.processor.padding + use_spk_embedding: False # change to True during sft + + +# dataset processor pipeline +data_pipeline: [ + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , +] +data_pipeline_gan: [ + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , +] + +# llm flow train conf +train_conf: + optim: adamw + optim_conf: + lr: 1e-5 # change to 1e-5 during sft + scheduler: constantlr # change to constantlr during sft + scheduler_conf: + warmup_steps: 2500 + max_epoch: 200 + grad_clip: 1 + accum_grad: 1 + log_interval: 100 + save_per_step: -1 \ No newline at end of file diff --git a/speech/third_party/Matcha-TTS/matcha/__init__.py b/speech/matcha/__init__.py similarity index 100% rename from speech/third_party/Matcha-TTS/matcha/__init__.py rename to speech/matcha/__init__.py diff --git a/speech/third_party/Matcha-TTS/matcha/app.py b/speech/matcha/app.py similarity index 100% rename from speech/third_party/Matcha-TTS/matcha/app.py rename to speech/matcha/app.py diff --git a/speech/third_party/Matcha-TTS/matcha/cli.py b/speech/matcha/cli.py similarity index 100% rename from speech/third_party/Matcha-TTS/matcha/cli.py rename to speech/matcha/cli.py diff --git a/speech/third_party/Matcha-TTS/matcha/hifigan/LICENSE b/speech/matcha/hifigan/LICENSE similarity index 100% rename from speech/third_party/Matcha-TTS/matcha/hifigan/LICENSE rename to speech/matcha/hifigan/LICENSE diff --git a/speech/third_party/Matcha-TTS/matcha/hifigan/README.md b/speech/matcha/hifigan/README.md similarity index 100% rename from speech/third_party/Matcha-TTS/matcha/hifigan/README.md rename to speech/matcha/hifigan/README.md diff --git a/speech/third_party/Matcha-TTS/matcha/hifigan/__init__.py b/speech/matcha/hifigan/__init__.py similarity index 100% rename from speech/third_party/Matcha-TTS/matcha/hifigan/__init__.py rename to speech/matcha/hifigan/__init__.py diff --git a/speech/third_party/Matcha-TTS/matcha/hifigan/config.py b/speech/matcha/hifigan/config.py similarity index 100% rename from speech/third_party/Matcha-TTS/matcha/hifigan/config.py rename to speech/matcha/hifigan/config.py diff --git a/speech/third_party/Matcha-TTS/matcha/hifigan/denoiser.py b/speech/matcha/hifigan/denoiser.py similarity index 100% rename from speech/third_party/Matcha-TTS/matcha/hifigan/denoiser.py rename to speech/matcha/hifigan/denoiser.py diff --git a/speech/third_party/Matcha-TTS/matcha/hifigan/env.py b/speech/matcha/hifigan/env.py similarity index 100% rename from speech/third_party/Matcha-TTS/matcha/hifigan/env.py rename to speech/matcha/hifigan/env.py diff --git a/speech/third_party/Matcha-TTS/matcha/hifigan/meldataset.py b/speech/matcha/hifigan/meldataset.py similarity index 100% rename from speech/third_party/Matcha-TTS/matcha/hifigan/meldataset.py rename to speech/matcha/hifigan/meldataset.py diff --git a/speech/third_party/Matcha-TTS/matcha/hifigan/models.py b/speech/matcha/hifigan/models.py similarity index 100% rename from speech/third_party/Matcha-TTS/matcha/hifigan/models.py rename to speech/matcha/hifigan/models.py diff --git a/speech/third_party/Matcha-TTS/matcha/hifigan/xutils.py b/speech/matcha/hifigan/xutils.py similarity index 100% rename from speech/third_party/Matcha-TTS/matcha/hifigan/xutils.py rename to speech/matcha/hifigan/xutils.py diff --git a/speech/third_party/Matcha-TTS/matcha/models/__init__.py b/speech/matcha/models/__init__.py similarity index 100% rename from speech/third_party/Matcha-TTS/matcha/models/__init__.py rename to speech/matcha/models/__init__.py diff --git a/speech/third_party/Matcha-TTS/matcha/models/baselightningmodule.py b/speech/matcha/models/baselightningmodule.py similarity index 100% rename from speech/third_party/Matcha-TTS/matcha/models/baselightningmodule.py rename to speech/matcha/models/baselightningmodule.py diff --git a/speech/third_party/Matcha-TTS/matcha/models/components/__init__.py b/speech/matcha/models/components/__init__.py similarity index 100% rename from speech/third_party/Matcha-TTS/matcha/models/components/__init__.py rename to speech/matcha/models/components/__init__.py diff --git a/speech/third_party/Matcha-TTS/matcha/models/components/decoder.py b/speech/matcha/models/components/decoder.py similarity index 100% rename from speech/third_party/Matcha-TTS/matcha/models/components/decoder.py rename to speech/matcha/models/components/decoder.py diff --git a/speech/third_party/Matcha-TTS/matcha/models/components/flow_matching.py b/speech/matcha/models/components/flow_matching.py similarity index 100% rename from speech/third_party/Matcha-TTS/matcha/models/components/flow_matching.py rename to speech/matcha/models/components/flow_matching.py diff --git a/speech/third_party/Matcha-TTS/matcha/models/components/text_encoder.py b/speech/matcha/models/components/text_encoder.py similarity index 100% rename from speech/third_party/Matcha-TTS/matcha/models/components/text_encoder.py rename to speech/matcha/models/components/text_encoder.py diff --git a/speech/third_party/Matcha-TTS/matcha/models/components/transformer.py b/speech/matcha/models/components/transformer.py similarity index 100% rename from speech/third_party/Matcha-TTS/matcha/models/components/transformer.py rename to speech/matcha/models/components/transformer.py diff --git a/speech/third_party/Matcha-TTS/matcha/models/matcha_tts.py b/speech/matcha/models/matcha_tts.py similarity index 100% rename from speech/third_party/Matcha-TTS/matcha/models/matcha_tts.py rename to speech/matcha/models/matcha_tts.py diff --git a/speech/third_party/Matcha-TTS/matcha/onnx/__init__.py b/speech/matcha/onnx/__init__.py similarity index 100% rename from speech/third_party/Matcha-TTS/matcha/onnx/__init__.py rename to speech/matcha/onnx/__init__.py diff --git a/speech/third_party/Matcha-TTS/matcha/onnx/export.py b/speech/matcha/onnx/export.py similarity index 100% rename from speech/third_party/Matcha-TTS/matcha/onnx/export.py rename to speech/matcha/onnx/export.py diff --git a/speech/third_party/Matcha-TTS/matcha/onnx/infer.py b/speech/matcha/onnx/infer.py similarity index 100% rename from speech/third_party/Matcha-TTS/matcha/onnx/infer.py rename to speech/matcha/onnx/infer.py diff --git a/speech/third_party/Matcha-TTS/matcha/text/__init__.py b/speech/matcha/text/__init__.py similarity index 100% rename from speech/third_party/Matcha-TTS/matcha/text/__init__.py rename to speech/matcha/text/__init__.py diff --git a/speech/third_party/Matcha-TTS/matcha/text/cleaners.py b/speech/matcha/text/cleaners.py similarity index 100% rename from speech/third_party/Matcha-TTS/matcha/text/cleaners.py rename to speech/matcha/text/cleaners.py diff --git a/speech/third_party/Matcha-TTS/matcha/text/numbers.py b/speech/matcha/text/numbers.py similarity index 100% rename from speech/third_party/Matcha-TTS/matcha/text/numbers.py rename to speech/matcha/text/numbers.py diff --git a/speech/third_party/Matcha-TTS/matcha/text/symbols.py b/speech/matcha/text/symbols.py similarity index 100% rename from speech/third_party/Matcha-TTS/matcha/text/symbols.py rename to speech/matcha/text/symbols.py diff --git a/speech/third_party/Matcha-TTS/matcha/utils/__init__.py b/speech/matcha/utils/__init__.py similarity index 100% rename from speech/third_party/Matcha-TTS/matcha/utils/__init__.py rename to speech/matcha/utils/__init__.py diff --git a/speech/third_party/Matcha-TTS/matcha/utils/audio.py b/speech/matcha/utils/audio.py similarity index 100% rename from speech/third_party/Matcha-TTS/matcha/utils/audio.py rename to speech/matcha/utils/audio.py diff --git a/speech/third_party/Matcha-TTS/matcha/utils/generate_data_statistics.py b/speech/matcha/utils/generate_data_statistics.py similarity index 100% rename from speech/third_party/Matcha-TTS/matcha/utils/generate_data_statistics.py rename to speech/matcha/utils/generate_data_statistics.py diff --git a/speech/third_party/Matcha-TTS/matcha/utils/instantiators.py b/speech/matcha/utils/instantiators.py similarity index 100% rename from speech/third_party/Matcha-TTS/matcha/utils/instantiators.py rename to speech/matcha/utils/instantiators.py diff --git a/speech/third_party/Matcha-TTS/matcha/utils/logging_utils.py b/speech/matcha/utils/logging_utils.py similarity index 100% rename from speech/third_party/Matcha-TTS/matcha/utils/logging_utils.py rename to speech/matcha/utils/logging_utils.py diff --git a/speech/third_party/Matcha-TTS/matcha/utils/model.py b/speech/matcha/utils/model.py similarity index 100% rename from speech/third_party/Matcha-TTS/matcha/utils/model.py rename to speech/matcha/utils/model.py diff --git a/speech/third_party/Matcha-TTS/matcha/utils/monotonic_align/__init__.py b/speech/matcha/utils/monotonic_align/__init__.py similarity index 100% rename from speech/third_party/Matcha-TTS/matcha/utils/monotonic_align/__init__.py rename to speech/matcha/utils/monotonic_align/__init__.py diff --git a/speech/third_party/Matcha-TTS/matcha/utils/monotonic_align/core.pyx b/speech/matcha/utils/monotonic_align/core.pyx similarity index 100% rename from speech/third_party/Matcha-TTS/matcha/utils/monotonic_align/core.pyx rename to speech/matcha/utils/monotonic_align/core.pyx diff --git a/speech/third_party/Matcha-TTS/matcha/utils/monotonic_align/setup.py b/speech/matcha/utils/monotonic_align/setup.py similarity index 100% rename from speech/third_party/Matcha-TTS/matcha/utils/monotonic_align/setup.py rename to speech/matcha/utils/monotonic_align/setup.py diff --git a/speech/third_party/Matcha-TTS/matcha/utils/pylogger.py b/speech/matcha/utils/pylogger.py similarity index 100% rename from speech/third_party/Matcha-TTS/matcha/utils/pylogger.py rename to speech/matcha/utils/pylogger.py diff --git a/speech/third_party/Matcha-TTS/matcha/utils/rich_utils.py b/speech/matcha/utils/rich_utils.py similarity index 100% rename from speech/third_party/Matcha-TTS/matcha/utils/rich_utils.py rename to speech/matcha/utils/rich_utils.py diff --git a/speech/third_party/Matcha-TTS/matcha/utils/utils.py b/speech/matcha/utils/utils.py similarity index 100% rename from speech/third_party/Matcha-TTS/matcha/utils/utils.py rename to speech/matcha/utils/utils.py diff --git a/speech/third_party/Matcha-TTS/.env.example b/speech/third_party/Matcha-TTS/.env.example deleted file mode 100644 index a790e320464ebc778ca07f5bcd826a9c8412ed0e..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/.env.example +++ /dev/null @@ -1,6 +0,0 @@ -# example of file for storing private and user specific environment variables, like keys or system paths -# rename it to ".env" (excluded from version control by default) -# .env is loaded by train.py automatically -# hydra allows you to reference variables in .yaml configs with special syntax: ${oc.env:MY_VAR} - -MY_VAR="/home/user/my/system/path" diff --git a/speech/third_party/Matcha-TTS/.github/PULL_REQUEST_TEMPLATE.md b/speech/third_party/Matcha-TTS/.github/PULL_REQUEST_TEMPLATE.md deleted file mode 100644 index 410bcd87a45297ab8f0d369574a032858b6b1811..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/.github/PULL_REQUEST_TEMPLATE.md +++ /dev/null @@ -1,22 +0,0 @@ -## What does this PR do? - - - -Fixes #\ - -## Before submitting - -- [ ] Did you make sure **title is self-explanatory** and **the description concisely explains the PR**? -- [ ] Did you make sure your **PR does only one thing**, instead of bundling different changes together? -- [ ] Did you list all the **breaking changes** introduced by this pull request? -- [ ] Did you **test your PR locally** with `pytest` command? -- [ ] Did you **run pre-commit hooks** with `pre-commit run -a` command? - -## Did you have fun? - -Make sure you had fun coding ๐Ÿ™ƒ diff --git a/speech/third_party/Matcha-TTS/.github/codecov.yml b/speech/third_party/Matcha-TTS/.github/codecov.yml deleted file mode 100644 index c66853c4bd9991f730da5dda7dc9881986779558..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/.github/codecov.yml +++ /dev/null @@ -1,15 +0,0 @@ -coverage: - status: - # measures overall project coverage - project: - default: - threshold: 100% # how much decrease in coverage is needed to not consider success - - # measures PR or single commit coverage - patch: - default: - threshold: 100% # how much decrease in coverage is needed to not consider success - - - # project: off - # patch: off diff --git a/speech/third_party/Matcha-TTS/.github/dependabot.yml b/speech/third_party/Matcha-TTS/.github/dependabot.yml deleted file mode 100644 index b19ccab12a3c573025ce6ba6d9068b062b29cc1b..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/.github/dependabot.yml +++ /dev/null @@ -1,17 +0,0 @@ -# To get started with Dependabot version updates, you'll need to specify which -# package ecosystems to update and where the package manifests are located. -# Please see the documentation for all configuration options: -# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates - -version: 2 -updates: - - package-ecosystem: "pip" # See documentation for possible values - directory: "/" # Location of package manifests - target-branch: "dev" - schedule: - interval: "daily" - ignore: - - dependency-name: "pytorch-lightning" - update-types: ["version-update:semver-patch"] - - dependency-name: "torchmetrics" - update-types: ["version-update:semver-patch"] diff --git a/speech/third_party/Matcha-TTS/.github/release-drafter.yml b/speech/third_party/Matcha-TTS/.github/release-drafter.yml deleted file mode 100644 index 59af159f671abe75311eb626c8ec92ca6ea09d3c..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/.github/release-drafter.yml +++ /dev/null @@ -1,44 +0,0 @@ -name-template: "v$RESOLVED_VERSION" -tag-template: "v$RESOLVED_VERSION" - -categories: - - title: "๐Ÿš€ Features" - labels: - - "feature" - - "enhancement" - - title: "๐Ÿ› Bug Fixes" - labels: - - "fix" - - "bugfix" - - "bug" - - title: "๐Ÿงน Maintenance" - labels: - - "maintenance" - - "dependencies" - - "refactoring" - - "cosmetic" - - "chore" - - title: "๐Ÿ“๏ธ Documentation" - labels: - - "documentation" - - "docs" - -change-template: "- $TITLE @$AUTHOR (#$NUMBER)" -change-title-escapes: '\<*_&' # You can add # and @ to disable mentions - -version-resolver: - major: - labels: - - "major" - minor: - labels: - - "minor" - patch: - labels: - - "patch" - default: patch - -template: | - ## Changes - - $CHANGES diff --git a/speech/third_party/Matcha-TTS/.gitignore b/speech/third_party/Matcha-TTS/.gitignore deleted file mode 100644 index cbec8b43a0414bbbf4cc9feae49b9dc091a60c92..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/.gitignore +++ /dev/null @@ -1,163 +0,0 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -pip-wheel-metadata/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -.python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -### VisualStudioCode -.vscode/* -!.vscode/settings.json -!.vscode/tasks.json -!.vscode/launch.json -!.vscode/extensions.json -*.code-workspace -**/.vscode - -# JetBrains -.idea/ - -# Data & Models -*.h5 -*.tar -*.tar.gz - -# Lightning-Hydra-Template -configs/local/default.yaml -/data/ -/logs/ -.env - -# Aim logging -.aim - -# Cython complied files -matcha/utils/monotonic_align/core.c - -# Ignoring hifigan checkpoint -generator_v1 -g_02500000 -gradio_cached_examples/ -synth_output/ diff --git a/speech/third_party/Matcha-TTS/.pre-commit-config.yaml b/speech/third_party/Matcha-TTS/.pre-commit-config.yaml deleted file mode 100644 index e695f115eba12d84fe6f465c5d834dfa35c3d2ec..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/.pre-commit-config.yaml +++ /dev/null @@ -1,59 +0,0 @@ -default_language_version: - python: python3.10 - -repos: - - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.5.0 - hooks: - # list of supported hooks: https://pre-commit.com/hooks.html - - id: trailing-whitespace - - id: end-of-file-fixer - # - id: check-docstring-first - - id: check-yaml - - id: debug-statements - - id: detect-private-key - - id: check-toml - - id: check-case-conflict - - id: check-added-large-files - - # python code formatting - - repo: https://github.com/psf/black - rev: 23.12.1 - hooks: - - id: black - args: [--line-length, "120"] - - # python import sorting - - repo: https://github.com/PyCQA/isort - rev: 5.13.2 - hooks: - - id: isort - args: ["--profile", "black", "--filter-files"] - - # python upgrading syntax to newer version - - repo: https://github.com/asottile/pyupgrade - rev: v3.15.0 - hooks: - - id: pyupgrade - args: [--py38-plus] - - # python check (PEP8), programming errors and code complexity - - repo: https://github.com/PyCQA/flake8 - rev: 7.0.0 - hooks: - - id: flake8 - args: - [ - "--max-line-length", "120", - "--extend-ignore", - "E203,E402,E501,F401,F841,RST2,RST301", - "--exclude", - "logs/*,data/*,matcha/hifigan/*", - ] - additional_dependencies: [flake8-rst-docstrings==0.3.0] - - # pylint - - repo: https://github.com/pycqa/pylint - rev: v3.0.3 - hooks: - - id: pylint diff --git a/speech/third_party/Matcha-TTS/.project-root b/speech/third_party/Matcha-TTS/.project-root deleted file mode 100644 index 63eab774b9e36aa1a46cbd31b59cbd373bc5477f..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/.project-root +++ /dev/null @@ -1,2 +0,0 @@ -# this file is required for inferring the project root directory -# do not delete diff --git a/speech/third_party/Matcha-TTS/.pylintrc b/speech/third_party/Matcha-TTS/.pylintrc deleted file mode 100644 index 962864189eab99a66b315b80f5a9976e7a423d4a..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/.pylintrc +++ /dev/null @@ -1,525 +0,0 @@ -[MASTER] - -# A comma-separated list of package or module names from where C extensions may -# be loaded. Extensions are loading into the active Python interpreter and may -# run arbitrary code. -extension-pkg-whitelist= - -# Add files or directories to the blacklist. They should be base names, not -# paths. -ignore=CVS - -# Add files or directories matching the regex patterns to the blacklist. The -# regex matches against base names, not paths. -ignore-patterns= - -# Python code to execute, usually for sys.path manipulation such as -# pygtk.require(). -#init-hook= - -# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the -# number of processors available to use. -jobs=1 - -# Control the amount of potential inferred values when inferring a single -# object. This can help the performance when dealing with large functions or -# complex, nested conditions. -limit-inference-results=100 - -# List of plugins (as comma separated values of python modules names) to load, -# usually to register additional checkers. -load-plugins= - -# Pickle collected data for later comparisons. -persistent=yes - -# Specify a configuration file. -#rcfile= - -# When enabled, pylint would attempt to guess common misconfiguration and emit -# user-friendly hints instead of false-positive error messages. -suggestion-mode=yes - -# Allow loading of arbitrary C extensions. Extensions are imported into the -# active Python interpreter and may run arbitrary code. -unsafe-load-any-extension=no - - -[MESSAGES CONTROL] - -# Only show warnings with the listed confidence levels. Leave empty to show -# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED. -confidence= - -# Disable the message, report, category or checker with the given id(s). You -# can either give multiple identifiers separated by comma (,) or put this -# option multiple times (only on the command line, not in the configuration -# file where it should appear only once). You can also use "--disable=all" to -# disable everything first and then reenable specific checks. For example, if -# you want to run only the similarities checker, you can use "--disable=all -# --enable=similarities". If you want to run only the classes checker, but have -# no Warning level messages displayed, use "--disable=all --enable=classes -# --disable=W". -disable=missing-docstring, - too-many-public-methods, - too-many-lines, - bare-except, - ## for avoiding weird p3.6 CI linter error - ## TODO: see later if we can remove this - assigning-non-slot, - unsupported-assignment-operation, - ## end - line-too-long, - fixme, - wrong-import-order, - ungrouped-imports, - wrong-import-position, - import-error, - invalid-name, - too-many-instance-attributes, - arguments-differ, - arguments-renamed, - no-name-in-module, - no-member, - unsubscriptable-object, - raw-checker-failed, - bad-inline-option, - locally-disabled, - file-ignored, - suppressed-message, - useless-suppression, - deprecated-pragma, - use-symbolic-message-instead, - useless-object-inheritance, - too-few-public-methods, - too-many-branches, - too-many-arguments, - too-many-locals, - too-many-statements, - duplicate-code, - not-callable, - import-outside-toplevel, - logging-fstring-interpolation, - logging-not-lazy, - unused-argument, - no-else-return, - chained-comparison, - redefined-outer-name - -# Enable the message, report, category or checker with the given id(s). You can -# either give multiple identifier separated by comma (,) or put this option -# multiple time (only on the command line, not in the configuration file where -# it should appear only once). See also the "--disable" option for examples. -enable=c-extension-no-member - - -[REPORTS] - -# Python expression which should return a note less than 10 (10 is the highest -# note). You have access to the variables errors warning, statement which -# respectively contain the number of errors / warnings messages and the total -# number of statements analyzed. This is used by the global evaluation report -# (RP0004). -evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) - -# Template used to display messages. This is a python new-style format string -# used to format the message information. See doc for all details. -#msg-template= - -# Set the output format. Available formats are text, parseable, colorized, json -# and msvs (visual studio). You can also give a reporter class, e.g. -# mypackage.mymodule.MyReporterClass. -output-format=text - -# Tells whether to display a full report or only the messages. -reports=no - -# Activate the evaluation score. -score=yes - - -[REFACTORING] - -# Maximum number of nested blocks for function / method body -max-nested-blocks=5 - -# Complete name of functions that never returns. When checking for -# inconsistent-return-statements if a never returning function is called then -# it will be considered as an explicit return statement and no message will be -# printed. -never-returning-functions=sys.exit - - -[LOGGING] - -# Format style used to check logging format string. `old` means using % -# formatting, while `new` is for `{}` formatting. -logging-format-style=old - -# Logging modules to check that the string format arguments are in logging -# function parameter format. -logging-modules=logging - - -[SPELLING] - -# Limits count of emitted suggestions for spelling mistakes. -max-spelling-suggestions=4 - -# Spelling dictionary name. Available dictionaries: none. To make it working -# install python-enchant package.. -spelling-dict= - -# List of comma separated words that should not be checked. -spelling-ignore-words= - -# A path to a file that contains private dictionary; one word per line. -spelling-private-dict-file= - -# Tells whether to store unknown words to indicated private dictionary in -# --spelling-private-dict-file option instead of raising a message. -spelling-store-unknown-words=no - - -[MISCELLANEOUS] - -# List of note tags to take in consideration, separated by a comma. -notes=FIXME, - XXX, - TODO - - -[TYPECHECK] - -# List of decorators that produce context managers, such as -# contextlib.contextmanager. Add to this list to register other decorators that -# produce valid context managers. -contextmanager-decorators=contextlib.contextmanager - -# List of members which are set dynamically and missed by pylint inference -# system, and so shouldn't trigger E1101 when accessed. Python regular -# expressions are accepted. -generated-members=numpy.*,torch.* - -# Tells whether missing members accessed in mixin class should be ignored. A -# mixin class is detected if its name ends with "mixin" (case insensitive). -ignore-mixin-members=yes - -# Tells whether to warn about missing members when the owner of the attribute -# is inferred to be None. -ignore-none=yes - -# This flag controls whether pylint should warn about no-member and similar -# checks whenever an opaque object is returned when inferring. The inference -# can return multiple potential results while evaluating a Python object, but -# some branches might not be evaluated, which results in partial inference. In -# that case, it might be useful to still emit no-member and other checks for -# the rest of the inferred objects. -ignore-on-opaque-inference=yes - -# List of class names for which member attributes should not be checked (useful -# for classes with dynamically set attributes). This supports the use of -# qualified names. -ignored-classes=optparse.Values,thread._local,_thread._local - -# List of module names for which member attributes should not be checked -# (useful for modules/projects where namespaces are manipulated during runtime -# and thus existing member attributes cannot be deduced by static analysis. It -# supports qualified module names, as well as Unix pattern matching. -ignored-modules= - -# Show a hint with possible names when a member name was not found. The aspect -# of finding the hint is based on edit distance. -missing-member-hint=yes - -# The minimum edit distance a name should have in order to be considered a -# similar match for a missing member name. -missing-member-hint-distance=1 - -# The total number of similar names that should be taken in consideration when -# showing a hint for a missing member. -missing-member-max-choices=1 - - -[VARIABLES] - -# List of additional names supposed to be defined in builtins. Remember that -# you should avoid defining new builtins when possible. -additional-builtins= - -# Tells whether unused global variables should be treated as a violation. -allow-global-unused-variables=yes - -# List of strings which can identify a callback function by name. A callback -# name must start or end with one of those strings. -callbacks=cb_, - _cb - -# A regular expression matching the name of dummy variables (i.e. expected to -# not be used). -dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_ - -# Argument names that match this expression will be ignored. Default to name -# with leading underscore. -ignored-argument-names=_.*|^ignored_|^unused_ - -# Tells whether we should check for unused import in __init__ files. -init-import=no - -# List of qualified module names which can have objects that can redefine -# builtins. -redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io - - -[FORMAT] - -# Expected format of line ending, e.g. empty (any line ending), LF or CRLF. -expected-line-ending-format= - -# Regexp for a line that is allowed to be longer than the limit. -ignore-long-lines=^\s*(# )??$ - -# Number of spaces of indent required inside a hanging or continued line. -indent-after-paren=4 - -# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 -# tab). -indent-string=' ' - -# Maximum number of characters on a single line. -max-line-length=120 - -# Maximum number of lines in a module. -max-module-lines=1000 - -# Allow the body of a class to be on the same line as the declaration if body -# contains single statement. -single-line-class-stmt=no - -# Allow the body of an if to be on the same line as the test if there is no -# else. -single-line-if-stmt=no - - -[SIMILARITIES] - -# Ignore comments when computing similarities. -ignore-comments=yes - -# Ignore docstrings when computing similarities. -ignore-docstrings=yes - -# Ignore imports when computing similarities. -ignore-imports=no - -# Minimum lines number of a similarity. -min-similarity-lines=4 - - -[BASIC] - -# Naming style matching correct argument names. -argument-naming-style=snake_case - -# Regular expression matching correct argument names. Overrides argument- -# naming-style. -argument-rgx=[a-z_][a-z0-9_]{0,30}$ - -# Naming style matching correct attribute names. -attr-naming-style=snake_case - -# Regular expression matching correct attribute names. Overrides attr-naming- -# style. -#attr-rgx= - -# Bad variable names which should always be refused, separated by a comma. -bad-names= - -# Naming style matching correct class attribute names. -class-attribute-naming-style=any - -# Regular expression matching correct class attribute names. Overrides class- -# attribute-naming-style. -#class-attribute-rgx= - -# Naming style matching correct class names. -class-naming-style=PascalCase - -# Regular expression matching correct class names. Overrides class-naming- -# style. -#class-rgx= - -# Naming style matching correct constant names. -const-naming-style=UPPER_CASE - -# Regular expression matching correct constant names. Overrides const-naming- -# style. -#const-rgx= - -# Minimum line length for functions/classes that require docstrings, shorter -# ones are exempt. -docstring-min-length=-1 - -# Naming style matching correct function names. -function-naming-style=snake_case - -# Regular expression matching correct function names. Overrides function- -# naming-style. -#function-rgx= - -# Good variable names which should always be accepted, separated by a comma. -good-names=i, - j, - k, - x, - ex, - Run, - _ - -# Include a hint for the correct naming format with invalid-name. -include-naming-hint=no - -# Naming style matching correct inline iteration names. -inlinevar-naming-style=any - -# Regular expression matching correct inline iteration names. Overrides -# inlinevar-naming-style. -#inlinevar-rgx= - -# Naming style matching correct method names. -method-naming-style=snake_case - -# Regular expression matching correct method names. Overrides method-naming- -# style. -#method-rgx= - -# Naming style matching correct module names. -module-naming-style=snake_case - -# Regular expression matching correct module names. Overrides module-naming- -# style. -#module-rgx= - -# Colon-delimited sets of names that determine each other's naming style when -# the name regexes allow several styles. -name-group= - -# Regular expression which should only match function or class names that do -# not require a docstring. -no-docstring-rgx=^_ - -# List of decorators that produce properties, such as abc.abstractproperty. Add -# to this list to register other decorators that produce valid properties. -# These decorators are taken in consideration only for invalid-name. -property-classes=abc.abstractproperty - -# Naming style matching correct variable names. -variable-naming-style=snake_case - -# Regular expression matching correct variable names. Overrides variable- -# naming-style. -variable-rgx=[a-z_][a-z0-9_]{0,30}$ - - -[STRING] - -# This flag controls whether the implicit-str-concat-in-sequence should -# generate a warning on implicit string concatenation in sequences defined over -# several lines. -check-str-concat-over-line-jumps=no - - -[IMPORTS] - -# Allow wildcard imports from modules that define __all__. -allow-wildcard-with-all=no - -# Analyse import fallback blocks. This can be used to support both Python 2 and -# 3 compatible code, which means that the block might have code that exists -# only in one or another interpreter, leading to false positives when analysed. -analyse-fallback-blocks=no - -# Deprecated modules which should not be used, separated by a comma. -deprecated-modules=optparse,tkinter.tix - -# Create a graph of external dependencies in the given file (report RP0402 must -# not be disabled). -ext-import-graph= - -# Create a graph of every (i.e. internal and external) dependencies in the -# given file (report RP0402 must not be disabled). -import-graph= - -# Create a graph of internal dependencies in the given file (report RP0402 must -# not be disabled). -int-import-graph= - -# Force import order to recognize a module as part of the standard -# compatibility libraries. -known-standard-library= - -# Force import order to recognize a module as part of a third party library. -known-third-party=enchant - - -[CLASSES] - -# List of method names used to declare (i.e. assign) instance attributes. -defining-attr-methods=__init__, - __new__, - setUp - -# List of member names, which should be excluded from the protected access -# warning. -exclude-protected=_asdict, - _fields, - _replace, - _source, - _make - -# List of valid names for the first argument in a class method. -valid-classmethod-first-arg=cls - -# List of valid names for the first argument in a metaclass class method. -valid-metaclass-classmethod-first-arg=cls - - -[DESIGN] - -# Maximum number of arguments for function / method. -max-args=5 - -# Maximum number of attributes for a class (see R0902). -max-attributes=7 - -# Maximum number of boolean expressions in an if statement. -max-bool-expr=5 - -# Maximum number of branch for function / method body. -max-branches=12 - -# Maximum number of locals for function / method body. -max-locals=15 - -# Maximum number of parents for a class (see R0901). -max-parents=15 - -# Maximum number of public methods for a class (see R0904). -max-public-methods=20 - -# Maximum number of return / yield for function / method body. -max-returns=6 - -# Maximum number of statements in function / method body. -max-statements=50 - -# Minimum number of public methods for a class (see R0903). -min-public-methods=2 - - -[EXCEPTIONS] - -# Exceptions that will emit a warning when being caught. Defaults to -# "BaseException, Exception". -overgeneral-exceptions=builtins.BaseException, - builtins.Exception diff --git a/speech/third_party/Matcha-TTS/LICENSE b/speech/third_party/Matcha-TTS/LICENSE deleted file mode 100644 index 858018e750da7be7b271bb7307e68d159ed67ef6..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2023 Shivam Mehta - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/speech/third_party/Matcha-TTS/MANIFEST.in b/speech/third_party/Matcha-TTS/MANIFEST.in deleted file mode 100644 index c013140cdfb9de19c4d4e73c73a44e33f33fa871..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/MANIFEST.in +++ /dev/null @@ -1,14 +0,0 @@ -include README.md -include LICENSE.txt -include requirements.*.txt -include *.cff -include requirements.txt -include matcha/VERSION -recursive-include matcha *.json -recursive-include matcha *.html -recursive-include matcha *.png -recursive-include matcha *.md -recursive-include matcha *.py -recursive-include matcha *.pyx -recursive-exclude tests * -prune tests* diff --git a/speech/third_party/Matcha-TTS/Makefile b/speech/third_party/Matcha-TTS/Makefile deleted file mode 100644 index 4b523dd17b13a19617c9cc9d9dad7f7d8d4c24a0..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/Makefile +++ /dev/null @@ -1,42 +0,0 @@ - -help: ## Show help - @grep -E '^[.a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' - -clean: ## Clean autogenerated files - rm -rf dist - find . -type f -name "*.DS_Store" -ls -delete - find . | grep -E "(__pycache__|\.pyc|\.pyo)" | xargs rm -rf - find . | grep -E ".pytest_cache" | xargs rm -rf - find . | grep -E ".ipynb_checkpoints" | xargs rm -rf - rm -f .coverage - -clean-logs: ## Clean logs - rm -rf logs/** - -create-package: ## Create wheel and tar gz - rm -rf dist/ - python setup.py bdist_wheel --plat-name=manylinux1_x86_64 - python setup.py sdist - python -m twine upload dist/* --verbose --skip-existing - -format: ## Run pre-commit hooks - pre-commit run -a - -sync: ## Merge changes from main branch to your current branch - git pull - git pull origin main - -test: ## Run not slow tests - pytest -k "not slow" - -test-full: ## Run all tests - pytest - -train-ljspeech: ## Train the model - python matcha/train.py experiment=ljspeech - -train-ljspeech-min: ## Train the model with minimum memory - python matcha/train.py experiment=ljspeech_min_memory - -start_app: ## Start the app - python matcha/app.py diff --git a/speech/third_party/Matcha-TTS/README.md b/speech/third_party/Matcha-TTS/README.md deleted file mode 100644 index ebc6b7c0a76d30c33bf95583d629825c02183e31..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/README.md +++ /dev/null @@ -1,278 +0,0 @@ -
- -# ๐Ÿต Matcha-TTS: A fast TTS architecture with conditional flow matching - -### [Shivam Mehta](https://www.kth.se/profile/smehta), [Ruibo Tu](https://www.kth.se/profile/ruibo), [Jonas Beskow](https://www.kth.se/profile/beskow), [ร‰va Szรฉkely](https://www.kth.se/profile/szekely), and [Gustav Eje Henter](https://people.kth.se/~ghe/) - -[![python](https://img.shields.io/badge/-Python_3.10-blue?logo=python&logoColor=white)](https://www.python.org/downloads/release/python-3100/) -[![pytorch](https://img.shields.io/badge/PyTorch_2.0+-ee4c2c?logo=pytorch&logoColor=white)](https://pytorch.org/get-started/locally/) -[![lightning](https://img.shields.io/badge/-Lightning_2.0+-792ee5?logo=pytorchlightning&logoColor=white)](https://pytorchlightning.ai/) -[![hydra](https://img.shields.io/badge/Config-Hydra_1.3-89b8cd)](https://hydra.cc/) -[![black](https://img.shields.io/badge/Code%20Style-Black-black.svg?labelColor=gray)](https://black.readthedocs.io/en/stable/) -[![isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/) - -

- -

- -
- -> This is the official code implementation of ๐Ÿต Matcha-TTS [ICASSP 2024]. - -We propose ๐Ÿต Matcha-TTS, a new approach to non-autoregressive neural TTS, that uses [conditional flow matching](https://arxiv.org/abs/2210.02747) (similar to [rectified flows](https://arxiv.org/abs/2209.03003)) to speed up ODE-based speech synthesis. Our method: - -- Is probabilistic -- Has compact memory footprint -- Sounds highly natural -- Is very fast to synthesise from - -Check out our [demo page](https://shivammehta25.github.io/Matcha-TTS) and read [our ICASSP 2024 paper](https://arxiv.org/abs/2309.03199) for more details. - -[Pre-trained models](https://drive.google.com/drive/folders/17C_gYgEHOxI5ZypcfE_k1piKCtyR0isJ?usp=sharing) will be automatically downloaded with the CLI or gradio interface. - -You can also [try ๐Ÿต Matcha-TTS in your browser on HuggingFace ๐Ÿค— spaces](https://huggingface.co/spaces/shivammehta25/Matcha-TTS). - -## Teaser video - -[![Watch the video](https://img.youtube.com/vi/xmvJkz3bqw0/hqdefault.jpg)](https://youtu.be/xmvJkz3bqw0) - -## Installation - -1. Create an environment (suggested but optional) - -``` -conda create -n matcha-tts python=3.10 -y -conda activate matcha-tts -``` - -2. Install Matcha TTS using pip or from source - -```bash -pip install matcha-tts -``` - -from source - -```bash -pip install git+https://github.com/shivammehta25/Matcha-TTS.git -cd Matcha-TTS -pip install -e . -``` - -3. Run CLI / gradio app / jupyter notebook - -```bash -# This will download the required models -matcha-tts --text "" -``` - -or - -```bash -matcha-tts-app -``` - -or open `synthesis.ipynb` on jupyter notebook - -### CLI Arguments - -- To synthesise from given text, run: - -```bash -matcha-tts --text "" -``` - -- To synthesise from a file, run: - -```bash -matcha-tts --file -``` - -- To batch synthesise from a file, run: - -```bash -matcha-tts --file --batched -``` - -Additional arguments - -- Speaking rate - -```bash -matcha-tts --text "" --speaking_rate 1.0 -``` - -- Sampling temperature - -```bash -matcha-tts --text "" --temperature 0.667 -``` - -- Euler ODE solver steps - -```bash -matcha-tts --text "" --steps 10 -``` - -## Train with your own dataset - -Let's assume we are training with LJ Speech - -1. Download the dataset from [here](https://keithito.com/LJ-Speech-Dataset/), extract it to `data/LJSpeech-1.1`, and prepare the file lists to point to the extracted data like for [item 5 in the setup of the NVIDIA Tacotron 2 repo](https://github.com/NVIDIA/tacotron2#setup). - -2. Clone and enter the Matcha-TTS repository - -```bash -git clone https://github.com/shivammehta25/Matcha-TTS.git -cd Matcha-TTS -``` - -3. Install the package from source - -```bash -pip install -e . -``` - -4. Go to `configs/data/ljspeech.yaml` and change - -```yaml -train_filelist_path: data/filelists/ljs_audio_text_train_filelist.txt -valid_filelist_path: data/filelists/ljs_audio_text_val_filelist.txt -``` - -5. Generate normalisation statistics with the yaml file of dataset configuration - -```bash -matcha-data-stats -i ljspeech.yaml -# Output: -#{'mel_mean': -5.53662231756592, 'mel_std': 2.1161014277038574} -``` - -Update these values in `configs/data/ljspeech.yaml` under `data_statistics` key. - -```bash -data_statistics: # Computed for ljspeech dataset - mel_mean: -5.536622 - mel_std: 2.116101 -``` - -to the paths of your train and validation filelists. - -6. Run the training script - -```bash -make train-ljspeech -``` - -or - -```bash -python matcha/train.py experiment=ljspeech -``` - -- for a minimum memory run - -```bash -python matcha/train.py experiment=ljspeech_min_memory -``` - -- for multi-gpu training, run - -```bash -python matcha/train.py experiment=ljspeech trainer.devices=[0,1] -``` - -7. Synthesise from the custom trained model - -```bash -matcha-tts --text "" --checkpoint_path -``` - -## ONNX support - -> Special thanks to [@mush42](https://github.com/mush42) for implementing ONNX export and inference support. - -It is possible to export Matcha checkpoints to [ONNX](https://onnx.ai/), and run inference on the exported ONNX graph. - -### ONNX export - -To export a checkpoint to ONNX, first install ONNX with - -```bash -pip install onnx -``` - -then run the following: - -```bash -python3 -m matcha.onnx.export matcha.ckpt model.onnx --n-timesteps 5 -``` - -Optionally, the ONNX exporter accepts **vocoder-name** and **vocoder-checkpoint** arguments. This enables you to embed the vocoder in the exported graph and generate waveforms in a single run (similar to end-to-end TTS systems). - -**Note** that `n_timesteps` is treated as a hyper-parameter rather than a model input. This means you should specify it during export (not during inference). If not specified, `n_timesteps` is set to **5**. - -**Important**: for now, torch>=2.1.0 is needed for export since the `scaled_product_attention` operator is not exportable in older versions. Until the final version is released, those who want to export their models must install torch>=2.1.0 manually as a pre-release. - -### ONNX Inference - -To run inference on the exported model, first install `onnxruntime` using - -```bash -pip install onnxruntime -pip install onnxruntime-gpu # for GPU inference -``` - -then use the following: - -```bash -python3 -m matcha.onnx.infer model.onnx --text "hey" --output-dir ./outputs -``` - -You can also control synthesis parameters: - -```bash -python3 -m matcha.onnx.infer model.onnx --text "hey" --output-dir ./outputs --temperature 0.4 --speaking_rate 0.9 --spk 0 -``` - -To run inference on **GPU**, make sure to install **onnxruntime-gpu** package, and then pass `--gpu` to the inference command: - -```bash -python3 -m matcha.onnx.infer model.onnx --text "hey" --output-dir ./outputs --gpu -``` - -If you exported only Matcha to ONNX, this will write mel-spectrogram as graphs and `numpy` arrays to the output directory. -If you embedded the vocoder in the exported graph, this will write `.wav` audio files to the output directory. - -If you exported only Matcha to ONNX, and you want to run a full TTS pipeline, you can pass a path to a vocoder model in `ONNX` format: - -```bash -python3 -m matcha.onnx.infer model.onnx --text "hey" --output-dir ./outputs --vocoder hifigan.small.onnx -``` - -This will write `.wav` audio files to the output directory. - -## Citation information - -If you use our code or otherwise find this work useful, please cite our paper: - -```text -@inproceedings{mehta2024matcha, - title={Matcha-{TTS}: A fast {TTS} architecture with conditional flow matching}, - author={Mehta, Shivam and Tu, Ruibo and Beskow, Jonas and Sz{\'e}kely, {\'E}va and Henter, Gustav Eje}, - booktitle={Proc. ICASSP}, - year={2024} -} -``` - -## Acknowledgements - -Since this code uses [Lightning-Hydra-Template](https://github.com/ashleve/lightning-hydra-template), you have all the powers that come with it. - -Other source code we would like to acknowledge: - -- [Coqui-TTS](https://github.com/coqui-ai/TTS/tree/dev): For helping me figure out how to make cython binaries pip installable and encouragement -- [Hugging Face Diffusers](https://huggingface.co/): For their awesome diffusers library and its components -- [Grad-TTS](https://github.com/huawei-noah/Speech-Backbones/tree/main/Grad-TTS): For the monotonic alignment search source code -- [torchdyn](https://github.com/DiffEqML/torchdyn): Useful for trying other ODE solvers during research and development -- [labml.ai](https://nn.labml.ai/transformers/rope/index.html): For the RoPE implementation diff --git a/speech/third_party/Matcha-TTS/configs/__init__.py b/speech/third_party/Matcha-TTS/configs/__init__.py deleted file mode 100644 index 56bf7f4aa4906bc0f997132708cc0826c198e4aa..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/configs/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# this file is needed here to include configs when building project as a package diff --git a/speech/third_party/Matcha-TTS/configs/callbacks/default.yaml b/speech/third_party/Matcha-TTS/configs/callbacks/default.yaml deleted file mode 100644 index ebaa3ed31a7f626bc62f90184dc4b25b631e52a9..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/configs/callbacks/default.yaml +++ /dev/null @@ -1,5 +0,0 @@ -defaults: - - model_checkpoint.yaml - - model_summary.yaml - - rich_progress_bar.yaml - - _self_ diff --git a/speech/third_party/Matcha-TTS/configs/callbacks/model_checkpoint.yaml b/speech/third_party/Matcha-TTS/configs/callbacks/model_checkpoint.yaml deleted file mode 100644 index 3d085c711a8521b6b98ad6401b686bb601ceacd6..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/configs/callbacks/model_checkpoint.yaml +++ /dev/null @@ -1,17 +0,0 @@ -# https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.ModelCheckpoint.html - -model_checkpoint: - _target_: lightning.pytorch.callbacks.ModelCheckpoint - dirpath: ${paths.output_dir}/checkpoints # directory to save the model file - filename: checkpoint_{epoch:03d} # checkpoint filename - monitor: epoch # name of the logged metric which determines when model is improving - verbose: False # verbosity mode - save_last: true # additionally always save an exact copy of the last checkpoint to a file last.ckpt - save_top_k: 10 # save k best models (determined by above metric) - mode: "max" # "max" means higher metric value is better, can be also "min" - auto_insert_metric_name: True # when True, the checkpoints filenames will contain the metric name - save_weights_only: False # if True, then only the modelโ€™s weights will be saved - every_n_train_steps: null # number of training steps between checkpoints - train_time_interval: null # checkpoints are monitored at the specified time interval - every_n_epochs: 100 # number of epochs between checkpoints - save_on_train_epoch_end: null # whether to run checkpointing at the end of the training epoch or the end of validation diff --git a/speech/third_party/Matcha-TTS/configs/callbacks/model_summary.yaml b/speech/third_party/Matcha-TTS/configs/callbacks/model_summary.yaml deleted file mode 100644 index 6e5368d0e94298cce6d5421365b4583bd763ba92..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/configs/callbacks/model_summary.yaml +++ /dev/null @@ -1,5 +0,0 @@ -# https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.RichModelSummary.html - -model_summary: - _target_: lightning.pytorch.callbacks.RichModelSummary - max_depth: 3 # the maximum depth of layer nesting that the summary will include diff --git a/speech/third_party/Matcha-TTS/configs/callbacks/none.yaml b/speech/third_party/Matcha-TTS/configs/callbacks/none.yaml deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/speech/third_party/Matcha-TTS/configs/callbacks/rich_progress_bar.yaml b/speech/third_party/Matcha-TTS/configs/callbacks/rich_progress_bar.yaml deleted file mode 100644 index de6f1ccb11205a4db93645fb6f297e50205de172..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/configs/callbacks/rich_progress_bar.yaml +++ /dev/null @@ -1,4 +0,0 @@ -# https://lightning.ai/docs/pytorch/latest/api/lightning.pytorch.callbacks.RichProgressBar.html - -rich_progress_bar: - _target_: lightning.pytorch.callbacks.RichProgressBar diff --git a/speech/third_party/Matcha-TTS/configs/debug/default.yaml b/speech/third_party/Matcha-TTS/configs/debug/default.yaml deleted file mode 100644 index e3932c82585fbe44047c1569a5cfe9ee9895c71a..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/configs/debug/default.yaml +++ /dev/null @@ -1,35 +0,0 @@ -# @package _global_ - -# default debugging setup, runs 1 full epoch -# other debugging configs can inherit from this one - -# overwrite task name so debugging logs are stored in separate folder -task_name: "debug" - -# disable callbacks and loggers during debugging -# callbacks: null -# logger: null - -extras: - ignore_warnings: False - enforce_tags: False - -# sets level of all command line loggers to 'DEBUG' -# https://hydra.cc/docs/tutorials/basic/running_your_app/logging/ -hydra: - job_logging: - root: - level: DEBUG - - # use this to also set hydra loggers to 'DEBUG' - # verbose: True - -trainer: - max_epochs: 1 - accelerator: cpu # debuggers don't like gpus - devices: 1 # debuggers don't like multiprocessing - detect_anomaly: true # raise exception if NaN or +/-inf is detected in any tensor - -data: - num_workers: 0 # debuggers don't like multiprocessing - pin_memory: False # disable gpu memory pin diff --git a/speech/third_party/Matcha-TTS/configs/debug/fdr.yaml b/speech/third_party/Matcha-TTS/configs/debug/fdr.yaml deleted file mode 100644 index 7f2d34fa37c31017e749d5a4fc5ae6763e688b46..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/configs/debug/fdr.yaml +++ /dev/null @@ -1,9 +0,0 @@ -# @package _global_ - -# runs 1 train, 1 validation and 1 test step - -defaults: - - default - -trainer: - fast_dev_run: true diff --git a/speech/third_party/Matcha-TTS/configs/debug/limit.yaml b/speech/third_party/Matcha-TTS/configs/debug/limit.yaml deleted file mode 100644 index 514d77fbd1475b03fff0372e3da3c2fa7ea7d190..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/configs/debug/limit.yaml +++ /dev/null @@ -1,12 +0,0 @@ -# @package _global_ - -# uses only 1% of the training data and 5% of validation/test data - -defaults: - - default - -trainer: - max_epochs: 3 - limit_train_batches: 0.01 - limit_val_batches: 0.05 - limit_test_batches: 0.05 diff --git a/speech/third_party/Matcha-TTS/configs/debug/overfit.yaml b/speech/third_party/Matcha-TTS/configs/debug/overfit.yaml deleted file mode 100644 index 9906586a67a12aa81ff69138f589a366dbe2222f..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/configs/debug/overfit.yaml +++ /dev/null @@ -1,13 +0,0 @@ -# @package _global_ - -# overfits to 3 batches - -defaults: - - default - -trainer: - max_epochs: 20 - overfit_batches: 3 - -# model ckpt and early stopping need to be disabled during overfitting -callbacks: null diff --git a/speech/third_party/Matcha-TTS/configs/debug/profiler.yaml b/speech/third_party/Matcha-TTS/configs/debug/profiler.yaml deleted file mode 100644 index 266295f15e0166e1d1b58b88caa7673f4b6493b5..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/configs/debug/profiler.yaml +++ /dev/null @@ -1,15 +0,0 @@ -# @package _global_ - -# runs with execution time profiling - -defaults: - - default - -trainer: - max_epochs: 1 - # profiler: "simple" - profiler: "advanced" - # profiler: "pytorch" - accelerator: gpu - - limit_train_batches: 0.02 diff --git a/speech/third_party/Matcha-TTS/configs/eval.yaml b/speech/third_party/Matcha-TTS/configs/eval.yaml deleted file mode 100644 index be312992b2a486b04d83a54dbd8f670d94979709..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/configs/eval.yaml +++ /dev/null @@ -1,18 +0,0 @@ -# @package _global_ - -defaults: - - _self_ - - data: mnist # choose datamodule with `test_dataloader()` for evaluation - - model: mnist - - logger: null - - trainer: default - - paths: default - - extras: default - - hydra: default - -task_name: "eval" - -tags: ["dev"] - -# passing checkpoint path is necessary for evaluation -ckpt_path: ??? diff --git a/speech/third_party/Matcha-TTS/configs/experiment/hifi_dataset_piper_phonemizer.yaml b/speech/third_party/Matcha-TTS/configs/experiment/hifi_dataset_piper_phonemizer.yaml deleted file mode 100644 index 7e6c57a0d0a399f7463f4ff2d96e1928c435779b..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/configs/experiment/hifi_dataset_piper_phonemizer.yaml +++ /dev/null @@ -1,14 +0,0 @@ -# @package _global_ - -# to execute this experiment run: -# python train.py experiment=multispeaker - -defaults: - - override /data: hi-fi_en-US_female.yaml - -# all parameters below will be merged with parameters from default configurations set above -# this allows you to overwrite only specified parameters - -tags: ["hi-fi", "single_speaker", "piper_phonemizer", "en_US", "female"] - -run_name: hi-fi_en-US_female_piper_phonemizer diff --git a/speech/third_party/Matcha-TTS/configs/experiment/ljspeech.yaml b/speech/third_party/Matcha-TTS/configs/experiment/ljspeech.yaml deleted file mode 100644 index d5723f42cf3552226c42bd91202cc18818b685f0..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/configs/experiment/ljspeech.yaml +++ /dev/null @@ -1,14 +0,0 @@ -# @package _global_ - -# to execute this experiment run: -# python train.py experiment=multispeaker - -defaults: - - override /data: ljspeech.yaml - -# all parameters below will be merged with parameters from default configurations set above -# this allows you to overwrite only specified parameters - -tags: ["ljspeech"] - -run_name: ljspeech diff --git a/speech/third_party/Matcha-TTS/configs/experiment/ljspeech_min_memory.yaml b/speech/third_party/Matcha-TTS/configs/experiment/ljspeech_min_memory.yaml deleted file mode 100644 index ef554dc633c392b1592d90d9d7734f2329264fdd..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/configs/experiment/ljspeech_min_memory.yaml +++ /dev/null @@ -1,18 +0,0 @@ -# @package _global_ - -# to execute this experiment run: -# python train.py experiment=multispeaker - -defaults: - - override /data: ljspeech.yaml - -# all parameters below will be merged with parameters from default configurations set above -# this allows you to overwrite only specified parameters - -tags: ["ljspeech"] - -run_name: ljspeech_min - - -model: - out_size: 172 diff --git a/speech/third_party/Matcha-TTS/configs/experiment/multispeaker.yaml b/speech/third_party/Matcha-TTS/configs/experiment/multispeaker.yaml deleted file mode 100644 index 553842f4e2168db0fee4e44db11b5d086295b044..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/configs/experiment/multispeaker.yaml +++ /dev/null @@ -1,14 +0,0 @@ -# @package _global_ - -# to execute this experiment run: -# python train.py experiment=multispeaker - -defaults: - - override /data: vctk.yaml - -# all parameters below will be merged with parameters from default configurations set above -# this allows you to overwrite only specified parameters - -tags: ["multispeaker"] - -run_name: multispeaker diff --git a/speech/third_party/Matcha-TTS/configs/extras/default.yaml b/speech/third_party/Matcha-TTS/configs/extras/default.yaml deleted file mode 100644 index b9c6b622283a647fbc513166fc14f016cc3ed8a0..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/configs/extras/default.yaml +++ /dev/null @@ -1,8 +0,0 @@ -# disable python warnings if they annoy you -ignore_warnings: False - -# ask user for tags if none are provided in the config -enforce_tags: True - -# pretty print config tree at the start of the run using Rich library -print_config: True diff --git a/speech/third_party/Matcha-TTS/configs/hparams_search/mnist_optuna.yaml b/speech/third_party/Matcha-TTS/configs/hparams_search/mnist_optuna.yaml deleted file mode 100644 index 1391183ebcdec3d8f5eb61374e0719d13c7545da..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/configs/hparams_search/mnist_optuna.yaml +++ /dev/null @@ -1,52 +0,0 @@ -# @package _global_ - -# example hyperparameter optimization of some experiment with Optuna: -# python train.py -m hparams_search=mnist_optuna experiment=example - -defaults: - - override /hydra/sweeper: optuna - -# choose metric which will be optimized by Optuna -# make sure this is the correct name of some metric logged in lightning module! -optimized_metric: "val/acc_best" - -# here we define Optuna hyperparameter search -# it optimizes for value returned from function with @hydra.main decorator -# docs: https://hydra.cc/docs/next/plugins/optuna_sweeper -hydra: - mode: "MULTIRUN" # set hydra to multirun by default if this config is attached - - sweeper: - _target_: hydra_plugins.hydra_optuna_sweeper.optuna_sweeper.OptunaSweeper - - # storage URL to persist optimization results - # for example, you can use SQLite if you set 'sqlite:///example.db' - storage: null - - # name of the study to persist optimization results - study_name: null - - # number of parallel workers - n_jobs: 1 - - # 'minimize' or 'maximize' the objective - direction: maximize - - # total number of runs that will be executed - n_trials: 20 - - # choose Optuna hyperparameter sampler - # you can choose bayesian sampler (tpe), random search (without optimization), grid sampler, and others - # docs: https://optuna.readthedocs.io/en/stable/reference/samplers.html - sampler: - _target_: optuna.samplers.TPESampler - seed: 1234 - n_startup_trials: 10 # number of random sampling runs before optimization starts - - # define hyperparameter search space - params: - model.optimizer.lr: interval(0.0001, 0.1) - data.batch_size: choice(32, 64, 128, 256) - model.net.lin1_size: choice(64, 128, 256) - model.net.lin2_size: choice(64, 128, 256) - model.net.lin3_size: choice(32, 64, 128, 256) diff --git a/speech/third_party/Matcha-TTS/configs/hydra/default.yaml b/speech/third_party/Matcha-TTS/configs/hydra/default.yaml deleted file mode 100644 index 1533136b22802a4f81e5387b74e407289edce94d..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/configs/hydra/default.yaml +++ /dev/null @@ -1,19 +0,0 @@ -# https://hydra.cc/docs/configure_hydra/intro/ - -# enable color logging -defaults: - - override hydra_logging: colorlog - - override job_logging: colorlog - -# output directory, generated dynamically on each run -run: - dir: ${paths.log_dir}/${task_name}/${run_name}/runs/${now:%Y-%m-%d}_${now:%H-%M-%S} -sweep: - dir: ${paths.log_dir}/${task_name}/${run_name}/multiruns/${now:%Y-%m-%d}_${now:%H-%M-%S} - subdir: ${hydra.job.num} - -job_logging: - handlers: - file: - # Incorporates fix from https://github.com/facebookresearch/hydra/pull/2242 - filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log diff --git a/speech/third_party/Matcha-TTS/configs/local/.gitkeep b/speech/third_party/Matcha-TTS/configs/local/.gitkeep deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/speech/third_party/Matcha-TTS/configs/logger/aim.yaml b/speech/third_party/Matcha-TTS/configs/logger/aim.yaml deleted file mode 100644 index 8f9f6adad7feb2780c2efd5ddb0ed053621e05f8..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/configs/logger/aim.yaml +++ /dev/null @@ -1,28 +0,0 @@ -# https://aimstack.io/ - -# example usage in lightning module: -# https://github.com/aimhubio/aim/blob/main/examples/pytorch_lightning_track.py - -# open the Aim UI with the following command (run in the folder containing the `.aim` folder): -# `aim up` - -aim: - _target_: aim.pytorch_lightning.AimLogger - repo: ${paths.root_dir} # .aim folder will be created here - # repo: "aim://ip_address:port" # can instead provide IP address pointing to Aim remote tracking server which manages the repo, see https://aimstack.readthedocs.io/en/latest/using/remote_tracking.html# - - # aim allows to group runs under experiment name - experiment: null # any string, set to "default" if not specified - - train_metric_prefix: "train/" - val_metric_prefix: "val/" - test_metric_prefix: "test/" - - # sets the tracking interval in seconds for system usage metrics (CPU, GPU, memory, etc.) - system_tracking_interval: 10 # set to null to disable system metrics tracking - - # enable/disable logging of system params such as installed packages, git info, env vars, etc. - log_system_params: true - - # enable/disable tracking console logs (default value is true) - capture_terminal_logs: false # set to false to avoid infinite console log loop issue https://github.com/aimhubio/aim/issues/2550 diff --git a/speech/third_party/Matcha-TTS/configs/logger/comet.yaml b/speech/third_party/Matcha-TTS/configs/logger/comet.yaml deleted file mode 100644 index e0789274e2137ee6c97ca37a5d56c2b8abaf0aaa..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/configs/logger/comet.yaml +++ /dev/null @@ -1,12 +0,0 @@ -# https://www.comet.ml - -comet: - _target_: lightning.pytorch.loggers.comet.CometLogger - api_key: ${oc.env:COMET_API_TOKEN} # api key is loaded from environment variable - save_dir: "${paths.output_dir}" - project_name: "lightning-hydra-template" - rest_api_key: null - # experiment_name: "" - experiment_key: null # set to resume experiment - offline: False - prefix: "" diff --git a/speech/third_party/Matcha-TTS/configs/logger/csv.yaml b/speech/third_party/Matcha-TTS/configs/logger/csv.yaml deleted file mode 100644 index fa028e9c146430c319101ffdfce466514338591c..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/configs/logger/csv.yaml +++ /dev/null @@ -1,7 +0,0 @@ -# csv logger built in lightning - -csv: - _target_: lightning.pytorch.loggers.csv_logs.CSVLogger - save_dir: "${paths.output_dir}" - name: "csv/" - prefix: "" diff --git a/speech/third_party/Matcha-TTS/configs/logger/many_loggers.yaml b/speech/third_party/Matcha-TTS/configs/logger/many_loggers.yaml deleted file mode 100644 index dd586800bdccb4e8f4b0236a181b7ddd756ba9ab..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/configs/logger/many_loggers.yaml +++ /dev/null @@ -1,9 +0,0 @@ -# train with many loggers at once - -defaults: - # - comet - - csv - # - mlflow - # - neptune - - tensorboard - - wandb diff --git a/speech/third_party/Matcha-TTS/configs/logger/mlflow.yaml b/speech/third_party/Matcha-TTS/configs/logger/mlflow.yaml deleted file mode 100644 index f8fb7e685fa27fc8141387a421b90a0b9b492d9e..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/configs/logger/mlflow.yaml +++ /dev/null @@ -1,12 +0,0 @@ -# https://mlflow.org - -mlflow: - _target_: lightning.pytorch.loggers.mlflow.MLFlowLogger - # experiment_name: "" - # run_name: "" - tracking_uri: ${paths.log_dir}/mlflow/mlruns # run `mlflow ui` command inside the `logs/mlflow/` dir to open the UI - tags: null - # save_dir: "./mlruns" - prefix: "" - artifact_location: null - # run_id: "" diff --git a/speech/third_party/Matcha-TTS/configs/logger/neptune.yaml b/speech/third_party/Matcha-TTS/configs/logger/neptune.yaml deleted file mode 100644 index 8233c140018ecce6ab62971beed269991d31c89b..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/configs/logger/neptune.yaml +++ /dev/null @@ -1,9 +0,0 @@ -# https://neptune.ai - -neptune: - _target_: lightning.pytorch.loggers.neptune.NeptuneLogger - api_key: ${oc.env:NEPTUNE_API_TOKEN} # api key is loaded from environment variable - project: username/lightning-hydra-template - # name: "" - log_model_checkpoints: True - prefix: "" diff --git a/speech/third_party/Matcha-TTS/configs/logger/tensorboard.yaml b/speech/third_party/Matcha-TTS/configs/logger/tensorboard.yaml deleted file mode 100644 index 2bd31f6d8ba68d1f5c36a804885d5b9f9c1a9302..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/configs/logger/tensorboard.yaml +++ /dev/null @@ -1,10 +0,0 @@ -# https://www.tensorflow.org/tensorboard/ - -tensorboard: - _target_: lightning.pytorch.loggers.tensorboard.TensorBoardLogger - save_dir: "${paths.output_dir}/tensorboard/" - name: null - log_graph: False - default_hp_metric: True - prefix: "" - # version: "" diff --git a/speech/third_party/Matcha-TTS/configs/logger/wandb.yaml b/speech/third_party/Matcha-TTS/configs/logger/wandb.yaml deleted file mode 100644 index ece165889b3d0d9dc750a8f3c7454188cfdf12b7..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/configs/logger/wandb.yaml +++ /dev/null @@ -1,16 +0,0 @@ -# https://wandb.ai - -wandb: - _target_: lightning.pytorch.loggers.wandb.WandbLogger - # name: "" # name of the run (normally generated by wandb) - save_dir: "${paths.output_dir}" - offline: False - id: null # pass correct id to resume experiment! - anonymous: null # enable anonymous logging - project: "lightning-hydra-template" - log_model: False # upload lightning ckpts - prefix: "" # a string to put at the beginning of metric keys - # entity: "" # set to name of your wandb team - group: "" - tags: [] - job_type: "" diff --git a/speech/third_party/Matcha-TTS/configs/model/cfm/default.yaml b/speech/third_party/Matcha-TTS/configs/model/cfm/default.yaml deleted file mode 100644 index 0d1d9609e2d05c7b0a12a26115520340ac18e584..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/configs/model/cfm/default.yaml +++ /dev/null @@ -1,3 +0,0 @@ -name: CFM -solver: euler -sigma_min: 1e-4 diff --git a/speech/third_party/Matcha-TTS/configs/model/decoder/default.yaml b/speech/third_party/Matcha-TTS/configs/model/decoder/default.yaml deleted file mode 100644 index aaa00e63402ade5c76247a2f1d6b294ec3c61e63..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/configs/model/decoder/default.yaml +++ /dev/null @@ -1,7 +0,0 @@ -channels: [256, 256] -dropout: 0.05 -attention_head_dim: 64 -n_blocks: 1 -num_mid_blocks: 2 -num_heads: 2 -act_fn: snakebeta diff --git a/speech/third_party/Matcha-TTS/configs/model/encoder/default.yaml b/speech/third_party/Matcha-TTS/configs/model/encoder/default.yaml deleted file mode 100644 index d4d5e5adee8f707bd384b682a3ad9a116c40c6ed..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/configs/model/encoder/default.yaml +++ /dev/null @@ -1,18 +0,0 @@ -encoder_type: RoPE Encoder -encoder_params: - n_feats: ${model.n_feats} - n_channels: 192 - filter_channels: 768 - filter_channels_dp: 256 - n_heads: 2 - n_layers: 6 - kernel_size: 3 - p_dropout: 0.1 - spk_emb_dim: 64 - n_spks: 1 - prenet: true - -duration_predictor_params: - filter_channels_dp: ${model.encoder.encoder_params.filter_channels_dp} - kernel_size: 3 - p_dropout: ${model.encoder.encoder_params.p_dropout} diff --git a/speech/third_party/Matcha-TTS/configs/model/matcha.yaml b/speech/third_party/Matcha-TTS/configs/model/matcha.yaml deleted file mode 100644 index 36f6eafbdcaa324f7494a4b97a7590da7824f357..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/configs/model/matcha.yaml +++ /dev/null @@ -1,15 +0,0 @@ -defaults: - - _self_ - - encoder: default.yaml - - decoder: default.yaml - - cfm: default.yaml - - optimizer: adam.yaml - -_target_: matcha.models.matcha_tts.MatchaTTS -n_vocab: 178 -n_spks: ${data.n_spks} -spk_emb_dim: 64 -n_feats: 80 -data_statistics: ${data.data_statistics} -out_size: null # Must be divisible by 4 -prior_loss: true diff --git a/speech/third_party/Matcha-TTS/configs/model/optimizer/adam.yaml b/speech/third_party/Matcha-TTS/configs/model/optimizer/adam.yaml deleted file mode 100644 index 42795577474eaee5b0b96845a95e1a11c9152385..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/configs/model/optimizer/adam.yaml +++ /dev/null @@ -1,4 +0,0 @@ -_target_: torch.optim.Adam -_partial_: true -lr: 1e-4 -weight_decay: 0.0 diff --git a/speech/third_party/Matcha-TTS/configs/paths/default.yaml b/speech/third_party/Matcha-TTS/configs/paths/default.yaml deleted file mode 100644 index ec81db2d34712909a79be3e42e65efe08c35ecee..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/configs/paths/default.yaml +++ /dev/null @@ -1,18 +0,0 @@ -# path to root directory -# this requires PROJECT_ROOT environment variable to exist -# you can replace it with "." if you want the root to be the current working directory -root_dir: ${oc.env:PROJECT_ROOT} - -# path to data directory -data_dir: ${paths.root_dir}/data/ - -# path to logging directory -log_dir: ${paths.root_dir}/logs/ - -# path to output directory, created dynamically by hydra -# path generation pattern is specified in `configs/hydra/default.yaml` -# use it to store all files generated during the run, like ckpts and metrics -output_dir: ${hydra:runtime.output_dir} - -# path to working directory -work_dir: ${hydra:runtime.cwd} diff --git a/speech/third_party/Matcha-TTS/configs/train.yaml b/speech/third_party/Matcha-TTS/configs/train.yaml deleted file mode 100644 index e6f5c2e7b9781758c8d25f941f004ca383c3f494..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/configs/train.yaml +++ /dev/null @@ -1,51 +0,0 @@ -# @package _global_ - -# specify here default configuration -# order of defaults determines the order in which configs override each other -defaults: - - _self_ - - data: ljspeech - - model: matcha - - callbacks: default - - logger: tensorboard # set logger here or use command line (e.g. `python train.py logger=tensorboard`) - - trainer: default - - paths: default - - extras: default - - hydra: default - - # experiment configs allow for version control of specific hyperparameters - # e.g. best hyperparameters for given model and datamodule - - experiment: null - - # config for hyperparameter optimization - - hparams_search: null - - # optional local config for machine/user specific settings - # it's optional since it doesn't need to exist and is excluded from version control - - optional local: default - - # debugging config (enable through command line, e.g. `python train.py debug=default) - - debug: null - -# task name, determines output directory path -task_name: "train" - -run_name: ??? - -# tags to help you identify your experiments -# you can overwrite this in experiment configs -# overwrite from command line with `python train.py tags="[first_tag, second_tag]"` -tags: ["dev"] - -# set False to skip model training -train: True - -# evaluate on test set, using best model weights achieved during training -# lightning chooses best weights based on the metric specified in checkpoint callback -test: True - -# simply provide checkpoint path to resume training -ckpt_path: null - -# seed for random number generators in pytorch, numpy and python.random -seed: 1234 diff --git a/speech/third_party/Matcha-TTS/configs/trainer/cpu.yaml b/speech/third_party/Matcha-TTS/configs/trainer/cpu.yaml deleted file mode 100644 index b7d6767e60c956567555980654f15e7bb673a41f..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/configs/trainer/cpu.yaml +++ /dev/null @@ -1,5 +0,0 @@ -defaults: - - default - -accelerator: cpu -devices: 1 diff --git a/speech/third_party/Matcha-TTS/configs/trainer/ddp.yaml b/speech/third_party/Matcha-TTS/configs/trainer/ddp.yaml deleted file mode 100644 index 94b43e20ca7bf1f2ea92627fd46906e4f0a273a1..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/configs/trainer/ddp.yaml +++ /dev/null @@ -1,9 +0,0 @@ -defaults: - - default - -strategy: ddp - -accelerator: gpu -devices: [0,1] -num_nodes: 1 -sync_batchnorm: True diff --git a/speech/third_party/Matcha-TTS/configs/trainer/ddp_sim.yaml b/speech/third_party/Matcha-TTS/configs/trainer/ddp_sim.yaml deleted file mode 100644 index 8404419e5c295654967d0dfb73a7366e75be2f1f..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/configs/trainer/ddp_sim.yaml +++ /dev/null @@ -1,7 +0,0 @@ -defaults: - - default - -# simulate DDP on CPU, useful for debugging -accelerator: cpu -devices: 2 -strategy: ddp_spawn diff --git a/speech/third_party/Matcha-TTS/configs/trainer/default.yaml b/speech/third_party/Matcha-TTS/configs/trainer/default.yaml deleted file mode 100644 index ee3d370d8ca6b08d7ee7a86d34184c2104f0e1ef..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/configs/trainer/default.yaml +++ /dev/null @@ -1,20 +0,0 @@ -_target_: lightning.pytorch.trainer.Trainer - -default_root_dir: ${paths.output_dir} - -max_epochs: -1 - -accelerator: gpu -devices: [0] - -# mixed precision for extra speed-up -precision: 16-mixed - -# perform a validation loop every N training epochs -check_val_every_n_epoch: 1 - -# set True to to ensure deterministic results -# makes training slower but gives more reproducibility than just setting seeds -deterministic: False - -gradient_clip_val: 5.0 diff --git a/speech/third_party/Matcha-TTS/configs/trainer/gpu.yaml b/speech/third_party/Matcha-TTS/configs/trainer/gpu.yaml deleted file mode 100644 index b2389510a90f5f0161cff6ccfcb4a96097ddf9a1..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/configs/trainer/gpu.yaml +++ /dev/null @@ -1,5 +0,0 @@ -defaults: - - default - -accelerator: gpu -devices: 1 diff --git a/speech/third_party/Matcha-TTS/configs/trainer/mps.yaml b/speech/third_party/Matcha-TTS/configs/trainer/mps.yaml deleted file mode 100644 index 1ecf6d5cc3a34ca127c5510f4a18e989561e38e4..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/configs/trainer/mps.yaml +++ /dev/null @@ -1,5 +0,0 @@ -defaults: - - default - -accelerator: mps -devices: 1 diff --git a/speech/third_party/Matcha-TTS/matcha/VERSION b/speech/third_party/Matcha-TTS/matcha/VERSION deleted file mode 100644 index 442b1138f7851df1c22deb15fd5d6ff5b742e550..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/matcha/VERSION +++ /dev/null @@ -1 +0,0 @@ -0.0.5.1 diff --git a/speech/third_party/Matcha-TTS/notebooks/.gitkeep b/speech/third_party/Matcha-TTS/notebooks/.gitkeep deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/speech/third_party/Matcha-TTS/pyproject.toml b/speech/third_party/Matcha-TTS/pyproject.toml deleted file mode 100644 index 74aa39300a61b8b3607dc634d68aa47013141ec5..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/pyproject.toml +++ /dev/null @@ -1,51 +0,0 @@ -[build-system] -requires = ["setuptools", "wheel", "cython==0.29.35", "numpy==1.24.3", "packaging"] - -[tool.black] -line-length = 120 -target-version = ['py310'] -exclude = ''' - -( - /( - \.eggs # exclude a few common directories in the - | \.git # root of the project - | \.hg - | \.mypy_cache - | \.tox - | \.venv - | _build - | buck-out - | build - | dist - )/ - | foo.py # also separately exclude a file named foo.py in - # the root of the project -) -''' - -[tool.pytest.ini_options] -addopts = [ - "--color=yes", - "--durations=0", - "--strict-markers", - "--doctest-modules", -] -filterwarnings = [ - "ignore::DeprecationWarning", - "ignore::UserWarning", -] -log_cli = "True" -markers = [ - "slow: slow tests", -] -minversion = "6.0" -testpaths = "tests/" - -[tool.coverage.report] -exclude_lines = [ - "pragma: nocover", - "raise NotImplementedError", - "raise NotImplementedError()", - "if __name__ == .__main__.:", -] diff --git a/speech/third_party/Matcha-TTS/requirements.txt b/speech/third_party/Matcha-TTS/requirements.txt deleted file mode 100644 index 3e14a532cb14f99190404472915213940bfad4b9..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/requirements.txt +++ /dev/null @@ -1,45 +0,0 @@ -# --------- pytorch --------- # -torch>=2.0.0 -torchvision>=0.15.0 -lightning>=2.0.0 -torchmetrics>=0.11.4 - -# --------- hydra --------- # -hydra-core==1.3.2 -hydra-colorlog==1.2.0 -hydra-optuna-sweeper==1.2.0 - -# --------- loggers --------- # -# wandb -# neptune-client -# mlflow -# comet-ml -# aim>=3.16.2 # no lower than 3.16.2, see https://github.com/aimhubio/aim/issues/2550 - -# --------- others --------- # -rootutils # standardizing the project root setup -pre-commit # hooks for applying linters on commit -rich # beautiful text formatting in terminal -pytest # tests -# sh # for running bash commands in some tests (linux/macos only) -phonemizer # phonemization of text -tensorboard -librosa -Cython -numpy -einops -inflect -Unidecode -scipy -torchaudio -matplotlib -pandas -conformer==0.3.2 -diffusers==0.25.0 -notebook -ipywidgets -gradio==3.43.2 -gdown -wget -seaborn -piper_phonemize diff --git a/speech/third_party/Matcha-TTS/setup.py b/speech/third_party/Matcha-TTS/setup.py deleted file mode 100644 index 80d4aac04c6cd36859c5d753468ef2e105770098..0000000000000000000000000000000000000000 --- a/speech/third_party/Matcha-TTS/setup.py +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env python -import os - -import numpy -from Cython.Build import cythonize -from setuptools import Extension, find_packages, setup - -exts = [ - Extension( - name="matcha.utils.monotonic_align.core", - sources=["matcha/utils/monotonic_align/core.pyx"], - ) -] - -with open("README.md", encoding="utf-8") as readme_file: - README = readme_file.read() - -cwd = os.path.dirname(os.path.abspath(__file__)) -with open(os.path.join(cwd, "matcha", "VERSION")) as fin: - version = fin.read().strip() - -setup( - name="matcha-tts", - version=version, - description="๐Ÿต Matcha-TTS: A fast TTS architecture with conditional flow matching", - long_description=README, - long_description_content_type="text/markdown", - author="Shivam Mehta", - author_email="shivam.mehta25@gmail.com", - url="https://shivammehta25.github.io/Matcha-TTS", - install_requires=[str(r) for r in open(os.path.join(os.path.dirname(__file__), "requirements.txt"))], - include_dirs=[numpy.get_include()], - include_package_data=True, - packages=find_packages(exclude=["tests", "tests/*", "examples", "examples/*"]), - # use this to customize global commands available in the terminal after installing the package - entry_points={ - "console_scripts": [ - "matcha-data-stats=matcha.utils.generate_data_statistics:main", - "matcha-tts=matcha.cli:cli", - "matcha-tts-app=matcha.app:main", - ] - }, - ext_modules=cythonize(exts, language_level=3), - python_requires=">=3.9.0", -) diff --git a/speech/train.py b/speech/train.py index 9e54f077893f83df9f4f5c7e594440578a7c224b..de22f0eed91bf840fdd03345ce065f53483f5466 100644 --- a/speech/train.py +++ b/speech/train.py @@ -16,9 +16,6 @@ from __future__ import print_function import argparse import datetime -import logging - -logging.getLogger("matplotlib").setLevel(logging.WARNING) import os from copy import deepcopy @@ -29,6 +26,7 @@ from hyperpyyaml import load_hyperpyyaml from loguru import logger from torch.distributed.elastic.multiprocessing.errors import record +from comet_ml import Experiment from cosyvoice.utils.executor import Executor from cosyvoice.utils.losses import DPOLoss from cosyvoice.utils.train_utils import (check_modify_and_save_config, @@ -109,20 +107,61 @@ def get_args(): return args +def init_comet_experiment(args, configs): + """Initialize Comet ML experiment""" + rank = int(os.environ.get('RANK', 0)) + + # Only create experiment on rank 0 to avoid duplicates + if rank == 0 and not args.comet_disabled: + # Set up Comet ML experiment + experiment = Experiment( + api_key=args.comet_api_key, + project_name=args.comet_project, + workspace=args.comet_workspace, + experiment_name=args.comet_experiment_name, + disabled=args.comet_disabled, + offline=args.comet_offline, + auto_metric_logging=True, + auto_param_logging=True, + auto_histogram_weight_logging=True, + auto_histogram_gradient_logging=True, + auto_histogram_activation_logging=False, + ) + + # Log hyperparameters + experiment.log_parameters(configs["train_conf"]) + experiment.log_parameter("model_type", args.model) + experiment.log_parameter("train_data", args.train_data) + experiment.log_parameter("cv_data", args.cv_data) + experiment.log_parameter("use_amp", args.use_amp) + experiment.log_parameter("dpo", args.dpo) + experiment.log_parameter("num_workers", args.num_workers) + experiment.log_parameter("prefetch", args.prefetch) + + # Log model architecture if available + if args.model in configs: + model_config = configs[args.model].__dict__ if hasattr(configs[args.model], '__dict__') else {} + experiment.log_parameters(model_config, prefix=f"{args.model}/") + + # Add tags + experiment.add_tag(args.model) + if args.dpo: + experiment.add_tag("dpo") + if args.use_amp: + experiment.add_tag("amp") + + logger.info(f"Comet ML experiment initialized: {experiment.get_name()}") + return experiment + else: + return None + @record def main(): args = get_args() - logging.basicConfig( - level=logging.DEBUG, format="%(asctime)s %(levelname)s %(message)s" - ) - # gan train has some special initialization logic - gan = True if args.model == "hifigan" else False override_dict = { k: None for k in ["llm", "flow", "hift", "hifigan"] if k != args.model } - if gan is True: - override_dict.pop("hift") try: with open(args.config, "r", encoding="utf-8") as f: configs = load_hyperpyyaml( @@ -136,23 +175,27 @@ def main(): logger.error(f"Error loading config: {e}") with open(args.config, "r", encoding="utf-8") as f: configs = load_hyperpyyaml(f, overrides=override_dict) - if gan is True: - configs["train_conf"] = configs["train_conf_gan"] + configs["train_conf"].update(vars(args)) - # Init env for ddp - init_distributed(args) + world_size = int(os.environ.get('WORLD_SIZE', 1)) + local_rank = int(os.environ.get('LOCAL_RANK', 0)) + rank = int(os.environ.get('RANK', 0)) + logger.info(f'training on multiple gpus, this gpu {local_rank}, rank {rank}, world_size {world_size}') + torch.cuda.set_device(local_rank) + dist.init_process_group(args.dist_backend) # Get dataset & dataloader train_dataset, _, train_data_loader, cv_data_loader = init_dataset_and_dataloader( - args, configs, gan, args.dpo + args, configs, args.dpo ) # Do some sanity checks and save config to arsg.model_dir configs = check_modify_and_save_config(args, configs) # Tensorboard summary - writer = init_summarywriter(args) + experiment = init_comet_experiment(args, configs) + # load checkpoint if args.dpo is True: @@ -168,6 +211,11 @@ def main(): start_step = state_dict["step"] if "epoch" in state_dict: start_epoch = state_dict["epoch"] + # Log checkpoint info to Comet + if experiment: + experiment.log_parameter("checkpoint", args.checkpoint) + experiment.log_parameter("start_step", start_step) + experiment.log_parameter("start_epoch", start_epoch) else: logger.warning(f"checkpoint {args.checkpoint} do not exsist!") @@ -178,12 +226,10 @@ def main(): ) # Get optimizer & scheduler - model, optimizer, scheduler, optimizer_d, scheduler_d = ( - init_optimizer_and_scheduler(args, configs, model, gan) + model, optimizer, scheduler = ( + init_optimizer_and_scheduler(configs, model) ) scheduler.set_step(start_step) - if scheduler_d is not None: - scheduler_d.set_step(start_step) # Save init checkpoints info_dict = deepcopy(configs["train_conf"]) @@ -191,6 +237,14 @@ def main(): info_dict["epoch"] = start_epoch save_model(model, "init", info_dict) + # Log model save to Comet + if experiment: + experiment.log_model( + name=f"{args.model}_init", + file_or_folder=os.path.join(args.model_dir, "init.pt"), + metadata=info_dict + ) + # DPO related if args.dpo is True: ref_model = deepcopy(configs[args.model]) @@ -201,11 +255,16 @@ def main(): ref_model = torch.nn.parallel.DistributedDataParallel( ref_model, find_unused_parameters=True ) + if experiment: + experiment.log_parameter("ref_model", args.ref_model) + experiment.log_parameter("dpo_beta", 0.01) + experiment.log_parameter("dpo_label_smoothing", 0.0) + experiment.log_parameter("dpo_ipo", False) else: ref_model, dpo_loss = None, None # Get executor - executor = Executor(gan=gan, ref_model=ref_model, dpo_loss=dpo_loss) + executor = Executor(gan=False, ref_model=ref_model, dpo_loss=dpo_loss) executor.step = start_step # Init scaler, used for pytorch amp mixed precision training @@ -220,34 +279,22 @@ def main(): group_join = dist.new_group( backend="nccl", timeout=datetime.timedelta(seconds=args.timeout) ) - if gan is True: - executor.train_one_epoc_gan( - model, - optimizer, - scheduler, - optimizer_d, - scheduler_d, - train_data_loader, - cv_data_loader, - writer, - info_dict, - scaler, - group_join, - ) - else: - executor.train_one_epoc( - model, - optimizer, - scheduler, - train_data_loader, - cv_data_loader, - writer, - info_dict, - scaler, - group_join, - ) + + executor.train_one_epoc( + model, + optimizer, + scheduler, + train_data_loader, + cv_data_loader, + experiment, + info_dict, + scaler, + group_join, + model_type=args.model + ) dist.destroy_process_group(group_join) - + if experiment: + experiment.end() if __name__ == "__main__": main()