diff --git a/speech/cosyvoice/utils/executor.py b/speech/cosyvoice/utils/executor.py
index c12a0c7b96cd0a4a0a5712578dc298e6b2ecceff..b4e9e01aba0118e9cdd1f5b0a71a450a10e30bc4 100644
--- a/speech/cosyvoice/utils/executor.py
+++ b/speech/cosyvoice/utils/executor.py
@@ -49,10 +49,11 @@ class Executor:
         scheduler,
         train_data_loader,
         cv_data_loader,
-        writer,
+        experiment,
         info_dict,
         scaler,
         group_join,
+        model_type
     ):
         """Train one epoch"""
 
@@ -101,10 +102,10 @@ class Executor:
                     info_dict = batch_backward(model, scaler, info_dict)
 
                 info_dict = update_parameter_and_lr(
-                    model, optimizer, scheduler, scaler, info_dict
+                    model, optimizer, scheduler, scaler, info_dict, model_type=model_type
                 )
-                log_per_step(writer, info_dict)
-                # NOTE specify save_per_step in cosyvoice.yaml if you want to enable step save
+                log_per_step(experiment, info_dict)
+                
                 if (
                     info_dict["save_per_step"] > 0
                     and (self.step + 1) % info_dict["save_per_step"] == 0
@@ -112,102 +113,16 @@ class Executor:
                 ):
                     dist.barrier()
                     self.cv(
-                        model, cv_data_loader, writer, info_dict, on_batch_end=False
+                        model, cv_data_loader, experiment, info_dict, on_batch_end=False
                     )
                     model.train()
                 if (batch_idx + 1) % info_dict["accum_grad"] == 0:
                     self.step += 1
         dist.barrier()
-        self.cv(model, cv_data_loader, writer, info_dict, on_batch_end=True)
-
-    def train_one_epoc_gan(
-        self,
-        model,
-        optimizer,
-        scheduler,
-        optimizer_d,
-        scheduler_d,
-        train_data_loader,
-        cv_data_loader,
-        writer,
-        info_dict,
-        scaler,
-        group_join,
-    ):
-        """Train one epoch"""
-
-        lr = optimizer.param_groups[0]["lr"]
-        logger.info(
-            f"Epoch {self.epoch} TRAIN info lr {lr} rank {self.rank}"
-        )
-        logger.info(
-            f"using accumulate grad, new batch size is {info_dict['accum_grad']} times larger than before"
-        )
-        # A context manager to be used in conjunction with an instance of
-        # torch.nn.parallel.DistributedDataParallel to be able to train
-        # with uneven inputs across participating processes.
-        model.train()
-        model_context = (
-            model.join if info_dict["train_engine"] == "torch_ddp" else nullcontext
-        )
-        with model_context():
-            for batch_idx, batch_dict in enumerate(train_data_loader):
-                info_dict["tag"] = "TRAIN"
-                info_dict["step"] = self.step
-                info_dict["epoch"] = self.epoch
-                info_dict["batch_idx"] = batch_idx
-                if cosyvoice_join(group_join, info_dict):
-                    break
-
-                # Disable gradient synchronizations across DDP processes.
-                # Within this context, gradients will be accumulated on module
-                # variables, which will later be synchronized.
-                if (
-                    info_dict["train_engine"] == "torch_ddp"
-                    and (batch_idx + 1) % info_dict["accum_grad"] != 0
-                ):
-                    context = model.no_sync
-                # Used for single gpu training and DDP gradient synchronization
-                # processes.
-                else:
-                    context = nullcontext
-
-                with context():
-                    batch_dict["turn"] = "discriminator"
-                    info_dict = batch_forward(model, batch_dict, scaler, info_dict)
-                    info_dict = batch_backward(model, scaler, info_dict)
-                info_dict = update_parameter_and_lr(
-                    model, optimizer_d, scheduler_d, scaler, info_dict
-                )
-                optimizer.zero_grad()
-                log_per_step(writer, info_dict)
-                with context():
-                    batch_dict["turn"] = "generator"
-                    info_dict = batch_forward(model, batch_dict, scaler, info_dict)
-                    info_dict = batch_backward(model, scaler, info_dict)
-                info_dict = update_parameter_and_lr(
-                    model, optimizer, scheduler, scaler, info_dict
-                )
-                optimizer_d.zero_grad()
-                log_per_step(writer, info_dict)
-                # NOTE specify save_per_step in cosyvoice.yaml if you want to enable step save
-                if (
-                    info_dict["save_per_step"] > 0
-                    and (self.step + 1) % info_dict["save_per_step"] == 0
-                    and (batch_idx + 1) % info_dict["accum_grad"] == 0
-                ):
-                    dist.barrier()
-                    self.cv(
-                        model, cv_data_loader, writer, info_dict, on_batch_end=False
-                    )
-                    model.train()
-                if (batch_idx + 1) % info_dict["accum_grad"] == 0:
-                    self.step += 1
-        dist.barrier()
-        # self.cv(model, cv_data_loader, writer, info_dict, on_batch_end=True)
+        #self.cv(model, cv_data_loader, writer, info_dict, on_batch_end=True)
 
     @torch.inference_mode()
-    def cv(self, model, cv_data_loader, writer, info_dict, on_batch_end=True):
+    def cv(self, model, cv_data_loader, experiment, info_dict, on_batch_end=True):
         """Cross validation on"""
         logger.info(f"Epoch {self.epoch} Step {self.step + 1} on_batch_end {on_batch_end} CV rank {self.rank}")
         model.eval()
@@ -233,7 +148,7 @@ class Executor:
         for k, v in total_loss_dict.items():
             total_loss_dict[k] = sum(v) / total_num_utts
         info_dict["loss_dict"] = total_loss_dict
-        log_per_save(writer, info_dict)
+        log_per_save(experiment, info_dict)
         model_name = (
             f"epoch_{self.epoch}_whole"
             if on_batch_end
diff --git a/speech/cosyvoice/utils/train_utils.py b/speech/cosyvoice/utils/train_utils.py
index 885d85109ba2d6f14848e06b91ca0c07b7abb605..4ba27e3a8ee25da173850fef7ffb978c8d6ba97f 100644
--- a/speech/cosyvoice/utils/train_utils.py
+++ b/speech/cosyvoice/utils/train_utils.py
@@ -26,15 +26,14 @@ import deepspeed
 import torch.optim as optim
 import torch.distributed as dist
 
-from torch.utils.tensorboard import SummaryWriter
 from torch.utils.data import DataLoader
 from torch.nn.utils import clip_grad_norm_
 from loguru import logger
 from deepspeed.runtime.zero.stage_1_and_2 import estimate_zero2_model_states_mem_needs_all_live
 
 from cosyvoice.dataset.dataset import Dataset
-from cosyvoice.utils.scheduler import WarmupLR, NoamHoldAnnealing, ConstantLR
 
+from torch.optim.lr_scheduler import LinearLR, ConstantLR, SequentialLR
 
 def init_distributed(args):
     world_size = int(os.environ.get('WORLD_SIZE', 1))
@@ -49,10 +48,10 @@ def init_distributed(args):
     return world_size, local_rank, rank
 
 
-def init_dataset_and_dataloader(args, configs, gan, dpo):
-    data_pipeline = configs['data_pipeline_gan'] if gan is True else configs['data_pipeline']
-    train_dataset = Dataset(args.train_data, data_pipeline=data_pipeline, mode='train', gan=gan, dpo=dpo, shuffle=True, partition=True)
-    cv_dataset = Dataset(args.cv_data, data_pipeline=data_pipeline, mode='train', gan=gan, dpo=dpo, shuffle=False, partition=False)
+def init_dataset_and_dataloader(args, configs, dpo):
+    data_pipeline = configs['data_pipeline']
+    train_dataset = Dataset(args.train_data, data_pipeline=data_pipeline, mode='train', gan=False, dpo=dpo, shuffle=True, partition=True)
+    cv_dataset = Dataset(args.cv_data, data_pipeline=data_pipeline, mode='train', gan=False, dpo=dpo, shuffle=False, partition=False)
 
     # do not use persistent_workers=True, as whisper tokenizer opens tiktoken file each time when the for loop starts
     train_data_loader = DataLoader(train_dataset,
@@ -109,90 +108,38 @@ def wrap_cuda_model(args, model):
     return model
 
 
-def init_optimizer_and_scheduler(args, configs, model, gan):
+def init_optimizer_and_scheduler(configs, model):
     """Init optimizer and scheduler"""
-    if gan is False:
-        if configs['train_conf']['optim'] == 'adam':
-            optimizer = optim.Adam(model.parameters(), **configs['train_conf']['optim_conf'])
-        elif configs['train_conf']['optim'] == 'adamw':
-            optimizer = optim.AdamW(model.parameters(), **configs['train_conf']['optim_conf'])
-        else:
-            raise ValueError("unknown optimizer: " + configs['train_conf'])
-
-        if configs['train_conf']['scheduler'] == 'warmuplr':
-            scheduler_type = WarmupLR
-            scheduler = WarmupLR(optimizer, **configs['train_conf']['scheduler_conf'])
-        elif configs['train_conf']['scheduler'] == 'NoamHoldAnnealing':
-            scheduler_type = NoamHoldAnnealing
-            scheduler = NoamHoldAnnealing(optimizer, **configs['train_conf']['scheduler_conf'])
-        elif configs['train_conf']['scheduler'] == 'constantlr':
-            scheduler_type = ConstantLR
-            scheduler = ConstantLR(optimizer)
-        else:
-            raise ValueError("unknown scheduler: " + configs['train_conf'])
-
-        # use deepspeed optimizer for speedup
-        if args.train_engine == "deepspeed":
-            def scheduler(opt):
-                return scheduler_type(opt, **configs['train_conf']['scheduler_conf'])
-            model, optimizer, _, scheduler = deepspeed.initialize(
-                args=args,
-                model=model,
-                optimizer=None,
-                lr_scheduler=scheduler,
-                model_parameters=model.parameters())
-
-        optimizer_d, scheduler_d = None, None
-
+    if configs['train_conf']['optim'] == 'adam':
+        optimizer = optim.Adam(model.parameters(), **configs['train_conf']['optim_conf'])
+    elif configs['train_conf']['optim'] == 'adamw':
+        optimizer = optim.AdamW(model.parameters(), **configs['train_conf']['optim_conf'])
     else:
-        # currently we wrap generator and discriminator in one model, so we cannot use deepspeed
-        if configs['train_conf']['optim'] == 'adam':
-            optimizer = optim.Adam(model.module.generator.parameters(), **configs['train_conf']['optim_conf'])
-        elif configs['train_conf']['optim'] == 'adamw':
-            optimizer = optim.AdamW(model.module.generator.parameters(), **configs['train_conf']['optim_conf'])
-        else:
-            raise ValueError("unknown optimizer: " + configs['train_conf'])
-
-        if configs['train_conf']['scheduler'] == 'warmuplr':
-            scheduler_type = WarmupLR
-            scheduler = WarmupLR(optimizer, **configs['train_conf']['scheduler_conf'])
-        elif configs['train_conf']['scheduler'] == 'NoamHoldAnnealing':
-            scheduler_type = NoamHoldAnnealing
-            scheduler = NoamHoldAnnealing(optimizer, **configs['train_conf']['scheduler_conf'])
-        elif configs['train_conf']['scheduler'] == 'constantlr':
-            scheduler_type = ConstantLR
-            scheduler = ConstantLR(optimizer)
-        else:
-            raise ValueError("unknown scheduler: " + configs['train_conf'])
+        raise ValueError("unknown optimizer: " + configs['train_conf'])
+
+    # Create schedulers
+    warmup_scheduler = LinearLR(
+        optimizer,
+        start_factor=1e-9,  # Start at nearly 0
+        end_factor=1.0,     # End at base learning rate
+        total_iters=5000    # 5k warmup steps
+    )
+    
+    constant_scheduler = ConstantLR(
+        optimizer,
+        factor=1.0,  # Keep learning rate constant
+        total_iters=float('inf')  # Run indefinitely
+    )
+    
+    # Combine schedulers: warmup for 5k steps, then constant
+    scheduler = SequentialLR(
+        optimizer,
+        schedulers=[warmup_scheduler, constant_scheduler],
+        milestones=[5000]  # Switch after 5k steps
+    )
+
+    return model, optimizer, scheduler
 
-        if configs['train_conf']['optim_d'] == 'adam':
-            optimizer_d = optim.Adam(model.module.discriminator.parameters(), **configs['train_conf']['optim_conf'])
-        elif configs['train_conf']['optim_d'] == 'adamw':
-            optimizer_d = optim.AdamW(model.module.discriminator.parameters(), **configs['train_conf']['optim_conf'])
-        else:
-            raise ValueError("unknown optimizer: " + configs['train_conf'])
-
-        if configs['train_conf']['scheduler_d'] == 'warmuplr':
-            scheduler_type = WarmupLR
-            scheduler_d = WarmupLR(optimizer_d, **configs['train_conf']['scheduler_conf'])
-        elif configs['train_conf']['scheduler_d'] == 'NoamHoldAnnealing':
-            scheduler_type = NoamHoldAnnealing
-            scheduler_d = NoamHoldAnnealing(optimizer_d, **configs['train_conf']['scheduler_conf'])
-        elif configs['train_conf']['scheduler'] == 'constantlr':
-            scheduler_type = ConstantLR
-            scheduler_d = ConstantLR(optimizer_d)
-        else:
-            raise ValueError("unknown scheduler: " + configs['train_conf'])
-    return model, optimizer, scheduler, optimizer_d, scheduler_d
-
-
-def init_summarywriter(args):
-    """Init summary writer"""    
-    writer = None
-    if int(os.environ.get('RANK', 0)) == 0:
-        os.makedirs(args.model_dir, exist_ok=True)
-        writer = SummaryWriter(args.tensorboard_dir)
-    return writer
 
 
 def save_model(model, model_name, info_dict):
@@ -295,21 +242,87 @@ def batch_backward(model, scaler, info_dict):
     return info_dict
 
 
-def update_parameter_and_lr(model, optimizer, scheduler, scaler, info_dict):
+def update_parameter_and_lr(model, optimizer, scheduler, scaler, info_dict, model_type='llm'):
     """Update parameters and learning rate"""
+
+    #Define key components based on model type
+    if model_type == 'llm':
+        key_components = {
+            # Text processing components
+            'text_embedding': [],
+            'text_encoder': [],
+            'text_encoder_affine': [],
+            
+            # LLM core components
+            'llm_embedding': [],
+            'llm.model': [],  # Qwen2 model layers
+            'llm_decoder': [],
+            
+            # Speech components
+            'speech_embedding': [],
+            'spk_embed_affine': [],
+            
+            # Other components
+            'other': []
+        }
+    elif model_type == 'flow':
+        key_components = {
+            # Input processing
+            'input_embedding': [],
+            'spk_embed_affine': [],
+            
+            # Encoder components
+            'encoder': [],
+            'encoder_proj': [],
+            
+            # Flow/Diffusion components
+            'decoder.cfm': [],  # Conditional Flow Matching
+            'decoder.unet': [],  # UNet backbone
+            'decoder.estimator': [],  # Score/velocity estimator
+            'decoder.time_embedding': [],  # Time embeddings
+            'decoder.conv': [],  # Convolutional layers
+            'decoder.attention': [],  # Attention layers
+            
+            # Length regulation
+            'length_regulator': [],
+            
+            # Other components
+            'other': []
+        }
+
     grad_norm = 0.0
-    if info_dict['train_engine'] == "deepspeed":
-        info_dict["is_gradient_accumulation_boundary"] = model.is_gradient_accumulation_boundary()
-        model.step()
-        grad_norm = model.get_global_grad_norm()
-    elif (info_dict['batch_idx'] + 1) % info_dict["accum_grad"] == 0:
+    layer_grad_norms = {}
+
+    if (info_dict['batch_idx'] + 1) % info_dict["accum_grad"] == 0:
+
+        for name, param in model.named_parameters():
+            if param.grad is not None:
+                # Calculate gradient norm for this parameter
+                param_grad_norm = param.grad.data.norm(2).item()
+                layer_grad_norms[name] = param_grad_norm
+                
+                # Categorize into key components
+                categorized = False
+                for component_key in key_components:
+                    if component_key != 'other':
+                        # Special handling for decoder sub-components in flow models
+                        if model_type == 'flow' and component_key.startswith('decoder.'):
+                            component_pattern = component_key.replace('decoder.', '')
+                            if 'decoder' in name and component_pattern in name:
+                                key_components[component_key].append((name, param_grad_norm))
+                                categorized = True
+                                break
+                        elif component_key in name:
+                            key_components[component_key].append((name, param_grad_norm))
+                            categorized = True
+                            break
+                if not categorized:
+                    key_components['other'].append((name, param_grad_norm))
+
         # Use mixed precision training
         if scaler is not None:
             scaler.unscale_(optimizer)
             grad_norm = clip_grad_norm_(model.parameters(), info_dict['grad_clip'])
-            # We don't check grad here since that if the gradient
-            # has inf/nan values, scaler.step will skip
-            # optimizer.step().
             if torch.isfinite(grad_norm):
                 scaler.step(optimizer)
             else:
@@ -325,11 +338,12 @@ def update_parameter_and_lr(model, optimizer, scheduler, scaler, info_dict):
         scheduler.step()
     info_dict["lr"] = optimizer.param_groups[0]['lr']
     info_dict["grad_norm"] = grad_norm
+    info_dict["layer_grad_norms"] = layer_grad_norms
+    info_dict["key_component_grads"] = key_components
     return info_dict
 
-
-def log_per_step(writer, info_dict):
-    """Log per step"""
+def log_per_step(experiment, info_dict):
+    """Log per step using Comet ML"""
     tag = info_dict["tag"]
     epoch = info_dict.get('epoch', 0)
     step = info_dict["step"]
@@ -337,39 +351,61 @@ def log_per_step(writer, info_dict):
     loss_dict = info_dict['loss_dict']
     rank = int(os.environ.get('RANK', 0))
 
-    # only rank 0 write to tensorboard to avoid multi-process write
-    if writer is not None:
+    # Only rank 0 writes to Comet ML to avoid multi-process write
+    if experiment is not None and rank == 0:
         if (info_dict['train_engine'] == 'deepspeed' and info_dict['is_gradient_accumulation_boundary'] is True) or \
            (info_dict['train_engine'] == 'torch_ddp' and (info_dict['batch_idx'] + 1) % info_dict['accum_grad'] == 0):
-            for k in ['epoch', 'lr', 'grad_norm']:
-                writer.add_scalar(f'{tag}/{k}', info_dict[k], step + 1)
+            # Log metrics to Comet ML
+            experiment.log_metric(f'{tag}_epoch', info_dict['epoch'], step=step + 1)
+            experiment.log_metric(f'{tag}_lr', info_dict['lr'], step=step + 1)
+            experiment.log_metric(f'{tag}_grad_norm', info_dict['grad_norm'], step=step + 1)
+            
+            # Log all losses
             for k, v in loss_dict.items():
-                writer.add_scalar(f'{tag}/{k}', v, step + 1)
+                if isinstance(v, torch.Tensor):
+                    v = v.item()
+                experiment.log_metric(f'{tag}_{k}', v, step=step + 1)
 
     # TRAIN & CV, Shell log (stdout)
     if (info_dict['batch_idx'] + 1) % info_dict['log_interval'] == 0:
         log_str = f'{tag} Batch {epoch}/{batch_idx + 1} '
         for name, value in loss_dict.items():
+            if isinstance(value, torch.Tensor):
+                value = value.item()
             log_str += f'{name} {value:.6f} '
         if tag == "TRAIN":
             log_str += f'lr {info_dict["lr"]:.8f} grad_norm {info_dict["grad_norm"]:.6f}'
         log_str += f' rank {rank}'
         logging.debug(log_str)
 
-
-def log_per_save(writer, info_dict):
-    """Log per save"""
+def log_per_save(experiment, info_dict):
+    """Log per save using Comet ML"""
     tag = info_dict["tag"]
     epoch = info_dict["epoch"]
     step = info_dict["step"]
     loss_dict = info_dict["loss_dict"]
     lr = info_dict['lr']
     rank = int(os.environ.get('RANK', 0))
-    logger.info(
-        f'Epoch {epoch} Step {step + 1} CV info lr {lr} {rank} {''.join([f"{k} {v}" for k, v in loss_dict.items()])}')
-
-    if writer is not None:
-        for k in ['epoch', 'lr']:
-            writer.add_scalar(f'{tag}/{k}', info_dict[k], step + 1)
+    
+    # Create loss string for logging
+    loss_str = ' '.join([f"{k} {v.item() if isinstance(v, torch.Tensor) else v}" for k, v in loss_dict.items()])
+    logger.info(f'Epoch {epoch} Step {step + 1} CV info lr {lr} {rank} {loss_str}')
+
+    if experiment is not None and rank == 0:
+        # Log metrics to Comet ML
+        experiment.log_metric(f'{tag}_epoch', info_dict['epoch'], step=step + 1)
+        experiment.log_metric(f'{tag}_lr', info_dict['lr'], step=step + 1)
+        
+        # Log all losses
         for k, v in loss_dict.items():
-            writer.add_scalar(f'{tag}/{k}', v, step + 1)
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            experiment.log_metric(f'{tag}_{k}', v, step=step + 1)
+        
+        # Log additional validation info
+        if tag == "CV":
+            # Calculate average CV loss for the epoch
+            avg_loss = loss_dict.get('loss', 0)
+            if isinstance(avg_loss, torch.Tensor):
+                avg_loss = avg_loss.item()
+            experiment.log_metric('cv_avg_loss_per_epoch', avg_loss, epoch=epoch)
diff --git a/speech/cosyvoice2.yaml b/speech/cosyvoice2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..551e42edc0258d9de2ee20a680e58ffe101af168
--- /dev/null
+++ b/speech/cosyvoice2.yaml
@@ -0,0 +1,217 @@
+# set random seed, so that you may reproduce your result.
+__set_seed1: !apply:random.seed [1986]
+__set_seed2: !apply:numpy.random.seed [1986]
+__set_seed3: !apply:torch.manual_seed [1986]
+__set_seed4: !apply:torch.cuda.manual_seed_all [1986]
+
+# fixed params
+sample_rate: 24000
+llm_input_size: 896
+llm_output_size: 896
+spk_embed_dim: 192
+qwen_pretrain_path: ''
+token_frame_rate: 25
+token_mel_ratio: 2
+
+# stream related params
+chunk_size: 25 # streaming inference chunk size, in token
+num_decoding_left_chunks: -1 # streaming inference flow decoder left chunk size, <0 means use all left chunks
+
+# model params
+# for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
+# for system/third_party class/function, we do not require this.
+llm: !new:cosyvoice.llm.llm.Qwen2LM
+    llm_input_size: !ref <llm_input_size>
+    llm_output_size: !ref <llm_output_size>
+    speech_token_size: 6561
+    length_normalized_loss: True
+    lsm_weight: 0
+    mix_ratio: [5, 15]
+    llm: !new:cosyvoice.llm.llm.Qwen2Encoder
+        pretrain_path: !ref <qwen_pretrain_path>
+    sampling: !name:cosyvoice.utils.common.ras_sampling
+        top_p: 0.8
+        top_k: 25
+        win_size: 10
+        tau_r: 0.1
+
+flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
+    input_size: 512
+    output_size: 80
+    spk_embed_dim: !ref <spk_embed_dim>
+    output_type: 'mel'
+    vocab_size: 6561
+    input_frame_rate: !ref <token_frame_rate>
+    only_mask_loss: True
+    token_mel_ratio: !ref <token_mel_ratio>
+    pre_lookahead_len: 3
+    encoder: !new:cosyvoice.transformer.upsample_encoder.UpsampleConformerEncoder
+        output_size: 512
+        attention_heads: 8
+        linear_units: 2048
+        num_blocks: 6
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        attention_dropout_rate: 0.1
+        normalize_before: True
+        input_layer: 'linear'
+        pos_enc_layer_type: 'rel_pos_espnet'
+        selfattention_layer_type: 'rel_selfattn'
+        input_size: 512
+        use_cnn_module: False
+        macaron_style: False
+        static_chunk_size: !ref <chunk_size>
+    decoder: !new:cosyvoice.flow.flow_matching.CausalConditionalCFM
+        in_channels: 240
+        n_spks: 1
+        spk_emb_dim: 80
+        cfm_params: !new:omegaconf.DictConfig
+            content:
+                sigma_min: 1e-06
+                solver: 'euler'
+                t_scheduler: 'cosine'
+                training_cfg_rate: 0.2
+                inference_cfg_rate: 0.7
+                reg_loss_type: 'l1'
+        estimator: !new:cosyvoice.flow.decoder.CausalConditionalDecoder
+            in_channels: 320
+            out_channels: 80
+            channels: [256]
+            dropout: 0.0
+            attention_head_dim: 64
+            n_blocks: 4
+            num_mid_blocks: 12
+            num_heads: 8
+            act_fn: 'gelu'
+            static_chunk_size: !ref <chunk_size> * <token_mel_ratio>
+            num_decoding_left_chunks: !ref <num_decoding_left_chunks>
+
+hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
+    in_channels: 80
+    base_channels: 512
+    nb_harmonics: 8
+    sampling_rate: !ref <sample_rate>
+    nsf_alpha: 0.1
+    nsf_sigma: 0.003
+    nsf_voiced_threshold: 10
+    upsample_rates: [8, 5, 3]
+    upsample_kernel_sizes: [16, 11, 7]
+    istft_params:
+        n_fft: 16
+        hop_len: 4
+    resblock_kernel_sizes: [3, 7, 11]
+    resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+    source_resblock_kernel_sizes: [7, 7, 11]
+    source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+    lrelu_slope: 0.1
+    audio_limit: 0.99
+    f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor
+        num_class: 1
+        in_channels: 80
+        cond_channels: 512
+
+# gan related module
+mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram
+    n_fft: 1920
+    num_mels: 80
+    sampling_rate: !ref <sample_rate>
+    hop_size: 480
+    win_size: 1920
+    fmin: 0
+    fmax: null
+    center: False
+hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan
+    generator: !ref <hift>
+    discriminator: !new:cosyvoice.hifigan.discriminator.MultipleDiscriminator
+        mpd: !new:matcha.hifigan.models.MultiPeriodDiscriminator
+        mrd: !new:cosyvoice.hifigan.discriminator.MultiResSpecDiscriminator
+    mel_spec_transform: [
+        !ref <mel_spec_transform1>
+    ]
+
+# processor functions
+parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
+get_tokenizer: !name:cosyvoice.tokenizer.tokenizer.get_qwen_tokenizer
+    token_path: !ref <qwen_pretrain_path>
+    skip_special_tokens: True
+allowed_special: 'all'
+tokenize: !name:cosyvoice.dataset.processor.tokenize
+    get_tokenizer: !ref <get_tokenizer>
+    allowed_special: !ref <allowed_special>
+filter: !name:cosyvoice.dataset.processor.filter
+    max_length: 40960
+    min_length: 100
+    token_max_length: 200
+    token_min_length: 1
+resample: !name:cosyvoice.dataset.processor.resample
+    resample_rate: !ref <sample_rate>
+truncate: !name:cosyvoice.dataset.processor.truncate
+    truncate_length: 24480 # must be a multiplier of hop_size
+feat_extractor: !name:matcha.utils.audio.mel_spectrogram
+    n_fft: 1920
+    num_mels: 80
+    sampling_rate: !ref <sample_rate>
+    hop_size: 480
+    win_size: 1920
+    fmin: 0
+    fmax: 8000
+    center: False
+compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
+    feat_extractor: !ref <feat_extractor>
+compute_f0: !name:cosyvoice.dataset.processor.compute_f0
+    sample_rate: !ref <sample_rate>
+    hop_size: 480
+parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
+    normalize: True
+shuffle: !name:cosyvoice.dataset.processor.shuffle
+    shuffle_size: 1000
+sort: !name:cosyvoice.dataset.processor.sort
+    sort_size: 500  # sort_size should be less than shuffle_size
+batch: !name:cosyvoice.dataset.processor.batch
+    batch_type: 'dynamic'
+    max_frames_in_batch: 2000
+padding: !name:cosyvoice.dataset.processor.padding
+    use_spk_embedding: False # change to True during sft
+
+
+# dataset processor pipeline
+data_pipeline: [
+    !ref <parquet_opener>,
+    !ref <tokenize>,
+    !ref <filter>,
+    !ref <resample>,
+    !ref <compute_fbank>,
+    !ref <parse_embedding>,
+    !ref <shuffle>,
+    !ref <sort>,
+    !ref <batch>,
+    !ref <padding>,
+]
+data_pipeline_gan: [
+    !ref <parquet_opener>,
+    !ref <tokenize>,
+    !ref <filter>,
+    !ref <resample>,
+    !ref <truncate>,
+    !ref <compute_fbank>,
+    !ref <compute_f0>,
+    !ref <parse_embedding>,
+    !ref <shuffle>,
+    !ref <sort>,
+    !ref <batch>,
+    !ref <padding>,
+]
+
+# llm flow train conf
+train_conf:
+    optim: adamw
+    optim_conf:
+        lr: 1e-5 # change to 1e-5 during sft
+    scheduler: constantlr # change to constantlr during sft
+    scheduler_conf:
+        warmup_steps: 2500
+    max_epoch: 200
+    grad_clip: 1
+    accum_grad: 1
+    log_interval: 100
+    save_per_step: -1
\ No newline at end of file
diff --git a/speech/third_party/Matcha-TTS/matcha/__init__.py b/speech/matcha/__init__.py
similarity index 100%
rename from speech/third_party/Matcha-TTS/matcha/__init__.py
rename to speech/matcha/__init__.py
diff --git a/speech/third_party/Matcha-TTS/matcha/app.py b/speech/matcha/app.py
similarity index 100%
rename from speech/third_party/Matcha-TTS/matcha/app.py
rename to speech/matcha/app.py
diff --git a/speech/third_party/Matcha-TTS/matcha/cli.py b/speech/matcha/cli.py
similarity index 100%
rename from speech/third_party/Matcha-TTS/matcha/cli.py
rename to speech/matcha/cli.py
diff --git a/speech/third_party/Matcha-TTS/matcha/hifigan/LICENSE b/speech/matcha/hifigan/LICENSE
similarity index 100%
rename from speech/third_party/Matcha-TTS/matcha/hifigan/LICENSE
rename to speech/matcha/hifigan/LICENSE
diff --git a/speech/third_party/Matcha-TTS/matcha/hifigan/README.md b/speech/matcha/hifigan/README.md
similarity index 100%
rename from speech/third_party/Matcha-TTS/matcha/hifigan/README.md
rename to speech/matcha/hifigan/README.md
diff --git a/speech/third_party/Matcha-TTS/matcha/hifigan/__init__.py b/speech/matcha/hifigan/__init__.py
similarity index 100%
rename from speech/third_party/Matcha-TTS/matcha/hifigan/__init__.py
rename to speech/matcha/hifigan/__init__.py
diff --git a/speech/third_party/Matcha-TTS/matcha/hifigan/config.py b/speech/matcha/hifigan/config.py
similarity index 100%
rename from speech/third_party/Matcha-TTS/matcha/hifigan/config.py
rename to speech/matcha/hifigan/config.py
diff --git a/speech/third_party/Matcha-TTS/matcha/hifigan/denoiser.py b/speech/matcha/hifigan/denoiser.py
similarity index 100%
rename from speech/third_party/Matcha-TTS/matcha/hifigan/denoiser.py
rename to speech/matcha/hifigan/denoiser.py
diff --git a/speech/third_party/Matcha-TTS/matcha/hifigan/env.py b/speech/matcha/hifigan/env.py
similarity index 100%
rename from speech/third_party/Matcha-TTS/matcha/hifigan/env.py
rename to speech/matcha/hifigan/env.py
diff --git a/speech/third_party/Matcha-TTS/matcha/hifigan/meldataset.py b/speech/matcha/hifigan/meldataset.py
similarity index 100%
rename from speech/third_party/Matcha-TTS/matcha/hifigan/meldataset.py
rename to speech/matcha/hifigan/meldataset.py
diff --git a/speech/third_party/Matcha-TTS/matcha/hifigan/models.py b/speech/matcha/hifigan/models.py
similarity index 100%
rename from speech/third_party/Matcha-TTS/matcha/hifigan/models.py
rename to speech/matcha/hifigan/models.py
diff --git a/speech/third_party/Matcha-TTS/matcha/hifigan/xutils.py b/speech/matcha/hifigan/xutils.py
similarity index 100%
rename from speech/third_party/Matcha-TTS/matcha/hifigan/xutils.py
rename to speech/matcha/hifigan/xutils.py
diff --git a/speech/third_party/Matcha-TTS/matcha/models/__init__.py b/speech/matcha/models/__init__.py
similarity index 100%
rename from speech/third_party/Matcha-TTS/matcha/models/__init__.py
rename to speech/matcha/models/__init__.py
diff --git a/speech/third_party/Matcha-TTS/matcha/models/baselightningmodule.py b/speech/matcha/models/baselightningmodule.py
similarity index 100%
rename from speech/third_party/Matcha-TTS/matcha/models/baselightningmodule.py
rename to speech/matcha/models/baselightningmodule.py
diff --git a/speech/third_party/Matcha-TTS/matcha/models/components/__init__.py b/speech/matcha/models/components/__init__.py
similarity index 100%
rename from speech/third_party/Matcha-TTS/matcha/models/components/__init__.py
rename to speech/matcha/models/components/__init__.py
diff --git a/speech/third_party/Matcha-TTS/matcha/models/components/decoder.py b/speech/matcha/models/components/decoder.py
similarity index 100%
rename from speech/third_party/Matcha-TTS/matcha/models/components/decoder.py
rename to speech/matcha/models/components/decoder.py
diff --git a/speech/third_party/Matcha-TTS/matcha/models/components/flow_matching.py b/speech/matcha/models/components/flow_matching.py
similarity index 100%
rename from speech/third_party/Matcha-TTS/matcha/models/components/flow_matching.py
rename to speech/matcha/models/components/flow_matching.py
diff --git a/speech/third_party/Matcha-TTS/matcha/models/components/text_encoder.py b/speech/matcha/models/components/text_encoder.py
similarity index 100%
rename from speech/third_party/Matcha-TTS/matcha/models/components/text_encoder.py
rename to speech/matcha/models/components/text_encoder.py
diff --git a/speech/third_party/Matcha-TTS/matcha/models/components/transformer.py b/speech/matcha/models/components/transformer.py
similarity index 100%
rename from speech/third_party/Matcha-TTS/matcha/models/components/transformer.py
rename to speech/matcha/models/components/transformer.py
diff --git a/speech/third_party/Matcha-TTS/matcha/models/matcha_tts.py b/speech/matcha/models/matcha_tts.py
similarity index 100%
rename from speech/third_party/Matcha-TTS/matcha/models/matcha_tts.py
rename to speech/matcha/models/matcha_tts.py
diff --git a/speech/third_party/Matcha-TTS/matcha/onnx/__init__.py b/speech/matcha/onnx/__init__.py
similarity index 100%
rename from speech/third_party/Matcha-TTS/matcha/onnx/__init__.py
rename to speech/matcha/onnx/__init__.py
diff --git a/speech/third_party/Matcha-TTS/matcha/onnx/export.py b/speech/matcha/onnx/export.py
similarity index 100%
rename from speech/third_party/Matcha-TTS/matcha/onnx/export.py
rename to speech/matcha/onnx/export.py
diff --git a/speech/third_party/Matcha-TTS/matcha/onnx/infer.py b/speech/matcha/onnx/infer.py
similarity index 100%
rename from speech/third_party/Matcha-TTS/matcha/onnx/infer.py
rename to speech/matcha/onnx/infer.py
diff --git a/speech/third_party/Matcha-TTS/matcha/text/__init__.py b/speech/matcha/text/__init__.py
similarity index 100%
rename from speech/third_party/Matcha-TTS/matcha/text/__init__.py
rename to speech/matcha/text/__init__.py
diff --git a/speech/third_party/Matcha-TTS/matcha/text/cleaners.py b/speech/matcha/text/cleaners.py
similarity index 100%
rename from speech/third_party/Matcha-TTS/matcha/text/cleaners.py
rename to speech/matcha/text/cleaners.py
diff --git a/speech/third_party/Matcha-TTS/matcha/text/numbers.py b/speech/matcha/text/numbers.py
similarity index 100%
rename from speech/third_party/Matcha-TTS/matcha/text/numbers.py
rename to speech/matcha/text/numbers.py
diff --git a/speech/third_party/Matcha-TTS/matcha/text/symbols.py b/speech/matcha/text/symbols.py
similarity index 100%
rename from speech/third_party/Matcha-TTS/matcha/text/symbols.py
rename to speech/matcha/text/symbols.py
diff --git a/speech/third_party/Matcha-TTS/matcha/utils/__init__.py b/speech/matcha/utils/__init__.py
similarity index 100%
rename from speech/third_party/Matcha-TTS/matcha/utils/__init__.py
rename to speech/matcha/utils/__init__.py
diff --git a/speech/third_party/Matcha-TTS/matcha/utils/audio.py b/speech/matcha/utils/audio.py
similarity index 100%
rename from speech/third_party/Matcha-TTS/matcha/utils/audio.py
rename to speech/matcha/utils/audio.py
diff --git a/speech/third_party/Matcha-TTS/matcha/utils/generate_data_statistics.py b/speech/matcha/utils/generate_data_statistics.py
similarity index 100%
rename from speech/third_party/Matcha-TTS/matcha/utils/generate_data_statistics.py
rename to speech/matcha/utils/generate_data_statistics.py
diff --git a/speech/third_party/Matcha-TTS/matcha/utils/instantiators.py b/speech/matcha/utils/instantiators.py
similarity index 100%
rename from speech/third_party/Matcha-TTS/matcha/utils/instantiators.py
rename to speech/matcha/utils/instantiators.py
diff --git a/speech/third_party/Matcha-TTS/matcha/utils/logging_utils.py b/speech/matcha/utils/logging_utils.py
similarity index 100%
rename from speech/third_party/Matcha-TTS/matcha/utils/logging_utils.py
rename to speech/matcha/utils/logging_utils.py
diff --git a/speech/third_party/Matcha-TTS/matcha/utils/model.py b/speech/matcha/utils/model.py
similarity index 100%
rename from speech/third_party/Matcha-TTS/matcha/utils/model.py
rename to speech/matcha/utils/model.py
diff --git a/speech/third_party/Matcha-TTS/matcha/utils/monotonic_align/__init__.py b/speech/matcha/utils/monotonic_align/__init__.py
similarity index 100%
rename from speech/third_party/Matcha-TTS/matcha/utils/monotonic_align/__init__.py
rename to speech/matcha/utils/monotonic_align/__init__.py
diff --git a/speech/third_party/Matcha-TTS/matcha/utils/monotonic_align/core.pyx b/speech/matcha/utils/monotonic_align/core.pyx
similarity index 100%
rename from speech/third_party/Matcha-TTS/matcha/utils/monotonic_align/core.pyx
rename to speech/matcha/utils/monotonic_align/core.pyx
diff --git a/speech/third_party/Matcha-TTS/matcha/utils/monotonic_align/setup.py b/speech/matcha/utils/monotonic_align/setup.py
similarity index 100%
rename from speech/third_party/Matcha-TTS/matcha/utils/monotonic_align/setup.py
rename to speech/matcha/utils/monotonic_align/setup.py
diff --git a/speech/third_party/Matcha-TTS/matcha/utils/pylogger.py b/speech/matcha/utils/pylogger.py
similarity index 100%
rename from speech/third_party/Matcha-TTS/matcha/utils/pylogger.py
rename to speech/matcha/utils/pylogger.py
diff --git a/speech/third_party/Matcha-TTS/matcha/utils/rich_utils.py b/speech/matcha/utils/rich_utils.py
similarity index 100%
rename from speech/third_party/Matcha-TTS/matcha/utils/rich_utils.py
rename to speech/matcha/utils/rich_utils.py
diff --git a/speech/third_party/Matcha-TTS/matcha/utils/utils.py b/speech/matcha/utils/utils.py
similarity index 100%
rename from speech/third_party/Matcha-TTS/matcha/utils/utils.py
rename to speech/matcha/utils/utils.py
diff --git a/speech/third_party/Matcha-TTS/.env.example b/speech/third_party/Matcha-TTS/.env.example
deleted file mode 100644
index a790e320464ebc778ca07f5bcd826a9c8412ed0e..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/.env.example
+++ /dev/null
@@ -1,6 +0,0 @@
-# example of file for storing private and user specific environment variables, like keys or system paths
-# rename it to ".env" (excluded from version control by default)
-# .env is loaded by train.py automatically
-# hydra allows you to reference variables in .yaml configs with special syntax: ${oc.env:MY_VAR}
-
-MY_VAR="/home/user/my/system/path"
diff --git a/speech/third_party/Matcha-TTS/.github/PULL_REQUEST_TEMPLATE.md b/speech/third_party/Matcha-TTS/.github/PULL_REQUEST_TEMPLATE.md
deleted file mode 100644
index 410bcd87a45297ab8f0d369574a032858b6b1811..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/.github/PULL_REQUEST_TEMPLATE.md
+++ /dev/null
@@ -1,22 +0,0 @@
-## What does this PR do?
-
-<!--
-Please include a summary of the change and which issue is fixed.
-Please also include relevant motivation and context.
-List any dependencies that are required for this change.
-List all the breaking changes introduced by this pull request.
--->
-
-Fixes #\<issue_number>
-
-## Before submitting
-
-- [ ] Did you make sure **title is self-explanatory** and **the description concisely explains the PR**?
-- [ ] Did you make sure your **PR does only one thing**, instead of bundling different changes together?
-- [ ] Did you list all the **breaking changes** introduced by this pull request?
-- [ ] Did you **test your PR locally** with `pytest` command?
-- [ ] Did you **run pre-commit hooks** with `pre-commit run -a` command?
-
-## Did you have fun?
-
-Make sure you had fun coding 🙃
diff --git a/speech/third_party/Matcha-TTS/.github/codecov.yml b/speech/third_party/Matcha-TTS/.github/codecov.yml
deleted file mode 100644
index c66853c4bd9991f730da5dda7dc9881986779558..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/.github/codecov.yml
+++ /dev/null
@@ -1,15 +0,0 @@
-coverage:
-  status:
-    # measures overall project coverage
-    project:
-      default:
-        threshold: 100% # how much decrease in coverage is needed to not consider success
-
-    # measures PR or single commit coverage
-    patch:
-      default:
-        threshold: 100% # how much decrease in coverage is needed to not consider success
-
-
-    # project: off
-    # patch: off
diff --git a/speech/third_party/Matcha-TTS/.github/dependabot.yml b/speech/third_party/Matcha-TTS/.github/dependabot.yml
deleted file mode 100644
index b19ccab12a3c573025ce6ba6d9068b062b29cc1b..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/.github/dependabot.yml
+++ /dev/null
@@ -1,17 +0,0 @@
-# To get started with Dependabot version updates, you'll need to specify which
-# package ecosystems to update and where the package manifests are located.
-# Please see the documentation for all configuration options:
-# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
-
-version: 2
-updates:
-  - package-ecosystem: "pip" # See documentation for possible values
-    directory: "/" # Location of package manifests
-    target-branch: "dev"
-    schedule:
-      interval: "daily"
-    ignore:
-      - dependency-name: "pytorch-lightning"
-        update-types: ["version-update:semver-patch"]
-      - dependency-name: "torchmetrics"
-        update-types: ["version-update:semver-patch"]
diff --git a/speech/third_party/Matcha-TTS/.github/release-drafter.yml b/speech/third_party/Matcha-TTS/.github/release-drafter.yml
deleted file mode 100644
index 59af159f671abe75311eb626c8ec92ca6ea09d3c..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/.github/release-drafter.yml
+++ /dev/null
@@ -1,44 +0,0 @@
-name-template: "v$RESOLVED_VERSION"
-tag-template: "v$RESOLVED_VERSION"
-
-categories:
-  - title: "🚀 Features"
-    labels:
-      - "feature"
-      - "enhancement"
-  - title: "🐛 Bug Fixes"
-    labels:
-      - "fix"
-      - "bugfix"
-      - "bug"
-  - title: "🧹 Maintenance"
-    labels:
-      - "maintenance"
-      - "dependencies"
-      - "refactoring"
-      - "cosmetic"
-      - "chore"
-  - title: "📝️ Documentation"
-    labels:
-      - "documentation"
-      - "docs"
-
-change-template: "- $TITLE @$AUTHOR (#$NUMBER)"
-change-title-escapes: '\<*_&' # You can add # and @ to disable mentions
-
-version-resolver:
-  major:
-    labels:
-      - "major"
-  minor:
-    labels:
-      - "minor"
-  patch:
-    labels:
-      - "patch"
-  default: patch
-
-template: |
-  ## Changes
-
-  $CHANGES
diff --git a/speech/third_party/Matcha-TTS/.gitignore b/speech/third_party/Matcha-TTS/.gitignore
deleted file mode 100644
index cbec8b43a0414bbbf4cc9feae49b9dc091a60c92..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/.gitignore
+++ /dev/null
@@ -1,163 +0,0 @@
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-
-# C extensions
-*.so
-
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-pip-wheel-metadata/
-share/python-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-*.py,cover
-.hypothesis/
-.pytest_cache/
-
-# Translations
-*.mo
-*.pot
-
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-target/
-
-# Jupyter Notebook
-.ipynb_checkpoints
-
-# IPython
-profile_default/
-ipython_config.py
-
-# pyenv
-.python-version
-
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-#Pipfile.lock
-
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow
-__pypackages__/
-
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-
-# SageMath parsed files
-*.sage.py
-
-# Environments
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-
-# Spyder project settings
-.spyderproject
-.spyproject
-
-# Rope project settings
-.ropeproject
-
-# mkdocs documentation
-/site
-
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-
-# Pyre type checker
-.pyre/
-
-### VisualStudioCode
-.vscode/*
-!.vscode/settings.json
-!.vscode/tasks.json
-!.vscode/launch.json
-!.vscode/extensions.json
-*.code-workspace
-**/.vscode
-
-# JetBrains
-.idea/
-
-# Data & Models
-*.h5
-*.tar
-*.tar.gz
-
-# Lightning-Hydra-Template
-configs/local/default.yaml
-/data/
-/logs/
-.env
-
-# Aim logging
-.aim
-
-# Cython complied files
-matcha/utils/monotonic_align/core.c
-
-# Ignoring hifigan checkpoint
-generator_v1
-g_02500000
-gradio_cached_examples/
-synth_output/
diff --git a/speech/third_party/Matcha-TTS/.pre-commit-config.yaml b/speech/third_party/Matcha-TTS/.pre-commit-config.yaml
deleted file mode 100644
index e695f115eba12d84fe6f465c5d834dfa35c3d2ec..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/.pre-commit-config.yaml
+++ /dev/null
@@ -1,59 +0,0 @@
-default_language_version:
-  python: python3.10
-
-repos:
-  - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.5.0
-    hooks:
-      # list of supported hooks: https://pre-commit.com/hooks.html
-      - id: trailing-whitespace
-      - id: end-of-file-fixer
-      # - id: check-docstring-first
-      - id: check-yaml
-      - id: debug-statements
-      - id: detect-private-key
-      - id: check-toml
-      - id: check-case-conflict
-      - id: check-added-large-files
-
-  # python code formatting
-  - repo: https://github.com/psf/black
-    rev: 23.12.1
-    hooks:
-      - id: black
-        args: [--line-length, "120"]
-
-  # python import sorting
-  - repo: https://github.com/PyCQA/isort
-    rev: 5.13.2
-    hooks:
-      - id: isort
-        args: ["--profile", "black", "--filter-files"]
-
-  # python upgrading syntax to newer version
-  - repo: https://github.com/asottile/pyupgrade
-    rev: v3.15.0
-    hooks:
-      - id: pyupgrade
-        args: [--py38-plus]
-
-  # python check (PEP8), programming errors and code complexity
-  - repo: https://github.com/PyCQA/flake8
-    rev: 7.0.0
-    hooks:
-      - id: flake8
-        args:
-          [
-            "--max-line-length", "120",
-            "--extend-ignore",
-            "E203,E402,E501,F401,F841,RST2,RST301",
-            "--exclude",
-            "logs/*,data/*,matcha/hifigan/*",
-          ]
-        additional_dependencies: [flake8-rst-docstrings==0.3.0]
-
-  # pylint
-  - repo: https://github.com/pycqa/pylint
-    rev: v3.0.3
-    hooks:
-    -   id: pylint
diff --git a/speech/third_party/Matcha-TTS/.project-root b/speech/third_party/Matcha-TTS/.project-root
deleted file mode 100644
index 63eab774b9e36aa1a46cbd31b59cbd373bc5477f..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/.project-root
+++ /dev/null
@@ -1,2 +0,0 @@
-# this file is required for inferring the project root directory
-# do not delete
diff --git a/speech/third_party/Matcha-TTS/.pylintrc b/speech/third_party/Matcha-TTS/.pylintrc
deleted file mode 100644
index 962864189eab99a66b315b80f5a9976e7a423d4a..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/.pylintrc
+++ /dev/null
@@ -1,525 +0,0 @@
-[MASTER]
-
-# A comma-separated list of package or module names from where C extensions may
-# be loaded. Extensions are loading into the active Python interpreter and may
-# run arbitrary code.
-extension-pkg-whitelist=
-
-# Add files or directories to the blacklist. They should be base names, not
-# paths.
-ignore=CVS
-
-# Add files or directories matching the regex patterns to the blacklist. The
-# regex matches against base names, not paths.
-ignore-patterns=
-
-# Python code to execute, usually for sys.path manipulation such as
-# pygtk.require().
-#init-hook=
-
-# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
-# number of processors available to use.
-jobs=1
-
-# Control the amount of potential inferred values when inferring a single
-# object. This can help the performance when dealing with large functions or
-# complex, nested conditions.
-limit-inference-results=100
-
-# List of plugins (as comma separated values of python modules names) to load,
-# usually to register additional checkers.
-load-plugins=
-
-# Pickle collected data for later comparisons.
-persistent=yes
-
-# Specify a configuration file.
-#rcfile=
-
-# When enabled, pylint would attempt to guess common misconfiguration and emit
-# user-friendly hints instead of false-positive error messages.
-suggestion-mode=yes
-
-# Allow loading of arbitrary C extensions. Extensions are imported into the
-# active Python interpreter and may run arbitrary code.
-unsafe-load-any-extension=no
-
-
-[MESSAGES CONTROL]
-
-# Only show warnings with the listed confidence levels. Leave empty to show
-# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED.
-confidence=
-
-# Disable the message, report, category or checker with the given id(s). You
-# can either give multiple identifiers separated by comma (,) or put this
-# option multiple times (only on the command line, not in the configuration
-# file where it should appear only once). You can also use "--disable=all" to
-# disable everything first and then reenable specific checks. For example, if
-# you want to run only the similarities checker, you can use "--disable=all
-# --enable=similarities". If you want to run only the classes checker, but have
-# no Warning level messages displayed, use "--disable=all --enable=classes
-# --disable=W".
-disable=missing-docstring,
-        too-many-public-methods,
-        too-many-lines,
-        bare-except,
-        ## for avoiding weird p3.6 CI linter error
-        ## TODO: see later if we can remove this
-        assigning-non-slot,
-        unsupported-assignment-operation,
-        ## end
-        line-too-long,
-        fixme,
-        wrong-import-order,
-        ungrouped-imports,
-        wrong-import-position,
-        import-error,
-        invalid-name,
-        too-many-instance-attributes,
-        arguments-differ,
-        arguments-renamed,
-        no-name-in-module,
-        no-member,
-        unsubscriptable-object,
-        raw-checker-failed,
-        bad-inline-option,
-        locally-disabled,
-        file-ignored,
-        suppressed-message,
-        useless-suppression,
-        deprecated-pragma,
-        use-symbolic-message-instead,
-        useless-object-inheritance,
-        too-few-public-methods,
-        too-many-branches,
-        too-many-arguments,
-        too-many-locals,
-        too-many-statements,
-        duplicate-code,
-        not-callable,
-        import-outside-toplevel,
-        logging-fstring-interpolation,
-        logging-not-lazy,
-        unused-argument,
-        no-else-return,
-        chained-comparison,
-        redefined-outer-name
-
-# Enable the message, report, category or checker with the given id(s). You can
-# either give multiple identifier separated by comma (,) or put this option
-# multiple time (only on the command line, not in the configuration file where
-# it should appear only once). See also the "--disable" option for examples.
-enable=c-extension-no-member
-
-
-[REPORTS]
-
-# Python expression which should return a note less than 10 (10 is the highest
-# note). You have access to the variables errors warning, statement which
-# respectively contain the number of errors / warnings messages and the total
-# number of statements analyzed. This is used by the global evaluation report
-# (RP0004).
-evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
-
-# Template used to display messages. This is a python new-style format string
-# used to format the message information. See doc for all details.
-#msg-template=
-
-# Set the output format. Available formats are text, parseable, colorized, json
-# and msvs (visual studio). You can also give a reporter class, e.g.
-# mypackage.mymodule.MyReporterClass.
-output-format=text
-
-# Tells whether to display a full report or only the messages.
-reports=no
-
-# Activate the evaluation score.
-score=yes
-
-
-[REFACTORING]
-
-# Maximum number of nested blocks for function / method body
-max-nested-blocks=5
-
-# Complete name of functions that never returns. When checking for
-# inconsistent-return-statements if a never returning function is called then
-# it will be considered as an explicit return statement and no message will be
-# printed.
-never-returning-functions=sys.exit
-
-
-[LOGGING]
-
-# Format style used to check logging format string. `old` means using %
-# formatting, while `new` is for `{}` formatting.
-logging-format-style=old
-
-# Logging modules to check that the string format arguments are in logging
-# function parameter format.
-logging-modules=logging
-
-
-[SPELLING]
-
-# Limits count of emitted suggestions for spelling mistakes.
-max-spelling-suggestions=4
-
-# Spelling dictionary name. Available dictionaries: none. To make it working
-# install python-enchant package..
-spelling-dict=
-
-# List of comma separated words that should not be checked.
-spelling-ignore-words=
-
-# A path to a file that contains private dictionary; one word per line.
-spelling-private-dict-file=
-
-# Tells whether to store unknown words to indicated private dictionary in
-# --spelling-private-dict-file option instead of raising a message.
-spelling-store-unknown-words=no
-
-
-[MISCELLANEOUS]
-
-# List of note tags to take in consideration, separated by a comma.
-notes=FIXME,
-      XXX,
-      TODO
-
-
-[TYPECHECK]
-
-# List of decorators that produce context managers, such as
-# contextlib.contextmanager. Add to this list to register other decorators that
-# produce valid context managers.
-contextmanager-decorators=contextlib.contextmanager
-
-# List of members which are set dynamically and missed by pylint inference
-# system, and so shouldn't trigger E1101 when accessed. Python regular
-# expressions are accepted.
-generated-members=numpy.*,torch.*
-
-# Tells whether missing members accessed in mixin class should be ignored. A
-# mixin class is detected if its name ends with "mixin" (case insensitive).
-ignore-mixin-members=yes
-
-# Tells whether to warn about missing members when the owner of the attribute
-# is inferred to be None.
-ignore-none=yes
-
-# This flag controls whether pylint should warn about no-member and similar
-# checks whenever an opaque object is returned when inferring. The inference
-# can return multiple potential results while evaluating a Python object, but
-# some branches might not be evaluated, which results in partial inference. In
-# that case, it might be useful to still emit no-member and other checks for
-# the rest of the inferred objects.
-ignore-on-opaque-inference=yes
-
-# List of class names for which member attributes should not be checked (useful
-# for classes with dynamically set attributes). This supports the use of
-# qualified names.
-ignored-classes=optparse.Values,thread._local,_thread._local
-
-# List of module names for which member attributes should not be checked
-# (useful for modules/projects where namespaces are manipulated during runtime
-# and thus existing member attributes cannot be deduced by static analysis. It
-# supports qualified module names, as well as Unix pattern matching.
-ignored-modules=
-
-# Show a hint with possible names when a member name was not found. The aspect
-# of finding the hint is based on edit distance.
-missing-member-hint=yes
-
-# The minimum edit distance a name should have in order to be considered a
-# similar match for a missing member name.
-missing-member-hint-distance=1
-
-# The total number of similar names that should be taken in consideration when
-# showing a hint for a missing member.
-missing-member-max-choices=1
-
-
-[VARIABLES]
-
-# List of additional names supposed to be defined in builtins. Remember that
-# you should avoid defining new builtins when possible.
-additional-builtins=
-
-# Tells whether unused global variables should be treated as a violation.
-allow-global-unused-variables=yes
-
-# List of strings which can identify a callback function by name. A callback
-# name must start or end with one of those strings.
-callbacks=cb_,
-          _cb
-
-# A regular expression matching the name of dummy variables (i.e. expected to
-# not be used).
-dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
-
-# Argument names that match this expression will be ignored. Default to name
-# with leading underscore.
-ignored-argument-names=_.*|^ignored_|^unused_
-
-# Tells whether we should check for unused import in __init__ files.
-init-import=no
-
-# List of qualified module names which can have objects that can redefine
-# builtins.
-redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
-
-
-[FORMAT]
-
-# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
-expected-line-ending-format=
-
-# Regexp for a line that is allowed to be longer than the limit.
-ignore-long-lines=^\s*(# )?<?https?://\S+>?$
-
-# Number of spaces of indent required inside a hanging or continued line.
-indent-after-paren=4
-
-# String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
-# tab).
-indent-string='    '
-
-# Maximum number of characters on a single line.
-max-line-length=120
-
-# Maximum number of lines in a module.
-max-module-lines=1000
-
-# Allow the body of a class to be on the same line as the declaration if body
-# contains single statement.
-single-line-class-stmt=no
-
-# Allow the body of an if to be on the same line as the test if there is no
-# else.
-single-line-if-stmt=no
-
-
-[SIMILARITIES]
-
-# Ignore comments when computing similarities.
-ignore-comments=yes
-
-# Ignore docstrings when computing similarities.
-ignore-docstrings=yes
-
-# Ignore imports when computing similarities.
-ignore-imports=no
-
-# Minimum lines number of a similarity.
-min-similarity-lines=4
-
-
-[BASIC]
-
-# Naming style matching correct argument names.
-argument-naming-style=snake_case
-
-# Regular expression matching correct argument names. Overrides argument-
-# naming-style.
-argument-rgx=[a-z_][a-z0-9_]{0,30}$
-
-# Naming style matching correct attribute names.
-attr-naming-style=snake_case
-
-# Regular expression matching correct attribute names. Overrides attr-naming-
-# style.
-#attr-rgx=
-
-# Bad variable names which should always be refused, separated by a comma.
-bad-names=
-
-# Naming style matching correct class attribute names.
-class-attribute-naming-style=any
-
-# Regular expression matching correct class attribute names. Overrides class-
-# attribute-naming-style.
-#class-attribute-rgx=
-
-# Naming style matching correct class names.
-class-naming-style=PascalCase
-
-# Regular expression matching correct class names. Overrides class-naming-
-# style.
-#class-rgx=
-
-# Naming style matching correct constant names.
-const-naming-style=UPPER_CASE
-
-# Regular expression matching correct constant names. Overrides const-naming-
-# style.
-#const-rgx=
-
-# Minimum line length for functions/classes that require docstrings, shorter
-# ones are exempt.
-docstring-min-length=-1
-
-# Naming style matching correct function names.
-function-naming-style=snake_case
-
-# Regular expression matching correct function names. Overrides function-
-# naming-style.
-#function-rgx=
-
-# Good variable names which should always be accepted, separated by a comma.
-good-names=i,
-           j,
-           k,
-           x,
-           ex,
-           Run,
-           _
-
-# Include a hint for the correct naming format with invalid-name.
-include-naming-hint=no
-
-# Naming style matching correct inline iteration names.
-inlinevar-naming-style=any
-
-# Regular expression matching correct inline iteration names. Overrides
-# inlinevar-naming-style.
-#inlinevar-rgx=
-
-# Naming style matching correct method names.
-method-naming-style=snake_case
-
-# Regular expression matching correct method names. Overrides method-naming-
-# style.
-#method-rgx=
-
-# Naming style matching correct module names.
-module-naming-style=snake_case
-
-# Regular expression matching correct module names. Overrides module-naming-
-# style.
-#module-rgx=
-
-# Colon-delimited sets of names that determine each other's naming style when
-# the name regexes allow several styles.
-name-group=
-
-# Regular expression which should only match function or class names that do
-# not require a docstring.
-no-docstring-rgx=^_
-
-# List of decorators that produce properties, such as abc.abstractproperty. Add
-# to this list to register other decorators that produce valid properties.
-# These decorators are taken in consideration only for invalid-name.
-property-classes=abc.abstractproperty
-
-# Naming style matching correct variable names.
-variable-naming-style=snake_case
-
-# Regular expression matching correct variable names. Overrides variable-
-# naming-style.
-variable-rgx=[a-z_][a-z0-9_]{0,30}$
-
-
-[STRING]
-
-# This flag controls whether the implicit-str-concat-in-sequence should
-# generate a warning on implicit string concatenation in sequences defined over
-# several lines.
-check-str-concat-over-line-jumps=no
-
-
-[IMPORTS]
-
-# Allow wildcard imports from modules that define __all__.
-allow-wildcard-with-all=no
-
-# Analyse import fallback blocks. This can be used to support both Python 2 and
-# 3 compatible code, which means that the block might have code that exists
-# only in one or another interpreter, leading to false positives when analysed.
-analyse-fallback-blocks=no
-
-# Deprecated modules which should not be used, separated by a comma.
-deprecated-modules=optparse,tkinter.tix
-
-# Create a graph of external dependencies in the given file (report RP0402 must
-# not be disabled).
-ext-import-graph=
-
-# Create a graph of every (i.e. internal and external) dependencies in the
-# given file (report RP0402 must not be disabled).
-import-graph=
-
-# Create a graph of internal dependencies in the given file (report RP0402 must
-# not be disabled).
-int-import-graph=
-
-# Force import order to recognize a module as part of the standard
-# compatibility libraries.
-known-standard-library=
-
-# Force import order to recognize a module as part of a third party library.
-known-third-party=enchant
-
-
-[CLASSES]
-
-# List of method names used to declare (i.e. assign) instance attributes.
-defining-attr-methods=__init__,
-                      __new__,
-                      setUp
-
-# List of member names, which should be excluded from the protected access
-# warning.
-exclude-protected=_asdict,
-                  _fields,
-                  _replace,
-                  _source,
-                  _make
-
-# List of valid names for the first argument in a class method.
-valid-classmethod-first-arg=cls
-
-# List of valid names for the first argument in a metaclass class method.
-valid-metaclass-classmethod-first-arg=cls
-
-
-[DESIGN]
-
-# Maximum number of arguments for function / method.
-max-args=5
-
-# Maximum number of attributes for a class (see R0902).
-max-attributes=7
-
-# Maximum number of boolean expressions in an if statement.
-max-bool-expr=5
-
-# Maximum number of branch for function / method body.
-max-branches=12
-
-# Maximum number of locals for function / method body.
-max-locals=15
-
-# Maximum number of parents for a class (see R0901).
-max-parents=15
-
-# Maximum number of public methods for a class (see R0904).
-max-public-methods=20
-
-# Maximum number of return / yield for function / method body.
-max-returns=6
-
-# Maximum number of statements in function / method body.
-max-statements=50
-
-# Minimum number of public methods for a class (see R0903).
-min-public-methods=2
-
-
-[EXCEPTIONS]
-
-# Exceptions that will emit a warning when being caught. Defaults to
-# "BaseException, Exception".
-overgeneral-exceptions=builtins.BaseException,
-                       builtins.Exception
diff --git a/speech/third_party/Matcha-TTS/LICENSE b/speech/third_party/Matcha-TTS/LICENSE
deleted file mode 100644
index 858018e750da7be7b271bb7307e68d159ed67ef6..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/LICENSE
+++ /dev/null
@@ -1,21 +0,0 @@
-MIT License
-
-Copyright (c) 2023 Shivam Mehta
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
diff --git a/speech/third_party/Matcha-TTS/MANIFEST.in b/speech/third_party/Matcha-TTS/MANIFEST.in
deleted file mode 100644
index c013140cdfb9de19c4d4e73c73a44e33f33fa871..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/MANIFEST.in
+++ /dev/null
@@ -1,14 +0,0 @@
-include README.md
-include LICENSE.txt
-include requirements.*.txt
-include *.cff
-include requirements.txt
-include matcha/VERSION
-recursive-include matcha *.json
-recursive-include matcha *.html
-recursive-include matcha *.png
-recursive-include matcha *.md
-recursive-include matcha *.py
-recursive-include matcha *.pyx
-recursive-exclude tests *
-prune tests*
diff --git a/speech/third_party/Matcha-TTS/Makefile b/speech/third_party/Matcha-TTS/Makefile
deleted file mode 100644
index 4b523dd17b13a19617c9cc9d9dad7f7d8d4c24a0..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/Makefile
+++ /dev/null
@@ -1,42 +0,0 @@
-
-help:  ## Show help
-	@grep -E '^[.a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
-
-clean: ## Clean autogenerated files
-	rm -rf dist
-	find . -type f -name "*.DS_Store" -ls -delete
-	find . | grep -E "(__pycache__|\.pyc|\.pyo)" | xargs rm -rf
-	find . | grep -E ".pytest_cache" | xargs rm -rf
-	find . | grep -E ".ipynb_checkpoints" | xargs rm -rf
-	rm -f .coverage
-
-clean-logs: ## Clean logs
-	rm -rf logs/**
-
-create-package: ## Create wheel and tar gz
-	rm -rf dist/
-	python setup.py bdist_wheel --plat-name=manylinux1_x86_64
-	python setup.py sdist
-	python -m twine upload  dist/* --verbose --skip-existing
-
-format: ## Run pre-commit hooks
-	pre-commit run -a
-
-sync: ## Merge changes from main branch to your current branch
-	git pull
-	git pull origin main
-
-test: ## Run not slow tests
-	pytest -k "not slow"
-
-test-full: ## Run all tests
-	pytest
-
-train-ljspeech: ## Train the model
-	python matcha/train.py experiment=ljspeech
-
-train-ljspeech-min: ## Train the model with minimum memory
-	python matcha/train.py experiment=ljspeech_min_memory
-
-start_app: ## Start the app
-	python matcha/app.py
diff --git a/speech/third_party/Matcha-TTS/README.md b/speech/third_party/Matcha-TTS/README.md
deleted file mode 100644
index ebc6b7c0a76d30c33bf95583d629825c02183e31..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/README.md
+++ /dev/null
@@ -1,278 +0,0 @@
-<div align="center">
-
-# 🍵 Matcha-TTS: A fast TTS architecture with conditional flow matching
-
-### [Shivam Mehta](https://www.kth.se/profile/smehta), [Ruibo Tu](https://www.kth.se/profile/ruibo), [Jonas Beskow](https://www.kth.se/profile/beskow), [Éva Székely](https://www.kth.se/profile/szekely), and [Gustav Eje Henter](https://people.kth.se/~ghe/)
-
-[![python](https://img.shields.io/badge/-Python_3.10-blue?logo=python&logoColor=white)](https://www.python.org/downloads/release/python-3100/)
-[![pytorch](https://img.shields.io/badge/PyTorch_2.0+-ee4c2c?logo=pytorch&logoColor=white)](https://pytorch.org/get-started/locally/)
-[![lightning](https://img.shields.io/badge/-Lightning_2.0+-792ee5?logo=pytorchlightning&logoColor=white)](https://pytorchlightning.ai/)
-[![hydra](https://img.shields.io/badge/Config-Hydra_1.3-89b8cd)](https://hydra.cc/)
-[![black](https://img.shields.io/badge/Code%20Style-Black-black.svg?labelColor=gray)](https://black.readthedocs.io/en/stable/)
-[![isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
-
-<p style="text-align: center;">
-  <img src="https://shivammehta25.github.io/Matcha-TTS/images/logo.png" height="128"/>
-</p>
-
-</div>
-
-> This is the official code implementation of 🍵 Matcha-TTS [ICASSP 2024].
-
-We propose 🍵 Matcha-TTS, a new approach to non-autoregressive neural TTS, that uses [conditional flow matching](https://arxiv.org/abs/2210.02747) (similar to [rectified flows](https://arxiv.org/abs/2209.03003)) to speed up ODE-based speech synthesis. Our method:
-
-- Is probabilistic
-- Has compact memory footprint
-- Sounds highly natural
-- Is very fast to synthesise from
-
-Check out our [demo page](https://shivammehta25.github.io/Matcha-TTS) and read [our ICASSP 2024 paper](https://arxiv.org/abs/2309.03199) for more details.
-
-[Pre-trained models](https://drive.google.com/drive/folders/17C_gYgEHOxI5ZypcfE_k1piKCtyR0isJ?usp=sharing) will be automatically downloaded with the CLI or gradio interface.
-
-You can also [try 🍵 Matcha-TTS in your browser on HuggingFace 🤗 spaces](https://huggingface.co/spaces/shivammehta25/Matcha-TTS).
-
-## Teaser video
-
-[![Watch the video](https://img.youtube.com/vi/xmvJkz3bqw0/hqdefault.jpg)](https://youtu.be/xmvJkz3bqw0)
-
-## Installation
-
-1. Create an environment (suggested but optional)
-
-```
-conda create -n matcha-tts python=3.10 -y
-conda activate matcha-tts
-```
-
-2. Install Matcha TTS using pip or from source
-
-```bash
-pip install matcha-tts
-```
-
-from source
-
-```bash
-pip install git+https://github.com/shivammehta25/Matcha-TTS.git
-cd Matcha-TTS
-pip install -e .
-```
-
-3. Run CLI / gradio app / jupyter notebook
-
-```bash
-# This will download the required models
-matcha-tts --text "<INPUT TEXT>"
-```
-
-or
-
-```bash
-matcha-tts-app
-```
-
-or open `synthesis.ipynb` on jupyter notebook
-
-### CLI Arguments
-
-- To synthesise from given text, run:
-
-```bash
-matcha-tts --text "<INPUT TEXT>"
-```
-
-- To synthesise from a file, run:
-
-```bash
-matcha-tts --file <PATH TO FILE>
-```
-
-- To batch synthesise from a file, run:
-
-```bash
-matcha-tts --file <PATH TO FILE> --batched
-```
-
-Additional arguments
-
-- Speaking rate
-
-```bash
-matcha-tts --text "<INPUT TEXT>" --speaking_rate 1.0
-```
-
-- Sampling temperature
-
-```bash
-matcha-tts --text "<INPUT TEXT>" --temperature 0.667
-```
-
-- Euler ODE solver steps
-
-```bash
-matcha-tts --text "<INPUT TEXT>" --steps 10
-```
-
-## Train with your own dataset
-
-Let's assume we are training with LJ Speech
-
-1. Download the dataset from [here](https://keithito.com/LJ-Speech-Dataset/), extract it to `data/LJSpeech-1.1`, and prepare the file lists to point to the extracted data like for [item 5 in the setup of the NVIDIA Tacotron 2 repo](https://github.com/NVIDIA/tacotron2#setup).
-
-2. Clone and enter the Matcha-TTS repository
-
-```bash
-git clone https://github.com/shivammehta25/Matcha-TTS.git
-cd Matcha-TTS
-```
-
-3. Install the package from source
-
-```bash
-pip install -e .
-```
-
-4. Go to `configs/data/ljspeech.yaml` and change
-
-```yaml
-train_filelist_path: data/filelists/ljs_audio_text_train_filelist.txt
-valid_filelist_path: data/filelists/ljs_audio_text_val_filelist.txt
-```
-
-5. Generate normalisation statistics with the yaml file of dataset configuration
-
-```bash
-matcha-data-stats -i ljspeech.yaml
-# Output:
-#{'mel_mean': -5.53662231756592, 'mel_std': 2.1161014277038574}
-```
-
-Update these values in `configs/data/ljspeech.yaml` under `data_statistics` key.
-
-```bash
-data_statistics:  # Computed for ljspeech dataset
-  mel_mean: -5.536622
-  mel_std: 2.116101
-```
-
-to the paths of your train and validation filelists.
-
-6. Run the training script
-
-```bash
-make train-ljspeech
-```
-
-or
-
-```bash
-python matcha/train.py experiment=ljspeech
-```
-
-- for a minimum memory run
-
-```bash
-python matcha/train.py experiment=ljspeech_min_memory
-```
-
-- for multi-gpu training, run
-
-```bash
-python matcha/train.py experiment=ljspeech trainer.devices=[0,1]
-```
-
-7. Synthesise from the custom trained model
-
-```bash
-matcha-tts --text "<INPUT TEXT>" --checkpoint_path <PATH TO CHECKPOINT>
-```
-
-## ONNX support
-
-> Special thanks to [@mush42](https://github.com/mush42) for implementing ONNX export and inference support.
-
-It is possible to export Matcha checkpoints to [ONNX](https://onnx.ai/), and run inference on the exported ONNX graph.
-
-### ONNX export
-
-To export a checkpoint to ONNX, first install ONNX with
-
-```bash
-pip install onnx
-```
-
-then run the following:
-
-```bash
-python3 -m matcha.onnx.export matcha.ckpt model.onnx --n-timesteps 5
-```
-
-Optionally, the ONNX exporter accepts **vocoder-name** and **vocoder-checkpoint** arguments. This enables you to embed the vocoder in the exported graph and generate waveforms in a single run (similar to end-to-end TTS systems).
-
-**Note** that `n_timesteps` is treated as a hyper-parameter rather than a model input. This means you should specify it during export (not during inference). If not specified, `n_timesteps` is set to **5**.
-
-**Important**: for now, torch>=2.1.0 is needed for export since the `scaled_product_attention` operator is not exportable in older versions. Until the final version is released, those who want to export their models must install torch>=2.1.0 manually as a pre-release.
-
-### ONNX Inference
-
-To run inference on the exported model, first install `onnxruntime` using
-
-```bash
-pip install onnxruntime
-pip install onnxruntime-gpu  # for GPU inference
-```
-
-then use the following:
-
-```bash
-python3 -m matcha.onnx.infer model.onnx --text "hey" --output-dir ./outputs
-```
-
-You can also control synthesis parameters:
-
-```bash
-python3 -m matcha.onnx.infer model.onnx --text "hey" --output-dir ./outputs --temperature 0.4 --speaking_rate 0.9 --spk 0
-```
-
-To run inference on **GPU**, make sure to install **onnxruntime-gpu** package, and then pass `--gpu` to the inference command:
-
-```bash
-python3 -m matcha.onnx.infer model.onnx --text "hey" --output-dir ./outputs --gpu
-```
-
-If you exported only Matcha to ONNX, this will write mel-spectrogram as graphs and `numpy` arrays to the output directory.
-If you embedded the vocoder in the exported graph, this will write `.wav` audio files to the output directory.
-
-If you exported only Matcha to ONNX, and you want to run a full TTS pipeline, you can pass a path to a vocoder model in `ONNX` format:
-
-```bash
-python3 -m matcha.onnx.infer model.onnx --text "hey" --output-dir ./outputs --vocoder hifigan.small.onnx
-```
-
-This will write `.wav` audio files to the output directory.
-
-## Citation information
-
-If you use our code or otherwise find this work useful, please cite our paper:
-
-```text
-@inproceedings{mehta2024matcha,
-  title={Matcha-{TTS}: A fast {TTS} architecture with conditional flow matching},
-  author={Mehta, Shivam and Tu, Ruibo and Beskow, Jonas and Sz{\'e}kely, {\'E}va and Henter, Gustav Eje},
-  booktitle={Proc. ICASSP},
-  year={2024}
-}
-```
-
-## Acknowledgements
-
-Since this code uses [Lightning-Hydra-Template](https://github.com/ashleve/lightning-hydra-template), you have all the powers that come with it.
-
-Other source code we would like to acknowledge:
-
-- [Coqui-TTS](https://github.com/coqui-ai/TTS/tree/dev): For helping me figure out how to make cython binaries pip installable and encouragement
-- [Hugging Face Diffusers](https://huggingface.co/): For their awesome diffusers library and its components
-- [Grad-TTS](https://github.com/huawei-noah/Speech-Backbones/tree/main/Grad-TTS): For the monotonic alignment search source code
-- [torchdyn](https://github.com/DiffEqML/torchdyn): Useful for trying other ODE solvers during research and development
-- [labml.ai](https://nn.labml.ai/transformers/rope/index.html): For the RoPE implementation
diff --git a/speech/third_party/Matcha-TTS/configs/__init__.py b/speech/third_party/Matcha-TTS/configs/__init__.py
deleted file mode 100644
index 56bf7f4aa4906bc0f997132708cc0826c198e4aa..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/configs/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-# this file is needed here to include configs when building project as a package
diff --git a/speech/third_party/Matcha-TTS/configs/callbacks/default.yaml b/speech/third_party/Matcha-TTS/configs/callbacks/default.yaml
deleted file mode 100644
index ebaa3ed31a7f626bc62f90184dc4b25b631e52a9..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/configs/callbacks/default.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-defaults:
-  - model_checkpoint.yaml
-  - model_summary.yaml
-  - rich_progress_bar.yaml
-  - _self_
diff --git a/speech/third_party/Matcha-TTS/configs/callbacks/model_checkpoint.yaml b/speech/third_party/Matcha-TTS/configs/callbacks/model_checkpoint.yaml
deleted file mode 100644
index 3d085c711a8521b6b98ad6401b686bb601ceacd6..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/configs/callbacks/model_checkpoint.yaml
+++ /dev/null
@@ -1,17 +0,0 @@
-# https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.ModelCheckpoint.html
-
-model_checkpoint:
-  _target_: lightning.pytorch.callbacks.ModelCheckpoint
-  dirpath: ${paths.output_dir}/checkpoints # directory to save the model file
-  filename: checkpoint_{epoch:03d}  # checkpoint filename
-  monitor: epoch # name of the logged metric which determines when model is improving
-  verbose: False # verbosity mode
-  save_last: true # additionally always save an exact copy of the last checkpoint to a file last.ckpt
-  save_top_k: 10 # save k best models (determined by above metric)
-  mode: "max" # "max" means higher metric value is better, can be also "min"
-  auto_insert_metric_name: True # when True, the checkpoints filenames will contain the metric name
-  save_weights_only: False # if True, then only the model’s weights will be saved
-  every_n_train_steps: null # number of training steps between checkpoints
-  train_time_interval: null # checkpoints are monitored at the specified time interval
-  every_n_epochs: 100 # number of epochs between checkpoints
-  save_on_train_epoch_end: null # whether to run checkpointing at the end of the training epoch or the end of validation
diff --git a/speech/third_party/Matcha-TTS/configs/callbacks/model_summary.yaml b/speech/third_party/Matcha-TTS/configs/callbacks/model_summary.yaml
deleted file mode 100644
index 6e5368d0e94298cce6d5421365b4583bd763ba92..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/configs/callbacks/model_summary.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-# https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.RichModelSummary.html
-
-model_summary:
-  _target_: lightning.pytorch.callbacks.RichModelSummary
-  max_depth: 3 # the maximum depth of layer nesting that the summary will include
diff --git a/speech/third_party/Matcha-TTS/configs/callbacks/none.yaml b/speech/third_party/Matcha-TTS/configs/callbacks/none.yaml
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/speech/third_party/Matcha-TTS/configs/callbacks/rich_progress_bar.yaml b/speech/third_party/Matcha-TTS/configs/callbacks/rich_progress_bar.yaml
deleted file mode 100644
index de6f1ccb11205a4db93645fb6f297e50205de172..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/configs/callbacks/rich_progress_bar.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# https://lightning.ai/docs/pytorch/latest/api/lightning.pytorch.callbacks.RichProgressBar.html
-
-rich_progress_bar:
-  _target_: lightning.pytorch.callbacks.RichProgressBar
diff --git a/speech/third_party/Matcha-TTS/configs/debug/default.yaml b/speech/third_party/Matcha-TTS/configs/debug/default.yaml
deleted file mode 100644
index e3932c82585fbe44047c1569a5cfe9ee9895c71a..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/configs/debug/default.yaml
+++ /dev/null
@@ -1,35 +0,0 @@
-# @package _global_
-
-# default debugging setup, runs 1 full epoch
-# other debugging configs can inherit from this one
-
-# overwrite task name so debugging logs are stored in separate folder
-task_name: "debug"
-
-# disable callbacks and loggers during debugging
-# callbacks: null
-# logger: null
-
-extras:
-  ignore_warnings: False
-  enforce_tags: False
-
-# sets level of all command line loggers to 'DEBUG'
-# https://hydra.cc/docs/tutorials/basic/running_your_app/logging/
-hydra:
-  job_logging:
-    root:
-      level: DEBUG
-
-  # use this to also set hydra loggers to 'DEBUG'
-  # verbose: True
-
-trainer:
-  max_epochs: 1
-  accelerator: cpu # debuggers don't like gpus
-  devices: 1 # debuggers don't like multiprocessing
-  detect_anomaly: true # raise exception if NaN or +/-inf is detected in any tensor
-
-data:
-  num_workers: 0 # debuggers don't like multiprocessing
-  pin_memory: False # disable gpu memory pin
diff --git a/speech/third_party/Matcha-TTS/configs/debug/fdr.yaml b/speech/third_party/Matcha-TTS/configs/debug/fdr.yaml
deleted file mode 100644
index 7f2d34fa37c31017e749d5a4fc5ae6763e688b46..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/configs/debug/fdr.yaml
+++ /dev/null
@@ -1,9 +0,0 @@
-# @package _global_
-
-# runs 1 train, 1 validation and 1 test step
-
-defaults:
-  - default
-
-trainer:
-  fast_dev_run: true
diff --git a/speech/third_party/Matcha-TTS/configs/debug/limit.yaml b/speech/third_party/Matcha-TTS/configs/debug/limit.yaml
deleted file mode 100644
index 514d77fbd1475b03fff0372e3da3c2fa7ea7d190..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/configs/debug/limit.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# @package _global_
-
-# uses only 1% of the training data and 5% of validation/test data
-
-defaults:
-  - default
-
-trainer:
-  max_epochs: 3
-  limit_train_batches: 0.01
-  limit_val_batches: 0.05
-  limit_test_batches: 0.05
diff --git a/speech/third_party/Matcha-TTS/configs/debug/overfit.yaml b/speech/third_party/Matcha-TTS/configs/debug/overfit.yaml
deleted file mode 100644
index 9906586a67a12aa81ff69138f589a366dbe2222f..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/configs/debug/overfit.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-# @package _global_
-
-# overfits to 3 batches
-
-defaults:
-  - default
-
-trainer:
-  max_epochs: 20
-  overfit_batches: 3
-
-# model ckpt and early stopping need to be disabled during overfitting
-callbacks: null
diff --git a/speech/third_party/Matcha-TTS/configs/debug/profiler.yaml b/speech/third_party/Matcha-TTS/configs/debug/profiler.yaml
deleted file mode 100644
index 266295f15e0166e1d1b58b88caa7673f4b6493b5..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/configs/debug/profiler.yaml
+++ /dev/null
@@ -1,15 +0,0 @@
-# @package _global_
-
-# runs with execution time profiling
-
-defaults:
-  - default
-
-trainer:
-  max_epochs: 1
-  # profiler: "simple"
-  profiler: "advanced"
-  # profiler: "pytorch"
-  accelerator: gpu
-
-  limit_train_batches: 0.02
diff --git a/speech/third_party/Matcha-TTS/configs/eval.yaml b/speech/third_party/Matcha-TTS/configs/eval.yaml
deleted file mode 100644
index be312992b2a486b04d83a54dbd8f670d94979709..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/configs/eval.yaml
+++ /dev/null
@@ -1,18 +0,0 @@
-# @package _global_
-
-defaults:
-  - _self_
-  - data: mnist # choose datamodule with `test_dataloader()` for evaluation
-  - model: mnist
-  - logger: null
-  - trainer: default
-  - paths: default
-  - extras: default
-  - hydra: default
-
-task_name: "eval"
-
-tags: ["dev"]
-
-# passing checkpoint path is necessary for evaluation
-ckpt_path: ???
diff --git a/speech/third_party/Matcha-TTS/configs/experiment/hifi_dataset_piper_phonemizer.yaml b/speech/third_party/Matcha-TTS/configs/experiment/hifi_dataset_piper_phonemizer.yaml
deleted file mode 100644
index 7e6c57a0d0a399f7463f4ff2d96e1928c435779b..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/configs/experiment/hifi_dataset_piper_phonemizer.yaml
+++ /dev/null
@@ -1,14 +0,0 @@
-# @package _global_
-
-# to execute this experiment run:
-# python train.py experiment=multispeaker
-
-defaults:
-  - override /data: hi-fi_en-US_female.yaml
-
-# all parameters below will be merged with parameters from default configurations set above
-# this allows you to overwrite only specified parameters
-
-tags: ["hi-fi", "single_speaker", "piper_phonemizer", "en_US", "female"]
-
-run_name: hi-fi_en-US_female_piper_phonemizer
diff --git a/speech/third_party/Matcha-TTS/configs/experiment/ljspeech.yaml b/speech/third_party/Matcha-TTS/configs/experiment/ljspeech.yaml
deleted file mode 100644
index d5723f42cf3552226c42bd91202cc18818b685f0..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/configs/experiment/ljspeech.yaml
+++ /dev/null
@@ -1,14 +0,0 @@
-# @package _global_
-
-# to execute this experiment run:
-# python train.py experiment=multispeaker
-
-defaults:
-  - override /data: ljspeech.yaml
-
-# all parameters below will be merged with parameters from default configurations set above
-# this allows you to overwrite only specified parameters
-
-tags: ["ljspeech"]
-
-run_name: ljspeech
diff --git a/speech/third_party/Matcha-TTS/configs/experiment/ljspeech_min_memory.yaml b/speech/third_party/Matcha-TTS/configs/experiment/ljspeech_min_memory.yaml
deleted file mode 100644
index ef554dc633c392b1592d90d9d7734f2329264fdd..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/configs/experiment/ljspeech_min_memory.yaml
+++ /dev/null
@@ -1,18 +0,0 @@
-# @package _global_
-
-# to execute this experiment run:
-# python train.py experiment=multispeaker
-
-defaults:
-  - override /data: ljspeech.yaml
-
-# all parameters below will be merged with parameters from default configurations set above
-# this allows you to overwrite only specified parameters
-
-tags: ["ljspeech"]
-
-run_name: ljspeech_min
-
-
-model:
-  out_size: 172
diff --git a/speech/third_party/Matcha-TTS/configs/experiment/multispeaker.yaml b/speech/third_party/Matcha-TTS/configs/experiment/multispeaker.yaml
deleted file mode 100644
index 553842f4e2168db0fee4e44db11b5d086295b044..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/configs/experiment/multispeaker.yaml
+++ /dev/null
@@ -1,14 +0,0 @@
-# @package _global_
-
-# to execute this experiment run:
-# python train.py experiment=multispeaker
-
-defaults:
-  - override /data: vctk.yaml
-
-# all parameters below will be merged with parameters from default configurations set above
-# this allows you to overwrite only specified parameters
-
-tags: ["multispeaker"]
-
-run_name: multispeaker
diff --git a/speech/third_party/Matcha-TTS/configs/extras/default.yaml b/speech/third_party/Matcha-TTS/configs/extras/default.yaml
deleted file mode 100644
index b9c6b622283a647fbc513166fc14f016cc3ed8a0..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/configs/extras/default.yaml
+++ /dev/null
@@ -1,8 +0,0 @@
-# disable python warnings if they annoy you
-ignore_warnings: False
-
-# ask user for tags if none are provided in the config
-enforce_tags: True
-
-# pretty print config tree at the start of the run using Rich library
-print_config: True
diff --git a/speech/third_party/Matcha-TTS/configs/hparams_search/mnist_optuna.yaml b/speech/third_party/Matcha-TTS/configs/hparams_search/mnist_optuna.yaml
deleted file mode 100644
index 1391183ebcdec3d8f5eb61374e0719d13c7545da..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/configs/hparams_search/mnist_optuna.yaml
+++ /dev/null
@@ -1,52 +0,0 @@
-# @package _global_
-
-# example hyperparameter optimization of some experiment with Optuna:
-# python train.py -m hparams_search=mnist_optuna experiment=example
-
-defaults:
-  - override /hydra/sweeper: optuna
-
-# choose metric which will be optimized by Optuna
-# make sure this is the correct name of some metric logged in lightning module!
-optimized_metric: "val/acc_best"
-
-# here we define Optuna hyperparameter search
-# it optimizes for value returned from function with @hydra.main decorator
-# docs: https://hydra.cc/docs/next/plugins/optuna_sweeper
-hydra:
-  mode: "MULTIRUN" # set hydra to multirun by default if this config is attached
-
-  sweeper:
-    _target_: hydra_plugins.hydra_optuna_sweeper.optuna_sweeper.OptunaSweeper
-
-    # storage URL to persist optimization results
-    # for example, you can use SQLite if you set 'sqlite:///example.db'
-    storage: null
-
-    # name of the study to persist optimization results
-    study_name: null
-
-    # number of parallel workers
-    n_jobs: 1
-
-    # 'minimize' or 'maximize' the objective
-    direction: maximize
-
-    # total number of runs that will be executed
-    n_trials: 20
-
-    # choose Optuna hyperparameter sampler
-    # you can choose bayesian sampler (tpe), random search (without optimization), grid sampler, and others
-    # docs: https://optuna.readthedocs.io/en/stable/reference/samplers.html
-    sampler:
-      _target_: optuna.samplers.TPESampler
-      seed: 1234
-      n_startup_trials: 10 # number of random sampling runs before optimization starts
-
-    # define hyperparameter search space
-    params:
-      model.optimizer.lr: interval(0.0001, 0.1)
-      data.batch_size: choice(32, 64, 128, 256)
-      model.net.lin1_size: choice(64, 128, 256)
-      model.net.lin2_size: choice(64, 128, 256)
-      model.net.lin3_size: choice(32, 64, 128, 256)
diff --git a/speech/third_party/Matcha-TTS/configs/hydra/default.yaml b/speech/third_party/Matcha-TTS/configs/hydra/default.yaml
deleted file mode 100644
index 1533136b22802a4f81e5387b74e407289edce94d..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/configs/hydra/default.yaml
+++ /dev/null
@@ -1,19 +0,0 @@
-# https://hydra.cc/docs/configure_hydra/intro/
-
-# enable color logging
-defaults:
-  - override hydra_logging: colorlog
-  - override job_logging: colorlog
-
-# output directory, generated dynamically on each run
-run:
-  dir: ${paths.log_dir}/${task_name}/${run_name}/runs/${now:%Y-%m-%d}_${now:%H-%M-%S}
-sweep:
-  dir: ${paths.log_dir}/${task_name}/${run_name}/multiruns/${now:%Y-%m-%d}_${now:%H-%M-%S}
-  subdir: ${hydra.job.num}
-
-job_logging:
-  handlers:
-    file:
-      # Incorporates fix from https://github.com/facebookresearch/hydra/pull/2242
-      filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
diff --git a/speech/third_party/Matcha-TTS/configs/local/.gitkeep b/speech/third_party/Matcha-TTS/configs/local/.gitkeep
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/speech/third_party/Matcha-TTS/configs/logger/aim.yaml b/speech/third_party/Matcha-TTS/configs/logger/aim.yaml
deleted file mode 100644
index 8f9f6adad7feb2780c2efd5ddb0ed053621e05f8..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/configs/logger/aim.yaml
+++ /dev/null
@@ -1,28 +0,0 @@
-# https://aimstack.io/
-
-# example usage in lightning module:
-# https://github.com/aimhubio/aim/blob/main/examples/pytorch_lightning_track.py
-
-# open the Aim UI with the following command (run in the folder containing the `.aim` folder):
-# `aim up`
-
-aim:
-  _target_: aim.pytorch_lightning.AimLogger
-  repo: ${paths.root_dir} # .aim folder will be created here
-  # repo: "aim://ip_address:port" # can instead provide IP address pointing to Aim remote tracking server which manages the repo, see https://aimstack.readthedocs.io/en/latest/using/remote_tracking.html#
-
-  # aim allows to group runs under experiment name
-  experiment: null # any string, set to "default" if not specified
-
-  train_metric_prefix: "train/"
-  val_metric_prefix: "val/"
-  test_metric_prefix: "test/"
-
-  # sets the tracking interval in seconds for system usage metrics (CPU, GPU, memory, etc.)
-  system_tracking_interval: 10 # set to null to disable system metrics tracking
-
-  # enable/disable logging of system params such as installed packages, git info, env vars, etc.
-  log_system_params: true
-
-  # enable/disable tracking console logs (default value is true)
-  capture_terminal_logs: false # set to false to avoid infinite console log loop issue https://github.com/aimhubio/aim/issues/2550
diff --git a/speech/third_party/Matcha-TTS/configs/logger/comet.yaml b/speech/third_party/Matcha-TTS/configs/logger/comet.yaml
deleted file mode 100644
index e0789274e2137ee6c97ca37a5d56c2b8abaf0aaa..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/configs/logger/comet.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# https://www.comet.ml
-
-comet:
-  _target_: lightning.pytorch.loggers.comet.CometLogger
-  api_key: ${oc.env:COMET_API_TOKEN} # api key is loaded from environment variable
-  save_dir: "${paths.output_dir}"
-  project_name: "lightning-hydra-template"
-  rest_api_key: null
-  # experiment_name: ""
-  experiment_key: null # set to resume experiment
-  offline: False
-  prefix: ""
diff --git a/speech/third_party/Matcha-TTS/configs/logger/csv.yaml b/speech/third_party/Matcha-TTS/configs/logger/csv.yaml
deleted file mode 100644
index fa028e9c146430c319101ffdfce466514338591c..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/configs/logger/csv.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-# csv logger built in lightning
-
-csv:
-  _target_: lightning.pytorch.loggers.csv_logs.CSVLogger
-  save_dir: "${paths.output_dir}"
-  name: "csv/"
-  prefix: ""
diff --git a/speech/third_party/Matcha-TTS/configs/logger/many_loggers.yaml b/speech/third_party/Matcha-TTS/configs/logger/many_loggers.yaml
deleted file mode 100644
index dd586800bdccb4e8f4b0236a181b7ddd756ba9ab..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/configs/logger/many_loggers.yaml
+++ /dev/null
@@ -1,9 +0,0 @@
-# train with many loggers at once
-
-defaults:
-  # - comet
-  - csv
-  # - mlflow
-  # - neptune
-  - tensorboard
-  - wandb
diff --git a/speech/third_party/Matcha-TTS/configs/logger/mlflow.yaml b/speech/third_party/Matcha-TTS/configs/logger/mlflow.yaml
deleted file mode 100644
index f8fb7e685fa27fc8141387a421b90a0b9b492d9e..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/configs/logger/mlflow.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# https://mlflow.org
-
-mlflow:
-  _target_: lightning.pytorch.loggers.mlflow.MLFlowLogger
-  # experiment_name: ""
-  # run_name: ""
-  tracking_uri: ${paths.log_dir}/mlflow/mlruns # run `mlflow ui` command inside the `logs/mlflow/` dir to open the UI
-  tags: null
-  # save_dir: "./mlruns"
-  prefix: ""
-  artifact_location: null
-  # run_id: ""
diff --git a/speech/third_party/Matcha-TTS/configs/logger/neptune.yaml b/speech/third_party/Matcha-TTS/configs/logger/neptune.yaml
deleted file mode 100644
index 8233c140018ecce6ab62971beed269991d31c89b..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/configs/logger/neptune.yaml
+++ /dev/null
@@ -1,9 +0,0 @@
-# https://neptune.ai
-
-neptune:
-  _target_: lightning.pytorch.loggers.neptune.NeptuneLogger
-  api_key: ${oc.env:NEPTUNE_API_TOKEN} # api key is loaded from environment variable
-  project: username/lightning-hydra-template
-  # name: ""
-  log_model_checkpoints: True
-  prefix: ""
diff --git a/speech/third_party/Matcha-TTS/configs/logger/tensorboard.yaml b/speech/third_party/Matcha-TTS/configs/logger/tensorboard.yaml
deleted file mode 100644
index 2bd31f6d8ba68d1f5c36a804885d5b9f9c1a9302..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/configs/logger/tensorboard.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-# https://www.tensorflow.org/tensorboard/
-
-tensorboard:
-  _target_: lightning.pytorch.loggers.tensorboard.TensorBoardLogger
-  save_dir: "${paths.output_dir}/tensorboard/"
-  name: null
-  log_graph: False
-  default_hp_metric: True
-  prefix: ""
-  # version: ""
diff --git a/speech/third_party/Matcha-TTS/configs/logger/wandb.yaml b/speech/third_party/Matcha-TTS/configs/logger/wandb.yaml
deleted file mode 100644
index ece165889b3d0d9dc750a8f3c7454188cfdf12b7..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/configs/logger/wandb.yaml
+++ /dev/null
@@ -1,16 +0,0 @@
-# https://wandb.ai
-
-wandb:
-  _target_: lightning.pytorch.loggers.wandb.WandbLogger
-  # name: "" # name of the run (normally generated by wandb)
-  save_dir: "${paths.output_dir}"
-  offline: False
-  id: null # pass correct id to resume experiment!
-  anonymous: null # enable anonymous logging
-  project: "lightning-hydra-template"
-  log_model: False # upload lightning ckpts
-  prefix: "" # a string to put at the beginning of metric keys
-  # entity: "" # set to name of your wandb team
-  group: ""
-  tags: []
-  job_type: ""
diff --git a/speech/third_party/Matcha-TTS/configs/model/cfm/default.yaml b/speech/third_party/Matcha-TTS/configs/model/cfm/default.yaml
deleted file mode 100644
index 0d1d9609e2d05c7b0a12a26115520340ac18e584..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/configs/model/cfm/default.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-name: CFM
-solver: euler
-sigma_min: 1e-4
diff --git a/speech/third_party/Matcha-TTS/configs/model/decoder/default.yaml b/speech/third_party/Matcha-TTS/configs/model/decoder/default.yaml
deleted file mode 100644
index aaa00e63402ade5c76247a2f1d6b294ec3c61e63..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/configs/model/decoder/default.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-channels: [256, 256]
-dropout: 0.05
-attention_head_dim: 64
-n_blocks: 1
-num_mid_blocks: 2
-num_heads: 2
-act_fn: snakebeta
diff --git a/speech/third_party/Matcha-TTS/configs/model/encoder/default.yaml b/speech/third_party/Matcha-TTS/configs/model/encoder/default.yaml
deleted file mode 100644
index d4d5e5adee8f707bd384b682a3ad9a116c40c6ed..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/configs/model/encoder/default.yaml
+++ /dev/null
@@ -1,18 +0,0 @@
-encoder_type: RoPE Encoder
-encoder_params:
-  n_feats: ${model.n_feats}
-  n_channels: 192
-  filter_channels: 768
-  filter_channels_dp: 256
-  n_heads: 2
-  n_layers: 6
-  kernel_size: 3
-  p_dropout: 0.1
-  spk_emb_dim: 64
-  n_spks: 1
-  prenet: true
-
-duration_predictor_params:
-  filter_channels_dp: ${model.encoder.encoder_params.filter_channels_dp}
-  kernel_size: 3
-  p_dropout: ${model.encoder.encoder_params.p_dropout}
diff --git a/speech/third_party/Matcha-TTS/configs/model/matcha.yaml b/speech/third_party/Matcha-TTS/configs/model/matcha.yaml
deleted file mode 100644
index 36f6eafbdcaa324f7494a4b97a7590da7824f357..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/configs/model/matcha.yaml
+++ /dev/null
@@ -1,15 +0,0 @@
-defaults:
-  - _self_
-  - encoder: default.yaml
-  - decoder: default.yaml
-  - cfm: default.yaml
-  - optimizer: adam.yaml
-
-_target_: matcha.models.matcha_tts.MatchaTTS
-n_vocab: 178
-n_spks: ${data.n_spks}
-spk_emb_dim: 64
-n_feats: 80
-data_statistics: ${data.data_statistics}
-out_size: null # Must be divisible by 4
-prior_loss: true
diff --git a/speech/third_party/Matcha-TTS/configs/model/optimizer/adam.yaml b/speech/third_party/Matcha-TTS/configs/model/optimizer/adam.yaml
deleted file mode 100644
index 42795577474eaee5b0b96845a95e1a11c9152385..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/configs/model/optimizer/adam.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-_target_: torch.optim.Adam
-_partial_: true
-lr: 1e-4
-weight_decay: 0.0
diff --git a/speech/third_party/Matcha-TTS/configs/paths/default.yaml b/speech/third_party/Matcha-TTS/configs/paths/default.yaml
deleted file mode 100644
index ec81db2d34712909a79be3e42e65efe08c35ecee..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/configs/paths/default.yaml
+++ /dev/null
@@ -1,18 +0,0 @@
-# path to root directory
-# this requires PROJECT_ROOT environment variable to exist
-# you can replace it with "." if you want the root to be the current working directory
-root_dir: ${oc.env:PROJECT_ROOT}
-
-# path to data directory
-data_dir: ${paths.root_dir}/data/
-
-# path to logging directory
-log_dir: ${paths.root_dir}/logs/
-
-# path to output directory, created dynamically by hydra
-# path generation pattern is specified in `configs/hydra/default.yaml`
-# use it to store all files generated during the run, like ckpts and metrics
-output_dir: ${hydra:runtime.output_dir}
-
-# path to working directory
-work_dir: ${hydra:runtime.cwd}
diff --git a/speech/third_party/Matcha-TTS/configs/train.yaml b/speech/third_party/Matcha-TTS/configs/train.yaml
deleted file mode 100644
index e6f5c2e7b9781758c8d25f941f004ca383c3f494..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/configs/train.yaml
+++ /dev/null
@@ -1,51 +0,0 @@
-# @package _global_
-
-# specify here default configuration
-# order of defaults determines the order in which configs override each other
-defaults:
-  - _self_
-  - data: ljspeech
-  - model: matcha
-  - callbacks: default
-  - logger: tensorboard # set logger here or use command line (e.g. `python train.py logger=tensorboard`)
-  - trainer: default
-  - paths: default
-  - extras: default
-  - hydra: default
-
-  # experiment configs allow for version control of specific hyperparameters
-  # e.g. best hyperparameters for given model and datamodule
-  - experiment: null
-
-  # config for hyperparameter optimization
-  - hparams_search: null
-
-  # optional local config for machine/user specific settings
-  # it's optional since it doesn't need to exist and is excluded from version control
-  - optional local: default
-
-  # debugging config (enable through command line, e.g. `python train.py debug=default)
-  - debug: null
-
-# task name, determines output directory path
-task_name: "train"
-
-run_name: ???
-
-# tags to help you identify your experiments
-# you can overwrite this in experiment configs
-# overwrite from command line with `python train.py tags="[first_tag, second_tag]"`
-tags: ["dev"]
-
-# set False to skip model training
-train: True
-
-# evaluate on test set, using best model weights achieved during training
-# lightning chooses best weights based on the metric specified in checkpoint callback
-test: True
-
-# simply provide checkpoint path to resume training
-ckpt_path: null
-
-# seed for random number generators in pytorch, numpy and python.random
-seed: 1234
diff --git a/speech/third_party/Matcha-TTS/configs/trainer/cpu.yaml b/speech/third_party/Matcha-TTS/configs/trainer/cpu.yaml
deleted file mode 100644
index b7d6767e60c956567555980654f15e7bb673a41f..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/configs/trainer/cpu.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-defaults:
-  - default
-
-accelerator: cpu
-devices: 1
diff --git a/speech/third_party/Matcha-TTS/configs/trainer/ddp.yaml b/speech/third_party/Matcha-TTS/configs/trainer/ddp.yaml
deleted file mode 100644
index 94b43e20ca7bf1f2ea92627fd46906e4f0a273a1..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/configs/trainer/ddp.yaml
+++ /dev/null
@@ -1,9 +0,0 @@
-defaults:
-  - default
-
-strategy: ddp
-
-accelerator: gpu
-devices: [0,1]
-num_nodes: 1
-sync_batchnorm: True
diff --git a/speech/third_party/Matcha-TTS/configs/trainer/ddp_sim.yaml b/speech/third_party/Matcha-TTS/configs/trainer/ddp_sim.yaml
deleted file mode 100644
index 8404419e5c295654967d0dfb73a7366e75be2f1f..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/configs/trainer/ddp_sim.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-defaults:
-  - default
-
-# simulate DDP on CPU, useful for debugging
-accelerator: cpu
-devices: 2
-strategy: ddp_spawn
diff --git a/speech/third_party/Matcha-TTS/configs/trainer/default.yaml b/speech/third_party/Matcha-TTS/configs/trainer/default.yaml
deleted file mode 100644
index ee3d370d8ca6b08d7ee7a86d34184c2104f0e1ef..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/configs/trainer/default.yaml
+++ /dev/null
@@ -1,20 +0,0 @@
-_target_: lightning.pytorch.trainer.Trainer
-
-default_root_dir: ${paths.output_dir}
-
-max_epochs: -1
-
-accelerator: gpu
-devices: [0]
-
-# mixed precision for extra speed-up
-precision: 16-mixed
-
-# perform a validation loop every N training epochs
-check_val_every_n_epoch: 1
-
-# set True to to ensure deterministic results
-# makes training slower but gives more reproducibility than just setting seeds
-deterministic: False
-
-gradient_clip_val: 5.0
diff --git a/speech/third_party/Matcha-TTS/configs/trainer/gpu.yaml b/speech/third_party/Matcha-TTS/configs/trainer/gpu.yaml
deleted file mode 100644
index b2389510a90f5f0161cff6ccfcb4a96097ddf9a1..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/configs/trainer/gpu.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-defaults:
-  - default
-
-accelerator: gpu
-devices: 1
diff --git a/speech/third_party/Matcha-TTS/configs/trainer/mps.yaml b/speech/third_party/Matcha-TTS/configs/trainer/mps.yaml
deleted file mode 100644
index 1ecf6d5cc3a34ca127c5510f4a18e989561e38e4..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/configs/trainer/mps.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-defaults:
-  - default
-
-accelerator: mps
-devices: 1
diff --git a/speech/third_party/Matcha-TTS/matcha/VERSION b/speech/third_party/Matcha-TTS/matcha/VERSION
deleted file mode 100644
index 442b1138f7851df1c22deb15fd5d6ff5b742e550..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/matcha/VERSION
+++ /dev/null
@@ -1 +0,0 @@
-0.0.5.1
diff --git a/speech/third_party/Matcha-TTS/notebooks/.gitkeep b/speech/third_party/Matcha-TTS/notebooks/.gitkeep
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/speech/third_party/Matcha-TTS/pyproject.toml b/speech/third_party/Matcha-TTS/pyproject.toml
deleted file mode 100644
index 74aa39300a61b8b3607dc634d68aa47013141ec5..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/pyproject.toml
+++ /dev/null
@@ -1,51 +0,0 @@
-[build-system]
-requires = ["setuptools", "wheel", "cython==0.29.35", "numpy==1.24.3", "packaging"]
-
-[tool.black]
-line-length = 120
-target-version = ['py310']
-exclude = '''
-
-(
-  /(
-      \.eggs         # exclude a few common directories in the
-    | \.git          # root of the project
-    | \.hg
-    | \.mypy_cache
-    | \.tox
-    | \.venv
-    | _build
-    | buck-out
-    | build
-    | dist
-  )/
-  | foo.py           # also separately exclude a file named foo.py in
-                     # the root of the project
-)
-'''
-
-[tool.pytest.ini_options]
-addopts = [
-  "--color=yes",
-  "--durations=0",
-  "--strict-markers",
-  "--doctest-modules",
-]
-filterwarnings = [
-  "ignore::DeprecationWarning",
-  "ignore::UserWarning",
-]
-log_cli = "True"
-markers = [
-  "slow: slow tests",
-]
-minversion = "6.0"
-testpaths = "tests/"
-
-[tool.coverage.report]
-exclude_lines = [
-    "pragma: nocover",
-    "raise NotImplementedError",
-    "raise NotImplementedError()",
-    "if __name__ == .__main__.:",
-]
diff --git a/speech/third_party/Matcha-TTS/requirements.txt b/speech/third_party/Matcha-TTS/requirements.txt
deleted file mode 100644
index 3e14a532cb14f99190404472915213940bfad4b9..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/requirements.txt
+++ /dev/null
@@ -1,45 +0,0 @@
-# --------- pytorch --------- #
-torch>=2.0.0
-torchvision>=0.15.0
-lightning>=2.0.0
-torchmetrics>=0.11.4
-
-# --------- hydra --------- #
-hydra-core==1.3.2
-hydra-colorlog==1.2.0
-hydra-optuna-sweeper==1.2.0
-
-# --------- loggers --------- #
-# wandb
-# neptune-client
-# mlflow
-# comet-ml
-# aim>=3.16.2  # no lower than 3.16.2, see https://github.com/aimhubio/aim/issues/2550
-
-# --------- others --------- #
-rootutils       # standardizing the project root setup
-pre-commit      # hooks for applying linters on commit
-rich            # beautiful text formatting in terminal
-pytest          # tests
-# sh            # for running bash commands in some tests (linux/macos only)
-phonemizer      # phonemization of text
-tensorboard
-librosa
-Cython
-numpy
-einops
-inflect
-Unidecode
-scipy
-torchaudio
-matplotlib
-pandas
-conformer==0.3.2
-diffusers==0.25.0
-notebook
-ipywidgets
-gradio==3.43.2
-gdown
-wget
-seaborn
-piper_phonemize
diff --git a/speech/third_party/Matcha-TTS/setup.py b/speech/third_party/Matcha-TTS/setup.py
deleted file mode 100644
index 80d4aac04c6cd36859c5d753468ef2e105770098..0000000000000000000000000000000000000000
--- a/speech/third_party/Matcha-TTS/setup.py
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/usr/bin/env python
-import os
-
-import numpy
-from Cython.Build import cythonize
-from setuptools import Extension, find_packages, setup
-
-exts = [
-    Extension(
-        name="matcha.utils.monotonic_align.core",
-        sources=["matcha/utils/monotonic_align/core.pyx"],
-    )
-]
-
-with open("README.md", encoding="utf-8") as readme_file:
-    README = readme_file.read()
-
-cwd = os.path.dirname(os.path.abspath(__file__))
-with open(os.path.join(cwd, "matcha", "VERSION")) as fin:
-    version = fin.read().strip()
-
-setup(
-    name="matcha-tts",
-    version=version,
-    description="🍵 Matcha-TTS: A fast TTS architecture with conditional flow matching",
-    long_description=README,
-    long_description_content_type="text/markdown",
-    author="Shivam Mehta",
-    author_email="shivam.mehta25@gmail.com",
-    url="https://shivammehta25.github.io/Matcha-TTS",
-    install_requires=[str(r) for r in open(os.path.join(os.path.dirname(__file__), "requirements.txt"))],
-    include_dirs=[numpy.get_include()],
-    include_package_data=True,
-    packages=find_packages(exclude=["tests", "tests/*", "examples", "examples/*"]),
-    # use this to customize global commands available in the terminal after installing the package
-    entry_points={
-        "console_scripts": [
-            "matcha-data-stats=matcha.utils.generate_data_statistics:main",
-            "matcha-tts=matcha.cli:cli",
-            "matcha-tts-app=matcha.app:main",
-        ]
-    },
-    ext_modules=cythonize(exts, language_level=3),
-    python_requires=">=3.9.0",
-)
diff --git a/speech/train.py b/speech/train.py
index 9e54f077893f83df9f4f5c7e594440578a7c224b..de22f0eed91bf840fdd03345ce065f53483f5466 100644
--- a/speech/train.py
+++ b/speech/train.py
@@ -16,9 +16,6 @@ from __future__ import print_function
 
 import argparse
 import datetime
-import logging
-
-logging.getLogger("matplotlib").setLevel(logging.WARNING)
 import os
 from copy import deepcopy
 
@@ -29,6 +26,7 @@ from hyperpyyaml import load_hyperpyyaml
 from loguru import logger
 from torch.distributed.elastic.multiprocessing.errors import record
 
+from comet_ml import Experiment
 from cosyvoice.utils.executor import Executor
 from cosyvoice.utils.losses import DPOLoss
 from cosyvoice.utils.train_utils import (check_modify_and_save_config,
@@ -109,20 +107,61 @@ def get_args():
     return args
 
 
+def init_comet_experiment(args, configs):
+    """Initialize Comet ML experiment"""
+    rank = int(os.environ.get('RANK', 0))
+    
+    # Only create experiment on rank 0 to avoid duplicates
+    if rank == 0 and not args.comet_disabled:
+        # Set up Comet ML experiment
+        experiment = Experiment(
+            api_key=args.comet_api_key,
+            project_name=args.comet_project,
+            workspace=args.comet_workspace,
+            experiment_name=args.comet_experiment_name,
+            disabled=args.comet_disabled,
+            offline=args.comet_offline,
+            auto_metric_logging=True,
+            auto_param_logging=True,
+            auto_histogram_weight_logging=True,
+            auto_histogram_gradient_logging=True,
+            auto_histogram_activation_logging=False,
+        )
+        
+        # Log hyperparameters
+        experiment.log_parameters(configs["train_conf"])
+        experiment.log_parameter("model_type", args.model)
+        experiment.log_parameter("train_data", args.train_data)
+        experiment.log_parameter("cv_data", args.cv_data)
+        experiment.log_parameter("use_amp", args.use_amp)
+        experiment.log_parameter("dpo", args.dpo)
+        experiment.log_parameter("num_workers", args.num_workers)
+        experiment.log_parameter("prefetch", args.prefetch)
+        
+        # Log model architecture if available
+        if args.model in configs:
+            model_config = configs[args.model].__dict__ if hasattr(configs[args.model], '__dict__') else {}
+            experiment.log_parameters(model_config, prefix=f"{args.model}/")
+        
+        # Add tags
+        experiment.add_tag(args.model)
+        if args.dpo:
+            experiment.add_tag("dpo")
+        if args.use_amp:
+            experiment.add_tag("amp")
+            
+        logger.info(f"Comet ML experiment initialized: {experiment.get_name()}")
+        return experiment
+    else:
+        return None
+
 @record
 def main():
     args = get_args()
-    logging.basicConfig(
-        level=logging.DEBUG, format="%(asctime)s %(levelname)s %(message)s"
-    )
-    # gan train has some special initialization logic
-    gan = True if args.model == "hifigan" else False
 
     override_dict = {
         k: None for k in ["llm", "flow", "hift", "hifigan"] if k != args.model
     }
-    if gan is True:
-        override_dict.pop("hift")
     try:
         with open(args.config, "r", encoding="utf-8") as f:
             configs = load_hyperpyyaml(
@@ -136,23 +175,27 @@ def main():
         logger.error(f"Error loading config: {e}")
         with open(args.config, "r", encoding="utf-8") as f:
             configs = load_hyperpyyaml(f, overrides=override_dict)
-    if gan is True:
-        configs["train_conf"] = configs["train_conf_gan"]
+
     configs["train_conf"].update(vars(args))
 
-    # Init env for ddp
-    init_distributed(args)
+    world_size = int(os.environ.get('WORLD_SIZE', 1))
+    local_rank = int(os.environ.get('LOCAL_RANK', 0))
+    rank = int(os.environ.get('RANK', 0))
+    logger.info(f'training on multiple gpus, this gpu {local_rank}, rank {rank}, world_size {world_size}')
+    torch.cuda.set_device(local_rank)
+    dist.init_process_group(args.dist_backend)
 
     # Get dataset & dataloader
     train_dataset, _, train_data_loader, cv_data_loader = init_dataset_and_dataloader(
-        args, configs, gan, args.dpo
+        args, configs, args.dpo
     )
 
     # Do some sanity checks and save config to arsg.model_dir
     configs = check_modify_and_save_config(args, configs)
 
     # Tensorboard summary
-    writer = init_summarywriter(args)
+    experiment = init_comet_experiment(args, configs)
+
 
     # load checkpoint
     if args.dpo is True:
@@ -168,6 +211,11 @@ def main():
                 start_step = state_dict["step"]
             if "epoch" in state_dict:
                 start_epoch = state_dict["epoch"]
+            # Log checkpoint info to Comet
+            if experiment:
+                experiment.log_parameter("checkpoint", args.checkpoint)
+                experiment.log_parameter("start_step", start_step)
+                experiment.log_parameter("start_epoch", start_epoch)
         else:
             logger.warning(f"checkpoint {args.checkpoint} do not exsist!")
 
@@ -178,12 +226,10 @@ def main():
     )
 
     # Get optimizer & scheduler
-    model, optimizer, scheduler, optimizer_d, scheduler_d = (
-        init_optimizer_and_scheduler(args, configs, model, gan)
+    model, optimizer, scheduler = (
+        init_optimizer_and_scheduler(configs, model)
     )
     scheduler.set_step(start_step)
-    if scheduler_d is not None:
-        scheduler_d.set_step(start_step)
 
     # Save init checkpoints
     info_dict = deepcopy(configs["train_conf"])
@@ -191,6 +237,14 @@ def main():
     info_dict["epoch"] = start_epoch
     save_model(model, "init", info_dict)
 
+    # Log model save to Comet
+    if experiment:
+        experiment.log_model(
+            name=f"{args.model}_init",
+            file_or_folder=os.path.join(args.model_dir, "init.pt"),
+            metadata=info_dict
+        )
+
     # DPO related
     if args.dpo is True:
         ref_model = deepcopy(configs[args.model])
@@ -201,11 +255,16 @@ def main():
         ref_model = torch.nn.parallel.DistributedDataParallel(
             ref_model, find_unused_parameters=True
         )
+        if experiment:
+            experiment.log_parameter("ref_model", args.ref_model)
+            experiment.log_parameter("dpo_beta", 0.01)
+            experiment.log_parameter("dpo_label_smoothing", 0.0)
+            experiment.log_parameter("dpo_ipo", False)
     else:
         ref_model, dpo_loss = None, None
 
     # Get executor
-    executor = Executor(gan=gan, ref_model=ref_model, dpo_loss=dpo_loss)
+    executor = Executor(gan=False, ref_model=ref_model, dpo_loss=dpo_loss)
     executor.step = start_step
 
     # Init scaler, used for pytorch amp mixed precision training
@@ -220,34 +279,22 @@ def main():
         group_join = dist.new_group(
             backend="nccl", timeout=datetime.timedelta(seconds=args.timeout)
         )
-        if gan is True:
-            executor.train_one_epoc_gan(
-                model,
-                optimizer,
-                scheduler,
-                optimizer_d,
-                scheduler_d,
-                train_data_loader,
-                cv_data_loader,
-                writer,
-                info_dict,
-                scaler,
-                group_join,
-            )
-        else:
-            executor.train_one_epoc(
-                model,
-                optimizer,
-                scheduler,
-                train_data_loader,
-                cv_data_loader,
-                writer,
-                info_dict,
-                scaler,
-                group_join,
-            )
+        
+        executor.train_one_epoc(
+            model,
+            optimizer,
+            scheduler,
+            train_data_loader,
+            cv_data_loader,
+            experiment,
+            info_dict,
+            scaler,
+            group_join,
+            model_type=args.model
+        )
         dist.destroy_process_group(group_join)
-
+    if experiment:
+        experiment.end()
 
 if __name__ == "__main__":
     main()