| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| import fiddle as fdl |
| import torch |
| from lhotse.dataset.collation import collate_matrices, collate_vectors |
| from omegaconf import OmegaConf |
|
|
| from nemo import lightning as nl |
| from nemo.collections import speechlm |
| from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config |
| from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer |
| from nemo.collections.speechlm.models import HFAutoModelForSpeechSeq2Seq |
|
|
| torch.set_float32_matmul_precision("medium") |
|
|
|
|
| class LhotseHfNeMoDataset(torch.utils.data.Dataset): |
| """Class for a speechLM dataset |
| |
| Args: |
| processor (AutoProcessor): the processor to use |
| tokenizer (AutoTokenizer): the tokenizer to use |
| decoder_mask_fill (int): Value to fill in decoder mask |
| |
| Returns: |
| pl.LightningDataModule: the dataset to train with. |
| """ |
|
|
| def __init__(self, processor, tokenizer, decoder_mask_fill=-100): |
| super().__init__() |
| self.processor = processor |
| self.tokenizer = tokenizer |
| self.decoder_mask_fill = decoder_mask_fill |
|
|
| def __getitem__(self, cuts): |
| features = [] |
| for cut in cuts: |
| audio = cut.load_audio() |
| features.append( |
| self.processor( |
| audio, |
| sampling_rate=cut.sampling_rate, |
| return_tensors="pt", |
| text=cut.supervisions[0].text, |
| ) |
| ) |
|
|
| input_features = collate_matrices(tensors=[f["input_features"].squeeze(0) for f in features]) |
| labels = collate_vectors(tensors=[c.supervisions[0].tokens for c in cuts]) |
| decoder_input_ids = labels[:, :-1] |
| decoder_input_ids = decoder_input_ids.masked_fill( |
| decoder_input_ids == self.decoder_mask_fill, self.tokenizer.pad_id |
| ) |
| labels = labels[:, 1:].reshape(-1) |
|
|
| return { |
| "input_features": input_features, |
| "labels": labels, |
| "decoder_input_ids": decoder_input_ids, |
| } |
|
|
|
|
| if __name__ == '__main__': |
| import argparse |
|
|
| parser = argparse.ArgumentParser() |
|
|
| |
| |
| parser.add_argument('--model', default='openai/whisper-large-v3') |
| parser.add_argument('--data-path', type=str, required=True) |
| parser.add_argument('--strategy', type=str, default='auto', choices=['auto', 'ddp', 'fsdp']) |
| parser.add_argument('--devices', default=1) |
| parser.add_argument('--accelerator', default='gpu', choices=['gpu']) |
| parser.add_argument('--max-steps', type=int, default=100) |
| parser.add_argument('--model-save-path', type=str, default=None) |
| args = parser.parse_args() |
|
|
| model = HFAutoModelForSpeechSeq2Seq(model_name=args.model) |
| model = model.to(torch.float) |
| processor = model.processor |
| tokenizer = AutoTokenizer(args.model, include_special_tokens=True) |
|
|
| config = OmegaConf.create( |
| { |
| "manifest_filepath": args.data_path, |
| "sample_rate": 16000, |
| "shuffle": True, |
| "num_workers": 2, |
| "batch_size": 4, |
| "shuffle_buffer_size": 100, |
| } |
| ) |
|
|
| train_dataloader = get_lhotse_dataloader_from_config( |
| config, |
| global_rank=0, |
| world_size=1, |
| dataset=LhotseHfNeMoDataset( |
| processor=processor, |
| tokenizer=tokenizer, |
| ), |
| tokenizer=tokenizer, |
| ) |
|
|
| speechlm.api.finetune( |
| model=model, |
| data=train_dataloader, |
| trainer=nl.Trainer( |
| devices=args.devices, |
| max_steps=args.max_steps, |
| accelerator=args.accelerator, |
| strategy=args.strategy, |
| precision="bf16-mixed", |
| log_every_n_steps=1, |
| limit_val_batches=0.0, |
| num_sanity_val_steps=0, |
| accumulate_grad_batches=10, |
| gradient_clip_val=0.5, |
| use_distributed_sampler=False, |
| callbacks=[], |
| logger=None, |
| ), |
| optim=fdl.build(speechlm.adam.pytorch_adam_with_flat_lr(lr=1e-5)), |
| log=None, |
| ) |
|
|
| if args.model_save_path is not None: |
| model.save_pretrained(args.model_save_path) |
|
|