NbAiLab
/

whisper

Model card Files Files and versions

xet

Community

pere commited on Nov 11, 2022

Commit

0be47bc

1 Parent(s): 1d7000c

div

Browse files

Files changed (2) hide show

run_speech_recognition_whisper.py +48 -112
run_speech_recognition_whisper_pere.py +523 -0

run_speech_recognition_whisper.py CHANGED Viewed

@@ -55,9 +55,9 @@ from transformers.utils import check_min_version
 from transformers.utils.versions import require_version
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.24.0.dev0")
-require_version("datasets>=2.6.1", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
 logger = logging.getLogger(__name__)
@@ -281,129 +281,53 @@ class DataCollatorSpeechSeq2SeqWithPadding:
         return batch
-def create_vocabulary_from_data(
-        datasets: DatasetDict,
-        word_delimiter_token: Optional[str] = None,
-        unk_token: Optional[str] = None,
-        pad_token: Optional[str] = None,
-):
-    # Given training and test labels create vocabulary
-    alphabet = set()
-    def extract_all_chars(batch):
-        all_text = " ".join(batch["target_text"])
-        alphabet.update(all_text)
-    datasets.map(
-        extract_all_chars,
-        batched=True,
-        batch_size=-1,
-        keep_in_memory=True,
-        remove_columns=datasets["train"].column_names,
-    )
-    # # take union of all unique characters in each dataset
-    # vocab_set = functools.reduce(
-    #     lambda vocab_1, vocab_2: {"vocab": list(set(vocab_1["vocab"][0]) | set(vocab_2["vocab"][0]))}, vocabs.values()
-    # )["vocab"][0]
-    vocab_dict = {v: k for k, v in enumerate(sorted(list(alphabet)))}
-    # replace white space with delimiter token
-    if word_delimiter_token is not None:
-        vocab_dict[word_delimiter_token] = vocab_dict[" "]
-        del vocab_dict[" "]
-    # add unk and pad token
-    if unk_token is not None:
-        vocab_dict[unk_token] = len(vocab_dict)
-    if pad_token is not None:
-        vocab_dict[pad_token] = len(vocab_dict)
-    return vocab_dict
 def make_dataset(training_args, data_args):
     seed = training_args.seed or 42
-    # Pre-processing dataset
-    # import re
-    # def map_nst(entry):
-    #     text = entry["text"].lower()
-    #     text = text.replace("(...Vær stille under dette opptaket...)", "")
-    #     text = re.sub('[áàâ]', 'a', text)
-    #     text = re.sub('[ä]', 'æ', text)
-    #     text = re.sub('[éèëê]', 'e', text)
-    #     text = re.sub('[íìïî]', 'i', text)
-    #     text = re.sub('[óòöô]', 'o', text)
-    #     text = re.sub('[ö]', 'ø', text)
-    #     text = re.sub('[ç]', 'c', text)
-    #     text = re.sub('[úùüû]', 'u', text)
-    #     # text = re.sub('\\(?=(Punktum|Komma|Utropstegn|Spørsmålstegn))', ' ', text)
-    #     text = re.sub('\s+', ' ', text)
-    #     return {"text": text}
-    # def filter_nst(entry):
-    #     if not ((len(entry["text"]) <= len(entry["audio"]["array"]) // 320) and (len(entry["text"].strip()) >= 3)):
-    #         return False  # Too short
-    #     if re.match(entry["type"], "pIW|CA"):
-    #         return False  # Spelling out words
-    #     return True
-    # def filter_npsc(entry):
-    #     # False if there are digits in the text
-    #     if not ((len(entry["text"]) <= len(entry["audio"]["array"]) // 320) and (len(entry["text"].strip()) >= 3)):
-    #         return False  # Too short
-    #     if re.search("\d", entry["text"]):
-    #         return False
-    #     return True
-    # def map_npsc(entry):
-    #     batch = {"text": entry["text"].lower()}
-    #     batch["text"] = re.sub('[áàâ]', 'a', batch["text"])
-    #     batch["text"] = re.sub('[ä]', 'æ', batch["text"])
-    #     batch["text"] = re.sub('[éèëê]', 'e', batch["text"])
-    #     batch["text"] = re.sub('[íìïî]', 'i', batch["text"])
-    #     batch["text"] = re.sub('[óòöô]', 'o', batch["text"])
-    #     batch["text"] = re.sub('[ö]', 'ø', batch["text"])
-    #     batch["text"] = re.sub('[ç]', 'c', batch["text"])
-    #     batch["text"] = re.sub('[úùüû]', 'u', batch["text"])
-    #     batch["text"] = re.sub('\s', ' ', batch["text"])
-    #     batch["text"] = re.sub('<ee>', 'eee', batch["text"])
-    #     batch["text"] = re.sub('<qq>', 'qqq', batch["text"])
-    #     batch["text"] = re.sub('<mm>', 'mmm', batch["text"])
-    #     batch["text"] = re.sub('<inaudible>', 'xxx', batch["text"])
-    #     # batch["text"] = re.sub('<inaudible>', '?', batch["text"])
-    #     if "<" in batch["text"]:
-    #         raise ValueError(batch["text"])
-    #     return batch
-    # nst = datasets.load_dataset("NbAiLab/NST", "no-close")
-    # npsc = datasets.load_dataset("NbAiLab/NPSC", "16K_mp3")
-    # # TODO NST_hesitate
-    # split = len(npsc["train"]) / (len(npsc["train"]) + len(npsc["validation"]))  # Use same train/val ratio as NPSC
-    # nst_train = nst["train"].train_test_split(train_size=split, seed=seed)
-    # nst["train"] = nst_train["train"]
-    # nst["validation"] = nst_train["test"]
-    # nst = nst.filter(filter_nst).map(map_nst).shuffle(seed=seed)
-    # npsc = npsc.filter(filter_npsc).map(map_npsc).shuffle(seed=seed)
-    # npsc_base = npsc.remove_columns([col for col in npsc["train"].column_names if col not in ["text", "audio"]])
-    # nst_base = nst.remove_columns([col for col in nst["train"].column_names if col not in ["text", "audio"]])
-    # combined = {}
-    # for split in "train", "validation", "test":
-    #     probs = np.array([len(nst_base[split]), len(npsc_base[split])])  # Weight by number of examples
-    #     probs = (probs / probs.sum()).tolist()
-    #     comb = datasets.interleave_datasets([nst_base[split], npsc_base[split]], probabilities=probs, seed=seed)
-    #     combined[split] = comb
-    # return datasets.DatasetDict(**combined)
     dataset = datasets.load_dataset(training_args.dataset_name, training_args.dataset_config_name, use_auth_token=data_args.use_auth_token)
     return dataset
@@ -414,6 +338,7 @@ def main():
     # or by passing the --help flag to this script.
     # We now keep distinct sets of args, for a cleaner separation of concerns.
     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
         # If we pass only one argument to the script and it's the path to a json file,
@@ -423,6 +348,7 @@ def main():
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
     # Detecting last checkpoint.
     last_checkpoint = None
     if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
         last_checkpoint = get_last_checkpoint(training_args.output_dir)
@@ -490,6 +416,8 @@ def main():
     # chars_to_ignore_regex = (
     #    f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
     # )
     chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\'\–\_\\\+\#\/]'
     text_column_name = data_args.text_column_name
@@ -792,5 +720,13 @@ def main():
     return results
 if __name__ == "__main__":
     main()

 from transformers.utils.versions import require_version
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+# check_min_version("4.24.0.dev0")
+# require_version("datasets>=2.6.1", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
 logger = logging.getLogger(__name__)
         return batch
+# PERE - COMMENTING OUT - IS THIS NEEDED? We can load vocab from Whisper instead...
+# def create_vocabulary_from_data(
+#         datasets: DatasetDict,
+#         word_delimiter_token: Optional[str] = None,
+#         unk_token: Optional[str] = None,
+#         pad_token: Optional[str] = None,
+# ):
+#     # Given training and test labels create vocabulary
+#     alphabet = set()
+#     def extract_all_chars(batch):
+#         all_text = " ".join(batch["target_text"])
+#         alphabet.update(all_text)
+#     datasets.map(
+#         extract_all_chars,
+#         batched=True,
+#         batch_size=-1,
+#         keep_in_memory=True,
+#         remove_columns=datasets["train"].column_names,
+#     )
+#     # # take union of all unique characters in each dataset
+#     # vocab_set = functools.reduce(
+#     #     lambda vocab_1, vocab_2: {"vocab": list(set(vocab_1["vocab"][0]) | set(vocab_2["vocab"][0]))}, vocabs.values()
+#     # )["vocab"][0]
+#     vocab_dict = {v: k for k, v in enumerate(sorted(list(alphabet)))}
+#     # replace white space with delimiter token
+#     if word_delimiter_token is not None:
+#         vocab_dict[word_delimiter_token] = vocab_dict[" "]
+#         del vocab_dict[" "]
+#     # add unk and pad token
+#     if unk_token is not None:
+#         vocab_dict[unk_token] = len(vocab_dict)
+#     if pad_token is not None:
+#         vocab_dict[pad_token] = len(vocab_dict)
+#     return vocab_dict
 def make_dataset(training_args, data_args):
     seed = training_args.seed or 42
     dataset = datasets.load_dataset(training_args.dataset_name, training_args.dataset_config_name, use_auth_token=data_args.use_auth_token)
     return dataset
     # or by passing the --help flag to this script.
     # We now keep distinct sets of args, for a cleaner separation of concerns.
     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
         # If we pass only one argument to the script and it's the path to a json file,
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
     # Detecting last checkpoint.
+    # PERE - Great but does it set other parameters, like the current learning rate?
     last_checkpoint = None
     if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
         last_checkpoint = get_last_checkpoint(training_args.output_dir)
     # chars_to_ignore_regex = (
     #    f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
     # )
+    ## PERE - JUST REMOVE THIS FOR WHISPER?
     chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\'\–\_\\\+\#\/]'
     text_column_name = data_args.text_column_name
     return results
+#XLA hook
+def _mp_fn(index):
+    # For xla_spawn (TPUs)
+    print("The XLA is initiated")
+    main()
 if __name__ == "__main__":
     main()

run_speech_recognition_whisper_pere.py ADDED Viewed

	@@ -0,0 +1,523 @@

+#!/usr/bin/env python
+# coding=utf-8
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+""" Fine-tuning a 🤗 Transformers Whisper model for automatic speech recognition"""
+import functools
+import json
+import logging
+import os
+import re
+import sys
+import warnings
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Union
+import datasets
+import numpy as np
+import torch
+import evaluate
+from datasets import DatasetDict, load_dataset
+import transformers
+from transformers import (
+    AutoConfig,
+    AutoFeatureExtractor,
+    AutoModelForCTC,
+    AutoProcessor,
+    AutoTokenizer,
+    HfArgumentParser,
+    Trainer,
+    TrainingArguments,
+    Wav2Vec2Processor,
+    set_seed,
+    WhisperFeatureExtractor,
+    WhisperTokenizer,
+    WhisperForConditionalGeneration,
+    WhisperProcessor,
+    Seq2SeqTrainer,
+    Seq2SeqTrainingArguments,
+)
+from transformers.trainer_utils import get_last_checkpoint, is_main_process
+from transformers.utils import check_min_version
+from transformers.utils.versions import require_version
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+# check_min_version("4.24.0.dev0")
+# require_version("datasets>=2.6.1", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
+logger = logging.getLogger(__name__)
+def list_field(default=None, metadata=None):
+    return field(default_factory=lambda: default, metadata=metadata)
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+    model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
+    )
+    language: str = field(
+        metadata={"help": "Whisper specific language"}
+    )
+    task: str = field(
+        metadata={"help": "Whisper specific task, i.e., 'transcribe' or 'translate'"}
+    )
+    tokenizer_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to pretrained tokenizer or tokenizer identifier from huggingface.co/models"},
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
+    )
+    freeze_feature_encoder: bool = field(
+        default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
+    )
+    attention_dropout: float = field(
+        default=0.0, metadata={"help": "The dropout ratio for the attention probabilities."}
+    )
+    activation_dropout: float = field(
+        default=0.0, metadata={"help": "The dropout ratio for activations inside the fully connected layer."}
+    )
+    feat_proj_dropout: float = field(default=0.0, metadata={"help": "The dropout ratio for the projected features."})
+    hidden_dropout: float = field(
+        default=0.0,
+        metadata={
+            "help": "The dropout probability for all fully connected layers in the embeddings, encoder, and pooler."
+        },
+    )
+    final_dropout: float = field(
+        default=0.0,
+        metadata={"help": "The dropout probability for the final projection layer."},
+    )
+    mask_time_prob: float = field(
+        default=0.05,
+        metadata={
+            "help": "Probability of each feature vector along the time axis to be chosen as the start of the vector"
+                    "span to be masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature"
+                    "vectors will be masked along the time axis."
+        },
+    )
+    mask_time_length: int = field(
+        default=10,
+        metadata={"help": "Length of vector span to mask along the time axis."},
+    )
+    mask_feature_prob: float = field(
+        default=0.0,
+        metadata={
+            "help": "Probability of each feature vector along the feature axis to be chosen as the start of the vector"
+                    "span to be masked. Approximately ``mask_feature_prob * sequence_length // mask_feature_length`` feature bins will be masked along the time axis."
+        },
+    )
+    mask_feature_length: int = field(
+        default=10,
+        metadata={"help": "Length of vector span to mask along the feature axis."},
+    )
+    layerdrop: float = field(default=0.0, metadata={"help": "The LayerDrop probability."})
+    ctc_loss_reduction: Optional[str] = field(
+        default="mean", metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."}
+    )
+    ctc_zero_infinity: Optional[bool] = field(
+        default=False, metadata={"help": "If True, will try yo aboud the CTC loss goinf to infinity."}
+    )
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    Using `HfArgumentParser` we can turn this class
+    into argparse arguments to be able to specify them on
+    the command line.
+    """
+    dataset_name: str = field(
+        metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: str = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    train_split_name: str = field(
+        default="train",
+        metadata={
+            "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
+        },
+    )
+    eval_split_name: str = field(
+        default="test",
+        metadata={
+            "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
+        },
+    )
+    audio_column_name: str = field(
+        default="audio",
+        metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
+    )
+    text_column_name: str = field(
+        default="text",
+        metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"},
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
+                    "value if set."
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
+                    "value if set."
+        },
+    )
+    chars_to_ignore: Optional[List[str]] = list_field(
+        default=None,
+        metadata={"help": "A list of characters to remove from the transcripts."},
+    )
+    eval_metrics: List[str] = list_field(
+        default=["wer"],
+        metadata={"help": "A list of metrics the model should be evaluated on. E.g. `'wer cer'`"},
+    )
+    max_duration_in_seconds: float = field(
+        default=20.0,
+        metadata={
+            "help": "Filter audio files that are longer than `max_duration_in_seconds` seconds to 'max_duration_in_seconds`"
+        },
+    )
+    min_duration_in_seconds: float = field(
+        default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
+    )
+    preprocessing_only: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to only do data preprocessing and skip training. "
+                    "This is especially useful when data preprocessing errors out in distributed training due to timeout. "
+                    "In this case, one should run the preprocessing in a non-distributed setup with `preprocessing_only=True` "
+                    "so that the cached datasets can consequently be loaded in distributed training"
+        },
+    )
+    use_auth_token: bool = field(
+        default=False,
+        metadata={
+            "help": "If :obj:`True`, will use the token generated when running"
+                    ":obj:`transformers-cli login` as HTTP bearer authorization for remote files."
+        },
+    )
+    unk_token: str = field(
+        default="[UNK]",
+        metadata={"help": "The unk token for the tokenizer"},
+    )
+    pad_token: str = field(
+        default="[PAD]",
+        metadata={"help": "The padding token for the tokenizer"},
+    )
+    word_delimiter_token: str = field(
+        default="|",
+        metadata={"help": "The word delimiter token for the tokenizer"},
+    )
+    phoneme_language: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The target language that should be used be"
+                    " passed to the tokenizer for tokenization. Note that"
+                    " this is only relevant if the model classifies the"
+                    " input audio to a sequence of phoneme sequences."
+        },
+    )
+@dataclass
+class DataCollatorSpeechSeq2SeqWithPadding:
+    processor: Any
+    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+        # split inputs and labels since they have to be of different lengths and need different padding methods
+        # first treat the audio inputs by simply returning torch tensors
+        input_features = [{"input_features": feature["input_features"]} for feature in features]
+        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
+        # get the tokenized label sequences
+        label_features = [{"input_ids": feature["labels"]} for feature in features]
+        # pad the labels to max length
+        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
+        # replace padding with -100 to ignore loss correctly
+        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
+        # if bos token is appended in previous tokenization step,
+        # cut bos token here as it's append later anyways
+        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
+            labels = labels[:, 1:]
+        batch["labels"] = labels
+        return batch
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    # Metrics
+    def compute_metrics(pred):
+        pred_ids = pred.predictions
+        label_ids = pred.label_ids
+        # replace -100 with the pad_token_id
+        label_ids[label_ids == -100] = tokenizer.pad_token_id
+        # we do not want to group tokens when computing the metrics
+        pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
+        label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)
+        wer = 100 * metric.compute(predictions=pred_str, references=label_str)
+        return {"wer": wer}
+    # Prepare dataset
+    def prepare_dataset(batch):
+        # load and resample audio data from 48 to 16kHz
+        audio = batch["audio"]
+        # compute log-Mel input features from input audio array
+        batch["input_features"] = feature_extractor(
+            audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
+        # encode target text to label ids
+        batch["labels"] = tokenizer(batch["sentence"]).input_ids
+        return batch
+    def make_dataset(training_args, data_args):
+        seed = training_args.seed or 42
+        dataset = datasets.load_dataset(training_args.dataset_name, training_args.dataset_config_name, use_auth_token=data_args.use_auth_token)
+        return dataset
+    # PERE - SHOULD BE CHANGED TO STREAMING LATER
+    # Load dataset
+    speech_data = DatasetDict()
+    # The smallest dataset I found
+    speech_data["train"] = load_dataset(
+        "mozilla-foundation/common_voice_11_0", "nn-NO", split="train", use_auth_token=True)
+    speech_data["test"] = load_dataset(
+        "mozilla-foundation/common_voice_11_0", "nn-NO", split="test", use_auth_token=True)
+    #  PERE - REPLACE WITH THIS
+    # speech_data = make_dataset(training_args, data_args)
+    # Rename columns
+    if "audio" not in speech_data.column_names["train"]:
+        speech_data = speech_data.rename_column(source, "audio")
+    if "sentence" not in speech_data.column_names["train"]:
+        speech_data = speech_data.rename_column(target, "sentence")
+    # Remove not needed columns - Not really sure if this is necessary
+    remove_list = [i for i in speech_data.column_names["train"]
+                if i not in ["audio", "sentence"]]
+    speech_data = speech_data.remove_columns(remove_list)
+    # PERE - NEEDS TO BE PARAMETERIZED
+    # Initialise
+    feature_extractor = WhisperFeatureExtractor.from_pretrained(
+        "openai/whisper-small")
+    tokenizer = WhisperTokenizer.from_pretrained(
+        "openai/whisper-small", language="Norwegian", task="transcribe")
+    processor = WhisperProcessor.from_pretrained(
+        "openai/whisper-small", language="Norwegian", task="transcribe")
+    data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
+    # Prepare data
+    speech_data = speech_data.cast_column("audio", Audio(sampling_rate=16000))
+    speech_data = speech_data.map(
+        prepare_dataset, remove_columns=speech_data.column_names["train"], num_proc=1)
+    # Metrics
+    metric = evaluate.load("wer")
+    #Detecting last checkpoint.
+    last_checkpoint = None
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif last_checkpoint is not None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+    )
+   # Training
+    if training_args.do_train:
+        # use last checkpoint if exist
+        if last_checkpoint is not None:
+            checkpoint = last_checkpoint
+        elif os.path.isdir(model_args.model_name_or_path):
+            checkpoint = model_args.model_name_or_path
+            # Initialise a Pretrained model
+            # We need to set use_cache=False here if we want to use gradient accumulation
+            # PERE - For the test this is set static
+            model = WhisperForConditionalGeneration.from_pretrained(
+                "openai/whisper-small", use_cache=False)
+        else:
+            checkpoint = None
+        train_result = trainer.train(resume_from_checkpoint=checkpoint)
+        trainer.save_model()
+        metrics = train_result.metrics
+        max_train_samples = (
+            data_args.max_train_samples
+            if data_args.max_train_samples is not None
+            else len(vectorized_datasets["train"])
+        )
+        metrics["train_samples"] = min(max_train_samples, len(vectorized_datasets["train"]))
+        trainer.log_metrics("train", metrics)
+        trainer.save_metrics("train", metrics)
+        trainer.save_state()
+    # Overriding generation arguments - no tokens are forced as decoder outputs (see [`forced_decoder_ids`](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.generation_utils.GenerationMixin.generate.forced_decoder_ids)), no tokens are suppressed during generation (see [`suppress_tokens`](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.generation_utils.GenerationMixin.generate.suppress_tokens)):
+    model.config.forced_decoder_ids = None
+    model.config.suppress_tokens = []
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+    # Training arguments
+    training_args = Seq2SeqTrainingArguments(
+        output_dir="../whisper-testrun1",  # change to a repo name of your choice
+        per_device_train_batch_size=16,
+        gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
+        learning_rate=2e-5,
+        warmup_steps=500,
+        max_steps=5000,  # Changed from 4000
+        gradient_checkpointing=True,
+        group_by_length=True,
+        evaluation_strategy="steps",
+        per_device_eval_batch_size=8,
+        predict_with_generate=True,
+        generation_max_length=225,
+        save_steps=500,
+        eval_steps=500,
+        logging_steps=25,
+        report_to=["tensorboard"],
+        load_best_model_at_end=True,
+        metric_for_best_model="wer",
+        greater_is_better=False,
+        push_to_hub=True,
+    )
+    trainer = Seq2SeqTrainer(
+        args=training_args,
+        model=model,
+        train_dataset=speech_data["train"],
+        eval_dataset=speech_data["test"],
+        data_collator=data_collator,
+        compute_metrics=compute_metrics,
+        tokenizer=processor.feature_extractor,
+    )
+    # Initialize Trainer
+    trainer = Seq2SeqTrainer(
+        model=model,
+        data_collator=data_collator,
+        args=training_args,
+        compute_metrics=compute_metrics,
+        train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
+        eval_dataset=vectorized_datasets["validation"] if training_args.do_eval else None,
+        tokenizer=feature_extractor,
+    )
+    # 8. Finally, we can start training
+    # Evaluation
+    results = {}
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+        metrics = trainer.evaluate()
+        max_eval_samples = (
+            data_args.max_eval_samples if data_args.max_eval_samples is not None else len(vectorized_datasets["eval"])
+        )
+        metrics["eval_samples"] = min(max_eval_samples, len(vectorized_datasets["eval"]))
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+    # Write model card and (optionally) push to hub
+    config_name = data_args.dataset_config_name if data_args.dataset_config_name is not None else "na"
+    kwargs = {
+        "finetuned_from": model_args.model_name_or_path,
+        "tasks": "automatic-speech-recognition",
+        "tags": ["hf-asr-leaderboard", "automatic-speech-recognition", data_args.dataset_name],
+        "dataset_args": f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split: {data_args.eval_split_name}",
+        "dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}",
+        "language": model_args.language,
+    }
+    if "common_voice" in data_args.dataset_name:
+        kwargs["language"] = config_name
+    if training_args.push_to_hub:
+        trainer.push_to_hub(**kwargs)
+    else:
+        trainer.create_model_card(**kwargs)
+    return results
+#XLA hook
+def _mp_fn(index):
+    # For xla_spawn (TPUs)
+    print("The XLA is initiated")
+    main()
+if __name__ == "__main__":
+    main()