update
Browse files- run_test.nst +41 -0
- run_whisper_finetuning.py +21 -18
run_test.nst
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Whisper Finetuning script for the NST dataset
|
| 2 |
+
# Currently for training on a 48GB
|
| 3 |
+
# Reduce batch size and learning rate if training on smaller GPU
|
| 4 |
+
|
| 5 |
+
python run_whisper_finetuning.py \
|
| 6 |
+
--model_name_or_path="openai/whisper-small" \
|
| 7 |
+
--output_dir="../whisper-test-delete" \
|
| 8 |
+
--overwrite_output_dir=True \
|
| 9 |
+
--language="Norwegian" \
|
| 10 |
+
--task="transcribe" \
|
| 11 |
+
--dataset_name="NbAiLab/NST" \
|
| 12 |
+
--dataset_config="no-close" \
|
| 13 |
+
--do_train=True \
|
| 14 |
+
--do_eval=True \
|
| 15 |
+
--audio_column_name="audio" \
|
| 16 |
+
--text_column_name="text" \
|
| 17 |
+
--per_device_train_batch_size=48 \
|
| 18 |
+
--per_device_train_batch_size=48 \
|
| 19 |
+
--learning_rate=4e-5 \
|
| 20 |
+
--warmup_steps=5 \
|
| 21 |
+
--max_steps=50 \
|
| 22 |
+
--gradient_checkpointing=True \
|
| 23 |
+
--gradient_accumulation_steps=1 \
|
| 24 |
+
--group_by_length=False \
|
| 25 |
+
--evaluation_strategy="steps" \
|
| 26 |
+
--save_steps=10 \
|
| 27 |
+
--eval_steps=10 \
|
| 28 |
+
--max_eval_samples=10 \
|
| 29 |
+
--logging_steps=10 \
|
| 30 |
+
--fp16=True \
|
| 31 |
+
--load_best_model_at_end=True \
|
| 32 |
+
--metric_for_best_model="wer" \
|
| 33 |
+
--greater_is_better=False \
|
| 34 |
+
--report_to="tensorboard" \
|
| 35 |
+
--predict_with_generate=True \
|
| 36 |
+
--generation_max_length=225 \
|
| 37 |
+
--print_training_arguments=True \
|
| 38 |
+
--push_to_hub=True
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
|
run_whisper_finetuning.py
CHANGED
|
@@ -408,6 +408,9 @@ def main():
|
|
| 408 |
model_args.model_name_or_path, language=model_args.language, task=model_args.task)
|
| 409 |
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
|
| 410 |
|
|
|
|
|
|
|
|
|
|
| 411 |
|
| 412 |
# Prepare data
|
| 413 |
# TODO The casting of the not working on the NPSC in 48K. It seems to be working for Common Voice
|
|
@@ -416,6 +419,7 @@ def main():
|
|
| 416 |
# train_dataset = train_dataset.cast_column("audio", Audio(sampling_rate=16000))
|
| 417 |
# eval_dataset = eval_dataset.cast_column("audio", Audio(sampling_rate=16000))
|
| 418 |
|
|
|
|
| 419 |
|
| 420 |
# TODO I would really like to remove the non needed columns here. At least this cleans up the output.
|
| 421 |
# I am unable to figure out how to do this Streaming mode. Can not find a way to list columns.
|
|
@@ -425,7 +429,7 @@ def main():
|
|
| 425 |
eval_dataset = eval_dataset.map(prepare_dataset)
|
| 426 |
|
| 427 |
# Metrics
|
| 428 |
-
metric = evaluate.load("wer"
|
| 429 |
|
| 430 |
# Detecting last checkpoint.
|
| 431 |
last_checkpoint = None
|
|
@@ -476,11 +480,8 @@ def main():
|
|
| 476 |
# Num Epochs = 9223372036854775807
|
| 477 |
# Instantaneous batch size per device = 48
|
| 478 |
|
| 479 |
-
|
| 480 |
-
processor.save_pretrained(training_args.output_dir)
|
| 481 |
-
|
| 482 |
|
| 483 |
-
# TODO - I can not get the max_eval_steps to run directly. I am therefore including it here. Not very elegant, but it works.
|
| 484 |
trainer = Seq2SeqTrainer(
|
| 485 |
args=training_args,
|
| 486 |
model=model,
|
|
@@ -501,24 +502,26 @@ def main():
|
|
| 501 |
# TODO What does this do? Does this also mean we can load the state? Can this be done per checkpoint?
|
| 502 |
trainer.save_state()
|
| 503 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 504 |
if training_args.push_to_hub:
|
| 505 |
trainer.push_to_hub(**kwargs)
|
| 506 |
else:
|
| 507 |
trainer.create_model_card(**kwargs)
|
| 508 |
|
| 509 |
-
|
| 510 |
-
|
| 511 |
-
|
| 512 |
-
kwargs = {
|
| 513 |
-
"finetuned_from": model_args.model_name_or_path,
|
| 514 |
-
"tasks": "automatic-speech-recognition",
|
| 515 |
-
"tags": ["hf-asr-leaderboard", "automatic-speech-recognition", data_args.dataset_name],
|
| 516 |
-
"dataset_args": f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split: {data_args.eval_split_name}",
|
| 517 |
-
"dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}",
|
| 518 |
-
"language": model_args.language,
|
| 519 |
-
}
|
| 520 |
-
|
| 521 |
-
return results
|
| 522 |
|
| 523 |
# XLA hook
|
| 524 |
def _mp_fn(index):
|
|
|
|
| 408 |
model_args.model_name_or_path, language=model_args.language, task=model_args.task)
|
| 409 |
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
|
| 410 |
|
| 411 |
+
# Saving the processor and the tokenizer
|
| 412 |
+
processor.save_pretrained(training_args.output_dir)
|
| 413 |
+
tokenizer.save_pretrained(training_args.output_dir)
|
| 414 |
|
| 415 |
# Prepare data
|
| 416 |
# TODO The casting of the not working on the NPSC in 48K. It seems to be working for Common Voice
|
|
|
|
| 419 |
# train_dataset = train_dataset.cast_column("audio", Audio(sampling_rate=16000))
|
| 420 |
# eval_dataset = eval_dataset.cast_column("audio", Audio(sampling_rate=16000))
|
| 421 |
|
| 422 |
+
|
| 423 |
|
| 424 |
# TODO I would really like to remove the non needed columns here. At least this cleans up the output.
|
| 425 |
# I am unable to figure out how to do this Streaming mode. Can not find a way to list columns.
|
|
|
|
| 429 |
eval_dataset = eval_dataset.map(prepare_dataset)
|
| 430 |
|
| 431 |
# Metrics
|
| 432 |
+
metric = evaluate.load("wer")
|
| 433 |
|
| 434 |
# Detecting last checkpoint.
|
| 435 |
last_checkpoint = None
|
|
|
|
| 480 |
# Num Epochs = 9223372036854775807
|
| 481 |
# Instantaneous batch size per device = 48
|
| 482 |
|
| 483 |
+
|
|
|
|
|
|
|
| 484 |
|
|
|
|
| 485 |
trainer = Seq2SeqTrainer(
|
| 486 |
args=training_args,
|
| 487 |
model=model,
|
|
|
|
| 502 |
# TODO What does this do? Does this also mean we can load the state? Can this be done per checkpoint?
|
| 503 |
trainer.save_state()
|
| 504 |
|
| 505 |
+
# TODO - Look closer into the model card writing.
|
| 506 |
+
# Write model card and (optionally) push to hub
|
| 507 |
+
config_name = data_args.dataset_config_name if data_args.dataset_config_name is not None else "na"
|
| 508 |
+
kwargs = {
|
| 509 |
+
"finetuned_from": model_args.model_name_or_path,
|
| 510 |
+
"tasks": "automatic-speech-recognition",
|
| 511 |
+
"tags": ["hf-asr-leaderboard", "automatic-speech-recognition", data_args.dataset_name],
|
| 512 |
+
"dataset_args": f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split: {data_args.eval_split_name}",
|
| 513 |
+
"dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}",
|
| 514 |
+
"language": model_args.language,
|
| 515 |
+
}
|
| 516 |
+
|
| 517 |
if training_args.push_to_hub:
|
| 518 |
trainer.push_to_hub(**kwargs)
|
| 519 |
else:
|
| 520 |
trainer.create_model_card(**kwargs)
|
| 521 |
|
| 522 |
+
|
| 523 |
+
|
| 524 |
+
return train_result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 525 |
|
| 526 |
# XLA hook
|
| 527 |
def _mp_fn(index):
|