| # CUDA_VISIBLE_DEVICES="0" python run_speech_recognition_ctc.py \ | |
| # --dataset_name="timit_asr" \ | |
| # --model_name_or_path="facebook/wav2vec2-base" \ | |
| # --overwrite_output_dir \ | |
| # --output_dir="./wav2vec2-base-timit-fine-tuned" \ | |
| # --train_split_name="train" \ | |
| # --num_train_epochs="20" \ | |
| # --per_device_train_batch_size="32" \ | |
| # --per_device_eval_batch_size="1" \ | |
| # --weight_decay="0.005" \ | |
| # --learning_rate="1e-4" \ | |
| # --warmup_steps="1000" \ | |
| # --evaluation_strategy="steps" \ | |
| # --text_column_name="text" \ | |
| # --save_steps="400" \ | |
| # --eval_steps="100" \ | |
| # --logging_steps="10" \ | |
| # --layerdrop="0.0" \ | |
| # --save_total_limit="3" \ | |
| # --freeze_feature_extractor \ | |
| # --chars_to_ignore , ? . ! - \; \: \" “ % ‘ ” � \ | |
| # --fp16 \ | |
| # --group_by_length \ | |
| # --push_to_hub \ | |
| # --do_train \ | |
| # --do_eval | |
| # | |
| CUDA_VISIBLE_DEVICES=0 python run_speech_recognition_ctc.py \ | |
| --dataset_name="mozilla-foundation/common_voice_13_0" \ | |
| --model_name_or_path="facebook/wav2vec2-large-xlsr-53" \ | |
| --dataset_config_name="tr" \ | |
| --output_dir="./wav2vec2-common_voice-tr-demo" \ | |
| --preprocessing_num_workers="1" \ | |
| --overwrite_output_dir \ | |
| --num_train_epochs="15" \ | |
| --per_device_train_batch_size="16" \ | |
| --gradient_accumulation_steps="2" \ | |
| --learning_rate="3e-4" \ | |
| --warmup_steps="500" \ | |
| --evaluation_strategy="steps" \ | |
| --text_column_name="sentence" \ | |
| --save_steps="400" \ | |
| --eval_steps="100" \ | |
| --logging_steps="1" \ | |
| --layerdrop="0.0" \ | |
| --save_total_limit="3" \ | |
| --freeze_feature_encoder \ | |
| --gradient_checkpointing \ | |
| --chars_to_ignore , ? . ! - \; \: \" “ % ‘ ” � \ | |
| --fp16 \ | |
| --group_by_length \ | |
| --push_to_hub \ | |
| --do_train \ | |
| --do_eval \ | |
| --trust-remote-code \ | |
| --apply-trp --trp-depths 1 --trp-p 0.1 --trp-lambdas 0.4 0.2 0.1 | |
| # CUDA_VISIBLE_DEVICES="0" python run_speech_recognition_ctc.py \ | |
| # --dataset_name="multilingual_librispeech" \ | |
| # --model_name_or_path="facebook/wav2vec2-large-xlsr-53" \ | |
| # --train_split_name="train.9h" \ | |
| # --dataset_config_name="german" \ | |
| # --output_dir="./wav2vec2-xlsr-53-300m-mls-german-ft" \ | |
| # --overwrite_output_dir \ | |
| # --num_train_epochs="100" \ | |
| # --per_device_train_batch_size="32" \ | |
| # --gradient_accumulation_steps="1" \ | |
| # --learning_rate="1e-4" \ | |
| # --activation_dropout="0.1" \ | |
| # --warmup_steps="1000" \ | |
| # --evaluation_strategy="steps" \ | |
| # --text_column_name="text" \ | |
| # --save_steps="1000" \ | |
| # --eval_steps="500" \ | |
| # --logging_steps="10" \ | |
| # --layerdrop="0.1" \ | |
| # --hidden_dropout="0.0" \ | |
| # --save_total_limit="1" \ | |
| # --mask_time_prob="0.75" \ | |
| # --mask_feature_prob="0.25" \ | |
| # --mask_feature_length="64" \ | |
| # --freeze_feature_extractor \ | |
| # --fp16 \ | |
| # --group_by_length \ | |
| # --do_eval \ | |
| # --gradient_checkpointing \ | |
| # --use_auth_token \ | |
| # --do_train --do_eval \ | |
| # --push_to_hub \ |