Create train.sh
Browse files
train.sh
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/bash
|
| 2 |
+
#SBATCH --job-name ED
|
| 3 |
+
#SBATCH --account OPEN-28-58
|
| 4 |
+
#SBATCH --partition qgpu
|
| 5 |
+
#SBATCH --nodes=4
|
| 6 |
+
#SBATCH --ntasks=4
|
| 7 |
+
#SBATCH --ntasks-per-node=1
|
| 8 |
+
#SBATCH --gpus-per-node 8
|
| 9 |
+
#SBATCH --cpus-per-task=128
|
| 10 |
+
#SBATCH --time 2-00:00:00
|
| 11 |
+
#SBATCH --output=/mnt/proj1/open-28-58/lakoc/huggingface_asr/outputs/ebranchformer_english_small_normalized.out
|
| 12 |
+
|
| 13 |
+
EXPERIMENT="ebranchformer_english_small_normalized"
|
| 14 |
+
PROJECT="regularizations_english_corpus"
|
| 15 |
+
WORK_DIR="/mnt/proj1/open-28-58/lakoc/huggingface_asr"
|
| 16 |
+
RECIPE_DIR="${WORK_DIR}/recipes/ebranchformer_english"
|
| 17 |
+
EXPERIMENT_PATH="${WORK_DIR}/experiments/${EXPERIMENT}"
|
| 18 |
+
HF_HOME="/scratch/project/open-28-57/lakoc/huggingface_cache"
|
| 19 |
+
|
| 20 |
+
args=(
|
| 21 |
+
# General training arguments
|
| 22 |
+
--output_dir=$EXPERIMENT_PATH
|
| 23 |
+
--per_device_train_batch_size="64"
|
| 24 |
+
--per_device_eval_batch_size="8"
|
| 25 |
+
--dataloader_num_workers="24"
|
| 26 |
+
--num_train_epochs="400"
|
| 27 |
+
--group_by_length="True"
|
| 28 |
+
--bf16
|
| 29 |
+
--do_train
|
| 30 |
+
--do_evaluate
|
| 31 |
+
--joint_decoding_during_training
|
| 32 |
+
--load_best_model_at_end
|
| 33 |
+
--metric_for_best_model="eval_wer"
|
| 34 |
+
|
| 35 |
+
# Optimizer related arguments
|
| 36 |
+
--optim="adamw_torch"
|
| 37 |
+
--learning_rate="1e-3"
|
| 38 |
+
--warmup_steps="40000"
|
| 39 |
+
--early_stopping_patience="10"
|
| 40 |
+
--weight_decay="1e-6"
|
| 41 |
+
--max_grad_norm="0.5"
|
| 42 |
+
--lsm_factor="0.1"
|
| 43 |
+
--mask_unks
|
| 44 |
+
--gradient_accumulation_steps="1"
|
| 45 |
+
|
| 46 |
+
# Logging, saving and evaluation related arguments
|
| 47 |
+
--report_to="wandb"
|
| 48 |
+
--logging_steps="10"
|
| 49 |
+
--save_strategy="epoch"
|
| 50 |
+
--evaluation_strategy="epoch"
|
| 51 |
+
--wandb_predictions_to_save=500
|
| 52 |
+
--greater_is_better="False"
|
| 53 |
+
--save_total_limit="5"
|
| 54 |
+
--track_ctc_loss
|
| 55 |
+
|
| 56 |
+
# Data related arguments
|
| 57 |
+
--max_duration_in_seconds="20.0"
|
| 58 |
+
--min_duration_in_seconds="0.2"
|
| 59 |
+
--length_column_name="input_len"
|
| 60 |
+
--remove_unused_columns="False"
|
| 61 |
+
--preprocessing_num_workers="32"
|
| 62 |
+
--dataset_name="/scratch/project/open-28-57/lakoc/processed_dataset_full"
|
| 63 |
+
--writer_batch_size="500"
|
| 64 |
+
--test_splits wsj_test fisher_swbd_dev voxpopuli_test tedlium3_test librispeech_test.clean librispeech_test.other commonvoice_en_test fleurs_test
|
| 65 |
+
|
| 66 |
+
# Preprocessing related arguments
|
| 67 |
+
--data_preprocessing_config="${RECIPE_DIR}/data_preprocessing.json"
|
| 68 |
+
|
| 69 |
+
# Model related arguments
|
| 70 |
+
--from_encoder_decoder_config
|
| 71 |
+
--tokenizer_name="Lakoc/english_corpus_uni5000_normalized"
|
| 72 |
+
--feature_extractor_name="Lakoc/log_80mel_extractor_16k"
|
| 73 |
+
--base_encoder_model="Lakoc/fisher_ebranchformer_enc_12_layers_fixed"
|
| 74 |
+
--base_decoder_model="Lakoc/gpt2_tiny_decoder_6_layers"
|
| 75 |
+
--ctc_weight="0.3"
|
| 76 |
+
--decoder_pos_emb_fixed
|
| 77 |
+
--expect_2d_input
|
| 78 |
+
|
| 79 |
+
# Generation related arguments
|
| 80 |
+
--num_beams="1"
|
| 81 |
+
--max_length="512"
|
| 82 |
+
--predict_with_generate
|
| 83 |
+
--decoding_ctc_weight="0"
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
export PARENT=`/bin/hostname -s`
|
| 87 |
+
export MPORT=13000
|
| 88 |
+
export CHILDREN=`scontrol show hostnames $SLURM_JOB_NODELIST | grep -v $PARENT`
|
| 89 |
+
export HOSTLIST="$PARENT $CHILDREN"
|
| 90 |
+
export WORLD_SIZE=$SLURM_NTASKS
|
| 91 |
+
|
| 92 |
+
conda deactivate
|
| 93 |
+
source activate loco_asr
|
| 94 |
+
|
| 95 |
+
mkdir -p $EXPERIMENT_PATH
|
| 96 |
+
|
| 97 |
+
srun --cpus-per-task $SLURM_CPUS_ON_NODE --gpus-per-task $SLURM_GPUS_ON_NODE \
|
| 98 |
+
/mnt/proj1/open-28-58/lakoc/huggingface_asr/recipes/multinode_training/start_single_node_job.sh \
|
| 99 |
+
"${EXPERIMENT}" $PROJECT $WORK_DIR $RECIPE_DIR $HF_HOME "${args[@]}"
|