| # Running on cnode7-018 |
| # Started at Tue Nov 12 14:22:15 CST 2024 |
| # SLURMD_NODENAME=cnode7-018 |
| # SLURM_CLUSTER_NAME=slurm |
| # SLURM_CONF=/cm/shared/apps/slurm/var/etc/slurm/slurm.conf |
| # SLURM_CPUS_ON_NODE=224 |
| # SLURM_CPUS_PER_TASK=8 |
| # SLURM_EXPORT_ENV=PATH |
| # SLURM_GET_USER_ENV=1 |
| # SLURM_GPUS_ON_NODE=8 |
| # SLURM_GPUS_PER_NODE=8 |
| # SLURM_GTIDS=0 |
| # SLURM_JOBID=3520 |
| # SLURM_JOB_CPUS_PER_NODE='224(x3)' |
| # SLURM_JOB_END_TIME=1762928535 |
| # SLURM_JOB_GID=1026 |
| # SLURM_JOB_GPUS=0,1,2,3,4,5,6,7 |
| # SLURM_JOB_ID=3520 |
| # SLURM_JOB_NAME=exp_owsm/s2t_train_4b_ds_raw_bpe50000/train.log |
| # SLURM_JOB_NODELIST='cnode7-[018-020]' |
| # SLURM_JOB_NUM_NODES=3 |
| # SLURM_JOB_PARTITION=sa |
| # SLURM_JOB_QOS=normal |
| # SLURM_JOB_START_TIME=1731392535 |
| # SLURM_JOB_UID=1026 |
| # SLURM_JOB_USER=williamchen |
| # SLURM_LOCALID=0 |
| # SLURM_MEM_PER_NODE=2048000 |
| # SLURM_NNODES=3 |
| # SLURM_NODEID=0 |
| # SLURM_NODELIST='cnode7-[018-020]' |
| # SLURM_NODE_ALIASES='(null)' |
| # SLURM_OPEN_MODE=a |
| # SLURM_PRIO_PROCESS=0 |
| # SLURM_PROCID=0 |
| # SLURM_SUBMIT_DIR=/mnt/home/williamchen/espnet/egs2/owsm_v3.1/s2t1 |
| # SLURM_SUBMIT_HOST=154-T2-P1-NVR |
| # SLURM_TASKS_PER_NODE='28(x3)' |
| # SLURM_TASK_PID=3536759 |
| # SLURM_TOPOLOGY_ADDR=cnode7-018 |
| # SLURM_TOPOLOGY_ADDR_PATTERN=node |
| # SLURM_WORKING_CLUSTER=slurm:154-T2-P1-NVR:6817:9984:109 |
| # srun --export=ALL python3 -m espnet2.bin.s2t_train --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp_owsm/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp_owsm/s2t_train_4b_ds_raw_bpe50000 --config conf/train_4b_ds.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp_owsm/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp,speech,kaldi_ark --train_shape_file exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev,text_prev,text --fold_length 150 --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc,text_ctc,text --fold_length 150 --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/text,text,text --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp_owsm/s2t_stats_raw_bpe50000/valid/text_prev_shape --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp_owsm/s2t_stats_raw_bpe50000/valid/text_ctc_shape --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp_owsm/s2t_stats_raw_bpe50000/valid/text_shape --ngpu 8 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///mnt/home/williamchen/espnet/egs2/owsm_v3.1/s2t1/exp_owsm/s2t_train_4b_ds_raw_bpe50000/.dist_init_09b79df8-b95e-491d-8d35-6a67b36ca9ff |
| [2024-11-12 14:22:22,621] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
| [2024-11-12 14:22:22,940] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
| [2024-11-12 14:22:23,281] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
| Traceback (most recent call last): |
| File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main |
| return _run_code(code, main_globals, None, |
| File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code |
| exec(code, run_globals) |
| File "/mnt/home/williamchen/espnet/espnet2/bin/s2t_train.py", line 2, in <module> |
| from espnet2.tasks.s2t import S2TTask |
| File "/mnt/home/williamchen/espnet/espnet2/tasks/s2t.py", line 17, in <module> |
| from espnet2.asr.decoder.transformer_decoder import ( |
| File "/mnt/home/williamchen/espnet/espnet2/asr/decoder/transformer_decoder.py", line 30, in <module> |
| from deepspeed.checkpointing import checkpoint |
| ModuleNotFoundError: No module named 'deepspeed.checkpointing' |
| Traceback (most recent call last): |
| File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main |
| return _run_code(code, main_globals, None, |
| File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code |
| exec(code, run_globals) |
| File "/mnt/home/williamchen/espnet/espnet2/bin/s2t_train.py", line 2, in <module> |
| from espnet2.tasks.s2t import S2TTask |
| File "/mnt/home/williamchen/espnet/espnet2/tasks/s2t.py", line 17, in <module> |
| from espnet2.asr.decoder.transformer_decoder import ( |
| File "/mnt/home/williamchen/espnet/espnet2/asr/decoder/transformer_decoder.py", line 30, in <module> |
| from deepspeed.checkpointing import checkpoint |
| ModuleNotFoundError: No module named 'deepspeed.checkpointing' |
| Traceback (most recent call last): |
| File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main |
| return _run_code(code, main_globals, None, |
| File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code |
| exec(code, run_globals) |
| File "/mnt/home/williamchen/espnet/espnet2/bin/s2t_train.py", line 2, in <module> |
| from espnet2.tasks.s2t import S2TTask |
| File "/mnt/home/williamchen/espnet/espnet2/tasks/s2t.py", line 17, in <module> |
| from espnet2.asr.decoder.transformer_decoder import ( |
| File "/mnt/home/williamchen/espnet/espnet2/asr/decoder/transformer_decoder.py", line 30, in <module> |
| from deepspeed.checkpointing import checkpoint |
| ModuleNotFoundError: No module named 'deepspeed.checkpointing' |
| srun: error: cnode7-019: task 1: Exited with exit code 1 |
| srun: error: cnode7-018: task 0: Exited with exit code 1 |
| srun: error: cnode7-020: task 2: Exited with exit code 1 |
| # Accounting: begin_time=1731392535 |
| # Accounting: end_time=1731392546 |
| # Accounting: time=11 threads=1 |
| # Finished at Tue Nov 12 14:22:26 CST 2024 with status 1 |
|
|