wanchichen's picture
Add files using upload-large-folder tool
5895765 verified
# Running on cnode7-018
# Started at Tue Nov 12 14:22:15 CST 2024
# SLURMD_NODENAME=cnode7-018
# SLURM_CLUSTER_NAME=slurm
# SLURM_CONF=/cm/shared/apps/slurm/var/etc/slurm/slurm.conf
# SLURM_CPUS_ON_NODE=224
# SLURM_CPUS_PER_TASK=8
# SLURM_EXPORT_ENV=PATH
# SLURM_GET_USER_ENV=1
# SLURM_GPUS_ON_NODE=8
# SLURM_GPUS_PER_NODE=8
# SLURM_GTIDS=0
# SLURM_JOBID=3520
# SLURM_JOB_CPUS_PER_NODE='224(x3)'
# SLURM_JOB_END_TIME=1762928535
# SLURM_JOB_GID=1026
# SLURM_JOB_GPUS=0,1,2,3,4,5,6,7
# SLURM_JOB_ID=3520
# SLURM_JOB_NAME=exp_owsm/s2t_train_4b_ds_raw_bpe50000/train.log
# SLURM_JOB_NODELIST='cnode7-[018-020]'
# SLURM_JOB_NUM_NODES=3
# SLURM_JOB_PARTITION=sa
# SLURM_JOB_QOS=normal
# SLURM_JOB_START_TIME=1731392535
# SLURM_JOB_UID=1026
# SLURM_JOB_USER=williamchen
# SLURM_LOCALID=0
# SLURM_MEM_PER_NODE=2048000
# SLURM_NNODES=3
# SLURM_NODEID=0
# SLURM_NODELIST='cnode7-[018-020]'
# SLURM_NODE_ALIASES='(null)'
# SLURM_OPEN_MODE=a
# SLURM_PRIO_PROCESS=0
# SLURM_PROCID=0
# SLURM_SUBMIT_DIR=/mnt/home/williamchen/espnet/egs2/owsm_v3.1/s2t1
# SLURM_SUBMIT_HOST=154-T2-P1-NVR
# SLURM_TASKS_PER_NODE='28(x3)'
# SLURM_TASK_PID=3536759
# SLURM_TOPOLOGY_ADDR=cnode7-018
# SLURM_TOPOLOGY_ADDR_PATTERN=node
# SLURM_WORKING_CLUSTER=slurm:154-T2-P1-NVR:6817:9984:109
# srun --export=ALL python3 -m espnet2.bin.s2t_train --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp_owsm/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp_owsm/s2t_train_4b_ds_raw_bpe50000 --config conf/train_4b_ds.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp_owsm/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp,speech,kaldi_ark --train_shape_file exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev,text_prev,text --fold_length 150 --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc,text_ctc,text --fold_length 150 --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/text,text,text --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp_owsm/s2t_stats_raw_bpe50000/valid/text_prev_shape --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp_owsm/s2t_stats_raw_bpe50000/valid/text_ctc_shape --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp_owsm/s2t_stats_raw_bpe50000/valid/text_shape --ngpu 8 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///mnt/home/williamchen/espnet/egs2/owsm_v3.1/s2t1/exp_owsm/s2t_train_4b_ds_raw_bpe50000/.dist_init_09b79df8-b95e-491d-8d35-6a67b36ca9ff
[2024-11-12 14:22:22,621] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-11-12 14:22:22,940] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-11-12 14:22:23,281] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
Traceback (most recent call last):
File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/mnt/home/williamchen/espnet/espnet2/bin/s2t_train.py", line 2, in <module>
from espnet2.tasks.s2t import S2TTask
File "/mnt/home/williamchen/espnet/espnet2/tasks/s2t.py", line 17, in <module>
from espnet2.asr.decoder.transformer_decoder import (
File "/mnt/home/williamchen/espnet/espnet2/asr/decoder/transformer_decoder.py", line 30, in <module>
from deepspeed.checkpointing import checkpoint
ModuleNotFoundError: No module named 'deepspeed.checkpointing'
Traceback (most recent call last):
File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/mnt/home/williamchen/espnet/espnet2/bin/s2t_train.py", line 2, in <module>
from espnet2.tasks.s2t import S2TTask
File "/mnt/home/williamchen/espnet/espnet2/tasks/s2t.py", line 17, in <module>
from espnet2.asr.decoder.transformer_decoder import (
File "/mnt/home/williamchen/espnet/espnet2/asr/decoder/transformer_decoder.py", line 30, in <module>
from deepspeed.checkpointing import checkpoint
ModuleNotFoundError: No module named 'deepspeed.checkpointing'
Traceback (most recent call last):
File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/mnt/home/williamchen/espnet/espnet2/bin/s2t_train.py", line 2, in <module>
from espnet2.tasks.s2t import S2TTask
File "/mnt/home/williamchen/espnet/espnet2/tasks/s2t.py", line 17, in <module>
from espnet2.asr.decoder.transformer_decoder import (
File "/mnt/home/williamchen/espnet/espnet2/asr/decoder/transformer_decoder.py", line 30, in <module>
from deepspeed.checkpointing import checkpoint
ModuleNotFoundError: No module named 'deepspeed.checkpointing'
srun: error: cnode7-019: task 1: Exited with exit code 1
srun: error: cnode7-018: task 0: Exited with exit code 1
srun: error: cnode7-020: task 2: Exited with exit code 1
# Accounting: begin_time=1731392535
# Accounting: end_time=1731392546
# Accounting: time=11 threads=1
# Finished at Tue Nov 12 14:22:26 CST 2024 with status 1