Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- wandb/run-20240804_140603-q9i5g6sv/files/config.yaml +335 -0
- wandb/run-20240804_140603-q9i5g6sv/files/output.log +130 -0
- wandb/run-20240804_140603-q9i5g6sv/files/requirements.txt +271 -0
- wandb/run-20240804_140603-q9i5g6sv/files/wandb-metadata.json +215 -0
- wandb/run-20240804_140603-q9i5g6sv/files/wandb-summary.json +1 -0
- wandb/run-20240804_140603-q9i5g6sv/logs/debug-internal.log +186 -0
- wandb/run-20240804_140603-q9i5g6sv/logs/debug.log +30 -0
- wandb/run-20240804_140603-q9i5g6sv/run-q9i5g6sv.wandb +0 -0
- wandb/run-20240804_142250-6p58tz1g/files/config.yaml +335 -0
- wandb/run-20240804_142250-6p58tz1g/files/output.log +135 -0
- wandb/run-20240804_142250-6p58tz1g/files/requirements.txt +271 -0
- wandb/run-20240804_142250-6p58tz1g/files/wandb-metadata.json +215 -0
- wandb/run-20240804_142250-6p58tz1g/files/wandb-summary.json +1 -0
- wandb/run-20240804_142250-6p58tz1g/logs/debug-internal.log +186 -0
- wandb/run-20240804_142250-6p58tz1g/logs/debug.log +30 -0
- wandb/run-20240804_142250-6p58tz1g/run-6p58tz1g.wandb +0 -0
- wandb/run-20240804_143607-h7fxlkpt/files/config.yaml +335 -0
- wandb/run-20240804_143607-h7fxlkpt/files/output.log +135 -0
- wandb/run-20240804_143607-h7fxlkpt/files/requirements.txt +271 -0
- wandb/run-20240804_143607-h7fxlkpt/files/wandb-metadata.json +215 -0
- wandb/run-20240804_143607-h7fxlkpt/files/wandb-summary.json +1 -0
- wandb/run-20240804_143607-h7fxlkpt/logs/debug-internal.log +186 -0
- wandb/run-20240804_143607-h7fxlkpt/logs/debug.log +30 -0
- wandb/run-20240804_143607-h7fxlkpt/run-h7fxlkpt.wandb +0 -0
- wandb/run-20240804_221132-o8ieoj9i/files/config.yaml +335 -0
- wandb/run-20240804_221132-o8ieoj9i/files/output.log +135 -0
- wandb/run-20240804_221132-o8ieoj9i/files/requirements.txt +271 -0
- wandb/run-20240804_221132-o8ieoj9i/files/wandb-metadata.json +215 -0
- wandb/run-20240804_221132-o8ieoj9i/files/wandb-summary.json +1 -0
- wandb/run-20240804_221132-o8ieoj9i/logs/debug-internal.log +263 -0
- wandb/run-20240804_221132-o8ieoj9i/logs/debug.log +30 -0
- wandb/run-20240804_221132-o8ieoj9i/run-o8ieoj9i.wandb +0 -0
- wandb/run-20240812_052853-n84i0o06/files/config.yaml +335 -0
- wandb/run-20240812_052853-n84i0o06/files/output.log +139 -0
- wandb/run-20240812_052853-n84i0o06/files/requirements.txt +271 -0
- wandb/run-20240812_052853-n84i0o06/files/wandb-metadata.json +215 -0
- wandb/run-20240812_052853-n84i0o06/files/wandb-summary.json +1 -0
- wandb/run-20240812_052853-n84i0o06/logs/debug-internal.log +384 -0
- wandb/run-20240812_052853-n84i0o06/logs/debug.log +30 -0
- wandb/run-20240812_052853-n84i0o06/run-n84i0o06.wandb +0 -0
- wandb/run-20240812_063027-j1htzx7q/files/output.log +121 -0
- wandb/run-20240812_063027-j1htzx7q/files/wandb-summary.json +1 -0
- wandb/run-20240823_154448-v9m85jnt/files/config.yaml +321 -0
- wandb/run-20240823_154448-v9m85jnt/files/output.log +15 -0
- wandb/run-20240823_154448-v9m85jnt/files/requirements.txt +375 -0
- wandb/run-20240823_154448-v9m85jnt/files/wandb-metadata.json +220 -0
- wandb/run-20240823_154448-v9m85jnt/files/wandb-summary.json +1 -0
- wandb/run-20240823_154448-v9m85jnt/logs/debug-internal.log +189 -0
- wandb/run-20240823_154448-v9m85jnt/logs/debug.log +28 -0
- wandb/run-20240823_154448-v9m85jnt/run-v9m85jnt.wandb +0 -0
wandb/run-20240804_140603-q9i5g6sv/files/config.yaml
ADDED
|
@@ -0,0 +1,335 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
wandb_version: 1
|
| 2 |
+
|
| 3 |
+
sharding_strategy:
|
| 4 |
+
desc: null
|
| 5 |
+
value: FULL_SHARD
|
| 6 |
+
checkpoint_type:
|
| 7 |
+
desc: null
|
| 8 |
+
value: LOCAL_STATE_DICT
|
| 9 |
+
fsdp_activation_checkpointing:
|
| 10 |
+
desc: null
|
| 11 |
+
value: true
|
| 12 |
+
fsdp_cpu_offload:
|
| 13 |
+
desc: null
|
| 14 |
+
value: false
|
| 15 |
+
low_cpu_fsdp:
|
| 16 |
+
desc: null
|
| 17 |
+
value: false
|
| 18 |
+
no_meta_device:
|
| 19 |
+
desc: null
|
| 20 |
+
value: false
|
| 21 |
+
data_path:
|
| 22 |
+
desc: null
|
| 23 |
+
value: null
|
| 24 |
+
split:
|
| 25 |
+
desc: null
|
| 26 |
+
value: 969, 30, 1
|
| 27 |
+
train_data_path:
|
| 28 |
+
desc: null
|
| 29 |
+
value:
|
| 30 |
+
- '4013541'
|
| 31 |
+
- /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
|
| 32 |
+
valid_data_path:
|
| 33 |
+
desc: null
|
| 34 |
+
value:
|
| 35 |
+
- '4013541'
|
| 36 |
+
- /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
|
| 37 |
+
test_data_path:
|
| 38 |
+
desc: null
|
| 39 |
+
value:
|
| 40 |
+
- '4013541'
|
| 41 |
+
- /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
|
| 42 |
+
data_cache_path:
|
| 43 |
+
desc: null
|
| 44 |
+
value: null
|
| 45 |
+
vocab_size:
|
| 46 |
+
desc: null
|
| 47 |
+
value: null
|
| 48 |
+
vocab_file:
|
| 49 |
+
desc: null
|
| 50 |
+
value: null
|
| 51 |
+
merge_file:
|
| 52 |
+
desc: null
|
| 53 |
+
value: null
|
| 54 |
+
seq_length:
|
| 55 |
+
desc: null
|
| 56 |
+
value: 512
|
| 57 |
+
num_workers:
|
| 58 |
+
desc: null
|
| 59 |
+
value: 2
|
| 60 |
+
tokenizer_type:
|
| 61 |
+
desc: null
|
| 62 |
+
value: Llama2Tokenizer
|
| 63 |
+
tokenizer_model:
|
| 64 |
+
desc: null
|
| 65 |
+
value: /share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model
|
| 66 |
+
reset_position_ids:
|
| 67 |
+
desc: null
|
| 68 |
+
value: false
|
| 69 |
+
reset_attention_mask:
|
| 70 |
+
desc: null
|
| 71 |
+
value: false
|
| 72 |
+
eod_mask_loss:
|
| 73 |
+
desc: null
|
| 74 |
+
value: false
|
| 75 |
+
retro_return_doc_ids:
|
| 76 |
+
desc: null
|
| 77 |
+
value: false
|
| 78 |
+
short_seq_prob:
|
| 79 |
+
desc: null
|
| 80 |
+
value: 0.1
|
| 81 |
+
vocab_extra_ids:
|
| 82 |
+
desc: null
|
| 83 |
+
value: 0
|
| 84 |
+
seed:
|
| 85 |
+
desc: null
|
| 86 |
+
value: 1234
|
| 87 |
+
use_mpi:
|
| 88 |
+
desc: null
|
| 89 |
+
value: false
|
| 90 |
+
wandb_entity:
|
| 91 |
+
desc: null
|
| 92 |
+
value: iwakawa-koichi-q5-tohoku-nlp6723
|
| 93 |
+
wandb_name:
|
| 94 |
+
desc: null
|
| 95 |
+
value: tiny-llama_train_2024-08-04-14:05:53
|
| 96 |
+
wandb_project:
|
| 97 |
+
desc: null
|
| 98 |
+
value: llm_tutorial
|
| 99 |
+
quantization:
|
| 100 |
+
desc: null
|
| 101 |
+
value: false
|
| 102 |
+
use_freeze_layers:
|
| 103 |
+
desc: null
|
| 104 |
+
value: false
|
| 105 |
+
freeze_layers:
|
| 106 |
+
desc: null
|
| 107 |
+
value: null
|
| 108 |
+
bf16:
|
| 109 |
+
desc: null
|
| 110 |
+
value: true
|
| 111 |
+
fp16:
|
| 112 |
+
desc: null
|
| 113 |
+
value: false
|
| 114 |
+
mixed_precision:
|
| 115 |
+
desc: null
|
| 116 |
+
value: true
|
| 117 |
+
param_dtype:
|
| 118 |
+
desc: null
|
| 119 |
+
value: null
|
| 120 |
+
load:
|
| 121 |
+
desc: null
|
| 122 |
+
value: /work/llm_recipes/models/tiny-llama
|
| 123 |
+
save:
|
| 124 |
+
desc: null
|
| 125 |
+
value: /work/llm_recipes/models/tiny-llama
|
| 126 |
+
base_model:
|
| 127 |
+
desc: null
|
| 128 |
+
value: /share/pretrained_lm/meta-llama/TinyLlama_v1.1
|
| 129 |
+
use_better_transformer:
|
| 130 |
+
desc: null
|
| 131 |
+
value: false
|
| 132 |
+
grad_clip_norm:
|
| 133 |
+
desc: null
|
| 134 |
+
value: 1.0
|
| 135 |
+
eval_interval:
|
| 136 |
+
desc: null
|
| 137 |
+
value: 200
|
| 138 |
+
save_interval:
|
| 139 |
+
desc: null
|
| 140 |
+
value: 200
|
| 141 |
+
eval_iters:
|
| 142 |
+
desc: null
|
| 143 |
+
value: 10
|
| 144 |
+
optimizer:
|
| 145 |
+
desc: null
|
| 146 |
+
value: adam
|
| 147 |
+
lr:
|
| 148 |
+
desc: null
|
| 149 |
+
value: 2.0e-05
|
| 150 |
+
lr_decay_style:
|
| 151 |
+
desc: null
|
| 152 |
+
value: cosine
|
| 153 |
+
lr_decay_iters:
|
| 154 |
+
desc: null
|
| 155 |
+
value: 2000
|
| 156 |
+
lr_warmup_iters:
|
| 157 |
+
desc: null
|
| 158 |
+
value: 500
|
| 159 |
+
min_lr:
|
| 160 |
+
desc: null
|
| 161 |
+
value: 1.0e-06
|
| 162 |
+
train_iters:
|
| 163 |
+
desc: null
|
| 164 |
+
value: 2000
|
| 165 |
+
train_samples:
|
| 166 |
+
desc: null
|
| 167 |
+
value: null
|
| 168 |
+
global_batch_size:
|
| 169 |
+
desc: null
|
| 170 |
+
value: 320
|
| 171 |
+
micro_batch_size:
|
| 172 |
+
desc: null
|
| 173 |
+
value: 8
|
| 174 |
+
make_vocab_size_divisible_by:
|
| 175 |
+
desc: null
|
| 176 |
+
value: 128
|
| 177 |
+
sliding_window_size:
|
| 178 |
+
desc: null
|
| 179 |
+
value: 4096
|
| 180 |
+
skip_batch:
|
| 181 |
+
desc: null
|
| 182 |
+
value: null
|
| 183 |
+
no_save_optimizer_state:
|
| 184 |
+
desc: null
|
| 185 |
+
value: false
|
| 186 |
+
continual_pretraining:
|
| 187 |
+
desc: null
|
| 188 |
+
value: false
|
| 189 |
+
instruction_tuning:
|
| 190 |
+
desc: null
|
| 191 |
+
value: false
|
| 192 |
+
direct_preference_optimization:
|
| 193 |
+
desc: null
|
| 194 |
+
value: false
|
| 195 |
+
attention_dropout:
|
| 196 |
+
desc: null
|
| 197 |
+
value: 0.1
|
| 198 |
+
hidden_dropout:
|
| 199 |
+
desc: null
|
| 200 |
+
value: 0.1
|
| 201 |
+
weight_decay:
|
| 202 |
+
desc: null
|
| 203 |
+
value: 0.1
|
| 204 |
+
adam_beta1:
|
| 205 |
+
desc: null
|
| 206 |
+
value: 0.9
|
| 207 |
+
adam_beta2:
|
| 208 |
+
desc: null
|
| 209 |
+
value: 0.95
|
| 210 |
+
adam_eps:
|
| 211 |
+
desc: null
|
| 212 |
+
value: 1.0e-06
|
| 213 |
+
hf_transformer_model_dir:
|
| 214 |
+
desc: null
|
| 215 |
+
value: null
|
| 216 |
+
instruction_train_data_path:
|
| 217 |
+
desc: null
|
| 218 |
+
value: null
|
| 219 |
+
instruction_valid_data_path:
|
| 220 |
+
desc: null
|
| 221 |
+
value: null
|
| 222 |
+
epoch:
|
| 223 |
+
desc: null
|
| 224 |
+
value: null
|
| 225 |
+
instruction_dataset_size:
|
| 226 |
+
desc: null
|
| 227 |
+
value: null
|
| 228 |
+
save_sampler_state:
|
| 229 |
+
desc: null
|
| 230 |
+
value: false
|
| 231 |
+
label_smoothing:
|
| 232 |
+
desc: null
|
| 233 |
+
value: 0.0
|
| 234 |
+
save_n_checkpoints:
|
| 235 |
+
desc: null
|
| 236 |
+
value: 10
|
| 237 |
+
hf_repo_id:
|
| 238 |
+
desc: null
|
| 239 |
+
value: koichi12/tiny-llama
|
| 240 |
+
create_public_hf_repo:
|
| 241 |
+
desc: null
|
| 242 |
+
value: false
|
| 243 |
+
upload_all_checkpoints_to_hf:
|
| 244 |
+
desc: null
|
| 245 |
+
value: false
|
| 246 |
+
hf_upload_retry_limit:
|
| 247 |
+
desc: null
|
| 248 |
+
value: 2
|
| 249 |
+
exit_duration_in_mins:
|
| 250 |
+
desc: null
|
| 251 |
+
value: null
|
| 252 |
+
source_key:
|
| 253 |
+
desc: null
|
| 254 |
+
value: null
|
| 255 |
+
target_key:
|
| 256 |
+
desc: null
|
| 257 |
+
value: null
|
| 258 |
+
attn_implementation:
|
| 259 |
+
desc: null
|
| 260 |
+
value: flash_attention_2
|
| 261 |
+
efficient_instruction_tuning:
|
| 262 |
+
desc: null
|
| 263 |
+
value: false
|
| 264 |
+
remove_padding_masking:
|
| 265 |
+
desc: null
|
| 266 |
+
value: false
|
| 267 |
+
save_start_iter:
|
| 268 |
+
desc: null
|
| 269 |
+
value: null
|
| 270 |
+
rank:
|
| 271 |
+
desc: null
|
| 272 |
+
value: 0
|
| 273 |
+
world_size:
|
| 274 |
+
desc: null
|
| 275 |
+
value: 1
|
| 276 |
+
padded_vocab_size:
|
| 277 |
+
desc: null
|
| 278 |
+
value: 32000
|
| 279 |
+
gradient_accumulation_steps:
|
| 280 |
+
desc: null
|
| 281 |
+
value: 40
|
| 282 |
+
_wandb:
|
| 283 |
+
desc: null
|
| 284 |
+
value:
|
| 285 |
+
python_version: 3.10.12
|
| 286 |
+
cli_version: 0.16.3
|
| 287 |
+
framework: huggingface
|
| 288 |
+
huggingface_version: 4.43.3
|
| 289 |
+
is_jupyter_run: false
|
| 290 |
+
is_kaggle_kernel: false
|
| 291 |
+
start_time: 1722747963.684337
|
| 292 |
+
t:
|
| 293 |
+
1:
|
| 294 |
+
- 1
|
| 295 |
+
- 11
|
| 296 |
+
- 49
|
| 297 |
+
- 55
|
| 298 |
+
- 71
|
| 299 |
+
2:
|
| 300 |
+
- 1
|
| 301 |
+
- 11
|
| 302 |
+
- 49
|
| 303 |
+
- 55
|
| 304 |
+
- 71
|
| 305 |
+
3:
|
| 306 |
+
- 13
|
| 307 |
+
- 16
|
| 308 |
+
- 23
|
| 309 |
+
4: 3.10.12
|
| 310 |
+
5: 0.16.3
|
| 311 |
+
6: 4.43.3
|
| 312 |
+
8:
|
| 313 |
+
- 5
|
| 314 |
+
13: linux-x86_64
|
| 315 |
+
activation_function:
|
| 316 |
+
desc: null
|
| 317 |
+
value: silu
|
| 318 |
+
hidden_size:
|
| 319 |
+
desc: null
|
| 320 |
+
value: 2048
|
| 321 |
+
model_type:
|
| 322 |
+
desc: null
|
| 323 |
+
value: llama
|
| 324 |
+
max_position_embeddings:
|
| 325 |
+
desc: null
|
| 326 |
+
value: 2048
|
| 327 |
+
num_attention_heads:
|
| 328 |
+
desc: null
|
| 329 |
+
value: 32
|
| 330 |
+
num_hidden_layers:
|
| 331 |
+
desc: null
|
| 332 |
+
value: 22
|
| 333 |
+
model_architecture:
|
| 334 |
+
desc: null
|
| 335 |
+
value: LlamaForCausalLM
|
wandb/run-20240804_140603-q9i5g6sv/files/output.log
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Created Hugging Face repository with ID koichi12/tiny-llama.
|
| 2 |
+
Clearing GPU cache for all ranks
|
| 3 |
+
--> Running with torch torch_distributed debug set to detail
|
| 4 |
+
File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
| 5 |
+
Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
| 6 |
+
File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
| 7 |
+
Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
| 8 |
+
File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
| 9 |
+
Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
| 10 |
+
No checkpoint found in /work/llm_recipes/models/tiny-llama, skipping model loading
|
| 11 |
+
--> Model /share/pretrained_lm/meta-llama/TinyLlama_v1.1
|
| 12 |
+
--> /share/pretrained_lm/meta-llama/TinyLlama_v1.1 has 1100.048384 Million params
|
| 13 |
+
You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
|
| 14 |
+
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
|
| 15 |
+
Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaForCausalLM is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
|
| 16 |
+
Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
|
| 17 |
+
BFloat16 enabled for mixed precision - using bfSixteen policy
|
| 18 |
+
--> applying fsdp activation checkpointing...
|
| 19 |
+
> datasets target sizes (minimum size):
|
| 20 |
+
train: 640000
|
| 21 |
+
validation: 35200
|
| 22 |
+
test: 3200
|
| 23 |
+
> building train, validation, and test datasets for GPT ...
|
| 24 |
+
> finished creating GPT datasets ...
|
| 25 |
+
File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
| 26 |
+
Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
| 27 |
+
No checkpoint found in /work/llm_recipes/models/tiny-llama, skipping optimizer loading
|
| 28 |
+
File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
| 29 |
+
Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
| 30 |
+
model info: FullyShardedDataParallel(
|
| 31 |
+
(_fsdp_wrapped_module): LlamaForCausalLM(
|
| 32 |
+
(model): LlamaModel(
|
| 33 |
+
(embed_tokens): Embedding(32000, 2048)
|
| 34 |
+
(layers): ModuleList(
|
| 35 |
+
(0-21): 22 x FullyShardedDataParallel(
|
| 36 |
+
(_fsdp_wrapped_module): CheckpointWrapper(
|
| 37 |
+
(_checkpoint_wrapped_module): LlamaDecoderLayer(
|
| 38 |
+
(self_attn): LlamaFlashAttention2(
|
| 39 |
+
(q_proj): Linear(in_features=2048, out_features=2048, bias=False)
|
| 40 |
+
(k_proj): Linear(in_features=2048, out_features=256, bias=False)
|
| 41 |
+
(v_proj): Linear(in_features=2048, out_features=256, bias=False)
|
| 42 |
+
(o_proj): Linear(in_features=2048, out_features=2048, bias=False)
|
| 43 |
+
(rotary_emb): LlamaRotaryEmbedding()
|
| 44 |
+
)
|
| 45 |
+
(mlp): LlamaMLP(
|
| 46 |
+
(gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
|
| 47 |
+
(up_proj): Linear(in_features=2048, out_features=5632, bias=False)
|
| 48 |
+
(down_proj): Linear(in_features=5632, out_features=2048, bias=False)
|
| 49 |
+
(act_fn): SiLU()
|
| 50 |
+
)
|
| 51 |
+
(input_layernorm): LlamaRMSNorm()
|
| 52 |
+
(post_attention_layernorm): LlamaRMSNorm()
|
| 53 |
+
)
|
| 54 |
+
)
|
| 55 |
+
)
|
| 56 |
+
)
|
| 57 |
+
(norm): LlamaRMSNorm()
|
| 58 |
+
(rotary_emb): LlamaRotaryEmbedding()
|
| 59 |
+
)
|
| 60 |
+
(lm_head): Linear(in_features=2048, out_features=32000, bias=False)
|
| 61 |
+
)
|
| 62 |
+
)
|
| 63 |
+
model config: LlamaConfig {
|
| 64 |
+
"_name_or_path": "/share/pretrained_lm/meta-llama/TinyLlama_v1.1",
|
| 65 |
+
"architectures": [
|
| 66 |
+
"LlamaForCausalLM"
|
| 67 |
+
],
|
| 68 |
+
"attention_bias": false,
|
| 69 |
+
"attention_dropout": 0.0,
|
| 70 |
+
"bos_token_id": 1,
|
| 71 |
+
"eos_token_id": 2,
|
| 72 |
+
"hidden_act": "silu",
|
| 73 |
+
"hidden_size": 2048,
|
| 74 |
+
"initializer_range": 0.02,
|
| 75 |
+
"intermediate_size": 5632,
|
| 76 |
+
"label_smoothing": 0.0,
|
| 77 |
+
"max_position_embeddings": 2048,
|
| 78 |
+
"mlp_bias": false,
|
| 79 |
+
"model_type": "llama",
|
| 80 |
+
"num_attention_heads": 32,
|
| 81 |
+
"num_hidden_layers": 22,
|
| 82 |
+
"num_key_value_heads": 4,
|
| 83 |
+
"pretraining_tp": 1,
|
| 84 |
+
"rms_norm_eps": 1e-05,
|
| 85 |
+
"rope_scaling": null,
|
| 86 |
+
"rope_theta": 10000.0,
|
| 87 |
+
"tie_word_embeddings": false,
|
| 88 |
+
"torch_dtype": "float32",
|
| 89 |
+
"transformers_version": "4.43.3",
|
| 90 |
+
"use_cache": false,
|
| 91 |
+
"vocab_size": 32000
|
| 92 |
+
}
|
| 93 |
+
/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
|
| 94 |
+
warnings.warn(
|
| 95 |
+
Let split = None
|
| 96 |
+
Building a BlendedDataset for a single MegatronDataset
|
| 97 |
+
Unable to save the indexes because path_to_cache is None
|
| 98 |
+
Building a BlendedDataset for a single MegatronDataset
|
| 99 |
+
Unable to save the indexes because path_to_cache is None
|
| 100 |
+
Building a BlendedDataset for a single MegatronDataset
|
| 101 |
+
Unable to save the indexes because path_to_cache is None
|
| 102 |
+
Traceback (most recent call last):
|
| 103 |
+
File "/project/examples/finetuning.py", line 13, in <module>
|
| 104 |
+
main()
|
| 105 |
+
File "/project/src/llama_recipes/finetuning.py", line 281, in main
|
| 106 |
+
train(
|
| 107 |
+
File "/project/src/llama_recipes/utils/train_utils.py", line 110, in train
|
| 108 |
+
loss: torch.Tensor = model(**batch).loss
|
| 109 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
|
| 110 |
+
return self._call_impl(*args, **kwargs)
|
| 111 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
|
| 112 |
+
return forward_call(*args, **kwargs)
|
| 113 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 849, in forward
|
| 114 |
+
output = self._fsdp_wrapped_module(*args, **kwargs)
|
| 115 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
|
| 116 |
+
return self._call_impl(*args, **kwargs)
|
| 117 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
|
| 118 |
+
return forward_call(*args, **kwargs)
|
| 119 |
+
File "/project/lib/transformers/src/transformers/models/llama/modeling_llama.py", line 1141, in forward
|
| 120 |
+
outputs = self.model(
|
| 121 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
|
| 122 |
+
return self._call_impl(*args, **kwargs)
|
| 123 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
|
| 124 |
+
return forward_call(*args, **kwargs)
|
| 125 |
+
File "/project/lib/transformers/src/transformers/models/llama/modeling_llama.py", line 908, in forward
|
| 126 |
+
cache_position = torch.arange(
|
| 127 |
+
RuntimeError: CUDA error: device-side assert triggered
|
| 128 |
+
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
|
| 129 |
+
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
|
| 130 |
+
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
|
wandb/run-20240804_140603-q9i5g6sv/files/requirements.txt
ADDED
|
@@ -0,0 +1,271 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
absl-py==2.1.0
|
| 2 |
+
accelerate==0.33.0
|
| 3 |
+
aiohttp==3.9.1
|
| 4 |
+
aiosignal==1.3.1
|
| 5 |
+
annotated-types==0.6.0
|
| 6 |
+
apex==0.1
|
| 7 |
+
appdirs==1.4.4
|
| 8 |
+
argon2-cffi-bindings==21.2.0
|
| 9 |
+
argon2-cffi==23.1.0
|
| 10 |
+
asttokens==2.4.1
|
| 11 |
+
astunparse==1.6.3
|
| 12 |
+
async-timeout==4.0.3
|
| 13 |
+
attrs==23.2.0
|
| 14 |
+
audioread==3.0.1
|
| 15 |
+
beautifulsoup4==4.12.3
|
| 16 |
+
bleach==6.1.0
|
| 17 |
+
blis==0.7.11
|
| 18 |
+
cachetools==5.3.2
|
| 19 |
+
catalogue==2.0.10
|
| 20 |
+
certifi==2024.2.2
|
| 21 |
+
cffi==1.16.0
|
| 22 |
+
charset-normalizer==3.3.2
|
| 23 |
+
click==8.1.7
|
| 24 |
+
cloudpathlib==0.16.0
|
| 25 |
+
cloudpickle==3.0.0
|
| 26 |
+
cmake==3.28.1
|
| 27 |
+
colorama==0.4.6
|
| 28 |
+
comm==0.2.1
|
| 29 |
+
confection==0.1.4
|
| 30 |
+
contourpy==1.2.0
|
| 31 |
+
cubinlinker==0.3.0+2.g405ac64
|
| 32 |
+
cuda-python==12.3.0rc4+9.gdb8c48a.dirty
|
| 33 |
+
cudf==23.12.0
|
| 34 |
+
cugraph-dgl==23.12.0
|
| 35 |
+
cugraph-service-client==23.12.0
|
| 36 |
+
cugraph-service-server==23.12.0
|
| 37 |
+
cugraph==23.12.0
|
| 38 |
+
cuml==23.12.0
|
| 39 |
+
cupy-cuda12x==12.3.0
|
| 40 |
+
cycler==0.12.1
|
| 41 |
+
cymem==2.0.8
|
| 42 |
+
cython==3.0.8
|
| 43 |
+
dask-cuda==23.12.0
|
| 44 |
+
dask-cudf==23.12.0
|
| 45 |
+
dask==2023.11.0
|
| 46 |
+
debugpy==1.8.1
|
| 47 |
+
decorator==5.1.1
|
| 48 |
+
defusedxml==0.7.1
|
| 49 |
+
distributed==2023.11.0
|
| 50 |
+
dm-tree==0.1.8
|
| 51 |
+
docker-pycreds==0.4.0
|
| 52 |
+
einops==0.7.0
|
| 53 |
+
exceptiongroup==1.2.0
|
| 54 |
+
execnet==2.0.2
|
| 55 |
+
executing==2.0.1
|
| 56 |
+
expecttest==0.1.3
|
| 57 |
+
fastjsonschema==2.19.1
|
| 58 |
+
fastrlock==0.8.2
|
| 59 |
+
filelock==3.13.1
|
| 60 |
+
flash-attn==2.4.2
|
| 61 |
+
fonttools==4.48.1
|
| 62 |
+
frozenlist==1.4.1
|
| 63 |
+
fsspec==2023.12.2
|
| 64 |
+
gast==0.5.4
|
| 65 |
+
gitdb==4.0.11
|
| 66 |
+
gitpython==3.1.43
|
| 67 |
+
google-auth-oauthlib==0.4.6
|
| 68 |
+
google-auth==2.27.0
|
| 69 |
+
graphsurgeon==0.4.6
|
| 70 |
+
grpcio==1.60.1
|
| 71 |
+
huggingface-hub==0.24.5
|
| 72 |
+
hypothesis==5.35.1
|
| 73 |
+
idna==3.6
|
| 74 |
+
importlib-metadata==7.0.1
|
| 75 |
+
iniconfig==2.0.0
|
| 76 |
+
intel-openmp==2021.4.0
|
| 77 |
+
ipadic==1.0.0
|
| 78 |
+
ipykernel==6.29.2
|
| 79 |
+
ipython-genutils==0.2.0
|
| 80 |
+
ipython==8.21.0
|
| 81 |
+
jedi==0.19.1
|
| 82 |
+
jinja2==3.1.3
|
| 83 |
+
joblib==1.3.2
|
| 84 |
+
json5==0.9.14
|
| 85 |
+
jsonnet==0.19.1
|
| 86 |
+
jsonschema-specifications==2023.12.1
|
| 87 |
+
jsonschema==4.21.1
|
| 88 |
+
jupyter-client==8.6.0
|
| 89 |
+
jupyter-core==5.7.1
|
| 90 |
+
jupyter-tensorboard==0.2.0
|
| 91 |
+
jupyterlab-pygments==0.3.0
|
| 92 |
+
jupyterlab-server==1.2.0
|
| 93 |
+
jupyterlab==2.3.2
|
| 94 |
+
jupytext==1.16.1
|
| 95 |
+
kiwisolver==1.4.5
|
| 96 |
+
langcodes==3.3.0
|
| 97 |
+
lazy-loader==0.3
|
| 98 |
+
librosa==0.10.1
|
| 99 |
+
llvmlite==0.40.1
|
| 100 |
+
locket==1.0.0
|
| 101 |
+
logzero==1.7.0
|
| 102 |
+
lxml==5.2.2
|
| 103 |
+
markdown-it-py==3.0.0
|
| 104 |
+
markdown==3.5.2
|
| 105 |
+
markupsafe==2.1.4
|
| 106 |
+
matplotlib-inline==0.1.6
|
| 107 |
+
matplotlib==3.8.2
|
| 108 |
+
mdit-py-plugins==0.4.0
|
| 109 |
+
mdurl==0.1.2
|
| 110 |
+
mecab-python3==1.0.6
|
| 111 |
+
mistune==3.0.2
|
| 112 |
+
mkl-devel==2021.1.1
|
| 113 |
+
mkl-include==2021.1.1
|
| 114 |
+
mkl==2021.1.1
|
| 115 |
+
mock==5.1.0
|
| 116 |
+
more-itertools==9.1.0
|
| 117 |
+
mpmath==1.3.0
|
| 118 |
+
msgpack==1.0.7
|
| 119 |
+
multidict==6.0.4
|
| 120 |
+
murmurhash==1.0.10
|
| 121 |
+
nbclient==0.9.0
|
| 122 |
+
nbconvert==7.16.0
|
| 123 |
+
nbformat==5.9.2
|
| 124 |
+
nest-asyncio==1.6.0
|
| 125 |
+
networkx==2.6.3
|
| 126 |
+
ninja==1.11.1.1
|
| 127 |
+
nltk==3.8.1
|
| 128 |
+
notebook==6.4.10
|
| 129 |
+
numba==0.57.1+1.g1ff679645
|
| 130 |
+
numpy==1.24.4
|
| 131 |
+
nvfuser==0.1.4a0+d0bb811
|
| 132 |
+
nvidia-dali-cuda120==1.34.0
|
| 133 |
+
nvidia-pyindex==1.0.9
|
| 134 |
+
nvtx==0.2.5
|
| 135 |
+
oauthlib==3.2.2
|
| 136 |
+
onnx==1.15.0rc2
|
| 137 |
+
opencv==4.7.0
|
| 138 |
+
optree==0.10.0
|
| 139 |
+
packaging==23.2
|
| 140 |
+
pandas==1.5.3
|
| 141 |
+
pandocfilters==1.5.1
|
| 142 |
+
parso==0.8.3
|
| 143 |
+
partd==1.4.1
|
| 144 |
+
peft==0.11.1
|
| 145 |
+
pexpect==4.9.0
|
| 146 |
+
pillow==10.2.0
|
| 147 |
+
pip==24.0
|
| 148 |
+
platformdirs==4.2.0
|
| 149 |
+
pluggy==1.4.0
|
| 150 |
+
ply==3.11
|
| 151 |
+
polygraphy==0.49.4
|
| 152 |
+
pooch==1.8.0
|
| 153 |
+
portalocker==2.10.1
|
| 154 |
+
preshed==3.0.9
|
| 155 |
+
prettytable==3.9.0
|
| 156 |
+
prometheus-client==0.19.0
|
| 157 |
+
prompt-toolkit==3.0.43
|
| 158 |
+
protobuf==4.24.4
|
| 159 |
+
psutil==5.9.4
|
| 160 |
+
ptxcompiler==0.8.1+2.g0d406d6
|
| 161 |
+
ptyprocess==0.7.0
|
| 162 |
+
pure-eval==0.2.2
|
| 163 |
+
pyarrow==14.0.1.dev0+gba5374836.d20240125
|
| 164 |
+
pyasn1-modules==0.3.0
|
| 165 |
+
pyasn1==0.5.1
|
| 166 |
+
pybind11-global==2.11.1
|
| 167 |
+
pybind11==2.11.1
|
| 168 |
+
pycocotools==2.0+nv0.8.0
|
| 169 |
+
pycparser==2.21
|
| 170 |
+
pydantic-core==2.16.2
|
| 171 |
+
pydantic==2.6.1
|
| 172 |
+
pygments==2.17.2
|
| 173 |
+
pylibcugraph==23.12.0
|
| 174 |
+
pylibcugraphops==23.12.0
|
| 175 |
+
pylibraft==23.12.0
|
| 176 |
+
pynvml==11.4.1
|
| 177 |
+
pyparsing==3.1.1
|
| 178 |
+
pytest-flakefinder==1.1.0
|
| 179 |
+
pytest-rerunfailures==13.0
|
| 180 |
+
pytest-shard==0.1.2
|
| 181 |
+
pytest-xdist==3.5.0
|
| 182 |
+
pytest==8.0.0
|
| 183 |
+
python-dateutil==2.8.2
|
| 184 |
+
python-dotenv==1.0.0
|
| 185 |
+
python-hostlist==1.23.0
|
| 186 |
+
pytorch-quantization==2.1.2
|
| 187 |
+
pytz==2023.3.post1
|
| 188 |
+
pyyaml==6.0.1
|
| 189 |
+
pyzmq==25.1.2
|
| 190 |
+
raft-dask==23.12.0
|
| 191 |
+
rapids-dask-dependency==23.12.1
|
| 192 |
+
referencing==0.33.0
|
| 193 |
+
regex==2023.12.25
|
| 194 |
+
requests-oauthlib==1.3.1
|
| 195 |
+
requests==2.31.0
|
| 196 |
+
rich==13.7.0
|
| 197 |
+
rmm==23.12.0
|
| 198 |
+
rpds-py==0.17.1
|
| 199 |
+
rsa==4.9
|
| 200 |
+
sacrebleu==2.4.0
|
| 201 |
+
safetensors==0.4.3
|
| 202 |
+
scikit-learn==1.2.0
|
| 203 |
+
scipy==1.12.0
|
| 204 |
+
send2trash==1.8.2
|
| 205 |
+
sentencepiece==0.1.99
|
| 206 |
+
sentry-sdk==2.12.0
|
| 207 |
+
setproctitle==1.3.3
|
| 208 |
+
setuptools==68.2.2
|
| 209 |
+
six==1.16.0
|
| 210 |
+
smart-open==6.4.0
|
| 211 |
+
smmap==5.0.1
|
| 212 |
+
sortedcontainers==2.4.0
|
| 213 |
+
soundfile==0.12.1
|
| 214 |
+
soupsieve==2.5
|
| 215 |
+
soxr==0.3.7
|
| 216 |
+
spacy-legacy==3.0.12
|
| 217 |
+
spacy-loggers==1.0.5
|
| 218 |
+
spacy==3.7.2
|
| 219 |
+
sphinx-glpi-theme==0.6
|
| 220 |
+
srsly==2.4.8
|
| 221 |
+
stack-data==0.6.3
|
| 222 |
+
sympy==1.12
|
| 223 |
+
tabulate==0.9.0
|
| 224 |
+
tbb==2021.11.0
|
| 225 |
+
tblib==3.0.0
|
| 226 |
+
tensorboard-data-server==0.6.1
|
| 227 |
+
tensorboard-plugin-wit==1.8.1
|
| 228 |
+
tensorboard==2.9.0
|
| 229 |
+
tensorrt==8.6.3
|
| 230 |
+
terminado==0.18.0
|
| 231 |
+
termplotlib==0.3.9
|
| 232 |
+
thinc==8.2.3
|
| 233 |
+
threadpoolctl==3.2.0
|
| 234 |
+
thriftpy2==0.4.17
|
| 235 |
+
tinycss2==1.2.1
|
| 236 |
+
tokenizers==0.19.1
|
| 237 |
+
toml==0.10.2
|
| 238 |
+
tomli==2.0.1
|
| 239 |
+
toolz==0.12.1
|
| 240 |
+
torch-tensorrt==2.3.0a0
|
| 241 |
+
torch==2.3.0a0+ebedce2
|
| 242 |
+
torchdata==0.7.1a0
|
| 243 |
+
torchtext==0.17.0a0
|
| 244 |
+
torchvision==0.18.0a0
|
| 245 |
+
tornado==6.4
|
| 246 |
+
tqdm==4.66.1
|
| 247 |
+
traitlets==5.9.0
|
| 248 |
+
transformer-engine==1.3.0+5b90b7f
|
| 249 |
+
transformers==4.43.3
|
| 250 |
+
treelite-runtime==3.9.1
|
| 251 |
+
treelite==3.9.1
|
| 252 |
+
triton==2.2.0+e28a256
|
| 253 |
+
typer==0.9.0
|
| 254 |
+
types-dataclasses==0.6.6
|
| 255 |
+
typing-extensions==4.9.0
|
| 256 |
+
ucx-py==0.35.0
|
| 257 |
+
uff==0.6.9
|
| 258 |
+
ujson==5.8.0
|
| 259 |
+
urllib3==1.26.18
|
| 260 |
+
wandb==0.16.3
|
| 261 |
+
wasabi==1.1.2
|
| 262 |
+
wcwidth==0.2.13
|
| 263 |
+
weasel==0.3.4
|
| 264 |
+
webencodings==0.5.1
|
| 265 |
+
werkzeug==3.0.1
|
| 266 |
+
wheel==0.42.0
|
| 267 |
+
xdoctest==1.0.2
|
| 268 |
+
xgboost==1.7.6
|
| 269 |
+
yarl==1.9.4
|
| 270 |
+
zict==3.0.0
|
| 271 |
+
zipp==3.17.0
|
wandb/run-20240804_140603-q9i5g6sv/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "3.10.12",
|
| 4 |
+
"heartbeatAt": "2024-08-04T05:06:04.333644",
|
| 5 |
+
"startedAt": "2024-08-04T05:06:03.671763",
|
| 6 |
+
"docker": null,
|
| 7 |
+
"cuda": null,
|
| 8 |
+
"args": [
|
| 9 |
+
"--seq-length",
|
| 10 |
+
"512",
|
| 11 |
+
"--sliding-window-size",
|
| 12 |
+
"4096",
|
| 13 |
+
"--micro-batch-size",
|
| 14 |
+
"8",
|
| 15 |
+
"--global-batch-size",
|
| 16 |
+
"320",
|
| 17 |
+
"--train-iters",
|
| 18 |
+
"2000",
|
| 19 |
+
"--tokenizer-type",
|
| 20 |
+
"Llama2Tokenizer",
|
| 21 |
+
"--tokenizer-model",
|
| 22 |
+
"/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model",
|
| 23 |
+
"--train-data-path",
|
| 24 |
+
"4013541",
|
| 25 |
+
"/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
| 26 |
+
"--valid-data-path",
|
| 27 |
+
"4013541",
|
| 28 |
+
"/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
| 29 |
+
"--test-data-path",
|
| 30 |
+
"4013541",
|
| 31 |
+
"/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
| 32 |
+
"--lr",
|
| 33 |
+
"2e-5",
|
| 34 |
+
"--min-lr",
|
| 35 |
+
"1e-6",
|
| 36 |
+
"--lr-decay-style",
|
| 37 |
+
"cosine",
|
| 38 |
+
"--lr-warmup-iters",
|
| 39 |
+
"500",
|
| 40 |
+
"--lr-decay-iters",
|
| 41 |
+
"2000",
|
| 42 |
+
"--weight-decay",
|
| 43 |
+
"0.1",
|
| 44 |
+
"--grad-clip-norm",
|
| 45 |
+
"1.0",
|
| 46 |
+
"--optimizer",
|
| 47 |
+
"adam",
|
| 48 |
+
"--adam-beta1",
|
| 49 |
+
"0.9",
|
| 50 |
+
"--adam-beta2",
|
| 51 |
+
"0.95",
|
| 52 |
+
"--adam-eps",
|
| 53 |
+
"1e-6",
|
| 54 |
+
"--save-interval",
|
| 55 |
+
"200",
|
| 56 |
+
"--eval-interval",
|
| 57 |
+
"200",
|
| 58 |
+
"--eval-iters",
|
| 59 |
+
"10",
|
| 60 |
+
"--bf16",
|
| 61 |
+
"--mixed-precision",
|
| 62 |
+
"--base-model",
|
| 63 |
+
"/share/pretrained_lm/meta-llama/TinyLlama_v1.1",
|
| 64 |
+
"--save",
|
| 65 |
+
"/work/llm_recipes/models/tiny-llama",
|
| 66 |
+
"--load",
|
| 67 |
+
"/work/llm_recipes/models/tiny-llama",
|
| 68 |
+
"--fsdp-activation-checkpointing",
|
| 69 |
+
"--sharding-strategy",
|
| 70 |
+
"FULL_SHARD",
|
| 71 |
+
"--checkpoint-type",
|
| 72 |
+
"LOCAL_STATE_DICT",
|
| 73 |
+
"--save-n-checkpoints",
|
| 74 |
+
"10",
|
| 75 |
+
"--hf-upload-retry-limit",
|
| 76 |
+
"2",
|
| 77 |
+
"--hf-repo-id",
|
| 78 |
+
"koichi12/tiny-llama",
|
| 79 |
+
"--wandb-entity",
|
| 80 |
+
"iwakawa-koichi-q5-tohoku-nlp6723",
|
| 81 |
+
"--wandb-project",
|
| 82 |
+
"llm_tutorial",
|
| 83 |
+
"--wandb-name",
|
| 84 |
+
"tiny-llama_train_2024-08-04-14:05:53"
|
| 85 |
+
],
|
| 86 |
+
"state": "running",
|
| 87 |
+
"program": "/project/examples/finetuning.py",
|
| 88 |
+
"codePathLocal": "examples/finetuning.py",
|
| 89 |
+
"codePath": "examples/finetuning.py",
|
| 90 |
+
"git": {
|
| 91 |
+
"remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
|
| 92 |
+
"commit": "3be5353210a678dc7008f237fa16b99f2bdf36ea"
|
| 93 |
+
},
|
| 94 |
+
"email": null,
|
| 95 |
+
"root": "/project",
|
| 96 |
+
"host": "gpu-koiwa-00",
|
| 97 |
+
"username": "koiwa",
|
| 98 |
+
"executable": "/usr/bin/python",
|
| 99 |
+
"cpu_count": 18,
|
| 100 |
+
"cpu_count_logical": 18,
|
| 101 |
+
"cpu_freq": {
|
| 102 |
+
"current": 2400.0389999999993,
|
| 103 |
+
"min": 0.0,
|
| 104 |
+
"max": 0.0
|
| 105 |
+
},
|
| 106 |
+
"cpu_freq_per_core": [
|
| 107 |
+
{
|
| 108 |
+
"current": 2400.039,
|
| 109 |
+
"min": 0.0,
|
| 110 |
+
"max": 0.0
|
| 111 |
+
},
|
| 112 |
+
{
|
| 113 |
+
"current": 2400.039,
|
| 114 |
+
"min": 0.0,
|
| 115 |
+
"max": 0.0
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"current": 2400.039,
|
| 119 |
+
"min": 0.0,
|
| 120 |
+
"max": 0.0
|
| 121 |
+
},
|
| 122 |
+
{
|
| 123 |
+
"current": 2400.039,
|
| 124 |
+
"min": 0.0,
|
| 125 |
+
"max": 0.0
|
| 126 |
+
},
|
| 127 |
+
{
|
| 128 |
+
"current": 2400.039,
|
| 129 |
+
"min": 0.0,
|
| 130 |
+
"max": 0.0
|
| 131 |
+
},
|
| 132 |
+
{
|
| 133 |
+
"current": 2400.039,
|
| 134 |
+
"min": 0.0,
|
| 135 |
+
"max": 0.0
|
| 136 |
+
},
|
| 137 |
+
{
|
| 138 |
+
"current": 2400.039,
|
| 139 |
+
"min": 0.0,
|
| 140 |
+
"max": 0.0
|
| 141 |
+
},
|
| 142 |
+
{
|
| 143 |
+
"current": 2400.039,
|
| 144 |
+
"min": 0.0,
|
| 145 |
+
"max": 0.0
|
| 146 |
+
},
|
| 147 |
+
{
|
| 148 |
+
"current": 2400.039,
|
| 149 |
+
"min": 0.0,
|
| 150 |
+
"max": 0.0
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"current": 2400.039,
|
| 154 |
+
"min": 0.0,
|
| 155 |
+
"max": 0.0
|
| 156 |
+
},
|
| 157 |
+
{
|
| 158 |
+
"current": 2400.039,
|
| 159 |
+
"min": 0.0,
|
| 160 |
+
"max": 0.0
|
| 161 |
+
},
|
| 162 |
+
{
|
| 163 |
+
"current": 2400.039,
|
| 164 |
+
"min": 0.0,
|
| 165 |
+
"max": 0.0
|
| 166 |
+
},
|
| 167 |
+
{
|
| 168 |
+
"current": 2400.039,
|
| 169 |
+
"min": 0.0,
|
| 170 |
+
"max": 0.0
|
| 171 |
+
},
|
| 172 |
+
{
|
| 173 |
+
"current": 2400.039,
|
| 174 |
+
"min": 0.0,
|
| 175 |
+
"max": 0.0
|
| 176 |
+
},
|
| 177 |
+
{
|
| 178 |
+
"current": 2400.039,
|
| 179 |
+
"min": 0.0,
|
| 180 |
+
"max": 0.0
|
| 181 |
+
},
|
| 182 |
+
{
|
| 183 |
+
"current": 2400.039,
|
| 184 |
+
"min": 0.0,
|
| 185 |
+
"max": 0.0
|
| 186 |
+
},
|
| 187 |
+
{
|
| 188 |
+
"current": 2400.039,
|
| 189 |
+
"min": 0.0,
|
| 190 |
+
"max": 0.0
|
| 191 |
+
},
|
| 192 |
+
{
|
| 193 |
+
"current": 2400.039,
|
| 194 |
+
"min": 0.0,
|
| 195 |
+
"max": 0.0
|
| 196 |
+
}
|
| 197 |
+
],
|
| 198 |
+
"disk": {
|
| 199 |
+
"/": {
|
| 200 |
+
"total": 0.0625,
|
| 201 |
+
"used": 1.1444091796875e-05
|
| 202 |
+
}
|
| 203 |
+
},
|
| 204 |
+
"gpu": "NVIDIA A100-SXM4-40GB",
|
| 205 |
+
"gpu_count": 1,
|
| 206 |
+
"gpu_devices": [
|
| 207 |
+
{
|
| 208 |
+
"name": "NVIDIA A100-SXM4-40GB",
|
| 209 |
+
"memory_total": 42949672960
|
| 210 |
+
}
|
| 211 |
+
],
|
| 212 |
+
"memory": {
|
| 213 |
+
"total": 56.48781967163086
|
| 214 |
+
}
|
| 215 |
+
}
|
wandb/run-20240804_140603-q9i5g6sv/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"_wandb": {"runtime": 4}}
|
wandb/run-20240804_140603-q9i5g6sv/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2024-08-04 14:06:03,686 INFO StreamThr :9469 [internal.py:wandb_internal():86] W&B internal server running at pid: 9469, started at: 2024-08-04 14:06:03.685029
|
| 2 |
+
2024-08-04 14:06:03,687 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: status
|
| 3 |
+
2024-08-04 14:06:03,689 INFO WriterThread:9469 [datastore.py:open_for_write():87] open: /project/wandb/run-20240804_140603-q9i5g6sv/run-q9i5g6sv.wandb
|
| 4 |
+
2024-08-04 14:06:03,690 DEBUG SenderThread:9469 [sender.py:send():382] send: header
|
| 5 |
+
2024-08-04 14:06:03,703 DEBUG SenderThread:9469 [sender.py:send():382] send: run
|
| 6 |
+
2024-08-04 14:06:04,218 INFO SenderThread:9469 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240804_140603-q9i5g6sv/files
|
| 7 |
+
2024-08-04 14:06:04,218 INFO SenderThread:9469 [sender.py:_start_run_threads():1136] run started: q9i5g6sv with start time 1722747963.684337
|
| 8 |
+
2024-08-04 14:06:04,223 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: check_version
|
| 9 |
+
2024-08-04 14:06:04,223 DEBUG SenderThread:9469 [sender.py:send_request():409] send_request: check_version
|
| 10 |
+
2024-08-04 14:06:04,313 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: run_start
|
| 11 |
+
2024-08-04 14:06:04,320 DEBUG HandlerThread:9469 [system_info.py:__init__():27] System info init
|
| 12 |
+
2024-08-04 14:06:04,320 DEBUG HandlerThread:9469 [system_info.py:__init__():42] System info init done
|
| 13 |
+
2024-08-04 14:06:04,320 INFO HandlerThread:9469 [system_monitor.py:start():194] Starting system monitor
|
| 14 |
+
2024-08-04 14:06:04,320 INFO SystemMonitor:9469 [system_monitor.py:_start():158] Starting system asset monitoring threads
|
| 15 |
+
2024-08-04 14:06:04,320 INFO HandlerThread:9469 [system_monitor.py:probe():214] Collecting system info
|
| 16 |
+
2024-08-04 14:06:04,321 INFO SystemMonitor:9469 [interfaces.py:start():190] Started cpu monitoring
|
| 17 |
+
2024-08-04 14:06:04,321 INFO SystemMonitor:9469 [interfaces.py:start():190] Started disk monitoring
|
| 18 |
+
2024-08-04 14:06:04,322 INFO SystemMonitor:9469 [interfaces.py:start():190] Started gpu monitoring
|
| 19 |
+
2024-08-04 14:06:04,322 INFO SystemMonitor:9469 [interfaces.py:start():190] Started memory monitoring
|
| 20 |
+
2024-08-04 14:06:04,323 INFO SystemMonitor:9469 [interfaces.py:start():190] Started network monitoring
|
| 21 |
+
2024-08-04 14:06:04,333 DEBUG HandlerThread:9469 [system_info.py:probe():151] Probing system
|
| 22 |
+
2024-08-04 14:06:04,335 DEBUG HandlerThread:9469 [system_info.py:_probe_git():136] Probing git
|
| 23 |
+
2024-08-04 14:06:04,347 DEBUG HandlerThread:9469 [system_info.py:_probe_git():144] Probing git done
|
| 24 |
+
2024-08-04 14:06:04,347 DEBUG HandlerThread:9469 [system_info.py:probe():199] Probing system done
|
| 25 |
+
2024-08-04 14:06:04,347 DEBUG HandlerThread:9469 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-04T05:06:04.333644', 'startedAt': '2024-08-04T05:06:03.671763', 'docker': None, 'cuda': None, 'args': ('--seq-length', '512', '--sliding-window-size', '4096', '--micro-batch-size', '8', '--global-batch-size', '320', '--train-iters', '2000', '--tokenizer-type', 'Llama2Tokenizer', '--tokenizer-model', '/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model', '--train-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--valid-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--test-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '2000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '200', '--eval-interval', '200', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/meta-llama/TinyLlama_v1.1', '--save', '/work/llm_recipes/models/tiny-llama', '--load', '/work/llm_recipes/models/tiny-llama', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/tiny-llama', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'tiny-llama_train_2024-08-04-14:05:53'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '3be5353210a678dc7008f237fa16b99f2bdf36ea'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0389999999993, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.48781967163086}}
|
| 26 |
+
2024-08-04 14:06:04,347 INFO HandlerThread:9469 [system_monitor.py:probe():224] Finished collecting system info
|
| 27 |
+
2024-08-04 14:06:04,347 INFO HandlerThread:9469 [system_monitor.py:probe():227] Publishing system info
|
| 28 |
+
2024-08-04 14:06:04,349 INFO HandlerThread:9469 [system_monitor.py:probe():229] Finished publishing system info
|
| 29 |
+
2024-08-04 14:06:04,354 DEBUG SenderThread:9469 [sender.py:send():382] send: files
|
| 30 |
+
2024-08-04 14:06:04,354 INFO SenderThread:9469 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
|
| 31 |
+
2024-08-04 14:06:04,364 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: python_packages
|
| 32 |
+
2024-08-04 14:06:04,364 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: stop_status
|
| 33 |
+
2024-08-04 14:06:04,364 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: internal_messages
|
| 34 |
+
2024-08-04 14:06:04,364 DEBUG SenderThread:9469 [sender.py:send_request():409] send_request: python_packages
|
| 35 |
+
2024-08-04 14:06:04,366 DEBUG SenderThread:9469 [sender.py:send_request():409] send_request: stop_status
|
| 36 |
+
2024-08-04 14:06:04,605 DEBUG SenderThread:9469 [sender.py:send():382] send: telemetry
|
| 37 |
+
2024-08-04 14:06:04,996 INFO wandb-upload_0:9469 [upload_job.py:push():131] Uploaded file /tmp/tmpz1emajybwandb/prws540s-wandb-metadata.json
|
| 38 |
+
2024-08-04 14:06:05,220 INFO Thread-12 :9469 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_140603-q9i5g6sv/files/requirements.txt
|
| 39 |
+
2024-08-04 14:06:05,220 INFO Thread-12 :9469 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_140603-q9i5g6sv/files/wandb-metadata.json
|
| 40 |
+
2024-08-04 14:06:05,220 INFO Thread-12 :9469 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_140603-q9i5g6sv/files/output.log
|
| 41 |
+
2024-08-04 14:06:07,221 INFO Thread-12 :9469 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_140603-q9i5g6sv/files/output.log
|
| 42 |
+
2024-08-04 14:06:07,604 DEBUG SenderThread:9469 [sender.py:send():382] send: config
|
| 43 |
+
2024-08-04 14:06:07,605 DEBUG SenderThread:9469 [sender.py:send():382] send: config
|
| 44 |
+
2024-08-04 14:06:08,222 INFO Thread-12 :9469 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_140603-q9i5g6sv/files/output.log
|
| 45 |
+
2024-08-04 14:06:08,620 DEBUG SenderThread:9469 [sender.py:send():382] send: exit
|
| 46 |
+
2024-08-04 14:06:08,620 INFO SenderThread:9469 [sender.py:send_exit():589] handling exit code: 1
|
| 47 |
+
2024-08-04 14:06:08,620 INFO SenderThread:9469 [sender.py:send_exit():591] handling runtime: 4
|
| 48 |
+
2024-08-04 14:06:08,621 INFO SenderThread:9469 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
| 49 |
+
2024-08-04 14:06:08,622 INFO SenderThread:9469 [sender.py:send_exit():597] send defer
|
| 50 |
+
2024-08-04 14:06:08,622 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: defer
|
| 51 |
+
2024-08-04 14:06:08,622 INFO HandlerThread:9469 [handler.py:handle_request_defer():172] handle defer: 0
|
| 52 |
+
2024-08-04 14:06:08,622 DEBUG SenderThread:9469 [sender.py:send_request():409] send_request: defer
|
| 53 |
+
2024-08-04 14:06:08,622 INFO SenderThread:9469 [sender.py:send_request_defer():613] handle sender defer: 0
|
| 54 |
+
2024-08-04 14:06:08,622 INFO SenderThread:9469 [sender.py:transition_state():617] send defer: 1
|
| 55 |
+
2024-08-04 14:06:08,622 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: defer
|
| 56 |
+
2024-08-04 14:06:08,622 INFO HandlerThread:9469 [handler.py:handle_request_defer():172] handle defer: 1
|
| 57 |
+
2024-08-04 14:06:08,622 DEBUG SenderThread:9469 [sender.py:send_request():409] send_request: defer
|
| 58 |
+
2024-08-04 14:06:08,622 INFO SenderThread:9469 [sender.py:send_request_defer():613] handle sender defer: 1
|
| 59 |
+
2024-08-04 14:06:08,623 INFO SenderThread:9469 [sender.py:transition_state():617] send defer: 2
|
| 60 |
+
2024-08-04 14:06:08,623 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: defer
|
| 61 |
+
2024-08-04 14:06:08,623 INFO HandlerThread:9469 [handler.py:handle_request_defer():172] handle defer: 2
|
| 62 |
+
2024-08-04 14:06:08,623 INFO HandlerThread:9469 [system_monitor.py:finish():203] Stopping system monitor
|
| 63 |
+
2024-08-04 14:06:08,623 DEBUG SystemMonitor:9469 [system_monitor.py:_start():172] Starting system metrics aggregation loop
|
| 64 |
+
2024-08-04 14:06:08,623 INFO HandlerThread:9469 [interfaces.py:finish():202] Joined cpu monitor
|
| 65 |
+
2024-08-04 14:06:08,623 DEBUG SystemMonitor:9469 [system_monitor.py:_start():179] Finished system metrics aggregation loop
|
| 66 |
+
2024-08-04 14:06:08,623 INFO HandlerThread:9469 [interfaces.py:finish():202] Joined disk monitor
|
| 67 |
+
2024-08-04 14:06:08,624 DEBUG SystemMonitor:9469 [system_monitor.py:_start():183] Publishing last batch of metrics
|
| 68 |
+
2024-08-04 14:06:08,656 INFO HandlerThread:9469 [interfaces.py:finish():202] Joined gpu monitor
|
| 69 |
+
2024-08-04 14:06:08,656 INFO HandlerThread:9469 [interfaces.py:finish():202] Joined memory monitor
|
| 70 |
+
2024-08-04 14:06:08,656 INFO HandlerThread:9469 [interfaces.py:finish():202] Joined network monitor
|
| 71 |
+
2024-08-04 14:06:08,657 DEBUG SenderThread:9469 [sender.py:send_request():409] send_request: defer
|
| 72 |
+
2024-08-04 14:06:08,657 INFO SenderThread:9469 [sender.py:send_request_defer():613] handle sender defer: 2
|
| 73 |
+
2024-08-04 14:06:08,657 INFO SenderThread:9469 [sender.py:transition_state():617] send defer: 3
|
| 74 |
+
2024-08-04 14:06:08,657 DEBUG SenderThread:9469 [sender.py:send():382] send: stats
|
| 75 |
+
2024-08-04 14:06:08,657 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: defer
|
| 76 |
+
2024-08-04 14:06:08,657 INFO HandlerThread:9469 [handler.py:handle_request_defer():172] handle defer: 3
|
| 77 |
+
2024-08-04 14:06:08,657 DEBUG SenderThread:9469 [sender.py:send_request():409] send_request: defer
|
| 78 |
+
2024-08-04 14:06:08,657 INFO SenderThread:9469 [sender.py:send_request_defer():613] handle sender defer: 3
|
| 79 |
+
2024-08-04 14:06:08,657 INFO SenderThread:9469 [sender.py:transition_state():617] send defer: 4
|
| 80 |
+
2024-08-04 14:06:08,657 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: defer
|
| 81 |
+
2024-08-04 14:06:08,657 INFO HandlerThread:9469 [handler.py:handle_request_defer():172] handle defer: 4
|
| 82 |
+
2024-08-04 14:06:08,658 DEBUG SenderThread:9469 [sender.py:send_request():409] send_request: defer
|
| 83 |
+
2024-08-04 14:06:08,658 INFO SenderThread:9469 [sender.py:send_request_defer():613] handle sender defer: 4
|
| 84 |
+
2024-08-04 14:06:08,658 INFO SenderThread:9469 [sender.py:transition_state():617] send defer: 5
|
| 85 |
+
2024-08-04 14:06:08,658 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: defer
|
| 86 |
+
2024-08-04 14:06:08,658 INFO HandlerThread:9469 [handler.py:handle_request_defer():172] handle defer: 5
|
| 87 |
+
2024-08-04 14:06:08,658 DEBUG SenderThread:9469 [sender.py:send():382] send: summary
|
| 88 |
+
2024-08-04 14:06:08,659 INFO SenderThread:9469 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
| 89 |
+
2024-08-04 14:06:08,659 DEBUG SenderThread:9469 [sender.py:send_request():409] send_request: defer
|
| 90 |
+
2024-08-04 14:06:08,659 INFO SenderThread:9469 [sender.py:send_request_defer():613] handle sender defer: 5
|
| 91 |
+
2024-08-04 14:06:08,659 INFO SenderThread:9469 [sender.py:transition_state():617] send defer: 6
|
| 92 |
+
2024-08-04 14:06:08,659 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: defer
|
| 93 |
+
2024-08-04 14:06:08,659 INFO HandlerThread:9469 [handler.py:handle_request_defer():172] handle defer: 6
|
| 94 |
+
2024-08-04 14:06:08,659 DEBUG SenderThread:9469 [sender.py:send_request():409] send_request: defer
|
| 95 |
+
2024-08-04 14:06:08,660 INFO SenderThread:9469 [sender.py:send_request_defer():613] handle sender defer: 6
|
| 96 |
+
2024-08-04 14:06:08,662 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: status_report
|
| 97 |
+
2024-08-04 14:06:08,848 INFO SenderThread:9469 [sender.py:transition_state():617] send defer: 7
|
| 98 |
+
2024-08-04 14:06:08,849 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: defer
|
| 99 |
+
2024-08-04 14:06:08,849 INFO HandlerThread:9469 [handler.py:handle_request_defer():172] handle defer: 7
|
| 100 |
+
2024-08-04 14:06:08,849 DEBUG SenderThread:9469 [sender.py:send_request():409] send_request: defer
|
| 101 |
+
2024-08-04 14:06:08,849 INFO SenderThread:9469 [sender.py:send_request_defer():613] handle sender defer: 7
|
| 102 |
+
2024-08-04 14:06:09,223 INFO Thread-12 :9469 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_140603-q9i5g6sv/files/config.yaml
|
| 103 |
+
2024-08-04 14:06:09,223 INFO Thread-12 :9469 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_140603-q9i5g6sv/files/output.log
|
| 104 |
+
2024-08-04 14:06:09,223 INFO Thread-12 :9469 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_140603-q9i5g6sv/files/wandb-summary.json
|
| 105 |
+
2024-08-04 14:06:09,360 INFO SenderThread:9469 [sender.py:transition_state():617] send defer: 8
|
| 106 |
+
2024-08-04 14:06:09,361 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: defer
|
| 107 |
+
2024-08-04 14:06:09,361 INFO HandlerThread:9469 [handler.py:handle_request_defer():172] handle defer: 8
|
| 108 |
+
2024-08-04 14:06:09,361 DEBUG SenderThread:9469 [sender.py:send_request():409] send_request: defer
|
| 109 |
+
2024-08-04 14:06:09,361 INFO SenderThread:9469 [sender.py:send_request_defer():613] handle sender defer: 8
|
| 110 |
+
2024-08-04 14:06:09,361 INFO SenderThread:9469 [job_builder.py:build():296] Attempting to build job artifact
|
| 111 |
+
2024-08-04 14:06:09,362 INFO SenderThread:9469 [job_builder.py:_get_source_type():426] is repo sourced job
|
| 112 |
+
2024-08-04 14:06:09,376 INFO SenderThread:9469 [job_builder.py:build():402] adding wandb-job metadata file
|
| 113 |
+
2024-08-04 14:06:09,384 INFO SenderThread:9469 [sender.py:transition_state():617] send defer: 9
|
| 114 |
+
2024-08-04 14:06:09,384 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: defer
|
| 115 |
+
2024-08-04 14:06:09,384 DEBUG SenderThread:9469 [sender.py:send():382] send: artifact
|
| 116 |
+
2024-08-04 14:06:09,384 INFO HandlerThread:9469 [handler.py:handle_request_defer():172] handle defer: 9
|
| 117 |
+
2024-08-04 14:06:09,620 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: poll_exit
|
| 118 |
+
2024-08-04 14:06:10,224 INFO Thread-12 :9469 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_140603-q9i5g6sv/files/output.log
|
| 119 |
+
2024-08-04 14:06:10,240 INFO SenderThread:9469 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTA5MTk2NTkzOA==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTA5MzUzODM4NQ==', 'versionIndex': 3}}}
|
| 120 |
+
2024-08-04 14:06:10,240 DEBUG SenderThread:9469 [sender.py:send_request():409] send_request: defer
|
| 121 |
+
2024-08-04 14:06:10,240 INFO SenderThread:9469 [sender.py:send_request_defer():613] handle sender defer: 9
|
| 122 |
+
2024-08-04 14:06:10,240 INFO SenderThread:9469 [dir_watcher.py:finish():358] shutting down directory watcher
|
| 123 |
+
2024-08-04 14:06:11,225 INFO SenderThread:9469 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240804_140603-q9i5g6sv/files
|
| 124 |
+
2024-08-04 14:06:11,225 INFO SenderThread:9469 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_140603-q9i5g6sv/files/requirements.txt requirements.txt
|
| 125 |
+
2024-08-04 14:06:11,225 INFO SenderThread:9469 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_140603-q9i5g6sv/files/config.yaml config.yaml
|
| 126 |
+
2024-08-04 14:06:11,227 INFO SenderThread:9469 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_140603-q9i5g6sv/files/wandb-metadata.json wandb-metadata.json
|
| 127 |
+
2024-08-04 14:06:11,227 INFO SenderThread:9469 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_140603-q9i5g6sv/files/wandb-summary.json wandb-summary.json
|
| 128 |
+
2024-08-04 14:06:11,228 INFO SenderThread:9469 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_140603-q9i5g6sv/files/output.log output.log
|
| 129 |
+
2024-08-04 14:06:11,230 INFO SenderThread:9469 [sender.py:transition_state():617] send defer: 10
|
| 130 |
+
2024-08-04 14:06:11,230 DEBUG SenderThread:9469 [sender.py:send_request():409] send_request: poll_exit
|
| 131 |
+
2024-08-04 14:06:11,230 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: defer
|
| 132 |
+
2024-08-04 14:06:11,232 INFO HandlerThread:9469 [handler.py:handle_request_defer():172] handle defer: 10
|
| 133 |
+
2024-08-04 14:06:11,232 DEBUG SenderThread:9469 [sender.py:send_request():409] send_request: defer
|
| 134 |
+
2024-08-04 14:06:11,232 INFO SenderThread:9469 [sender.py:send_request_defer():613] handle sender defer: 10
|
| 135 |
+
2024-08-04 14:06:11,232 INFO SenderThread:9469 [file_pusher.py:finish():172] shutting down file pusher
|
| 136 |
+
2024-08-04 14:06:11,620 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: poll_exit
|
| 137 |
+
2024-08-04 14:06:11,621 DEBUG SenderThread:9469 [sender.py:send_request():409] send_request: poll_exit
|
| 138 |
+
2024-08-04 14:06:11,713 INFO wandb-upload_0:9469 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_140603-q9i5g6sv/files/requirements.txt
|
| 139 |
+
2024-08-04 14:06:11,733 INFO wandb-upload_1:9469 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_140603-q9i5g6sv/files/config.yaml
|
| 140 |
+
2024-08-04 14:06:11,829 INFO wandb-upload_2:9469 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_140603-q9i5g6sv/files/wandb-summary.json
|
| 141 |
+
2024-08-04 14:06:11,833 INFO wandb-upload_3:9469 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_140603-q9i5g6sv/files/output.log
|
| 142 |
+
2024-08-04 14:06:12,033 INFO Thread-11 (_thread_body):9469 [sender.py:transition_state():617] send defer: 11
|
| 143 |
+
2024-08-04 14:06:12,034 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: defer
|
| 144 |
+
2024-08-04 14:06:12,034 INFO HandlerThread:9469 [handler.py:handle_request_defer():172] handle defer: 11
|
| 145 |
+
2024-08-04 14:06:12,034 DEBUG SenderThread:9469 [sender.py:send_request():409] send_request: defer
|
| 146 |
+
2024-08-04 14:06:12,034 INFO SenderThread:9469 [sender.py:send_request_defer():613] handle sender defer: 11
|
| 147 |
+
2024-08-04 14:06:12,034 INFO SenderThread:9469 [file_pusher.py:join():178] waiting for file pusher
|
| 148 |
+
2024-08-04 14:06:12,034 INFO SenderThread:9469 [sender.py:transition_state():617] send defer: 12
|
| 149 |
+
2024-08-04 14:06:12,034 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: defer
|
| 150 |
+
2024-08-04 14:06:12,034 INFO HandlerThread:9469 [handler.py:handle_request_defer():172] handle defer: 12
|
| 151 |
+
2024-08-04 14:06:12,035 DEBUG SenderThread:9469 [sender.py:send_request():409] send_request: defer
|
| 152 |
+
2024-08-04 14:06:12,035 INFO SenderThread:9469 [sender.py:send_request_defer():613] handle sender defer: 12
|
| 153 |
+
2024-08-04 14:06:12,035 INFO SenderThread:9469 [file_stream.py:finish():595] file stream finish called
|
| 154 |
+
2024-08-04 14:06:12,204 INFO SenderThread:9469 [file_stream.py:finish():599] file stream finish is done
|
| 155 |
+
2024-08-04 14:06:12,204 INFO SenderThread:9469 [sender.py:transition_state():617] send defer: 13
|
| 156 |
+
2024-08-04 14:06:12,205 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: defer
|
| 157 |
+
2024-08-04 14:06:12,205 INFO HandlerThread:9469 [handler.py:handle_request_defer():172] handle defer: 13
|
| 158 |
+
2024-08-04 14:06:12,205 DEBUG SenderThread:9469 [sender.py:send_request():409] send_request: defer
|
| 159 |
+
2024-08-04 14:06:12,205 INFO SenderThread:9469 [sender.py:send_request_defer():613] handle sender defer: 13
|
| 160 |
+
2024-08-04 14:06:12,205 INFO SenderThread:9469 [sender.py:transition_state():617] send defer: 14
|
| 161 |
+
2024-08-04 14:06:12,205 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: defer
|
| 162 |
+
2024-08-04 14:06:12,205 DEBUG SenderThread:9469 [sender.py:send():382] send: final
|
| 163 |
+
2024-08-04 14:06:12,205 INFO HandlerThread:9469 [handler.py:handle_request_defer():172] handle defer: 14
|
| 164 |
+
2024-08-04 14:06:12,205 DEBUG SenderThread:9469 [sender.py:send():382] send: footer
|
| 165 |
+
2024-08-04 14:06:12,206 DEBUG SenderThread:9469 [sender.py:send_request():409] send_request: defer
|
| 166 |
+
2024-08-04 14:06:12,206 INFO SenderThread:9469 [sender.py:send_request_defer():613] handle sender defer: 14
|
| 167 |
+
2024-08-04 14:06:12,206 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: poll_exit
|
| 168 |
+
2024-08-04 14:06:12,206 DEBUG SenderThread:9469 [sender.py:send_request():409] send_request: poll_exit
|
| 169 |
+
2024-08-04 14:06:12,206 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: poll_exit
|
| 170 |
+
2024-08-04 14:06:12,207 DEBUG SenderThread:9469 [sender.py:send_request():409] send_request: poll_exit
|
| 171 |
+
2024-08-04 14:06:12,207 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: server_info
|
| 172 |
+
2024-08-04 14:06:12,207 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: get_summary
|
| 173 |
+
2024-08-04 14:06:12,207 DEBUG SenderThread:9469 [sender.py:send_request():409] send_request: server_info
|
| 174 |
+
2024-08-04 14:06:12,208 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: sampled_history
|
| 175 |
+
2024-08-04 14:06:12,209 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: internal_messages
|
| 176 |
+
2024-08-04 14:06:12,209 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: job_info
|
| 177 |
+
2024-08-04 14:06:12,360 DEBUG SenderThread:9469 [sender.py:send_request():409] send_request: job_info
|
| 178 |
+
2024-08-04 14:06:12,360 INFO MainThread:9469 [wandb_run.py:_footer_history_summary_info():3866] rendering history
|
| 179 |
+
2024-08-04 14:06:12,360 INFO MainThread:9469 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
|
| 180 |
+
2024-08-04 14:06:12,360 INFO MainThread:9469 [wandb_run.py:_footer_sync_info():3825] logging synced files
|
| 181 |
+
2024-08-04 14:06:12,360 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: shutdown
|
| 182 |
+
2024-08-04 14:06:12,361 INFO HandlerThread:9469 [handler.py:finish():869] shutting down handler
|
| 183 |
+
2024-08-04 14:06:13,210 INFO WriterThread:9469 [datastore.py:close():296] close: /project/wandb/run-20240804_140603-q9i5g6sv/run-q9i5g6sv.wandb
|
| 184 |
+
2024-08-04 14:06:13,360 INFO SenderThread:9469 [sender.py:finish():1572] shutting down sender
|
| 185 |
+
2024-08-04 14:06:13,360 INFO SenderThread:9469 [file_pusher.py:finish():172] shutting down file pusher
|
| 186 |
+
2024-08-04 14:06:13,360 INFO SenderThread:9469 [file_pusher.py:join():178] waiting for file pusher
|
wandb/run-20240804_140603-q9i5g6sv/logs/debug.log
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2024-08-04 14:06:03,677 INFO MainThread:9398 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
|
| 2 |
+
2024-08-04 14:06:03,678 INFO MainThread:9398 [wandb_setup.py:_flush():76] Configure stats pid to 9398
|
| 3 |
+
2024-08-04 14:06:03,678 INFO MainThread:9398 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
|
| 4 |
+
2024-08-04 14:06:03,678 INFO MainThread:9398 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
|
| 5 |
+
2024-08-04 14:06:03,678 INFO MainThread:9398 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train tiny llama sample'}
|
| 6 |
+
2024-08-04 14:06:03,678 INFO MainThread:9398 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
|
| 7 |
+
2024-08-04 14:06:03,678 INFO MainThread:9398 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
|
| 8 |
+
2024-08-04 14:06:03,678 INFO MainThread:9398 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240804_140603-q9i5g6sv/logs/debug.log
|
| 9 |
+
2024-08-04 14:06:03,678 INFO MainThread:9398 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240804_140603-q9i5g6sv/logs/debug-internal.log
|
| 10 |
+
2024-08-04 14:06:03,678 INFO MainThread:9398 [wandb_init.py:init():566] calling init triggers
|
| 11 |
+
2024-08-04 14:06:03,678 INFO MainThread:9398 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
|
| 12 |
+
config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'valid_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'test_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 512, 'num_workers': 2, 'tokenizer_type': 'Llama2Tokenizer', 'tokenizer_model': '/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'tiny-llama_train_2024-08-04-14:05:53', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/tiny-llama', 'save': '/work/llm_recipes/models/tiny-llama', 'base_model': '/share/pretrained_lm/meta-llama/TinyLlama_v1.1', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 2000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 2000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 8, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/tiny-llama', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 32000, 'gradient_accumulation_steps': 40}
|
| 13 |
+
2024-08-04 14:06:03,678 INFO MainThread:9398 [wandb_init.py:init():616] starting backend
|
| 14 |
+
2024-08-04 14:06:03,678 INFO MainThread:9398 [wandb_init.py:init():620] setting up manager
|
| 15 |
+
2024-08-04 14:06:03,683 INFO MainThread:9398 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 16 |
+
2024-08-04 14:06:03,684 INFO MainThread:9398 [wandb_init.py:init():628] backend started and connected
|
| 17 |
+
2024-08-04 14:06:03,689 INFO MainThread:9398 [wandb_init.py:init():720] updated telemetry
|
| 18 |
+
2024-08-04 14:06:03,699 INFO MainThread:9398 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
|
| 19 |
+
2024-08-04 14:06:04,223 INFO MainThread:9398 [wandb_run.py:_on_init():2262] communicating current version
|
| 20 |
+
2024-08-04 14:06:04,307 INFO MainThread:9398 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.5 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
|
| 21 |
+
|
| 22 |
+
2024-08-04 14:06:04,307 INFO MainThread:9398 [wandb_init.py:init():804] starting run threads in backend
|
| 23 |
+
2024-08-04 14:06:04,363 INFO MainThread:9398 [wandb_run.py:_console_start():2241] atexit reg
|
| 24 |
+
2024-08-04 14:06:04,363 INFO MainThread:9398 [wandb_run.py:_redirect():2096] redirect: wrap_raw
|
| 25 |
+
2024-08-04 14:06:04,363 INFO MainThread:9398 [wandb_run.py:_redirect():2161] Wrapping output streams.
|
| 26 |
+
2024-08-04 14:06:04,363 INFO MainThread:9398 [wandb_run.py:_redirect():2186] Redirects installed.
|
| 27 |
+
2024-08-04 14:06:04,364 INFO MainThread:9398 [wandb_init.py:init():847] run started, returning control to user process
|
| 28 |
+
2024-08-04 14:06:07,603 INFO MainThread:9398 [wandb_run.py:_config_callback():1343] config_cb None None {'activation_function': 'silu', 'hidden_size': 2048, 'model_type': 'llama', 'max_position_embeddings': 2048, 'num_attention_heads': 32, 'num_hidden_layers': 22, 'model_architecture': 'LlamaForCausalLM'}
|
| 29 |
+
2024-08-04 14:06:07,604 INFO MainThread:9398 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
|
| 30 |
+
2024-08-04 14:06:13,361 WARNING MsgRouterThr:9398 [router.py:message_loop():77] message_loop has been closed
|
wandb/run-20240804_140603-q9i5g6sv/run-q9i5g6sv.wandb
ADDED
|
Binary file (20.7 kB). View file
|
|
|
wandb/run-20240804_142250-6p58tz1g/files/config.yaml
ADDED
|
@@ -0,0 +1,335 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
wandb_version: 1
|
| 2 |
+
|
| 3 |
+
sharding_strategy:
|
| 4 |
+
desc: null
|
| 5 |
+
value: FULL_SHARD
|
| 6 |
+
checkpoint_type:
|
| 7 |
+
desc: null
|
| 8 |
+
value: LOCAL_STATE_DICT
|
| 9 |
+
fsdp_activation_checkpointing:
|
| 10 |
+
desc: null
|
| 11 |
+
value: true
|
| 12 |
+
fsdp_cpu_offload:
|
| 13 |
+
desc: null
|
| 14 |
+
value: false
|
| 15 |
+
low_cpu_fsdp:
|
| 16 |
+
desc: null
|
| 17 |
+
value: false
|
| 18 |
+
no_meta_device:
|
| 19 |
+
desc: null
|
| 20 |
+
value: false
|
| 21 |
+
data_path:
|
| 22 |
+
desc: null
|
| 23 |
+
value: null
|
| 24 |
+
split:
|
| 25 |
+
desc: null
|
| 26 |
+
value: 969, 30, 1
|
| 27 |
+
train_data_path:
|
| 28 |
+
desc: null
|
| 29 |
+
value:
|
| 30 |
+
- '4013541'
|
| 31 |
+
- /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
|
| 32 |
+
valid_data_path:
|
| 33 |
+
desc: null
|
| 34 |
+
value:
|
| 35 |
+
- '4013541'
|
| 36 |
+
- /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
|
| 37 |
+
test_data_path:
|
| 38 |
+
desc: null
|
| 39 |
+
value:
|
| 40 |
+
- '4013541'
|
| 41 |
+
- /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
|
| 42 |
+
data_cache_path:
|
| 43 |
+
desc: null
|
| 44 |
+
value: null
|
| 45 |
+
vocab_size:
|
| 46 |
+
desc: null
|
| 47 |
+
value: null
|
| 48 |
+
vocab_file:
|
| 49 |
+
desc: null
|
| 50 |
+
value: null
|
| 51 |
+
merge_file:
|
| 52 |
+
desc: null
|
| 53 |
+
value: null
|
| 54 |
+
seq_length:
|
| 55 |
+
desc: null
|
| 56 |
+
value: 512
|
| 57 |
+
num_workers:
|
| 58 |
+
desc: null
|
| 59 |
+
value: 2
|
| 60 |
+
tokenizer_type:
|
| 61 |
+
desc: null
|
| 62 |
+
value: Llama2Tokenizer
|
| 63 |
+
tokenizer_model:
|
| 64 |
+
desc: null
|
| 65 |
+
value: /share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model
|
| 66 |
+
reset_position_ids:
|
| 67 |
+
desc: null
|
| 68 |
+
value: false
|
| 69 |
+
reset_attention_mask:
|
| 70 |
+
desc: null
|
| 71 |
+
value: false
|
| 72 |
+
eod_mask_loss:
|
| 73 |
+
desc: null
|
| 74 |
+
value: false
|
| 75 |
+
retro_return_doc_ids:
|
| 76 |
+
desc: null
|
| 77 |
+
value: false
|
| 78 |
+
short_seq_prob:
|
| 79 |
+
desc: null
|
| 80 |
+
value: 0.1
|
| 81 |
+
vocab_extra_ids:
|
| 82 |
+
desc: null
|
| 83 |
+
value: 0
|
| 84 |
+
seed:
|
| 85 |
+
desc: null
|
| 86 |
+
value: 1234
|
| 87 |
+
use_mpi:
|
| 88 |
+
desc: null
|
| 89 |
+
value: false
|
| 90 |
+
wandb_entity:
|
| 91 |
+
desc: null
|
| 92 |
+
value: iwakawa-koichi-q5-tohoku-nlp6723
|
| 93 |
+
wandb_name:
|
| 94 |
+
desc: null
|
| 95 |
+
value: tiny-llama_train_2024-08-04-14:22:39
|
| 96 |
+
wandb_project:
|
| 97 |
+
desc: null
|
| 98 |
+
value: llm_tutorial
|
| 99 |
+
quantization:
|
| 100 |
+
desc: null
|
| 101 |
+
value: false
|
| 102 |
+
use_freeze_layers:
|
| 103 |
+
desc: null
|
| 104 |
+
value: false
|
| 105 |
+
freeze_layers:
|
| 106 |
+
desc: null
|
| 107 |
+
value: null
|
| 108 |
+
bf16:
|
| 109 |
+
desc: null
|
| 110 |
+
value: true
|
| 111 |
+
fp16:
|
| 112 |
+
desc: null
|
| 113 |
+
value: false
|
| 114 |
+
mixed_precision:
|
| 115 |
+
desc: null
|
| 116 |
+
value: true
|
| 117 |
+
param_dtype:
|
| 118 |
+
desc: null
|
| 119 |
+
value: null
|
| 120 |
+
load:
|
| 121 |
+
desc: null
|
| 122 |
+
value: /work/llm_recipes/models/tiny-llama
|
| 123 |
+
save:
|
| 124 |
+
desc: null
|
| 125 |
+
value: /work/llm_recipes/models/tiny-llama
|
| 126 |
+
base_model:
|
| 127 |
+
desc: null
|
| 128 |
+
value: /share/pretrained_lm/meta-llama/TinyLlama_v1.1
|
| 129 |
+
use_better_transformer:
|
| 130 |
+
desc: null
|
| 131 |
+
value: false
|
| 132 |
+
grad_clip_norm:
|
| 133 |
+
desc: null
|
| 134 |
+
value: 1.0
|
| 135 |
+
eval_interval:
|
| 136 |
+
desc: null
|
| 137 |
+
value: 200
|
| 138 |
+
save_interval:
|
| 139 |
+
desc: null
|
| 140 |
+
value: 200
|
| 141 |
+
eval_iters:
|
| 142 |
+
desc: null
|
| 143 |
+
value: 10
|
| 144 |
+
optimizer:
|
| 145 |
+
desc: null
|
| 146 |
+
value: adam
|
| 147 |
+
lr:
|
| 148 |
+
desc: null
|
| 149 |
+
value: 2.0e-05
|
| 150 |
+
lr_decay_style:
|
| 151 |
+
desc: null
|
| 152 |
+
value: cosine
|
| 153 |
+
lr_decay_iters:
|
| 154 |
+
desc: null
|
| 155 |
+
value: 2000
|
| 156 |
+
lr_warmup_iters:
|
| 157 |
+
desc: null
|
| 158 |
+
value: 500
|
| 159 |
+
min_lr:
|
| 160 |
+
desc: null
|
| 161 |
+
value: 1.0e-06
|
| 162 |
+
train_iters:
|
| 163 |
+
desc: null
|
| 164 |
+
value: 2000
|
| 165 |
+
train_samples:
|
| 166 |
+
desc: null
|
| 167 |
+
value: null
|
| 168 |
+
global_batch_size:
|
| 169 |
+
desc: null
|
| 170 |
+
value: 320
|
| 171 |
+
micro_batch_size:
|
| 172 |
+
desc: null
|
| 173 |
+
value: 8
|
| 174 |
+
make_vocab_size_divisible_by:
|
| 175 |
+
desc: null
|
| 176 |
+
value: 128
|
| 177 |
+
sliding_window_size:
|
| 178 |
+
desc: null
|
| 179 |
+
value: 4096
|
| 180 |
+
skip_batch:
|
| 181 |
+
desc: null
|
| 182 |
+
value: null
|
| 183 |
+
no_save_optimizer_state:
|
| 184 |
+
desc: null
|
| 185 |
+
value: false
|
| 186 |
+
continual_pretraining:
|
| 187 |
+
desc: null
|
| 188 |
+
value: false
|
| 189 |
+
instruction_tuning:
|
| 190 |
+
desc: null
|
| 191 |
+
value: false
|
| 192 |
+
direct_preference_optimization:
|
| 193 |
+
desc: null
|
| 194 |
+
value: false
|
| 195 |
+
attention_dropout:
|
| 196 |
+
desc: null
|
| 197 |
+
value: 0.1
|
| 198 |
+
hidden_dropout:
|
| 199 |
+
desc: null
|
| 200 |
+
value: 0.1
|
| 201 |
+
weight_decay:
|
| 202 |
+
desc: null
|
| 203 |
+
value: 0.1
|
| 204 |
+
adam_beta1:
|
| 205 |
+
desc: null
|
| 206 |
+
value: 0.9
|
| 207 |
+
adam_beta2:
|
| 208 |
+
desc: null
|
| 209 |
+
value: 0.95
|
| 210 |
+
adam_eps:
|
| 211 |
+
desc: null
|
| 212 |
+
value: 1.0e-06
|
| 213 |
+
hf_transformer_model_dir:
|
| 214 |
+
desc: null
|
| 215 |
+
value: null
|
| 216 |
+
instruction_train_data_path:
|
| 217 |
+
desc: null
|
| 218 |
+
value: null
|
| 219 |
+
instruction_valid_data_path:
|
| 220 |
+
desc: null
|
| 221 |
+
value: null
|
| 222 |
+
epoch:
|
| 223 |
+
desc: null
|
| 224 |
+
value: null
|
| 225 |
+
instruction_dataset_size:
|
| 226 |
+
desc: null
|
| 227 |
+
value: null
|
| 228 |
+
save_sampler_state:
|
| 229 |
+
desc: null
|
| 230 |
+
value: false
|
| 231 |
+
label_smoothing:
|
| 232 |
+
desc: null
|
| 233 |
+
value: 0.0
|
| 234 |
+
save_n_checkpoints:
|
| 235 |
+
desc: null
|
| 236 |
+
value: 10
|
| 237 |
+
hf_repo_id:
|
| 238 |
+
desc: null
|
| 239 |
+
value: koichi12/tiny-llama
|
| 240 |
+
create_public_hf_repo:
|
| 241 |
+
desc: null
|
| 242 |
+
value: false
|
| 243 |
+
upload_all_checkpoints_to_hf:
|
| 244 |
+
desc: null
|
| 245 |
+
value: false
|
| 246 |
+
hf_upload_retry_limit:
|
| 247 |
+
desc: null
|
| 248 |
+
value: 2
|
| 249 |
+
exit_duration_in_mins:
|
| 250 |
+
desc: null
|
| 251 |
+
value: null
|
| 252 |
+
source_key:
|
| 253 |
+
desc: null
|
| 254 |
+
value: null
|
| 255 |
+
target_key:
|
| 256 |
+
desc: null
|
| 257 |
+
value: null
|
| 258 |
+
attn_implementation:
|
| 259 |
+
desc: null
|
| 260 |
+
value: flash_attention_2
|
| 261 |
+
efficient_instruction_tuning:
|
| 262 |
+
desc: null
|
| 263 |
+
value: false
|
| 264 |
+
remove_padding_masking:
|
| 265 |
+
desc: null
|
| 266 |
+
value: false
|
| 267 |
+
save_start_iter:
|
| 268 |
+
desc: null
|
| 269 |
+
value: null
|
| 270 |
+
rank:
|
| 271 |
+
desc: null
|
| 272 |
+
value: 0
|
| 273 |
+
world_size:
|
| 274 |
+
desc: null
|
| 275 |
+
value: 1
|
| 276 |
+
padded_vocab_size:
|
| 277 |
+
desc: null
|
| 278 |
+
value: 32000
|
| 279 |
+
gradient_accumulation_steps:
|
| 280 |
+
desc: null
|
| 281 |
+
value: 40
|
| 282 |
+
_wandb:
|
| 283 |
+
desc: null
|
| 284 |
+
value:
|
| 285 |
+
python_version: 3.10.12
|
| 286 |
+
cli_version: 0.16.3
|
| 287 |
+
framework: huggingface
|
| 288 |
+
huggingface_version: 4.43.3
|
| 289 |
+
is_jupyter_run: false
|
| 290 |
+
is_kaggle_kernel: false
|
| 291 |
+
start_time: 1722748970.443993
|
| 292 |
+
t:
|
| 293 |
+
1:
|
| 294 |
+
- 1
|
| 295 |
+
- 11
|
| 296 |
+
- 49
|
| 297 |
+
- 55
|
| 298 |
+
- 71
|
| 299 |
+
2:
|
| 300 |
+
- 1
|
| 301 |
+
- 11
|
| 302 |
+
- 49
|
| 303 |
+
- 55
|
| 304 |
+
- 71
|
| 305 |
+
3:
|
| 306 |
+
- 13
|
| 307 |
+
- 16
|
| 308 |
+
- 23
|
| 309 |
+
4: 3.10.12
|
| 310 |
+
5: 0.16.3
|
| 311 |
+
6: 4.43.3
|
| 312 |
+
8:
|
| 313 |
+
- 5
|
| 314 |
+
13: linux-x86_64
|
| 315 |
+
activation_function:
|
| 316 |
+
desc: null
|
| 317 |
+
value: silu
|
| 318 |
+
hidden_size:
|
| 319 |
+
desc: null
|
| 320 |
+
value: 2048
|
| 321 |
+
model_type:
|
| 322 |
+
desc: null
|
| 323 |
+
value: llama
|
| 324 |
+
max_position_embeddings:
|
| 325 |
+
desc: null
|
| 326 |
+
value: 2048
|
| 327 |
+
num_attention_heads:
|
| 328 |
+
desc: null
|
| 329 |
+
value: 32
|
| 330 |
+
num_hidden_layers:
|
| 331 |
+
desc: null
|
| 332 |
+
value: 22
|
| 333 |
+
model_architecture:
|
| 334 |
+
desc: null
|
| 335 |
+
value: LlamaForCausalLM
|
wandb/run-20240804_142250-6p58tz1g/files/output.log
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Created Hugging Face repository with ID koichi12/tiny-llama.
|
| 2 |
+
Clearing GPU cache for all ranks
|
| 3 |
+
--> Running with torch torch_distributed debug set to detail
|
| 4 |
+
File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
| 5 |
+
Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
| 6 |
+
File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
| 7 |
+
Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
| 8 |
+
File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
| 9 |
+
Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
| 10 |
+
No checkpoint found in /work/llm_recipes/models/tiny-llama, skipping model loading
|
| 11 |
+
--> Model /share/pretrained_lm/meta-llama/TinyLlama_v1.1
|
| 12 |
+
--> /share/pretrained_lm/meta-llama/TinyLlama_v1.1 has 1100.048384 Million params
|
| 13 |
+
You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
|
| 14 |
+
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
|
| 15 |
+
Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaForCausalLM is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
|
| 16 |
+
Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
|
| 17 |
+
/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
|
| 18 |
+
warnings.warn(
|
| 19 |
+
BFloat16 enabled for mixed precision - using bfSixteen policy
|
| 20 |
+
--> applying fsdp activation checkpointing...
|
| 21 |
+
> datasets target sizes (minimum size):
|
| 22 |
+
train: 640000
|
| 23 |
+
validation: 35200
|
| 24 |
+
test: 3200
|
| 25 |
+
> building train, validation, and test datasets for GPT ...
|
| 26 |
+
> finished creating GPT datasets ...
|
| 27 |
+
File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
| 28 |
+
Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
| 29 |
+
No checkpoint found in /work/llm_recipes/models/tiny-llama, skipping optimizer loading
|
| 30 |
+
File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
| 31 |
+
Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
| 32 |
+
model info: FullyShardedDataParallel(
|
| 33 |
+
(_fsdp_wrapped_module): LlamaForCausalLM(
|
| 34 |
+
(model): LlamaModel(
|
| 35 |
+
(embed_tokens): Embedding(32000, 2048)
|
| 36 |
+
(layers): ModuleList(
|
| 37 |
+
(0-21): 22 x FullyShardedDataParallel(
|
| 38 |
+
(_fsdp_wrapped_module): CheckpointWrapper(
|
| 39 |
+
(_checkpoint_wrapped_module): LlamaDecoderLayer(
|
| 40 |
+
(self_attn): LlamaFlashAttention2(
|
| 41 |
+
(q_proj): Linear(in_features=2048, out_features=2048, bias=False)
|
| 42 |
+
(k_proj): Linear(in_features=2048, out_features=256, bias=False)
|
| 43 |
+
(v_proj): Linear(in_features=2048, out_features=256, bias=False)
|
| 44 |
+
(o_proj): Linear(in_features=2048, out_features=2048, bias=False)
|
| 45 |
+
(rotary_emb): LlamaRotaryEmbedding()
|
| 46 |
+
)
|
| 47 |
+
(mlp): LlamaMLP(
|
| 48 |
+
(gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
|
| 49 |
+
(up_proj): Linear(in_features=2048, out_features=5632, bias=False)
|
| 50 |
+
(down_proj): Linear(in_features=5632, out_features=2048, bias=False)
|
| 51 |
+
(act_fn): SiLU()
|
| 52 |
+
)
|
| 53 |
+
(input_layernorm): LlamaRMSNorm()
|
| 54 |
+
(post_attention_layernorm): LlamaRMSNorm()
|
| 55 |
+
)
|
| 56 |
+
)
|
| 57 |
+
)
|
| 58 |
+
)
|
| 59 |
+
(norm): LlamaRMSNorm()
|
| 60 |
+
(rotary_emb): LlamaRotaryEmbedding()
|
| 61 |
+
)
|
| 62 |
+
(lm_head): Linear(in_features=2048, out_features=32000, bias=False)
|
| 63 |
+
)
|
| 64 |
+
)
|
| 65 |
+
model config: LlamaConfig {
|
| 66 |
+
"_name_or_path": "/share/pretrained_lm/meta-llama/TinyLlama_v1.1",
|
| 67 |
+
"architectures": [
|
| 68 |
+
"LlamaForCausalLM"
|
| 69 |
+
],
|
| 70 |
+
"attention_bias": false,
|
| 71 |
+
"attention_dropout": 0.0,
|
| 72 |
+
"bos_token_id": 1,
|
| 73 |
+
"eos_token_id": 2,
|
| 74 |
+
"hidden_act": "silu",
|
| 75 |
+
"hidden_size": 2048,
|
| 76 |
+
"initializer_range": 0.02,
|
| 77 |
+
"intermediate_size": 5632,
|
| 78 |
+
"label_smoothing": 0.0,
|
| 79 |
+
"max_position_embeddings": 2048,
|
| 80 |
+
"mlp_bias": false,
|
| 81 |
+
"model_type": "llama",
|
| 82 |
+
"num_attention_heads": 32,
|
| 83 |
+
"num_hidden_layers": 22,
|
| 84 |
+
"num_key_value_heads": 4,
|
| 85 |
+
"pretraining_tp": 1,
|
| 86 |
+
"rms_norm_eps": 1e-05,
|
| 87 |
+
"rope_scaling": null,
|
| 88 |
+
"rope_theta": 10000.0,
|
| 89 |
+
"tie_word_embeddings": false,
|
| 90 |
+
"torch_dtype": "float32",
|
| 91 |
+
"transformers_version": "4.43.3",
|
| 92 |
+
"use_cache": false,
|
| 93 |
+
"vocab_size": 32000
|
| 94 |
+
}
|
| 95 |
+
Let split = None
|
| 96 |
+
Building a BlendedDataset for a single MegatronDataset
|
| 97 |
+
Unable to save the indexes because path_to_cache is None
|
| 98 |
+
Building a BlendedDataset for a single MegatronDataset
|
| 99 |
+
Unable to save the indexes because path_to_cache is None
|
| 100 |
+
Building a BlendedDataset for a single MegatronDataset
|
| 101 |
+
Unable to save the indexes because path_to_cache is None
|
| 102 |
+
Traceback (most recent call last):
|
| 103 |
+
File "/project/examples/finetuning.py", line 13, in <module>
|
| 104 |
+
main()
|
| 105 |
+
File "/project/src/llama_recipes/finetuning.py", line 281, in main
|
| 106 |
+
train(
|
| 107 |
+
File "/project/src/llama_recipes/utils/train_utils.py", line 104, in train
|
| 108 |
+
batch = next(train_dataloader)
|
| 109 |
+
File "/project/src/llama_recipes/utils/train_utils.py", line 24, in cyclic_iter
|
| 110 |
+
for x in iter:
|
| 111 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 631, in __next__
|
| 112 |
+
data = self._next_data()
|
| 113 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1346, in _next_data
|
| 114 |
+
return self._process_data(data)
|
| 115 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1372, in _process_data
|
| 116 |
+
data.reraise()
|
| 117 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/_utils.py", line 705, in reraise
|
| 118 |
+
raise exception
|
| 119 |
+
RuntimeError: Caught RuntimeError in DataLoader worker process 0.
|
| 120 |
+
Original Traceback (most recent call last):
|
| 121 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
|
| 122 |
+
data = fetcher.fetch(index)
|
| 123 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/fetch.py", line 54, in fetch
|
| 124 |
+
return self.collate_fn(data)
|
| 125 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 277, in default_collate
|
| 126 |
+
return collate(batch, collate_fn_map=default_collate_fn_map)
|
| 127 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 129, in collate
|
| 128 |
+
return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
|
| 129 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 129, in <dictcomp>
|
| 130 |
+
return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
|
| 131 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 121, in collate
|
| 132 |
+
return collate_fn_map[elem_type](batch, collate_fn_map=collate_fn_map)
|
| 133 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 174, in collate_tensor_fn
|
| 134 |
+
return torch.stack(batch, 0, out=out)
|
| 135 |
+
RuntimeError: stack expects each tensor to be equal size, but got [513] at entry 0 and [543] at entry 1
|
wandb/run-20240804_142250-6p58tz1g/files/requirements.txt
ADDED
|
@@ -0,0 +1,271 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
absl-py==2.1.0
|
| 2 |
+
accelerate==0.33.0
|
| 3 |
+
aiohttp==3.9.1
|
| 4 |
+
aiosignal==1.3.1
|
| 5 |
+
annotated-types==0.6.0
|
| 6 |
+
apex==0.1
|
| 7 |
+
appdirs==1.4.4
|
| 8 |
+
argon2-cffi-bindings==21.2.0
|
| 9 |
+
argon2-cffi==23.1.0
|
| 10 |
+
asttokens==2.4.1
|
| 11 |
+
astunparse==1.6.3
|
| 12 |
+
async-timeout==4.0.3
|
| 13 |
+
attrs==23.2.0
|
| 14 |
+
audioread==3.0.1
|
| 15 |
+
beautifulsoup4==4.12.3
|
| 16 |
+
bleach==6.1.0
|
| 17 |
+
blis==0.7.11
|
| 18 |
+
cachetools==5.3.2
|
| 19 |
+
catalogue==2.0.10
|
| 20 |
+
certifi==2024.2.2
|
| 21 |
+
cffi==1.16.0
|
| 22 |
+
charset-normalizer==3.3.2
|
| 23 |
+
click==8.1.7
|
| 24 |
+
cloudpathlib==0.16.0
|
| 25 |
+
cloudpickle==3.0.0
|
| 26 |
+
cmake==3.28.1
|
| 27 |
+
colorama==0.4.6
|
| 28 |
+
comm==0.2.1
|
| 29 |
+
confection==0.1.4
|
| 30 |
+
contourpy==1.2.0
|
| 31 |
+
cubinlinker==0.3.0+2.g405ac64
|
| 32 |
+
cuda-python==12.3.0rc4+9.gdb8c48a.dirty
|
| 33 |
+
cudf==23.12.0
|
| 34 |
+
cugraph-dgl==23.12.0
|
| 35 |
+
cugraph-service-client==23.12.0
|
| 36 |
+
cugraph-service-server==23.12.0
|
| 37 |
+
cugraph==23.12.0
|
| 38 |
+
cuml==23.12.0
|
| 39 |
+
cupy-cuda12x==12.3.0
|
| 40 |
+
cycler==0.12.1
|
| 41 |
+
cymem==2.0.8
|
| 42 |
+
cython==3.0.8
|
| 43 |
+
dask-cuda==23.12.0
|
| 44 |
+
dask-cudf==23.12.0
|
| 45 |
+
dask==2023.11.0
|
| 46 |
+
debugpy==1.8.1
|
| 47 |
+
decorator==5.1.1
|
| 48 |
+
defusedxml==0.7.1
|
| 49 |
+
distributed==2023.11.0
|
| 50 |
+
dm-tree==0.1.8
|
| 51 |
+
docker-pycreds==0.4.0
|
| 52 |
+
einops==0.7.0
|
| 53 |
+
exceptiongroup==1.2.0
|
| 54 |
+
execnet==2.0.2
|
| 55 |
+
executing==2.0.1
|
| 56 |
+
expecttest==0.1.3
|
| 57 |
+
fastjsonschema==2.19.1
|
| 58 |
+
fastrlock==0.8.2
|
| 59 |
+
filelock==3.13.1
|
| 60 |
+
flash-attn==2.4.2
|
| 61 |
+
fonttools==4.48.1
|
| 62 |
+
frozenlist==1.4.1
|
| 63 |
+
fsspec==2023.12.2
|
| 64 |
+
gast==0.5.4
|
| 65 |
+
gitdb==4.0.11
|
| 66 |
+
gitpython==3.1.43
|
| 67 |
+
google-auth-oauthlib==0.4.6
|
| 68 |
+
google-auth==2.27.0
|
| 69 |
+
graphsurgeon==0.4.6
|
| 70 |
+
grpcio==1.60.1
|
| 71 |
+
huggingface-hub==0.24.5
|
| 72 |
+
hypothesis==5.35.1
|
| 73 |
+
idna==3.6
|
| 74 |
+
importlib-metadata==7.0.1
|
| 75 |
+
iniconfig==2.0.0
|
| 76 |
+
intel-openmp==2021.4.0
|
| 77 |
+
ipadic==1.0.0
|
| 78 |
+
ipykernel==6.29.2
|
| 79 |
+
ipython-genutils==0.2.0
|
| 80 |
+
ipython==8.21.0
|
| 81 |
+
jedi==0.19.1
|
| 82 |
+
jinja2==3.1.3
|
| 83 |
+
joblib==1.3.2
|
| 84 |
+
json5==0.9.14
|
| 85 |
+
jsonnet==0.19.1
|
| 86 |
+
jsonschema-specifications==2023.12.1
|
| 87 |
+
jsonschema==4.21.1
|
| 88 |
+
jupyter-client==8.6.0
|
| 89 |
+
jupyter-core==5.7.1
|
| 90 |
+
jupyter-tensorboard==0.2.0
|
| 91 |
+
jupyterlab-pygments==0.3.0
|
| 92 |
+
jupyterlab-server==1.2.0
|
| 93 |
+
jupyterlab==2.3.2
|
| 94 |
+
jupytext==1.16.1
|
| 95 |
+
kiwisolver==1.4.5
|
| 96 |
+
langcodes==3.3.0
|
| 97 |
+
lazy-loader==0.3
|
| 98 |
+
librosa==0.10.1
|
| 99 |
+
llvmlite==0.40.1
|
| 100 |
+
locket==1.0.0
|
| 101 |
+
logzero==1.7.0
|
| 102 |
+
lxml==5.2.2
|
| 103 |
+
markdown-it-py==3.0.0
|
| 104 |
+
markdown==3.5.2
|
| 105 |
+
markupsafe==2.1.4
|
| 106 |
+
matplotlib-inline==0.1.6
|
| 107 |
+
matplotlib==3.8.2
|
| 108 |
+
mdit-py-plugins==0.4.0
|
| 109 |
+
mdurl==0.1.2
|
| 110 |
+
mecab-python3==1.0.6
|
| 111 |
+
mistune==3.0.2
|
| 112 |
+
mkl-devel==2021.1.1
|
| 113 |
+
mkl-include==2021.1.1
|
| 114 |
+
mkl==2021.1.1
|
| 115 |
+
mock==5.1.0
|
| 116 |
+
more-itertools==9.1.0
|
| 117 |
+
mpmath==1.3.0
|
| 118 |
+
msgpack==1.0.7
|
| 119 |
+
multidict==6.0.4
|
| 120 |
+
murmurhash==1.0.10
|
| 121 |
+
nbclient==0.9.0
|
| 122 |
+
nbconvert==7.16.0
|
| 123 |
+
nbformat==5.9.2
|
| 124 |
+
nest-asyncio==1.6.0
|
| 125 |
+
networkx==2.6.3
|
| 126 |
+
ninja==1.11.1.1
|
| 127 |
+
nltk==3.8.1
|
| 128 |
+
notebook==6.4.10
|
| 129 |
+
numba==0.57.1+1.g1ff679645
|
| 130 |
+
numpy==1.24.4
|
| 131 |
+
nvfuser==0.1.4a0+d0bb811
|
| 132 |
+
nvidia-dali-cuda120==1.34.0
|
| 133 |
+
nvidia-pyindex==1.0.9
|
| 134 |
+
nvtx==0.2.5
|
| 135 |
+
oauthlib==3.2.2
|
| 136 |
+
onnx==1.15.0rc2
|
| 137 |
+
opencv==4.7.0
|
| 138 |
+
optree==0.10.0
|
| 139 |
+
packaging==23.2
|
| 140 |
+
pandas==1.5.3
|
| 141 |
+
pandocfilters==1.5.1
|
| 142 |
+
parso==0.8.3
|
| 143 |
+
partd==1.4.1
|
| 144 |
+
peft==0.11.1
|
| 145 |
+
pexpect==4.9.0
|
| 146 |
+
pillow==10.2.0
|
| 147 |
+
pip==24.0
|
| 148 |
+
platformdirs==4.2.0
|
| 149 |
+
pluggy==1.4.0
|
| 150 |
+
ply==3.11
|
| 151 |
+
polygraphy==0.49.4
|
| 152 |
+
pooch==1.8.0
|
| 153 |
+
portalocker==2.10.1
|
| 154 |
+
preshed==3.0.9
|
| 155 |
+
prettytable==3.9.0
|
| 156 |
+
prometheus-client==0.19.0
|
| 157 |
+
prompt-toolkit==3.0.43
|
| 158 |
+
protobuf==4.24.4
|
| 159 |
+
psutil==5.9.4
|
| 160 |
+
ptxcompiler==0.8.1+2.g0d406d6
|
| 161 |
+
ptyprocess==0.7.0
|
| 162 |
+
pure-eval==0.2.2
|
| 163 |
+
pyarrow==14.0.1.dev0+gba5374836.d20240125
|
| 164 |
+
pyasn1-modules==0.3.0
|
| 165 |
+
pyasn1==0.5.1
|
| 166 |
+
pybind11-global==2.11.1
|
| 167 |
+
pybind11==2.11.1
|
| 168 |
+
pycocotools==2.0+nv0.8.0
|
| 169 |
+
pycparser==2.21
|
| 170 |
+
pydantic-core==2.16.2
|
| 171 |
+
pydantic==2.6.1
|
| 172 |
+
pygments==2.17.2
|
| 173 |
+
pylibcugraph==23.12.0
|
| 174 |
+
pylibcugraphops==23.12.0
|
| 175 |
+
pylibraft==23.12.0
|
| 176 |
+
pynvml==11.4.1
|
| 177 |
+
pyparsing==3.1.1
|
| 178 |
+
pytest-flakefinder==1.1.0
|
| 179 |
+
pytest-rerunfailures==13.0
|
| 180 |
+
pytest-shard==0.1.2
|
| 181 |
+
pytest-xdist==3.5.0
|
| 182 |
+
pytest==8.0.0
|
| 183 |
+
python-dateutil==2.8.2
|
| 184 |
+
python-dotenv==1.0.0
|
| 185 |
+
python-hostlist==1.23.0
|
| 186 |
+
pytorch-quantization==2.1.2
|
| 187 |
+
pytz==2023.3.post1
|
| 188 |
+
pyyaml==6.0.1
|
| 189 |
+
pyzmq==25.1.2
|
| 190 |
+
raft-dask==23.12.0
|
| 191 |
+
rapids-dask-dependency==23.12.1
|
| 192 |
+
referencing==0.33.0
|
| 193 |
+
regex==2023.12.25
|
| 194 |
+
requests-oauthlib==1.3.1
|
| 195 |
+
requests==2.31.0
|
| 196 |
+
rich==13.7.0
|
| 197 |
+
rmm==23.12.0
|
| 198 |
+
rpds-py==0.17.1
|
| 199 |
+
rsa==4.9
|
| 200 |
+
sacrebleu==2.4.0
|
| 201 |
+
safetensors==0.4.3
|
| 202 |
+
scikit-learn==1.2.0
|
| 203 |
+
scipy==1.12.0
|
| 204 |
+
send2trash==1.8.2
|
| 205 |
+
sentencepiece==0.1.99
|
| 206 |
+
sentry-sdk==2.12.0
|
| 207 |
+
setproctitle==1.3.3
|
| 208 |
+
setuptools==68.2.2
|
| 209 |
+
six==1.16.0
|
| 210 |
+
smart-open==6.4.0
|
| 211 |
+
smmap==5.0.1
|
| 212 |
+
sortedcontainers==2.4.0
|
| 213 |
+
soundfile==0.12.1
|
| 214 |
+
soupsieve==2.5
|
| 215 |
+
soxr==0.3.7
|
| 216 |
+
spacy-legacy==3.0.12
|
| 217 |
+
spacy-loggers==1.0.5
|
| 218 |
+
spacy==3.7.2
|
| 219 |
+
sphinx-glpi-theme==0.6
|
| 220 |
+
srsly==2.4.8
|
| 221 |
+
stack-data==0.6.3
|
| 222 |
+
sympy==1.12
|
| 223 |
+
tabulate==0.9.0
|
| 224 |
+
tbb==2021.11.0
|
| 225 |
+
tblib==3.0.0
|
| 226 |
+
tensorboard-data-server==0.6.1
|
| 227 |
+
tensorboard-plugin-wit==1.8.1
|
| 228 |
+
tensorboard==2.9.0
|
| 229 |
+
tensorrt==8.6.3
|
| 230 |
+
terminado==0.18.0
|
| 231 |
+
termplotlib==0.3.9
|
| 232 |
+
thinc==8.2.3
|
| 233 |
+
threadpoolctl==3.2.0
|
| 234 |
+
thriftpy2==0.4.17
|
| 235 |
+
tinycss2==1.2.1
|
| 236 |
+
tokenizers==0.19.1
|
| 237 |
+
toml==0.10.2
|
| 238 |
+
tomli==2.0.1
|
| 239 |
+
toolz==0.12.1
|
| 240 |
+
torch-tensorrt==2.3.0a0
|
| 241 |
+
torch==2.3.0a0+ebedce2
|
| 242 |
+
torchdata==0.7.1a0
|
| 243 |
+
torchtext==0.17.0a0
|
| 244 |
+
torchvision==0.18.0a0
|
| 245 |
+
tornado==6.4
|
| 246 |
+
tqdm==4.66.1
|
| 247 |
+
traitlets==5.9.0
|
| 248 |
+
transformer-engine==1.3.0+5b90b7f
|
| 249 |
+
transformers==4.43.3
|
| 250 |
+
treelite-runtime==3.9.1
|
| 251 |
+
treelite==3.9.1
|
| 252 |
+
triton==2.2.0+e28a256
|
| 253 |
+
typer==0.9.0
|
| 254 |
+
types-dataclasses==0.6.6
|
| 255 |
+
typing-extensions==4.9.0
|
| 256 |
+
ucx-py==0.35.0
|
| 257 |
+
uff==0.6.9
|
| 258 |
+
ujson==5.8.0
|
| 259 |
+
urllib3==1.26.18
|
| 260 |
+
wandb==0.16.3
|
| 261 |
+
wasabi==1.1.2
|
| 262 |
+
wcwidth==0.2.13
|
| 263 |
+
weasel==0.3.4
|
| 264 |
+
webencodings==0.5.1
|
| 265 |
+
werkzeug==3.0.1
|
| 266 |
+
wheel==0.42.0
|
| 267 |
+
xdoctest==1.0.2
|
| 268 |
+
xgboost==1.7.6
|
| 269 |
+
yarl==1.9.4
|
| 270 |
+
zict==3.0.0
|
| 271 |
+
zipp==3.17.0
|
wandb/run-20240804_142250-6p58tz1g/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "3.10.12",
|
| 4 |
+
"heartbeatAt": "2024-08-04T05:22:51.055103",
|
| 5 |
+
"startedAt": "2024-08-04T05:22:50.431050",
|
| 6 |
+
"docker": null,
|
| 7 |
+
"cuda": null,
|
| 8 |
+
"args": [
|
| 9 |
+
"--seq-length",
|
| 10 |
+
"512",
|
| 11 |
+
"--sliding-window-size",
|
| 12 |
+
"4096",
|
| 13 |
+
"--micro-batch-size",
|
| 14 |
+
"8",
|
| 15 |
+
"--global-batch-size",
|
| 16 |
+
"320",
|
| 17 |
+
"--train-iters",
|
| 18 |
+
"2000",
|
| 19 |
+
"--tokenizer-type",
|
| 20 |
+
"Llama2Tokenizer",
|
| 21 |
+
"--tokenizer-model",
|
| 22 |
+
"/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model",
|
| 23 |
+
"--train-data-path",
|
| 24 |
+
"4013541",
|
| 25 |
+
"/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
| 26 |
+
"--valid-data-path",
|
| 27 |
+
"4013541",
|
| 28 |
+
"/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
| 29 |
+
"--test-data-path",
|
| 30 |
+
"4013541",
|
| 31 |
+
"/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
| 32 |
+
"--lr",
|
| 33 |
+
"2e-5",
|
| 34 |
+
"--min-lr",
|
| 35 |
+
"1e-6",
|
| 36 |
+
"--lr-decay-style",
|
| 37 |
+
"cosine",
|
| 38 |
+
"--lr-warmup-iters",
|
| 39 |
+
"500",
|
| 40 |
+
"--lr-decay-iters",
|
| 41 |
+
"2000",
|
| 42 |
+
"--weight-decay",
|
| 43 |
+
"0.1",
|
| 44 |
+
"--grad-clip-norm",
|
| 45 |
+
"1.0",
|
| 46 |
+
"--optimizer",
|
| 47 |
+
"adam",
|
| 48 |
+
"--adam-beta1",
|
| 49 |
+
"0.9",
|
| 50 |
+
"--adam-beta2",
|
| 51 |
+
"0.95",
|
| 52 |
+
"--adam-eps",
|
| 53 |
+
"1e-6",
|
| 54 |
+
"--save-interval",
|
| 55 |
+
"200",
|
| 56 |
+
"--eval-interval",
|
| 57 |
+
"200",
|
| 58 |
+
"--eval-iters",
|
| 59 |
+
"10",
|
| 60 |
+
"--bf16",
|
| 61 |
+
"--mixed-precision",
|
| 62 |
+
"--base-model",
|
| 63 |
+
"/share/pretrained_lm/meta-llama/TinyLlama_v1.1",
|
| 64 |
+
"--save",
|
| 65 |
+
"/work/llm_recipes/models/tiny-llama",
|
| 66 |
+
"--load",
|
| 67 |
+
"/work/llm_recipes/models/tiny-llama",
|
| 68 |
+
"--fsdp-activation-checkpointing",
|
| 69 |
+
"--sharding-strategy",
|
| 70 |
+
"FULL_SHARD",
|
| 71 |
+
"--checkpoint-type",
|
| 72 |
+
"LOCAL_STATE_DICT",
|
| 73 |
+
"--save-n-checkpoints",
|
| 74 |
+
"10",
|
| 75 |
+
"--hf-upload-retry-limit",
|
| 76 |
+
"2",
|
| 77 |
+
"--hf-repo-id",
|
| 78 |
+
"koichi12/tiny-llama",
|
| 79 |
+
"--wandb-entity",
|
| 80 |
+
"iwakawa-koichi-q5-tohoku-nlp6723",
|
| 81 |
+
"--wandb-project",
|
| 82 |
+
"llm_tutorial",
|
| 83 |
+
"--wandb-name",
|
| 84 |
+
"tiny-llama_train_2024-08-04-14:22:39"
|
| 85 |
+
],
|
| 86 |
+
"state": "running",
|
| 87 |
+
"program": "/project/examples/finetuning.py",
|
| 88 |
+
"codePathLocal": "examples/finetuning.py",
|
| 89 |
+
"codePath": "examples/finetuning.py",
|
| 90 |
+
"git": {
|
| 91 |
+
"remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
|
| 92 |
+
"commit": "3be5353210a678dc7008f237fa16b99f2bdf36ea"
|
| 93 |
+
},
|
| 94 |
+
"email": null,
|
| 95 |
+
"root": "/project",
|
| 96 |
+
"host": "gpu-koiwa-00",
|
| 97 |
+
"username": "koiwa",
|
| 98 |
+
"executable": "/usr/bin/python",
|
| 99 |
+
"cpu_count": 18,
|
| 100 |
+
"cpu_count_logical": 18,
|
| 101 |
+
"cpu_freq": {
|
| 102 |
+
"current": 2400.0389999999993,
|
| 103 |
+
"min": 0.0,
|
| 104 |
+
"max": 0.0
|
| 105 |
+
},
|
| 106 |
+
"cpu_freq_per_core": [
|
| 107 |
+
{
|
| 108 |
+
"current": 2400.039,
|
| 109 |
+
"min": 0.0,
|
| 110 |
+
"max": 0.0
|
| 111 |
+
},
|
| 112 |
+
{
|
| 113 |
+
"current": 2400.039,
|
| 114 |
+
"min": 0.0,
|
| 115 |
+
"max": 0.0
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"current": 2400.039,
|
| 119 |
+
"min": 0.0,
|
| 120 |
+
"max": 0.0
|
| 121 |
+
},
|
| 122 |
+
{
|
| 123 |
+
"current": 2400.039,
|
| 124 |
+
"min": 0.0,
|
| 125 |
+
"max": 0.0
|
| 126 |
+
},
|
| 127 |
+
{
|
| 128 |
+
"current": 2400.039,
|
| 129 |
+
"min": 0.0,
|
| 130 |
+
"max": 0.0
|
| 131 |
+
},
|
| 132 |
+
{
|
| 133 |
+
"current": 2400.039,
|
| 134 |
+
"min": 0.0,
|
| 135 |
+
"max": 0.0
|
| 136 |
+
},
|
| 137 |
+
{
|
| 138 |
+
"current": 2400.039,
|
| 139 |
+
"min": 0.0,
|
| 140 |
+
"max": 0.0
|
| 141 |
+
},
|
| 142 |
+
{
|
| 143 |
+
"current": 2400.039,
|
| 144 |
+
"min": 0.0,
|
| 145 |
+
"max": 0.0
|
| 146 |
+
},
|
| 147 |
+
{
|
| 148 |
+
"current": 2400.039,
|
| 149 |
+
"min": 0.0,
|
| 150 |
+
"max": 0.0
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"current": 2400.039,
|
| 154 |
+
"min": 0.0,
|
| 155 |
+
"max": 0.0
|
| 156 |
+
},
|
| 157 |
+
{
|
| 158 |
+
"current": 2400.039,
|
| 159 |
+
"min": 0.0,
|
| 160 |
+
"max": 0.0
|
| 161 |
+
},
|
| 162 |
+
{
|
| 163 |
+
"current": 2400.039,
|
| 164 |
+
"min": 0.0,
|
| 165 |
+
"max": 0.0
|
| 166 |
+
},
|
| 167 |
+
{
|
| 168 |
+
"current": 2400.039,
|
| 169 |
+
"min": 0.0,
|
| 170 |
+
"max": 0.0
|
| 171 |
+
},
|
| 172 |
+
{
|
| 173 |
+
"current": 2400.039,
|
| 174 |
+
"min": 0.0,
|
| 175 |
+
"max": 0.0
|
| 176 |
+
},
|
| 177 |
+
{
|
| 178 |
+
"current": 2400.039,
|
| 179 |
+
"min": 0.0,
|
| 180 |
+
"max": 0.0
|
| 181 |
+
},
|
| 182 |
+
{
|
| 183 |
+
"current": 2400.039,
|
| 184 |
+
"min": 0.0,
|
| 185 |
+
"max": 0.0
|
| 186 |
+
},
|
| 187 |
+
{
|
| 188 |
+
"current": 2400.039,
|
| 189 |
+
"min": 0.0,
|
| 190 |
+
"max": 0.0
|
| 191 |
+
},
|
| 192 |
+
{
|
| 193 |
+
"current": 2400.039,
|
| 194 |
+
"min": 0.0,
|
| 195 |
+
"max": 0.0
|
| 196 |
+
}
|
| 197 |
+
],
|
| 198 |
+
"disk": {
|
| 199 |
+
"/": {
|
| 200 |
+
"total": 0.0625,
|
| 201 |
+
"used": 1.1444091796875e-05
|
| 202 |
+
}
|
| 203 |
+
},
|
| 204 |
+
"gpu": "NVIDIA A100-SXM4-40GB",
|
| 205 |
+
"gpu_count": 1,
|
| 206 |
+
"gpu_devices": [
|
| 207 |
+
{
|
| 208 |
+
"name": "NVIDIA A100-SXM4-40GB",
|
| 209 |
+
"memory_total": 42949672960
|
| 210 |
+
}
|
| 211 |
+
],
|
| 212 |
+
"memory": {
|
| 213 |
+
"total": 56.48781967163086
|
| 214 |
+
}
|
| 215 |
+
}
|
wandb/run-20240804_142250-6p58tz1g/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"_wandb": {"runtime": 2}}
|
wandb/run-20240804_142250-6p58tz1g/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2024-08-04 14:22:50,445 INFO StreamThr :10451 [internal.py:wandb_internal():86] W&B internal server running at pid: 10451, started at: 2024-08-04 14:22:50.444819
|
| 2 |
+
2024-08-04 14:22:50,447 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: status
|
| 3 |
+
2024-08-04 14:22:50,449 INFO WriterThread:10451 [datastore.py:open_for_write():87] open: /project/wandb/run-20240804_142250-6p58tz1g/run-6p58tz1g.wandb
|
| 4 |
+
2024-08-04 14:22:50,450 DEBUG SenderThread:10451 [sender.py:send():382] send: header
|
| 5 |
+
2024-08-04 14:22:50,463 DEBUG SenderThread:10451 [sender.py:send():382] send: run
|
| 6 |
+
2024-08-04 14:22:50,941 INFO SenderThread:10451 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240804_142250-6p58tz1g/files
|
| 7 |
+
2024-08-04 14:22:50,941 INFO SenderThread:10451 [sender.py:_start_run_threads():1136] run started: 6p58tz1g with start time 1722748970.443993
|
| 8 |
+
2024-08-04 14:22:50,946 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: check_version
|
| 9 |
+
2024-08-04 14:22:50,946 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: check_version
|
| 10 |
+
2024-08-04 14:22:51,034 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: run_start
|
| 11 |
+
2024-08-04 14:22:51,041 DEBUG HandlerThread:10451 [system_info.py:__init__():27] System info init
|
| 12 |
+
2024-08-04 14:22:51,041 DEBUG HandlerThread:10451 [system_info.py:__init__():42] System info init done
|
| 13 |
+
2024-08-04 14:22:51,041 INFO HandlerThread:10451 [system_monitor.py:start():194] Starting system monitor
|
| 14 |
+
2024-08-04 14:22:51,041 INFO SystemMonitor:10451 [system_monitor.py:_start():158] Starting system asset monitoring threads
|
| 15 |
+
2024-08-04 14:22:51,042 INFO HandlerThread:10451 [system_monitor.py:probe():214] Collecting system info
|
| 16 |
+
2024-08-04 14:22:51,042 INFO SystemMonitor:10451 [interfaces.py:start():190] Started cpu monitoring
|
| 17 |
+
2024-08-04 14:22:51,043 INFO SystemMonitor:10451 [interfaces.py:start():190] Started disk monitoring
|
| 18 |
+
2024-08-04 14:22:51,044 INFO SystemMonitor:10451 [interfaces.py:start():190] Started gpu monitoring
|
| 19 |
+
2024-08-04 14:22:51,044 INFO SystemMonitor:10451 [interfaces.py:start():190] Started memory monitoring
|
| 20 |
+
2024-08-04 14:22:51,045 INFO SystemMonitor:10451 [interfaces.py:start():190] Started network monitoring
|
| 21 |
+
2024-08-04 14:22:51,055 DEBUG HandlerThread:10451 [system_info.py:probe():151] Probing system
|
| 22 |
+
2024-08-04 14:22:51,059 DEBUG HandlerThread:10451 [system_info.py:_probe_git():136] Probing git
|
| 23 |
+
2024-08-04 14:22:51,071 DEBUG HandlerThread:10451 [system_info.py:_probe_git():144] Probing git done
|
| 24 |
+
2024-08-04 14:22:51,071 DEBUG HandlerThread:10451 [system_info.py:probe():199] Probing system done
|
| 25 |
+
2024-08-04 14:22:51,071 DEBUG HandlerThread:10451 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-04T05:22:51.055103', 'startedAt': '2024-08-04T05:22:50.431050', 'docker': None, 'cuda': None, 'args': ('--seq-length', '512', '--sliding-window-size', '4096', '--micro-batch-size', '8', '--global-batch-size', '320', '--train-iters', '2000', '--tokenizer-type', 'Llama2Tokenizer', '--tokenizer-model', '/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model', '--train-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--valid-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--test-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '2000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '200', '--eval-interval', '200', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/meta-llama/TinyLlama_v1.1', '--save', '/work/llm_recipes/models/tiny-llama', '--load', '/work/llm_recipes/models/tiny-llama', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/tiny-llama', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'tiny-llama_train_2024-08-04-14:22:39'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '3be5353210a678dc7008f237fa16b99f2bdf36ea'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0389999999993, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.48781967163086}}
|
| 26 |
+
2024-08-04 14:22:51,072 INFO HandlerThread:10451 [system_monitor.py:probe():224] Finished collecting system info
|
| 27 |
+
2024-08-04 14:22:51,072 INFO HandlerThread:10451 [system_monitor.py:probe():227] Publishing system info
|
| 28 |
+
2024-08-04 14:22:51,073 INFO HandlerThread:10451 [system_monitor.py:probe():229] Finished publishing system info
|
| 29 |
+
2024-08-04 14:22:51,079 DEBUG SenderThread:10451 [sender.py:send():382] send: files
|
| 30 |
+
2024-08-04 14:22:51,079 INFO SenderThread:10451 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
|
| 31 |
+
2024-08-04 14:22:51,089 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: python_packages
|
| 32 |
+
2024-08-04 14:22:51,089 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: stop_status
|
| 33 |
+
2024-08-04 14:22:51,089 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: python_packages
|
| 34 |
+
2024-08-04 14:22:51,090 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: internal_messages
|
| 35 |
+
2024-08-04 14:22:51,091 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: stop_status
|
| 36 |
+
2024-08-04 14:22:51,412 DEBUG SenderThread:10451 [sender.py:send():382] send: telemetry
|
| 37 |
+
2024-08-04 14:22:51,834 INFO wandb-upload_0:10451 [upload_job.py:push():131] Uploaded file /tmp/tmpvai5nc9ewandb/lc3l5ghh-wandb-metadata.json
|
| 38 |
+
2024-08-04 14:22:51,943 INFO Thread-12 :10451 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_142250-6p58tz1g/files/requirements.txt
|
| 39 |
+
2024-08-04 14:22:51,943 INFO Thread-12 :10451 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_142250-6p58tz1g/files/wandb-metadata.json
|
| 40 |
+
2024-08-04 14:22:51,943 INFO Thread-12 :10451 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_142250-6p58tz1g/files/output.log
|
| 41 |
+
2024-08-04 14:22:53,535 DEBUG SenderThread:10451 [sender.py:send():382] send: config
|
| 42 |
+
2024-08-04 14:22:53,536 DEBUG SenderThread:10451 [sender.py:send():382] send: config
|
| 43 |
+
2024-08-04 14:22:53,643 DEBUG SenderThread:10451 [sender.py:send():382] send: exit
|
| 44 |
+
2024-08-04 14:22:53,643 INFO SenderThread:10451 [sender.py:send_exit():589] handling exit code: 1
|
| 45 |
+
2024-08-04 14:22:53,643 INFO SenderThread:10451 [sender.py:send_exit():591] handling runtime: 2
|
| 46 |
+
2024-08-04 14:22:53,644 INFO SenderThread:10451 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
| 47 |
+
2024-08-04 14:22:53,645 INFO SenderThread:10451 [sender.py:send_exit():597] send defer
|
| 48 |
+
2024-08-04 14:22:53,645 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: defer
|
| 49 |
+
2024-08-04 14:22:53,645 INFO HandlerThread:10451 [handler.py:handle_request_defer():172] handle defer: 0
|
| 50 |
+
2024-08-04 14:22:53,645 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: defer
|
| 51 |
+
2024-08-04 14:22:53,645 INFO SenderThread:10451 [sender.py:send_request_defer():613] handle sender defer: 0
|
| 52 |
+
2024-08-04 14:22:53,645 INFO SenderThread:10451 [sender.py:transition_state():617] send defer: 1
|
| 53 |
+
2024-08-04 14:22:53,645 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: defer
|
| 54 |
+
2024-08-04 14:22:53,645 INFO HandlerThread:10451 [handler.py:handle_request_defer():172] handle defer: 1
|
| 55 |
+
2024-08-04 14:22:53,645 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: defer
|
| 56 |
+
2024-08-04 14:22:53,645 INFO SenderThread:10451 [sender.py:send_request_defer():613] handle sender defer: 1
|
| 57 |
+
2024-08-04 14:22:53,645 INFO SenderThread:10451 [sender.py:transition_state():617] send defer: 2
|
| 58 |
+
2024-08-04 14:22:53,645 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: defer
|
| 59 |
+
2024-08-04 14:22:53,645 INFO HandlerThread:10451 [handler.py:handle_request_defer():172] handle defer: 2
|
| 60 |
+
2024-08-04 14:22:53,645 INFO HandlerThread:10451 [system_monitor.py:finish():203] Stopping system monitor
|
| 61 |
+
2024-08-04 14:22:53,646 DEBUG SystemMonitor:10451 [system_monitor.py:_start():172] Starting system metrics aggregation loop
|
| 62 |
+
2024-08-04 14:22:53,646 INFO HandlerThread:10451 [interfaces.py:finish():202] Joined cpu monitor
|
| 63 |
+
2024-08-04 14:22:53,646 DEBUG SystemMonitor:10451 [system_monitor.py:_start():179] Finished system metrics aggregation loop
|
| 64 |
+
2024-08-04 14:22:53,646 INFO HandlerThread:10451 [interfaces.py:finish():202] Joined disk monitor
|
| 65 |
+
2024-08-04 14:22:53,646 DEBUG SystemMonitor:10451 [system_monitor.py:_start():183] Publishing last batch of metrics
|
| 66 |
+
2024-08-04 14:22:53,679 INFO HandlerThread:10451 [interfaces.py:finish():202] Joined gpu monitor
|
| 67 |
+
2024-08-04 14:22:53,679 INFO HandlerThread:10451 [interfaces.py:finish():202] Joined memory monitor
|
| 68 |
+
2024-08-04 14:22:53,679 INFO HandlerThread:10451 [interfaces.py:finish():202] Joined network monitor
|
| 69 |
+
2024-08-04 14:22:53,680 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: defer
|
| 70 |
+
2024-08-04 14:22:53,680 INFO SenderThread:10451 [sender.py:send_request_defer():613] handle sender defer: 2
|
| 71 |
+
2024-08-04 14:22:53,680 INFO SenderThread:10451 [sender.py:transition_state():617] send defer: 3
|
| 72 |
+
2024-08-04 14:22:53,680 DEBUG SenderThread:10451 [sender.py:send():382] send: stats
|
| 73 |
+
2024-08-04 14:22:53,680 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: defer
|
| 74 |
+
2024-08-04 14:22:53,680 INFO HandlerThread:10451 [handler.py:handle_request_defer():172] handle defer: 3
|
| 75 |
+
2024-08-04 14:22:53,680 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: defer
|
| 76 |
+
2024-08-04 14:22:53,681 INFO SenderThread:10451 [sender.py:send_request_defer():613] handle sender defer: 3
|
| 77 |
+
2024-08-04 14:22:53,681 INFO SenderThread:10451 [sender.py:transition_state():617] send defer: 4
|
| 78 |
+
2024-08-04 14:22:53,681 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: defer
|
| 79 |
+
2024-08-04 14:22:53,681 INFO HandlerThread:10451 [handler.py:handle_request_defer():172] handle defer: 4
|
| 80 |
+
2024-08-04 14:22:53,681 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: defer
|
| 81 |
+
2024-08-04 14:22:53,681 INFO SenderThread:10451 [sender.py:send_request_defer():613] handle sender defer: 4
|
| 82 |
+
2024-08-04 14:22:53,681 INFO SenderThread:10451 [sender.py:transition_state():617] send defer: 5
|
| 83 |
+
2024-08-04 14:22:53,681 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: defer
|
| 84 |
+
2024-08-04 14:22:53,681 INFO HandlerThread:10451 [handler.py:handle_request_defer():172] handle defer: 5
|
| 85 |
+
2024-08-04 14:22:53,681 DEBUG SenderThread:10451 [sender.py:send():382] send: summary
|
| 86 |
+
2024-08-04 14:22:53,682 INFO SenderThread:10451 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
| 87 |
+
2024-08-04 14:22:53,682 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: defer
|
| 88 |
+
2024-08-04 14:22:53,682 INFO SenderThread:10451 [sender.py:send_request_defer():613] handle sender defer: 5
|
| 89 |
+
2024-08-04 14:22:53,682 INFO SenderThread:10451 [sender.py:transition_state():617] send defer: 6
|
| 90 |
+
2024-08-04 14:22:53,683 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: defer
|
| 91 |
+
2024-08-04 14:22:53,683 INFO HandlerThread:10451 [handler.py:handle_request_defer():172] handle defer: 6
|
| 92 |
+
2024-08-04 14:22:53,683 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: defer
|
| 93 |
+
2024-08-04 14:22:53,683 INFO SenderThread:10451 [sender.py:send_request_defer():613] handle sender defer: 6
|
| 94 |
+
2024-08-04 14:22:53,685 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: status_report
|
| 95 |
+
2024-08-04 14:22:53,891 INFO SenderThread:10451 [sender.py:transition_state():617] send defer: 7
|
| 96 |
+
2024-08-04 14:22:53,891 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: defer
|
| 97 |
+
2024-08-04 14:22:53,891 INFO HandlerThread:10451 [handler.py:handle_request_defer():172] handle defer: 7
|
| 98 |
+
2024-08-04 14:22:53,892 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: defer
|
| 99 |
+
2024-08-04 14:22:53,892 INFO SenderThread:10451 [sender.py:send_request_defer():613] handle sender defer: 7
|
| 100 |
+
2024-08-04 14:22:53,944 INFO Thread-12 :10451 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_142250-6p58tz1g/files/config.yaml
|
| 101 |
+
2024-08-04 14:22:53,944 INFO Thread-12 :10451 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_142250-6p58tz1g/files/output.log
|
| 102 |
+
2024-08-04 14:22:53,944 INFO Thread-12 :10451 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_142250-6p58tz1g/files/wandb-summary.json
|
| 103 |
+
2024-08-04 14:22:54,643 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: poll_exit
|
| 104 |
+
2024-08-04 14:22:55,782 INFO SenderThread:10451 [sender.py:transition_state():617] send defer: 8
|
| 105 |
+
2024-08-04 14:22:55,783 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: poll_exit
|
| 106 |
+
2024-08-04 14:22:55,783 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: defer
|
| 107 |
+
2024-08-04 14:22:55,783 INFO HandlerThread:10451 [handler.py:handle_request_defer():172] handle defer: 8
|
| 108 |
+
2024-08-04 14:22:55,783 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: defer
|
| 109 |
+
2024-08-04 14:22:55,783 INFO SenderThread:10451 [sender.py:send_request_defer():613] handle sender defer: 8
|
| 110 |
+
2024-08-04 14:22:55,783 INFO SenderThread:10451 [job_builder.py:build():296] Attempting to build job artifact
|
| 111 |
+
2024-08-04 14:22:55,784 INFO SenderThread:10451 [job_builder.py:_get_source_type():426] is repo sourced job
|
| 112 |
+
2024-08-04 14:22:55,883 INFO SenderThread:10451 [job_builder.py:build():402] adding wandb-job metadata file
|
| 113 |
+
2024-08-04 14:22:55,891 INFO SenderThread:10451 [sender.py:transition_state():617] send defer: 9
|
| 114 |
+
2024-08-04 14:22:55,892 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: defer
|
| 115 |
+
2024-08-04 14:22:55,892 DEBUG SenderThread:10451 [sender.py:send():382] send: artifact
|
| 116 |
+
2024-08-04 14:22:55,892 INFO HandlerThread:10451 [handler.py:handle_request_defer():172] handle defer: 9
|
| 117 |
+
2024-08-04 14:22:55,945 INFO Thread-12 :10451 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_142250-6p58tz1g/files/output.log
|
| 118 |
+
2024-08-04 14:22:56,644 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: poll_exit
|
| 119 |
+
2024-08-04 14:22:57,777 INFO SenderThread:10451 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTA5MTk2NTkzOA==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTA5MzUzODM4NQ==', 'versionIndex': 3}}}
|
| 120 |
+
2024-08-04 14:22:57,777 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: defer
|
| 121 |
+
2024-08-04 14:22:57,777 INFO SenderThread:10451 [sender.py:send_request_defer():613] handle sender defer: 9
|
| 122 |
+
2024-08-04 14:22:57,777 INFO SenderThread:10451 [dir_watcher.py:finish():358] shutting down directory watcher
|
| 123 |
+
2024-08-04 14:22:57,946 INFO SenderThread:10451 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240804_142250-6p58tz1g/files
|
| 124 |
+
2024-08-04 14:22:57,946 INFO SenderThread:10451 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_142250-6p58tz1g/files/requirements.txt requirements.txt
|
| 125 |
+
2024-08-04 14:22:57,947 INFO SenderThread:10451 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_142250-6p58tz1g/files/config.yaml config.yaml
|
| 126 |
+
2024-08-04 14:22:57,948 INFO SenderThread:10451 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_142250-6p58tz1g/files/wandb-metadata.json wandb-metadata.json
|
| 127 |
+
2024-08-04 14:22:57,948 INFO SenderThread:10451 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_142250-6p58tz1g/files/wandb-summary.json wandb-summary.json
|
| 128 |
+
2024-08-04 14:22:57,950 INFO SenderThread:10451 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_142250-6p58tz1g/files/output.log output.log
|
| 129 |
+
2024-08-04 14:22:57,952 INFO SenderThread:10451 [sender.py:transition_state():617] send defer: 10
|
| 130 |
+
2024-08-04 14:22:57,952 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: poll_exit
|
| 131 |
+
2024-08-04 14:22:57,952 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: defer
|
| 132 |
+
2024-08-04 14:22:57,952 INFO HandlerThread:10451 [handler.py:handle_request_defer():172] handle defer: 10
|
| 133 |
+
2024-08-04 14:22:57,954 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: defer
|
| 134 |
+
2024-08-04 14:22:57,954 INFO SenderThread:10451 [sender.py:send_request_defer():613] handle sender defer: 10
|
| 135 |
+
2024-08-04 14:22:57,954 INFO SenderThread:10451 [file_pusher.py:finish():172] shutting down file pusher
|
| 136 |
+
2024-08-04 14:22:58,363 INFO wandb-upload_1:10451 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_142250-6p58tz1g/files/config.yaml
|
| 137 |
+
2024-08-04 14:22:58,459 INFO wandb-upload_0:10451 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_142250-6p58tz1g/files/requirements.txt
|
| 138 |
+
2024-08-04 14:22:58,506 INFO wandb-upload_2:10451 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_142250-6p58tz1g/files/wandb-summary.json
|
| 139 |
+
2024-08-04 14:22:58,525 INFO wandb-upload_3:10451 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_142250-6p58tz1g/files/output.log
|
| 140 |
+
2024-08-04 14:22:58,645 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: poll_exit
|
| 141 |
+
2024-08-04 14:22:58,645 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: poll_exit
|
| 142 |
+
2024-08-04 14:22:58,725 INFO Thread-11 (_thread_body):10451 [sender.py:transition_state():617] send defer: 11
|
| 143 |
+
2024-08-04 14:22:58,725 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: defer
|
| 144 |
+
2024-08-04 14:22:58,725 INFO HandlerThread:10451 [handler.py:handle_request_defer():172] handle defer: 11
|
| 145 |
+
2024-08-04 14:22:58,726 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: defer
|
| 146 |
+
2024-08-04 14:22:58,726 INFO SenderThread:10451 [sender.py:send_request_defer():613] handle sender defer: 11
|
| 147 |
+
2024-08-04 14:22:58,726 INFO SenderThread:10451 [file_pusher.py:join():178] waiting for file pusher
|
| 148 |
+
2024-08-04 14:22:58,726 INFO SenderThread:10451 [sender.py:transition_state():617] send defer: 12
|
| 149 |
+
2024-08-04 14:22:58,726 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: defer
|
| 150 |
+
2024-08-04 14:22:58,726 INFO HandlerThread:10451 [handler.py:handle_request_defer():172] handle defer: 12
|
| 151 |
+
2024-08-04 14:22:58,726 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: defer
|
| 152 |
+
2024-08-04 14:22:58,726 INFO SenderThread:10451 [sender.py:send_request_defer():613] handle sender defer: 12
|
| 153 |
+
2024-08-04 14:22:58,726 INFO SenderThread:10451 [file_stream.py:finish():595] file stream finish called
|
| 154 |
+
2024-08-04 14:22:58,910 INFO SenderThread:10451 [file_stream.py:finish():599] file stream finish is done
|
| 155 |
+
2024-08-04 14:22:58,911 INFO SenderThread:10451 [sender.py:transition_state():617] send defer: 13
|
| 156 |
+
2024-08-04 14:22:58,911 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: defer
|
| 157 |
+
2024-08-04 14:22:58,911 INFO HandlerThread:10451 [handler.py:handle_request_defer():172] handle defer: 13
|
| 158 |
+
2024-08-04 14:22:58,911 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: defer
|
| 159 |
+
2024-08-04 14:22:58,911 INFO SenderThread:10451 [sender.py:send_request_defer():613] handle sender defer: 13
|
| 160 |
+
2024-08-04 14:22:58,911 INFO SenderThread:10451 [sender.py:transition_state():617] send defer: 14
|
| 161 |
+
2024-08-04 14:22:58,911 DEBUG SenderThread:10451 [sender.py:send():382] send: final
|
| 162 |
+
2024-08-04 14:22:58,911 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: defer
|
| 163 |
+
2024-08-04 14:22:58,912 DEBUG SenderThread:10451 [sender.py:send():382] send: footer
|
| 164 |
+
2024-08-04 14:22:58,912 INFO HandlerThread:10451 [handler.py:handle_request_defer():172] handle defer: 14
|
| 165 |
+
2024-08-04 14:22:58,912 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: defer
|
| 166 |
+
2024-08-04 14:22:58,912 INFO SenderThread:10451 [sender.py:send_request_defer():613] handle sender defer: 14
|
| 167 |
+
2024-08-04 14:22:58,912 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: poll_exit
|
| 168 |
+
2024-08-04 14:22:58,912 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: poll_exit
|
| 169 |
+
2024-08-04 14:22:58,913 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: poll_exit
|
| 170 |
+
2024-08-04 14:22:58,913 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: poll_exit
|
| 171 |
+
2024-08-04 14:22:58,913 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: server_info
|
| 172 |
+
2024-08-04 14:22:58,913 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: get_summary
|
| 173 |
+
2024-08-04 14:22:58,914 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: server_info
|
| 174 |
+
2024-08-04 14:22:58,915 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: sampled_history
|
| 175 |
+
2024-08-04 14:22:58,915 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: internal_messages
|
| 176 |
+
2024-08-04 14:22:58,916 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: job_info
|
| 177 |
+
2024-08-04 14:22:59,080 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: job_info
|
| 178 |
+
2024-08-04 14:22:59,081 INFO MainThread:10451 [wandb_run.py:_footer_history_summary_info():3866] rendering history
|
| 179 |
+
2024-08-04 14:22:59,081 INFO MainThread:10451 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
|
| 180 |
+
2024-08-04 14:22:59,081 INFO MainThread:10451 [wandb_run.py:_footer_sync_info():3825] logging synced files
|
| 181 |
+
2024-08-04 14:22:59,081 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: shutdown
|
| 182 |
+
2024-08-04 14:22:59,081 INFO HandlerThread:10451 [handler.py:finish():869] shutting down handler
|
| 183 |
+
2024-08-04 14:22:59,916 INFO WriterThread:10451 [datastore.py:close():296] close: /project/wandb/run-20240804_142250-6p58tz1g/run-6p58tz1g.wandb
|
| 184 |
+
2024-08-04 14:23:00,081 INFO SenderThread:10451 [sender.py:finish():1572] shutting down sender
|
| 185 |
+
2024-08-04 14:23:00,081 INFO SenderThread:10451 [file_pusher.py:finish():172] shutting down file pusher
|
| 186 |
+
2024-08-04 14:23:00,081 INFO SenderThread:10451 [file_pusher.py:join():178] waiting for file pusher
|
wandb/run-20240804_142250-6p58tz1g/logs/debug.log
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2024-08-04 14:22:50,437 INFO MainThread:10380 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
|
| 2 |
+
2024-08-04 14:22:50,437 INFO MainThread:10380 [wandb_setup.py:_flush():76] Configure stats pid to 10380
|
| 3 |
+
2024-08-04 14:22:50,437 INFO MainThread:10380 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
|
| 4 |
+
2024-08-04 14:22:50,437 INFO MainThread:10380 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
|
| 5 |
+
2024-08-04 14:22:50,437 INFO MainThread:10380 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train tiny llama sample'}
|
| 6 |
+
2024-08-04 14:22:50,437 INFO MainThread:10380 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
|
| 7 |
+
2024-08-04 14:22:50,437 INFO MainThread:10380 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
|
| 8 |
+
2024-08-04 14:22:50,437 INFO MainThread:10380 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240804_142250-6p58tz1g/logs/debug.log
|
| 9 |
+
2024-08-04 14:22:50,437 INFO MainThread:10380 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240804_142250-6p58tz1g/logs/debug-internal.log
|
| 10 |
+
2024-08-04 14:22:50,438 INFO MainThread:10380 [wandb_init.py:init():566] calling init triggers
|
| 11 |
+
2024-08-04 14:22:50,438 INFO MainThread:10380 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
|
| 12 |
+
config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'valid_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'test_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 512, 'num_workers': 2, 'tokenizer_type': 'Llama2Tokenizer', 'tokenizer_model': '/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'tiny-llama_train_2024-08-04-14:22:39', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/tiny-llama', 'save': '/work/llm_recipes/models/tiny-llama', 'base_model': '/share/pretrained_lm/meta-llama/TinyLlama_v1.1', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 2000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 2000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 8, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/tiny-llama', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 32000, 'gradient_accumulation_steps': 40}
|
| 13 |
+
2024-08-04 14:22:50,438 INFO MainThread:10380 [wandb_init.py:init():616] starting backend
|
| 14 |
+
2024-08-04 14:22:50,438 INFO MainThread:10380 [wandb_init.py:init():620] setting up manager
|
| 15 |
+
2024-08-04 14:22:50,443 INFO MainThread:10380 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 16 |
+
2024-08-04 14:22:50,443 INFO MainThread:10380 [wandb_init.py:init():628] backend started and connected
|
| 17 |
+
2024-08-04 14:22:50,448 INFO MainThread:10380 [wandb_init.py:init():720] updated telemetry
|
| 18 |
+
2024-08-04 14:22:50,459 INFO MainThread:10380 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
|
| 19 |
+
2024-08-04 14:22:50,946 INFO MainThread:10380 [wandb_run.py:_on_init():2262] communicating current version
|
| 20 |
+
2024-08-04 14:22:51,027 INFO MainThread:10380 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.5 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
|
| 21 |
+
|
| 22 |
+
2024-08-04 14:22:51,027 INFO MainThread:10380 [wandb_init.py:init():804] starting run threads in backend
|
| 23 |
+
2024-08-04 14:22:51,088 INFO MainThread:10380 [wandb_run.py:_console_start():2241] atexit reg
|
| 24 |
+
2024-08-04 14:22:51,088 INFO MainThread:10380 [wandb_run.py:_redirect():2096] redirect: wrap_raw
|
| 25 |
+
2024-08-04 14:22:51,088 INFO MainThread:10380 [wandb_run.py:_redirect():2161] Wrapping output streams.
|
| 26 |
+
2024-08-04 14:22:51,088 INFO MainThread:10380 [wandb_run.py:_redirect():2186] Redirects installed.
|
| 27 |
+
2024-08-04 14:22:51,090 INFO MainThread:10380 [wandb_init.py:init():847] run started, returning control to user process
|
| 28 |
+
2024-08-04 14:22:53,535 INFO MainThread:10380 [wandb_run.py:_config_callback():1343] config_cb None None {'activation_function': 'silu', 'hidden_size': 2048, 'model_type': 'llama', 'max_position_embeddings': 2048, 'num_attention_heads': 32, 'num_hidden_layers': 22, 'model_architecture': 'LlamaForCausalLM'}
|
| 29 |
+
2024-08-04 14:22:53,535 INFO MainThread:10380 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
|
| 30 |
+
2024-08-04 14:23:00,082 WARNING MsgRouterThr:10380 [router.py:message_loop():77] message_loop has been closed
|
wandb/run-20240804_142250-6p58tz1g/run-6p58tz1g.wandb
ADDED
|
Binary file (20.5 kB). View file
|
|
|
wandb/run-20240804_143607-h7fxlkpt/files/config.yaml
ADDED
|
@@ -0,0 +1,335 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
wandb_version: 1
|
| 2 |
+
|
| 3 |
+
sharding_strategy:
|
| 4 |
+
desc: null
|
| 5 |
+
value: FULL_SHARD
|
| 6 |
+
checkpoint_type:
|
| 7 |
+
desc: null
|
| 8 |
+
value: LOCAL_STATE_DICT
|
| 9 |
+
fsdp_activation_checkpointing:
|
| 10 |
+
desc: null
|
| 11 |
+
value: true
|
| 12 |
+
fsdp_cpu_offload:
|
| 13 |
+
desc: null
|
| 14 |
+
value: false
|
| 15 |
+
low_cpu_fsdp:
|
| 16 |
+
desc: null
|
| 17 |
+
value: false
|
| 18 |
+
no_meta_device:
|
| 19 |
+
desc: null
|
| 20 |
+
value: false
|
| 21 |
+
data_path:
|
| 22 |
+
desc: null
|
| 23 |
+
value: null
|
| 24 |
+
split:
|
| 25 |
+
desc: null
|
| 26 |
+
value: 969, 30, 1
|
| 27 |
+
train_data_path:
|
| 28 |
+
desc: null
|
| 29 |
+
value:
|
| 30 |
+
- '4013541'
|
| 31 |
+
- /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
|
| 32 |
+
valid_data_path:
|
| 33 |
+
desc: null
|
| 34 |
+
value:
|
| 35 |
+
- '4013541'
|
| 36 |
+
- /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
|
| 37 |
+
test_data_path:
|
| 38 |
+
desc: null
|
| 39 |
+
value:
|
| 40 |
+
- '4013541'
|
| 41 |
+
- /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
|
| 42 |
+
data_cache_path:
|
| 43 |
+
desc: null
|
| 44 |
+
value: null
|
| 45 |
+
vocab_size:
|
| 46 |
+
desc: null
|
| 47 |
+
value: null
|
| 48 |
+
vocab_file:
|
| 49 |
+
desc: null
|
| 50 |
+
value: null
|
| 51 |
+
merge_file:
|
| 52 |
+
desc: null
|
| 53 |
+
value: null
|
| 54 |
+
seq_length:
|
| 55 |
+
desc: null
|
| 56 |
+
value: 512
|
| 57 |
+
num_workers:
|
| 58 |
+
desc: null
|
| 59 |
+
value: 2
|
| 60 |
+
tokenizer_type:
|
| 61 |
+
desc: null
|
| 62 |
+
value: Llama2Tokenizer
|
| 63 |
+
tokenizer_model:
|
| 64 |
+
desc: null
|
| 65 |
+
value: /share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model
|
| 66 |
+
reset_position_ids:
|
| 67 |
+
desc: null
|
| 68 |
+
value: false
|
| 69 |
+
reset_attention_mask:
|
| 70 |
+
desc: null
|
| 71 |
+
value: false
|
| 72 |
+
eod_mask_loss:
|
| 73 |
+
desc: null
|
| 74 |
+
value: false
|
| 75 |
+
retro_return_doc_ids:
|
| 76 |
+
desc: null
|
| 77 |
+
value: false
|
| 78 |
+
short_seq_prob:
|
| 79 |
+
desc: null
|
| 80 |
+
value: 0.1
|
| 81 |
+
vocab_extra_ids:
|
| 82 |
+
desc: null
|
| 83 |
+
value: 0
|
| 84 |
+
seed:
|
| 85 |
+
desc: null
|
| 86 |
+
value: 1234
|
| 87 |
+
use_mpi:
|
| 88 |
+
desc: null
|
| 89 |
+
value: false
|
| 90 |
+
wandb_entity:
|
| 91 |
+
desc: null
|
| 92 |
+
value: iwakawa-koichi-q5-tohoku-nlp6723
|
| 93 |
+
wandb_name:
|
| 94 |
+
desc: null
|
| 95 |
+
value: tiny-llama_train_2024-08-04-14:35:56
|
| 96 |
+
wandb_project:
|
| 97 |
+
desc: null
|
| 98 |
+
value: llm_tutorial
|
| 99 |
+
quantization:
|
| 100 |
+
desc: null
|
| 101 |
+
value: false
|
| 102 |
+
use_freeze_layers:
|
| 103 |
+
desc: null
|
| 104 |
+
value: false
|
| 105 |
+
freeze_layers:
|
| 106 |
+
desc: null
|
| 107 |
+
value: null
|
| 108 |
+
bf16:
|
| 109 |
+
desc: null
|
| 110 |
+
value: true
|
| 111 |
+
fp16:
|
| 112 |
+
desc: null
|
| 113 |
+
value: false
|
| 114 |
+
mixed_precision:
|
| 115 |
+
desc: null
|
| 116 |
+
value: true
|
| 117 |
+
param_dtype:
|
| 118 |
+
desc: null
|
| 119 |
+
value: null
|
| 120 |
+
load:
|
| 121 |
+
desc: null
|
| 122 |
+
value: /work/llm_recipes/models/tiny-llama
|
| 123 |
+
save:
|
| 124 |
+
desc: null
|
| 125 |
+
value: /work/llm_recipes/models/tiny-llama
|
| 126 |
+
base_model:
|
| 127 |
+
desc: null
|
| 128 |
+
value: /share/pretrained_lm/meta-llama/TinyLlama_v1.1
|
| 129 |
+
use_better_transformer:
|
| 130 |
+
desc: null
|
| 131 |
+
value: false
|
| 132 |
+
grad_clip_norm:
|
| 133 |
+
desc: null
|
| 134 |
+
value: 1.0
|
| 135 |
+
eval_interval:
|
| 136 |
+
desc: null
|
| 137 |
+
value: 200
|
| 138 |
+
save_interval:
|
| 139 |
+
desc: null
|
| 140 |
+
value: 200
|
| 141 |
+
eval_iters:
|
| 142 |
+
desc: null
|
| 143 |
+
value: 10
|
| 144 |
+
optimizer:
|
| 145 |
+
desc: null
|
| 146 |
+
value: adam
|
| 147 |
+
lr:
|
| 148 |
+
desc: null
|
| 149 |
+
value: 2.0e-05
|
| 150 |
+
lr_decay_style:
|
| 151 |
+
desc: null
|
| 152 |
+
value: cosine
|
| 153 |
+
lr_decay_iters:
|
| 154 |
+
desc: null
|
| 155 |
+
value: 2000
|
| 156 |
+
lr_warmup_iters:
|
| 157 |
+
desc: null
|
| 158 |
+
value: 500
|
| 159 |
+
min_lr:
|
| 160 |
+
desc: null
|
| 161 |
+
value: 1.0e-06
|
| 162 |
+
train_iters:
|
| 163 |
+
desc: null
|
| 164 |
+
value: 2000
|
| 165 |
+
train_samples:
|
| 166 |
+
desc: null
|
| 167 |
+
value: null
|
| 168 |
+
global_batch_size:
|
| 169 |
+
desc: null
|
| 170 |
+
value: 320
|
| 171 |
+
micro_batch_size:
|
| 172 |
+
desc: null
|
| 173 |
+
value: 8
|
| 174 |
+
make_vocab_size_divisible_by:
|
| 175 |
+
desc: null
|
| 176 |
+
value: 128
|
| 177 |
+
sliding_window_size:
|
| 178 |
+
desc: null
|
| 179 |
+
value: 4096
|
| 180 |
+
skip_batch:
|
| 181 |
+
desc: null
|
| 182 |
+
value: null
|
| 183 |
+
no_save_optimizer_state:
|
| 184 |
+
desc: null
|
| 185 |
+
value: false
|
| 186 |
+
continual_pretraining:
|
| 187 |
+
desc: null
|
| 188 |
+
value: false
|
| 189 |
+
instruction_tuning:
|
| 190 |
+
desc: null
|
| 191 |
+
value: false
|
| 192 |
+
direct_preference_optimization:
|
| 193 |
+
desc: null
|
| 194 |
+
value: false
|
| 195 |
+
attention_dropout:
|
| 196 |
+
desc: null
|
| 197 |
+
value: 0.1
|
| 198 |
+
hidden_dropout:
|
| 199 |
+
desc: null
|
| 200 |
+
value: 0.1
|
| 201 |
+
weight_decay:
|
| 202 |
+
desc: null
|
| 203 |
+
value: 0.1
|
| 204 |
+
adam_beta1:
|
| 205 |
+
desc: null
|
| 206 |
+
value: 0.9
|
| 207 |
+
adam_beta2:
|
| 208 |
+
desc: null
|
| 209 |
+
value: 0.95
|
| 210 |
+
adam_eps:
|
| 211 |
+
desc: null
|
| 212 |
+
value: 1.0e-06
|
| 213 |
+
hf_transformer_model_dir:
|
| 214 |
+
desc: null
|
| 215 |
+
value: null
|
| 216 |
+
instruction_train_data_path:
|
| 217 |
+
desc: null
|
| 218 |
+
value: null
|
| 219 |
+
instruction_valid_data_path:
|
| 220 |
+
desc: null
|
| 221 |
+
value: null
|
| 222 |
+
epoch:
|
| 223 |
+
desc: null
|
| 224 |
+
value: null
|
| 225 |
+
instruction_dataset_size:
|
| 226 |
+
desc: null
|
| 227 |
+
value: null
|
| 228 |
+
save_sampler_state:
|
| 229 |
+
desc: null
|
| 230 |
+
value: false
|
| 231 |
+
label_smoothing:
|
| 232 |
+
desc: null
|
| 233 |
+
value: 0.0
|
| 234 |
+
save_n_checkpoints:
|
| 235 |
+
desc: null
|
| 236 |
+
value: 10
|
| 237 |
+
hf_repo_id:
|
| 238 |
+
desc: null
|
| 239 |
+
value: koichi12/tiny-llama
|
| 240 |
+
create_public_hf_repo:
|
| 241 |
+
desc: null
|
| 242 |
+
value: false
|
| 243 |
+
upload_all_checkpoints_to_hf:
|
| 244 |
+
desc: null
|
| 245 |
+
value: false
|
| 246 |
+
hf_upload_retry_limit:
|
| 247 |
+
desc: null
|
| 248 |
+
value: 2
|
| 249 |
+
exit_duration_in_mins:
|
| 250 |
+
desc: null
|
| 251 |
+
value: null
|
| 252 |
+
source_key:
|
| 253 |
+
desc: null
|
| 254 |
+
value: null
|
| 255 |
+
target_key:
|
| 256 |
+
desc: null
|
| 257 |
+
value: null
|
| 258 |
+
attn_implementation:
|
| 259 |
+
desc: null
|
| 260 |
+
value: flash_attention_2
|
| 261 |
+
efficient_instruction_tuning:
|
| 262 |
+
desc: null
|
| 263 |
+
value: false
|
| 264 |
+
remove_padding_masking:
|
| 265 |
+
desc: null
|
| 266 |
+
value: false
|
| 267 |
+
save_start_iter:
|
| 268 |
+
desc: null
|
| 269 |
+
value: null
|
| 270 |
+
rank:
|
| 271 |
+
desc: null
|
| 272 |
+
value: 0
|
| 273 |
+
world_size:
|
| 274 |
+
desc: null
|
| 275 |
+
value: 1
|
| 276 |
+
padded_vocab_size:
|
| 277 |
+
desc: null
|
| 278 |
+
value: 32000
|
| 279 |
+
gradient_accumulation_steps:
|
| 280 |
+
desc: null
|
| 281 |
+
value: 40
|
| 282 |
+
_wandb:
|
| 283 |
+
desc: null
|
| 284 |
+
value:
|
| 285 |
+
python_version: 3.10.12
|
| 286 |
+
cli_version: 0.16.3
|
| 287 |
+
framework: huggingface
|
| 288 |
+
huggingface_version: 4.43.3
|
| 289 |
+
is_jupyter_run: false
|
| 290 |
+
is_kaggle_kernel: false
|
| 291 |
+
start_time: 1722749767.220741
|
| 292 |
+
t:
|
| 293 |
+
1:
|
| 294 |
+
- 1
|
| 295 |
+
- 11
|
| 296 |
+
- 49
|
| 297 |
+
- 55
|
| 298 |
+
- 71
|
| 299 |
+
2:
|
| 300 |
+
- 1
|
| 301 |
+
- 11
|
| 302 |
+
- 49
|
| 303 |
+
- 55
|
| 304 |
+
- 71
|
| 305 |
+
3:
|
| 306 |
+
- 13
|
| 307 |
+
- 16
|
| 308 |
+
- 23
|
| 309 |
+
4: 3.10.12
|
| 310 |
+
5: 0.16.3
|
| 311 |
+
6: 4.43.3
|
| 312 |
+
8:
|
| 313 |
+
- 5
|
| 314 |
+
13: linux-x86_64
|
| 315 |
+
activation_function:
|
| 316 |
+
desc: null
|
| 317 |
+
value: silu
|
| 318 |
+
hidden_size:
|
| 319 |
+
desc: null
|
| 320 |
+
value: 2048
|
| 321 |
+
model_type:
|
| 322 |
+
desc: null
|
| 323 |
+
value: llama
|
| 324 |
+
max_position_embeddings:
|
| 325 |
+
desc: null
|
| 326 |
+
value: 2048
|
| 327 |
+
num_attention_heads:
|
| 328 |
+
desc: null
|
| 329 |
+
value: 32
|
| 330 |
+
num_hidden_layers:
|
| 331 |
+
desc: null
|
| 332 |
+
value: 22
|
| 333 |
+
model_architecture:
|
| 334 |
+
desc: null
|
| 335 |
+
value: LlamaForCausalLM
|
wandb/run-20240804_143607-h7fxlkpt/files/output.log
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Created Hugging Face repository with ID koichi12/tiny-llama.
|
| 2 |
+
Clearing GPU cache for all ranks
|
| 3 |
+
--> Running with torch torch_distributed debug set to detail
|
| 4 |
+
File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
| 5 |
+
Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
| 6 |
+
File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
| 7 |
+
Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
| 8 |
+
File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
| 9 |
+
Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
| 10 |
+
No checkpoint found in /work/llm_recipes/models/tiny-llama, skipping model loading
|
| 11 |
+
--> Model /share/pretrained_lm/meta-llama/TinyLlama_v1.1
|
| 12 |
+
--> /share/pretrained_lm/meta-llama/TinyLlama_v1.1 has 1100.048384 Million params
|
| 13 |
+
You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
|
| 14 |
+
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
|
| 15 |
+
Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaForCausalLM is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
|
| 16 |
+
Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
|
| 17 |
+
/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
|
| 18 |
+
warnings.warn(
|
| 19 |
+
BFloat16 enabled for mixed precision - using bfSixteen policy
|
| 20 |
+
--> applying fsdp activation checkpointing...
|
| 21 |
+
> datasets target sizes (minimum size):
|
| 22 |
+
train: 640000
|
| 23 |
+
validation: 35200
|
| 24 |
+
test: 3200
|
| 25 |
+
> building train, validation, and test datasets for GPT ...
|
| 26 |
+
> finished creating GPT datasets ...
|
| 27 |
+
File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
| 28 |
+
Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
| 29 |
+
No checkpoint found in /work/llm_recipes/models/tiny-llama, skipping optimizer loading
|
| 30 |
+
File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
| 31 |
+
Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
|
| 32 |
+
model info: FullyShardedDataParallel(
|
| 33 |
+
(_fsdp_wrapped_module): LlamaForCausalLM(
|
| 34 |
+
(model): LlamaModel(
|
| 35 |
+
(embed_tokens): Embedding(32000, 2048)
|
| 36 |
+
(layers): ModuleList(
|
| 37 |
+
(0-21): 22 x FullyShardedDataParallel(
|
| 38 |
+
(_fsdp_wrapped_module): CheckpointWrapper(
|
| 39 |
+
(_checkpoint_wrapped_module): LlamaDecoderLayer(
|
| 40 |
+
(self_attn): LlamaFlashAttention2(
|
| 41 |
+
(q_proj): Linear(in_features=2048, out_features=2048, bias=False)
|
| 42 |
+
(k_proj): Linear(in_features=2048, out_features=256, bias=False)
|
| 43 |
+
(v_proj): Linear(in_features=2048, out_features=256, bias=False)
|
| 44 |
+
(o_proj): Linear(in_features=2048, out_features=2048, bias=False)
|
| 45 |
+
(rotary_emb): LlamaRotaryEmbedding()
|
| 46 |
+
)
|
| 47 |
+
(mlp): LlamaMLP(
|
| 48 |
+
(gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
|
| 49 |
+
(up_proj): Linear(in_features=2048, out_features=5632, bias=False)
|
| 50 |
+
(down_proj): Linear(in_features=5632, out_features=2048, bias=False)
|
| 51 |
+
(act_fn): SiLU()
|
| 52 |
+
)
|
| 53 |
+
(input_layernorm): LlamaRMSNorm()
|
| 54 |
+
(post_attention_layernorm): LlamaRMSNorm()
|
| 55 |
+
)
|
| 56 |
+
)
|
| 57 |
+
)
|
| 58 |
+
)
|
| 59 |
+
(norm): LlamaRMSNorm()
|
| 60 |
+
(rotary_emb): LlamaRotaryEmbedding()
|
| 61 |
+
)
|
| 62 |
+
(lm_head): Linear(in_features=2048, out_features=32000, bias=False)
|
| 63 |
+
)
|
| 64 |
+
)
|
| 65 |
+
model config: LlamaConfig {
|
| 66 |
+
"_name_or_path": "/share/pretrained_lm/meta-llama/TinyLlama_v1.1",
|
| 67 |
+
"architectures": [
|
| 68 |
+
"LlamaForCausalLM"
|
| 69 |
+
],
|
| 70 |
+
"attention_bias": false,
|
| 71 |
+
"attention_dropout": 0.0,
|
| 72 |
+
"bos_token_id": 1,
|
| 73 |
+
"eos_token_id": 2,
|
| 74 |
+
"hidden_act": "silu",
|
| 75 |
+
"hidden_size": 2048,
|
| 76 |
+
"initializer_range": 0.02,
|
| 77 |
+
"intermediate_size": 5632,
|
| 78 |
+
"label_smoothing": 0.0,
|
| 79 |
+
"max_position_embeddings": 2048,
|
| 80 |
+
"mlp_bias": false,
|
| 81 |
+
"model_type": "llama",
|
| 82 |
+
"num_attention_heads": 32,
|
| 83 |
+
"num_hidden_layers": 22,
|
| 84 |
+
"num_key_value_heads": 4,
|
| 85 |
+
"pretraining_tp": 1,
|
| 86 |
+
"rms_norm_eps": 1e-05,
|
| 87 |
+
"rope_scaling": null,
|
| 88 |
+
"rope_theta": 10000.0,
|
| 89 |
+
"tie_word_embeddings": false,
|
| 90 |
+
"torch_dtype": "float32",
|
| 91 |
+
"transformers_version": "4.43.3",
|
| 92 |
+
"use_cache": false,
|
| 93 |
+
"vocab_size": 32000
|
| 94 |
+
}
|
| 95 |
+
Let split = None
|
| 96 |
+
Building a BlendedDataset for a single MegatronDataset
|
| 97 |
+
Unable to save the indexes because path_to_cache is None
|
| 98 |
+
Building a BlendedDataset for a single MegatronDataset
|
| 99 |
+
Unable to save the indexes because path_to_cache is None
|
| 100 |
+
Building a BlendedDataset for a single MegatronDataset
|
| 101 |
+
Unable to save the indexes because path_to_cache is None
|
| 102 |
+
Traceback (most recent call last):
|
| 103 |
+
File "/project/examples/finetuning.py", line 13, in <module>
|
| 104 |
+
main()
|
| 105 |
+
File "/project/src/llama_recipes/finetuning.py", line 281, in main
|
| 106 |
+
train(
|
| 107 |
+
File "/project/src/llama_recipes/utils/train_utils.py", line 104, in train
|
| 108 |
+
batch = next(train_dataloader)
|
| 109 |
+
File "/project/src/llama_recipes/utils/train_utils.py", line 24, in cyclic_iter
|
| 110 |
+
for x in iter:
|
| 111 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 631, in __next__
|
| 112 |
+
data = self._next_data()
|
| 113 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1346, in _next_data
|
| 114 |
+
return self._process_data(data)
|
| 115 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1372, in _process_data
|
| 116 |
+
data.reraise()
|
| 117 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/_utils.py", line 705, in reraise
|
| 118 |
+
raise exception
|
| 119 |
+
RuntimeError: Caught RuntimeError in DataLoader worker process 0.
|
| 120 |
+
Original Traceback (most recent call last):
|
| 121 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
|
| 122 |
+
data = fetcher.fetch(index)
|
| 123 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/fetch.py", line 54, in fetch
|
| 124 |
+
return self.collate_fn(data)
|
| 125 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 277, in default_collate
|
| 126 |
+
return collate(batch, collate_fn_map=default_collate_fn_map)
|
| 127 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 129, in collate
|
| 128 |
+
return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
|
| 129 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 129, in <dictcomp>
|
| 130 |
+
return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
|
| 131 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 121, in collate
|
| 132 |
+
return collate_fn_map[elem_type](batch, collate_fn_map=collate_fn_map)
|
| 133 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 174, in collate_tensor_fn
|
| 134 |
+
return torch.stack(batch, 0, out=out)
|
| 135 |
+
RuntimeError: stack expects each tensor to be equal size, but got [513] at entry 0 and [543] at entry 1
|
wandb/run-20240804_143607-h7fxlkpt/files/requirements.txt
ADDED
|
@@ -0,0 +1,271 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
absl-py==2.1.0
|
| 2 |
+
accelerate==0.33.0
|
| 3 |
+
aiohttp==3.9.1
|
| 4 |
+
aiosignal==1.3.1
|
| 5 |
+
annotated-types==0.6.0
|
| 6 |
+
apex==0.1
|
| 7 |
+
appdirs==1.4.4
|
| 8 |
+
argon2-cffi-bindings==21.2.0
|
| 9 |
+
argon2-cffi==23.1.0
|
| 10 |
+
asttokens==2.4.1
|
| 11 |
+
astunparse==1.6.3
|
| 12 |
+
async-timeout==4.0.3
|
| 13 |
+
attrs==23.2.0
|
| 14 |
+
audioread==3.0.1
|
| 15 |
+
beautifulsoup4==4.12.3
|
| 16 |
+
bleach==6.1.0
|
| 17 |
+
blis==0.7.11
|
| 18 |
+
cachetools==5.3.2
|
| 19 |
+
catalogue==2.0.10
|
| 20 |
+
certifi==2024.2.2
|
| 21 |
+
cffi==1.16.0
|
| 22 |
+
charset-normalizer==3.3.2
|
| 23 |
+
click==8.1.7
|
| 24 |
+
cloudpathlib==0.16.0
|
| 25 |
+
cloudpickle==3.0.0
|
| 26 |
+
cmake==3.28.1
|
| 27 |
+
colorama==0.4.6
|
| 28 |
+
comm==0.2.1
|
| 29 |
+
confection==0.1.4
|
| 30 |
+
contourpy==1.2.0
|
| 31 |
+
cubinlinker==0.3.0+2.g405ac64
|
| 32 |
+
cuda-python==12.3.0rc4+9.gdb8c48a.dirty
|
| 33 |
+
cudf==23.12.0
|
| 34 |
+
cugraph-dgl==23.12.0
|
| 35 |
+
cugraph-service-client==23.12.0
|
| 36 |
+
cugraph-service-server==23.12.0
|
| 37 |
+
cugraph==23.12.0
|
| 38 |
+
cuml==23.12.0
|
| 39 |
+
cupy-cuda12x==12.3.0
|
| 40 |
+
cycler==0.12.1
|
| 41 |
+
cymem==2.0.8
|
| 42 |
+
cython==3.0.8
|
| 43 |
+
dask-cuda==23.12.0
|
| 44 |
+
dask-cudf==23.12.0
|
| 45 |
+
dask==2023.11.0
|
| 46 |
+
debugpy==1.8.1
|
| 47 |
+
decorator==5.1.1
|
| 48 |
+
defusedxml==0.7.1
|
| 49 |
+
distributed==2023.11.0
|
| 50 |
+
dm-tree==0.1.8
|
| 51 |
+
docker-pycreds==0.4.0
|
| 52 |
+
einops==0.7.0
|
| 53 |
+
exceptiongroup==1.2.0
|
| 54 |
+
execnet==2.0.2
|
| 55 |
+
executing==2.0.1
|
| 56 |
+
expecttest==0.1.3
|
| 57 |
+
fastjsonschema==2.19.1
|
| 58 |
+
fastrlock==0.8.2
|
| 59 |
+
filelock==3.13.1
|
| 60 |
+
flash-attn==2.4.2
|
| 61 |
+
fonttools==4.48.1
|
| 62 |
+
frozenlist==1.4.1
|
| 63 |
+
fsspec==2023.12.2
|
| 64 |
+
gast==0.5.4
|
| 65 |
+
gitdb==4.0.11
|
| 66 |
+
gitpython==3.1.43
|
| 67 |
+
google-auth-oauthlib==0.4.6
|
| 68 |
+
google-auth==2.27.0
|
| 69 |
+
graphsurgeon==0.4.6
|
| 70 |
+
grpcio==1.60.1
|
| 71 |
+
huggingface-hub==0.24.5
|
| 72 |
+
hypothesis==5.35.1
|
| 73 |
+
idna==3.6
|
| 74 |
+
importlib-metadata==7.0.1
|
| 75 |
+
iniconfig==2.0.0
|
| 76 |
+
intel-openmp==2021.4.0
|
| 77 |
+
ipadic==1.0.0
|
| 78 |
+
ipykernel==6.29.2
|
| 79 |
+
ipython-genutils==0.2.0
|
| 80 |
+
ipython==8.21.0
|
| 81 |
+
jedi==0.19.1
|
| 82 |
+
jinja2==3.1.3
|
| 83 |
+
joblib==1.3.2
|
| 84 |
+
json5==0.9.14
|
| 85 |
+
jsonnet==0.19.1
|
| 86 |
+
jsonschema-specifications==2023.12.1
|
| 87 |
+
jsonschema==4.21.1
|
| 88 |
+
jupyter-client==8.6.0
|
| 89 |
+
jupyter-core==5.7.1
|
| 90 |
+
jupyter-tensorboard==0.2.0
|
| 91 |
+
jupyterlab-pygments==0.3.0
|
| 92 |
+
jupyterlab-server==1.2.0
|
| 93 |
+
jupyterlab==2.3.2
|
| 94 |
+
jupytext==1.16.1
|
| 95 |
+
kiwisolver==1.4.5
|
| 96 |
+
langcodes==3.3.0
|
| 97 |
+
lazy-loader==0.3
|
| 98 |
+
librosa==0.10.1
|
| 99 |
+
llvmlite==0.40.1
|
| 100 |
+
locket==1.0.0
|
| 101 |
+
logzero==1.7.0
|
| 102 |
+
lxml==5.2.2
|
| 103 |
+
markdown-it-py==3.0.0
|
| 104 |
+
markdown==3.5.2
|
| 105 |
+
markupsafe==2.1.4
|
| 106 |
+
matplotlib-inline==0.1.6
|
| 107 |
+
matplotlib==3.8.2
|
| 108 |
+
mdit-py-plugins==0.4.0
|
| 109 |
+
mdurl==0.1.2
|
| 110 |
+
mecab-python3==1.0.6
|
| 111 |
+
mistune==3.0.2
|
| 112 |
+
mkl-devel==2021.1.1
|
| 113 |
+
mkl-include==2021.1.1
|
| 114 |
+
mkl==2021.1.1
|
| 115 |
+
mock==5.1.0
|
| 116 |
+
more-itertools==9.1.0
|
| 117 |
+
mpmath==1.3.0
|
| 118 |
+
msgpack==1.0.7
|
| 119 |
+
multidict==6.0.4
|
| 120 |
+
murmurhash==1.0.10
|
| 121 |
+
nbclient==0.9.0
|
| 122 |
+
nbconvert==7.16.0
|
| 123 |
+
nbformat==5.9.2
|
| 124 |
+
nest-asyncio==1.6.0
|
| 125 |
+
networkx==2.6.3
|
| 126 |
+
ninja==1.11.1.1
|
| 127 |
+
nltk==3.8.1
|
| 128 |
+
notebook==6.4.10
|
| 129 |
+
numba==0.57.1+1.g1ff679645
|
| 130 |
+
numpy==1.24.4
|
| 131 |
+
nvfuser==0.1.4a0+d0bb811
|
| 132 |
+
nvidia-dali-cuda120==1.34.0
|
| 133 |
+
nvidia-pyindex==1.0.9
|
| 134 |
+
nvtx==0.2.5
|
| 135 |
+
oauthlib==3.2.2
|
| 136 |
+
onnx==1.15.0rc2
|
| 137 |
+
opencv==4.7.0
|
| 138 |
+
optree==0.10.0
|
| 139 |
+
packaging==23.2
|
| 140 |
+
pandas==1.5.3
|
| 141 |
+
pandocfilters==1.5.1
|
| 142 |
+
parso==0.8.3
|
| 143 |
+
partd==1.4.1
|
| 144 |
+
peft==0.11.1
|
| 145 |
+
pexpect==4.9.0
|
| 146 |
+
pillow==10.2.0
|
| 147 |
+
pip==24.0
|
| 148 |
+
platformdirs==4.2.0
|
| 149 |
+
pluggy==1.4.0
|
| 150 |
+
ply==3.11
|
| 151 |
+
polygraphy==0.49.4
|
| 152 |
+
pooch==1.8.0
|
| 153 |
+
portalocker==2.10.1
|
| 154 |
+
preshed==3.0.9
|
| 155 |
+
prettytable==3.9.0
|
| 156 |
+
prometheus-client==0.19.0
|
| 157 |
+
prompt-toolkit==3.0.43
|
| 158 |
+
protobuf==4.24.4
|
| 159 |
+
psutil==5.9.4
|
| 160 |
+
ptxcompiler==0.8.1+2.g0d406d6
|
| 161 |
+
ptyprocess==0.7.0
|
| 162 |
+
pure-eval==0.2.2
|
| 163 |
+
pyarrow==14.0.1.dev0+gba5374836.d20240125
|
| 164 |
+
pyasn1-modules==0.3.0
|
| 165 |
+
pyasn1==0.5.1
|
| 166 |
+
pybind11-global==2.11.1
|
| 167 |
+
pybind11==2.11.1
|
| 168 |
+
pycocotools==2.0+nv0.8.0
|
| 169 |
+
pycparser==2.21
|
| 170 |
+
pydantic-core==2.16.2
|
| 171 |
+
pydantic==2.6.1
|
| 172 |
+
pygments==2.17.2
|
| 173 |
+
pylibcugraph==23.12.0
|
| 174 |
+
pylibcugraphops==23.12.0
|
| 175 |
+
pylibraft==23.12.0
|
| 176 |
+
pynvml==11.4.1
|
| 177 |
+
pyparsing==3.1.1
|
| 178 |
+
pytest-flakefinder==1.1.0
|
| 179 |
+
pytest-rerunfailures==13.0
|
| 180 |
+
pytest-shard==0.1.2
|
| 181 |
+
pytest-xdist==3.5.0
|
| 182 |
+
pytest==8.0.0
|
| 183 |
+
python-dateutil==2.8.2
|
| 184 |
+
python-dotenv==1.0.0
|
| 185 |
+
python-hostlist==1.23.0
|
| 186 |
+
pytorch-quantization==2.1.2
|
| 187 |
+
pytz==2023.3.post1
|
| 188 |
+
pyyaml==6.0.1
|
| 189 |
+
pyzmq==25.1.2
|
| 190 |
+
raft-dask==23.12.0
|
| 191 |
+
rapids-dask-dependency==23.12.1
|
| 192 |
+
referencing==0.33.0
|
| 193 |
+
regex==2023.12.25
|
| 194 |
+
requests-oauthlib==1.3.1
|
| 195 |
+
requests==2.31.0
|
| 196 |
+
rich==13.7.0
|
| 197 |
+
rmm==23.12.0
|
| 198 |
+
rpds-py==0.17.1
|
| 199 |
+
rsa==4.9
|
| 200 |
+
sacrebleu==2.4.0
|
| 201 |
+
safetensors==0.4.3
|
| 202 |
+
scikit-learn==1.2.0
|
| 203 |
+
scipy==1.12.0
|
| 204 |
+
send2trash==1.8.2
|
| 205 |
+
sentencepiece==0.1.99
|
| 206 |
+
sentry-sdk==2.12.0
|
| 207 |
+
setproctitle==1.3.3
|
| 208 |
+
setuptools==68.2.2
|
| 209 |
+
six==1.16.0
|
| 210 |
+
smart-open==6.4.0
|
| 211 |
+
smmap==5.0.1
|
| 212 |
+
sortedcontainers==2.4.0
|
| 213 |
+
soundfile==0.12.1
|
| 214 |
+
soupsieve==2.5
|
| 215 |
+
soxr==0.3.7
|
| 216 |
+
spacy-legacy==3.0.12
|
| 217 |
+
spacy-loggers==1.0.5
|
| 218 |
+
spacy==3.7.2
|
| 219 |
+
sphinx-glpi-theme==0.6
|
| 220 |
+
srsly==2.4.8
|
| 221 |
+
stack-data==0.6.3
|
| 222 |
+
sympy==1.12
|
| 223 |
+
tabulate==0.9.0
|
| 224 |
+
tbb==2021.11.0
|
| 225 |
+
tblib==3.0.0
|
| 226 |
+
tensorboard-data-server==0.6.1
|
| 227 |
+
tensorboard-plugin-wit==1.8.1
|
| 228 |
+
tensorboard==2.9.0
|
| 229 |
+
tensorrt==8.6.3
|
| 230 |
+
terminado==0.18.0
|
| 231 |
+
termplotlib==0.3.9
|
| 232 |
+
thinc==8.2.3
|
| 233 |
+
threadpoolctl==3.2.0
|
| 234 |
+
thriftpy2==0.4.17
|
| 235 |
+
tinycss2==1.2.1
|
| 236 |
+
tokenizers==0.19.1
|
| 237 |
+
toml==0.10.2
|
| 238 |
+
tomli==2.0.1
|
| 239 |
+
toolz==0.12.1
|
| 240 |
+
torch-tensorrt==2.3.0a0
|
| 241 |
+
torch==2.3.0a0+ebedce2
|
| 242 |
+
torchdata==0.7.1a0
|
| 243 |
+
torchtext==0.17.0a0
|
| 244 |
+
torchvision==0.18.0a0
|
| 245 |
+
tornado==6.4
|
| 246 |
+
tqdm==4.66.1
|
| 247 |
+
traitlets==5.9.0
|
| 248 |
+
transformer-engine==1.3.0+5b90b7f
|
| 249 |
+
transformers==4.43.3
|
| 250 |
+
treelite-runtime==3.9.1
|
| 251 |
+
treelite==3.9.1
|
| 252 |
+
triton==2.2.0+e28a256
|
| 253 |
+
typer==0.9.0
|
| 254 |
+
types-dataclasses==0.6.6
|
| 255 |
+
typing-extensions==4.9.0
|
| 256 |
+
ucx-py==0.35.0
|
| 257 |
+
uff==0.6.9
|
| 258 |
+
ujson==5.8.0
|
| 259 |
+
urllib3==1.26.18
|
| 260 |
+
wandb==0.16.3
|
| 261 |
+
wasabi==1.1.2
|
| 262 |
+
wcwidth==0.2.13
|
| 263 |
+
weasel==0.3.4
|
| 264 |
+
webencodings==0.5.1
|
| 265 |
+
werkzeug==3.0.1
|
| 266 |
+
wheel==0.42.0
|
| 267 |
+
xdoctest==1.0.2
|
| 268 |
+
xgboost==1.7.6
|
| 269 |
+
yarl==1.9.4
|
| 270 |
+
zict==3.0.0
|
| 271 |
+
zipp==3.17.0
|
wandb/run-20240804_143607-h7fxlkpt/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "3.10.12",
|
| 4 |
+
"heartbeatAt": "2024-08-04T05:36:07.811618",
|
| 5 |
+
"startedAt": "2024-08-04T05:36:07.207201",
|
| 6 |
+
"docker": null,
|
| 7 |
+
"cuda": null,
|
| 8 |
+
"args": [
|
| 9 |
+
"--seq-length",
|
| 10 |
+
"512",
|
| 11 |
+
"--sliding-window-size",
|
| 12 |
+
"4096",
|
| 13 |
+
"--micro-batch-size",
|
| 14 |
+
"8",
|
| 15 |
+
"--global-batch-size",
|
| 16 |
+
"320",
|
| 17 |
+
"--train-iters",
|
| 18 |
+
"2000",
|
| 19 |
+
"--tokenizer-type",
|
| 20 |
+
"Llama2Tokenizer",
|
| 21 |
+
"--tokenizer-model",
|
| 22 |
+
"/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model",
|
| 23 |
+
"--train-data-path",
|
| 24 |
+
"4013541",
|
| 25 |
+
"/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
| 26 |
+
"--valid-data-path",
|
| 27 |
+
"4013541",
|
| 28 |
+
"/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
| 29 |
+
"--test-data-path",
|
| 30 |
+
"4013541",
|
| 31 |
+
"/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
| 32 |
+
"--lr",
|
| 33 |
+
"2e-5",
|
| 34 |
+
"--min-lr",
|
| 35 |
+
"1e-6",
|
| 36 |
+
"--lr-decay-style",
|
| 37 |
+
"cosine",
|
| 38 |
+
"--lr-warmup-iters",
|
| 39 |
+
"500",
|
| 40 |
+
"--lr-decay-iters",
|
| 41 |
+
"2000",
|
| 42 |
+
"--weight-decay",
|
| 43 |
+
"0.1",
|
| 44 |
+
"--grad-clip-norm",
|
| 45 |
+
"1.0",
|
| 46 |
+
"--optimizer",
|
| 47 |
+
"adam",
|
| 48 |
+
"--adam-beta1",
|
| 49 |
+
"0.9",
|
| 50 |
+
"--adam-beta2",
|
| 51 |
+
"0.95",
|
| 52 |
+
"--adam-eps",
|
| 53 |
+
"1e-6",
|
| 54 |
+
"--save-interval",
|
| 55 |
+
"200",
|
| 56 |
+
"--eval-interval",
|
| 57 |
+
"200",
|
| 58 |
+
"--eval-iters",
|
| 59 |
+
"10",
|
| 60 |
+
"--bf16",
|
| 61 |
+
"--mixed-precision",
|
| 62 |
+
"--base-model",
|
| 63 |
+
"/share/pretrained_lm/meta-llama/TinyLlama_v1.1",
|
| 64 |
+
"--save",
|
| 65 |
+
"/work/llm_recipes/models/tiny-llama",
|
| 66 |
+
"--load",
|
| 67 |
+
"/work/llm_recipes/models/tiny-llama",
|
| 68 |
+
"--fsdp-activation-checkpointing",
|
| 69 |
+
"--sharding-strategy",
|
| 70 |
+
"FULL_SHARD",
|
| 71 |
+
"--checkpoint-type",
|
| 72 |
+
"LOCAL_STATE_DICT",
|
| 73 |
+
"--save-n-checkpoints",
|
| 74 |
+
"10",
|
| 75 |
+
"--hf-upload-retry-limit",
|
| 76 |
+
"2",
|
| 77 |
+
"--hf-repo-id",
|
| 78 |
+
"koichi12/tiny-llama",
|
| 79 |
+
"--wandb-entity",
|
| 80 |
+
"iwakawa-koichi-q5-tohoku-nlp6723",
|
| 81 |
+
"--wandb-project",
|
| 82 |
+
"llm_tutorial",
|
| 83 |
+
"--wandb-name",
|
| 84 |
+
"tiny-llama_train_2024-08-04-14:35:56"
|
| 85 |
+
],
|
| 86 |
+
"state": "running",
|
| 87 |
+
"program": "/project/examples/finetuning.py",
|
| 88 |
+
"codePathLocal": "examples/finetuning.py",
|
| 89 |
+
"codePath": "examples/finetuning.py",
|
| 90 |
+
"git": {
|
| 91 |
+
"remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
|
| 92 |
+
"commit": "3be5353210a678dc7008f237fa16b99f2bdf36ea"
|
| 93 |
+
},
|
| 94 |
+
"email": null,
|
| 95 |
+
"root": "/project",
|
| 96 |
+
"host": "gpu-koiwa-00",
|
| 97 |
+
"username": "koiwa",
|
| 98 |
+
"executable": "/usr/bin/python",
|
| 99 |
+
"cpu_count": 18,
|
| 100 |
+
"cpu_count_logical": 18,
|
| 101 |
+
"cpu_freq": {
|
| 102 |
+
"current": 2400.0389999999993,
|
| 103 |
+
"min": 0.0,
|
| 104 |
+
"max": 0.0
|
| 105 |
+
},
|
| 106 |
+
"cpu_freq_per_core": [
|
| 107 |
+
{
|
| 108 |
+
"current": 2400.039,
|
| 109 |
+
"min": 0.0,
|
| 110 |
+
"max": 0.0
|
| 111 |
+
},
|
| 112 |
+
{
|
| 113 |
+
"current": 2400.039,
|
| 114 |
+
"min": 0.0,
|
| 115 |
+
"max": 0.0
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"current": 2400.039,
|
| 119 |
+
"min": 0.0,
|
| 120 |
+
"max": 0.0
|
| 121 |
+
},
|
| 122 |
+
{
|
| 123 |
+
"current": 2400.039,
|
| 124 |
+
"min": 0.0,
|
| 125 |
+
"max": 0.0
|
| 126 |
+
},
|
| 127 |
+
{
|
| 128 |
+
"current": 2400.039,
|
| 129 |
+
"min": 0.0,
|
| 130 |
+
"max": 0.0
|
| 131 |
+
},
|
| 132 |
+
{
|
| 133 |
+
"current": 2400.039,
|
| 134 |
+
"min": 0.0,
|
| 135 |
+
"max": 0.0
|
| 136 |
+
},
|
| 137 |
+
{
|
| 138 |
+
"current": 2400.039,
|
| 139 |
+
"min": 0.0,
|
| 140 |
+
"max": 0.0
|
| 141 |
+
},
|
| 142 |
+
{
|
| 143 |
+
"current": 2400.039,
|
| 144 |
+
"min": 0.0,
|
| 145 |
+
"max": 0.0
|
| 146 |
+
},
|
| 147 |
+
{
|
| 148 |
+
"current": 2400.039,
|
| 149 |
+
"min": 0.0,
|
| 150 |
+
"max": 0.0
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"current": 2400.039,
|
| 154 |
+
"min": 0.0,
|
| 155 |
+
"max": 0.0
|
| 156 |
+
},
|
| 157 |
+
{
|
| 158 |
+
"current": 2400.039,
|
| 159 |
+
"min": 0.0,
|
| 160 |
+
"max": 0.0
|
| 161 |
+
},
|
| 162 |
+
{
|
| 163 |
+
"current": 2400.039,
|
| 164 |
+
"min": 0.0,
|
| 165 |
+
"max": 0.0
|
| 166 |
+
},
|
| 167 |
+
{
|
| 168 |
+
"current": 2400.039,
|
| 169 |
+
"min": 0.0,
|
| 170 |
+
"max": 0.0
|
| 171 |
+
},
|
| 172 |
+
{
|
| 173 |
+
"current": 2400.039,
|
| 174 |
+
"min": 0.0,
|
| 175 |
+
"max": 0.0
|
| 176 |
+
},
|
| 177 |
+
{
|
| 178 |
+
"current": 2400.039,
|
| 179 |
+
"min": 0.0,
|
| 180 |
+
"max": 0.0
|
| 181 |
+
},
|
| 182 |
+
{
|
| 183 |
+
"current": 2400.039,
|
| 184 |
+
"min": 0.0,
|
| 185 |
+
"max": 0.0
|
| 186 |
+
},
|
| 187 |
+
{
|
| 188 |
+
"current": 2400.039,
|
| 189 |
+
"min": 0.0,
|
| 190 |
+
"max": 0.0
|
| 191 |
+
},
|
| 192 |
+
{
|
| 193 |
+
"current": 2400.039,
|
| 194 |
+
"min": 0.0,
|
| 195 |
+
"max": 0.0
|
| 196 |
+
}
|
| 197 |
+
],
|
| 198 |
+
"disk": {
|
| 199 |
+
"/": {
|
| 200 |
+
"total": 0.0625,
|
| 201 |
+
"used": 1.1444091796875e-05
|
| 202 |
+
}
|
| 203 |
+
},
|
| 204 |
+
"gpu": "NVIDIA A100-SXM4-40GB",
|
| 205 |
+
"gpu_count": 1,
|
| 206 |
+
"gpu_devices": [
|
| 207 |
+
{
|
| 208 |
+
"name": "NVIDIA A100-SXM4-40GB",
|
| 209 |
+
"memory_total": 42949672960
|
| 210 |
+
}
|
| 211 |
+
],
|
| 212 |
+
"memory": {
|
| 213 |
+
"total": 56.48781967163086
|
| 214 |
+
}
|
| 215 |
+
}
|
wandb/run-20240804_143607-h7fxlkpt/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"_wandb": {"runtime": 2}}
|
wandb/run-20240804_143607-h7fxlkpt/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2024-08-04 14:36:07,222 INFO StreamThr :11584 [internal.py:wandb_internal():86] W&B internal server running at pid: 11584, started at: 2024-08-04 14:36:07.221438
|
| 2 |
+
2024-08-04 14:36:07,223 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: status
|
| 3 |
+
2024-08-04 14:36:07,225 INFO WriterThread:11584 [datastore.py:open_for_write():87] open: /project/wandb/run-20240804_143607-h7fxlkpt/run-h7fxlkpt.wandb
|
| 4 |
+
2024-08-04 14:36:07,226 DEBUG SenderThread:11584 [sender.py:send():382] send: header
|
| 5 |
+
2024-08-04 14:36:07,240 DEBUG SenderThread:11584 [sender.py:send():382] send: run
|
| 6 |
+
2024-08-04 14:36:07,696 INFO SenderThread:11584 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240804_143607-h7fxlkpt/files
|
| 7 |
+
2024-08-04 14:36:07,696 INFO SenderThread:11584 [sender.py:_start_run_threads():1136] run started: h7fxlkpt with start time 1722749767.220741
|
| 8 |
+
2024-08-04 14:36:07,701 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: check_version
|
| 9 |
+
2024-08-04 14:36:07,701 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: check_version
|
| 10 |
+
2024-08-04 14:36:07,791 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: run_start
|
| 11 |
+
2024-08-04 14:36:07,798 DEBUG HandlerThread:11584 [system_info.py:__init__():27] System info init
|
| 12 |
+
2024-08-04 14:36:07,798 DEBUG HandlerThread:11584 [system_info.py:__init__():42] System info init done
|
| 13 |
+
2024-08-04 14:36:07,798 INFO HandlerThread:11584 [system_monitor.py:start():194] Starting system monitor
|
| 14 |
+
2024-08-04 14:36:07,798 INFO SystemMonitor:11584 [system_monitor.py:_start():158] Starting system asset monitoring threads
|
| 15 |
+
2024-08-04 14:36:07,799 INFO HandlerThread:11584 [system_monitor.py:probe():214] Collecting system info
|
| 16 |
+
2024-08-04 14:36:07,799 INFO SystemMonitor:11584 [interfaces.py:start():190] Started cpu monitoring
|
| 17 |
+
2024-08-04 14:36:07,799 INFO SystemMonitor:11584 [interfaces.py:start():190] Started disk monitoring
|
| 18 |
+
2024-08-04 14:36:07,800 INFO SystemMonitor:11584 [interfaces.py:start():190] Started gpu monitoring
|
| 19 |
+
2024-08-04 14:36:07,801 INFO SystemMonitor:11584 [interfaces.py:start():190] Started memory monitoring
|
| 20 |
+
2024-08-04 14:36:07,802 INFO SystemMonitor:11584 [interfaces.py:start():190] Started network monitoring
|
| 21 |
+
2024-08-04 14:36:07,811 DEBUG HandlerThread:11584 [system_info.py:probe():151] Probing system
|
| 22 |
+
2024-08-04 14:36:07,813 DEBUG HandlerThread:11584 [system_info.py:_probe_git():136] Probing git
|
| 23 |
+
2024-08-04 14:36:07,825 DEBUG HandlerThread:11584 [system_info.py:_probe_git():144] Probing git done
|
| 24 |
+
2024-08-04 14:36:07,825 DEBUG HandlerThread:11584 [system_info.py:probe():199] Probing system done
|
| 25 |
+
2024-08-04 14:36:07,825 DEBUG HandlerThread:11584 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-04T05:36:07.811618', 'startedAt': '2024-08-04T05:36:07.207201', 'docker': None, 'cuda': None, 'args': ('--seq-length', '512', '--sliding-window-size', '4096', '--micro-batch-size', '8', '--global-batch-size', '320', '--train-iters', '2000', '--tokenizer-type', 'Llama2Tokenizer', '--tokenizer-model', '/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model', '--train-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--valid-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--test-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '2000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '200', '--eval-interval', '200', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/meta-llama/TinyLlama_v1.1', '--save', '/work/llm_recipes/models/tiny-llama', '--load', '/work/llm_recipes/models/tiny-llama', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/tiny-llama', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'tiny-llama_train_2024-08-04-14:35:56'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '3be5353210a678dc7008f237fa16b99f2bdf36ea'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0389999999993, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.48781967163086}}
|
| 26 |
+
2024-08-04 14:36:07,825 INFO HandlerThread:11584 [system_monitor.py:probe():224] Finished collecting system info
|
| 27 |
+
2024-08-04 14:36:07,825 INFO HandlerThread:11584 [system_monitor.py:probe():227] Publishing system info
|
| 28 |
+
2024-08-04 14:36:07,827 INFO HandlerThread:11584 [system_monitor.py:probe():229] Finished publishing system info
|
| 29 |
+
2024-08-04 14:36:07,833 DEBUG SenderThread:11584 [sender.py:send():382] send: files
|
| 30 |
+
2024-08-04 14:36:07,833 INFO SenderThread:11584 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
|
| 31 |
+
2024-08-04 14:36:07,842 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: python_packages
|
| 32 |
+
2024-08-04 14:36:07,842 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: stop_status
|
| 33 |
+
2024-08-04 14:36:07,842 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: internal_messages
|
| 34 |
+
2024-08-04 14:36:07,843 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: python_packages
|
| 35 |
+
2024-08-04 14:36:07,845 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: stop_status
|
| 36 |
+
2024-08-04 14:36:08,168 DEBUG SenderThread:11584 [sender.py:send():382] send: telemetry
|
| 37 |
+
2024-08-04 14:36:08,499 INFO wandb-upload_0:11584 [upload_job.py:push():131] Uploaded file /tmp/tmp7k_0gn43wandb/ux980mno-wandb-metadata.json
|
| 38 |
+
2024-08-04 14:36:08,698 INFO Thread-12 :11584 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_143607-h7fxlkpt/files/output.log
|
| 39 |
+
2024-08-04 14:36:08,698 INFO Thread-12 :11584 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_143607-h7fxlkpt/files/requirements.txt
|
| 40 |
+
2024-08-04 14:36:08,698 INFO Thread-12 :11584 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_143607-h7fxlkpt/files/wandb-metadata.json
|
| 41 |
+
2024-08-04 14:36:10,261 DEBUG SenderThread:11584 [sender.py:send():382] send: config
|
| 42 |
+
2024-08-04 14:36:10,262 DEBUG SenderThread:11584 [sender.py:send():382] send: config
|
| 43 |
+
2024-08-04 14:36:10,349 DEBUG SenderThread:11584 [sender.py:send():382] send: exit
|
| 44 |
+
2024-08-04 14:36:10,349 INFO SenderThread:11584 [sender.py:send_exit():589] handling exit code: 1
|
| 45 |
+
2024-08-04 14:36:10,349 INFO SenderThread:11584 [sender.py:send_exit():591] handling runtime: 2
|
| 46 |
+
2024-08-04 14:36:10,351 INFO SenderThread:11584 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
| 47 |
+
2024-08-04 14:36:10,351 INFO SenderThread:11584 [sender.py:send_exit():597] send defer
|
| 48 |
+
2024-08-04 14:36:10,351 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: defer
|
| 49 |
+
2024-08-04 14:36:10,351 INFO HandlerThread:11584 [handler.py:handle_request_defer():172] handle defer: 0
|
| 50 |
+
2024-08-04 14:36:10,351 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: defer
|
| 51 |
+
2024-08-04 14:36:10,351 INFO SenderThread:11584 [sender.py:send_request_defer():613] handle sender defer: 0
|
| 52 |
+
2024-08-04 14:36:10,351 INFO SenderThread:11584 [sender.py:transition_state():617] send defer: 1
|
| 53 |
+
2024-08-04 14:36:10,352 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: defer
|
| 54 |
+
2024-08-04 14:36:10,352 INFO HandlerThread:11584 [handler.py:handle_request_defer():172] handle defer: 1
|
| 55 |
+
2024-08-04 14:36:10,352 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: defer
|
| 56 |
+
2024-08-04 14:36:10,352 INFO SenderThread:11584 [sender.py:send_request_defer():613] handle sender defer: 1
|
| 57 |
+
2024-08-04 14:36:10,352 INFO SenderThread:11584 [sender.py:transition_state():617] send defer: 2
|
| 58 |
+
2024-08-04 14:36:10,352 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: defer
|
| 59 |
+
2024-08-04 14:36:10,352 INFO HandlerThread:11584 [handler.py:handle_request_defer():172] handle defer: 2
|
| 60 |
+
2024-08-04 14:36:10,352 INFO HandlerThread:11584 [system_monitor.py:finish():203] Stopping system monitor
|
| 61 |
+
2024-08-04 14:36:10,352 DEBUG SystemMonitor:11584 [system_monitor.py:_start():172] Starting system metrics aggregation loop
|
| 62 |
+
2024-08-04 14:36:10,352 INFO HandlerThread:11584 [interfaces.py:finish():202] Joined cpu monitor
|
| 63 |
+
2024-08-04 14:36:10,352 DEBUG SystemMonitor:11584 [system_monitor.py:_start():179] Finished system metrics aggregation loop
|
| 64 |
+
2024-08-04 14:36:10,353 INFO HandlerThread:11584 [interfaces.py:finish():202] Joined disk monitor
|
| 65 |
+
2024-08-04 14:36:10,353 DEBUG SystemMonitor:11584 [system_monitor.py:_start():183] Publishing last batch of metrics
|
| 66 |
+
2024-08-04 14:36:10,385 INFO HandlerThread:11584 [interfaces.py:finish():202] Joined gpu monitor
|
| 67 |
+
2024-08-04 14:36:10,385 INFO HandlerThread:11584 [interfaces.py:finish():202] Joined memory monitor
|
| 68 |
+
2024-08-04 14:36:10,386 INFO HandlerThread:11584 [interfaces.py:finish():202] Joined network monitor
|
| 69 |
+
2024-08-04 14:36:10,386 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: defer
|
| 70 |
+
2024-08-04 14:36:10,386 INFO SenderThread:11584 [sender.py:send_request_defer():613] handle sender defer: 2
|
| 71 |
+
2024-08-04 14:36:10,386 INFO SenderThread:11584 [sender.py:transition_state():617] send defer: 3
|
| 72 |
+
2024-08-04 14:36:10,386 DEBUG SenderThread:11584 [sender.py:send():382] send: stats
|
| 73 |
+
2024-08-04 14:36:10,386 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: defer
|
| 74 |
+
2024-08-04 14:36:10,386 INFO HandlerThread:11584 [handler.py:handle_request_defer():172] handle defer: 3
|
| 75 |
+
2024-08-04 14:36:10,387 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: defer
|
| 76 |
+
2024-08-04 14:36:10,387 INFO SenderThread:11584 [sender.py:send_request_defer():613] handle sender defer: 3
|
| 77 |
+
2024-08-04 14:36:10,387 INFO SenderThread:11584 [sender.py:transition_state():617] send defer: 4
|
| 78 |
+
2024-08-04 14:36:10,387 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: defer
|
| 79 |
+
2024-08-04 14:36:10,387 INFO HandlerThread:11584 [handler.py:handle_request_defer():172] handle defer: 4
|
| 80 |
+
2024-08-04 14:36:10,387 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: defer
|
| 81 |
+
2024-08-04 14:36:10,387 INFO SenderThread:11584 [sender.py:send_request_defer():613] handle sender defer: 4
|
| 82 |
+
2024-08-04 14:36:10,387 INFO SenderThread:11584 [sender.py:transition_state():617] send defer: 5
|
| 83 |
+
2024-08-04 14:36:10,387 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: defer
|
| 84 |
+
2024-08-04 14:36:10,387 INFO HandlerThread:11584 [handler.py:handle_request_defer():172] handle defer: 5
|
| 85 |
+
2024-08-04 14:36:10,387 DEBUG SenderThread:11584 [sender.py:send():382] send: summary
|
| 86 |
+
2024-08-04 14:36:10,388 INFO SenderThread:11584 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
| 87 |
+
2024-08-04 14:36:10,388 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: defer
|
| 88 |
+
2024-08-04 14:36:10,388 INFO SenderThread:11584 [sender.py:send_request_defer():613] handle sender defer: 5
|
| 89 |
+
2024-08-04 14:36:10,388 INFO SenderThread:11584 [sender.py:transition_state():617] send defer: 6
|
| 90 |
+
2024-08-04 14:36:10,389 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: defer
|
| 91 |
+
2024-08-04 14:36:10,389 INFO HandlerThread:11584 [handler.py:handle_request_defer():172] handle defer: 6
|
| 92 |
+
2024-08-04 14:36:10,389 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: defer
|
| 93 |
+
2024-08-04 14:36:10,389 INFO SenderThread:11584 [sender.py:send_request_defer():613] handle sender defer: 6
|
| 94 |
+
2024-08-04 14:36:10,391 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: status_report
|
| 95 |
+
2024-08-04 14:36:10,576 INFO SenderThread:11584 [sender.py:transition_state():617] send defer: 7
|
| 96 |
+
2024-08-04 14:36:10,577 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: defer
|
| 97 |
+
2024-08-04 14:36:10,577 INFO HandlerThread:11584 [handler.py:handle_request_defer():172] handle defer: 7
|
| 98 |
+
2024-08-04 14:36:10,577 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: defer
|
| 99 |
+
2024-08-04 14:36:10,577 INFO SenderThread:11584 [sender.py:send_request_defer():613] handle sender defer: 7
|
| 100 |
+
2024-08-04 14:36:10,699 INFO Thread-12 :11584 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_143607-h7fxlkpt/files/output.log
|
| 101 |
+
2024-08-04 14:36:10,699 INFO Thread-12 :11584 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_143607-h7fxlkpt/files/config.yaml
|
| 102 |
+
2024-08-04 14:36:10,699 INFO Thread-12 :11584 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_143607-h7fxlkpt/files/wandb-summary.json
|
| 103 |
+
2024-08-04 14:36:11,349 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: poll_exit
|
| 104 |
+
2024-08-04 14:36:12,530 INFO SenderThread:11584 [sender.py:transition_state():617] send defer: 8
|
| 105 |
+
2024-08-04 14:36:12,530 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: poll_exit
|
| 106 |
+
2024-08-04 14:36:12,530 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: defer
|
| 107 |
+
2024-08-04 14:36:12,531 INFO HandlerThread:11584 [handler.py:handle_request_defer():172] handle defer: 8
|
| 108 |
+
2024-08-04 14:36:12,531 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: defer
|
| 109 |
+
2024-08-04 14:36:12,531 INFO SenderThread:11584 [sender.py:send_request_defer():613] handle sender defer: 8
|
| 110 |
+
2024-08-04 14:36:12,531 INFO SenderThread:11584 [job_builder.py:build():296] Attempting to build job artifact
|
| 111 |
+
2024-08-04 14:36:12,532 INFO SenderThread:11584 [job_builder.py:_get_source_type():426] is repo sourced job
|
| 112 |
+
2024-08-04 14:36:12,546 INFO SenderThread:11584 [job_builder.py:build():402] adding wandb-job metadata file
|
| 113 |
+
2024-08-04 14:36:12,554 INFO SenderThread:11584 [sender.py:transition_state():617] send defer: 9
|
| 114 |
+
2024-08-04 14:36:12,555 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: defer
|
| 115 |
+
2024-08-04 14:36:12,555 DEBUG SenderThread:11584 [sender.py:send():382] send: artifact
|
| 116 |
+
2024-08-04 14:36:12,555 INFO HandlerThread:11584 [handler.py:handle_request_defer():172] handle defer: 9
|
| 117 |
+
2024-08-04 14:36:12,700 INFO Thread-12 :11584 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_143607-h7fxlkpt/files/output.log
|
| 118 |
+
2024-08-04 14:36:13,350 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: poll_exit
|
| 119 |
+
2024-08-04 14:36:13,435 INFO SenderThread:11584 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTA5MTk2NTkzOA==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTA5MzUzODM4NQ==', 'versionIndex': 3}}}
|
| 120 |
+
2024-08-04 14:36:13,435 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: defer
|
| 121 |
+
2024-08-04 14:36:13,435 INFO SenderThread:11584 [sender.py:send_request_defer():613] handle sender defer: 9
|
| 122 |
+
2024-08-04 14:36:13,435 INFO SenderThread:11584 [dir_watcher.py:finish():358] shutting down directory watcher
|
| 123 |
+
2024-08-04 14:36:13,701 INFO SenderThread:11584 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240804_143607-h7fxlkpt/files
|
| 124 |
+
2024-08-04 14:36:13,701 INFO SenderThread:11584 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_143607-h7fxlkpt/files/requirements.txt requirements.txt
|
| 125 |
+
2024-08-04 14:36:13,702 INFO SenderThread:11584 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_143607-h7fxlkpt/files/config.yaml config.yaml
|
| 126 |
+
2024-08-04 14:36:13,703 INFO SenderThread:11584 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_143607-h7fxlkpt/files/wandb-metadata.json wandb-metadata.json
|
| 127 |
+
2024-08-04 14:36:13,703 INFO SenderThread:11584 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_143607-h7fxlkpt/files/wandb-summary.json wandb-summary.json
|
| 128 |
+
2024-08-04 14:36:13,705 INFO SenderThread:11584 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_143607-h7fxlkpt/files/output.log output.log
|
| 129 |
+
2024-08-04 14:36:13,706 INFO SenderThread:11584 [sender.py:transition_state():617] send defer: 10
|
| 130 |
+
2024-08-04 14:36:13,707 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: poll_exit
|
| 131 |
+
2024-08-04 14:36:13,707 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: defer
|
| 132 |
+
2024-08-04 14:36:13,707 INFO HandlerThread:11584 [handler.py:handle_request_defer():172] handle defer: 10
|
| 133 |
+
2024-08-04 14:36:13,708 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: defer
|
| 134 |
+
2024-08-04 14:36:13,708 INFO SenderThread:11584 [sender.py:send_request_defer():613] handle sender defer: 10
|
| 135 |
+
2024-08-04 14:36:13,709 INFO SenderThread:11584 [file_pusher.py:finish():172] shutting down file pusher
|
| 136 |
+
2024-08-04 14:36:14,120 INFO wandb-upload_0:11584 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_143607-h7fxlkpt/files/requirements.txt
|
| 137 |
+
2024-08-04 14:36:14,203 INFO wandb-upload_1:11584 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_143607-h7fxlkpt/files/config.yaml
|
| 138 |
+
2024-08-04 14:36:14,309 INFO wandb-upload_3:11584 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_143607-h7fxlkpt/files/output.log
|
| 139 |
+
2024-08-04 14:36:14,324 INFO wandb-upload_2:11584 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_143607-h7fxlkpt/files/wandb-summary.json
|
| 140 |
+
2024-08-04 14:36:14,351 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: poll_exit
|
| 141 |
+
2024-08-04 14:36:14,351 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: poll_exit
|
| 142 |
+
2024-08-04 14:36:14,524 INFO Thread-11 (_thread_body):11584 [sender.py:transition_state():617] send defer: 11
|
| 143 |
+
2024-08-04 14:36:14,524 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: defer
|
| 144 |
+
2024-08-04 14:36:14,524 INFO HandlerThread:11584 [handler.py:handle_request_defer():172] handle defer: 11
|
| 145 |
+
2024-08-04 14:36:14,524 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: defer
|
| 146 |
+
2024-08-04 14:36:14,524 INFO SenderThread:11584 [sender.py:send_request_defer():613] handle sender defer: 11
|
| 147 |
+
2024-08-04 14:36:14,524 INFO SenderThread:11584 [file_pusher.py:join():178] waiting for file pusher
|
| 148 |
+
2024-08-04 14:36:14,525 INFO SenderThread:11584 [sender.py:transition_state():617] send defer: 12
|
| 149 |
+
2024-08-04 14:36:14,525 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: defer
|
| 150 |
+
2024-08-04 14:36:14,525 INFO HandlerThread:11584 [handler.py:handle_request_defer():172] handle defer: 12
|
| 151 |
+
2024-08-04 14:36:14,525 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: defer
|
| 152 |
+
2024-08-04 14:36:14,525 INFO SenderThread:11584 [sender.py:send_request_defer():613] handle sender defer: 12
|
| 153 |
+
2024-08-04 14:36:14,525 INFO SenderThread:11584 [file_stream.py:finish():595] file stream finish called
|
| 154 |
+
2024-08-04 14:36:14,732 INFO SenderThread:11584 [file_stream.py:finish():599] file stream finish is done
|
| 155 |
+
2024-08-04 14:36:14,732 INFO SenderThread:11584 [sender.py:transition_state():617] send defer: 13
|
| 156 |
+
2024-08-04 14:36:14,732 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: defer
|
| 157 |
+
2024-08-04 14:36:14,732 INFO HandlerThread:11584 [handler.py:handle_request_defer():172] handle defer: 13
|
| 158 |
+
2024-08-04 14:36:14,732 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: defer
|
| 159 |
+
2024-08-04 14:36:14,732 INFO SenderThread:11584 [sender.py:send_request_defer():613] handle sender defer: 13
|
| 160 |
+
2024-08-04 14:36:14,732 INFO SenderThread:11584 [sender.py:transition_state():617] send defer: 14
|
| 161 |
+
2024-08-04 14:36:14,732 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: defer
|
| 162 |
+
2024-08-04 14:36:14,733 DEBUG SenderThread:11584 [sender.py:send():382] send: final
|
| 163 |
+
2024-08-04 14:36:14,733 INFO HandlerThread:11584 [handler.py:handle_request_defer():172] handle defer: 14
|
| 164 |
+
2024-08-04 14:36:14,733 DEBUG SenderThread:11584 [sender.py:send():382] send: footer
|
| 165 |
+
2024-08-04 14:36:14,733 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: defer
|
| 166 |
+
2024-08-04 14:36:14,733 INFO SenderThread:11584 [sender.py:send_request_defer():613] handle sender defer: 14
|
| 167 |
+
2024-08-04 14:36:14,733 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: poll_exit
|
| 168 |
+
2024-08-04 14:36:14,733 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: poll_exit
|
| 169 |
+
2024-08-04 14:36:14,734 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: poll_exit
|
| 170 |
+
2024-08-04 14:36:14,734 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: server_info
|
| 171 |
+
2024-08-04 14:36:14,734 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: poll_exit
|
| 172 |
+
2024-08-04 14:36:14,734 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: server_info
|
| 173 |
+
2024-08-04 14:36:14,734 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: get_summary
|
| 174 |
+
2024-08-04 14:36:14,736 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: sampled_history
|
| 175 |
+
2024-08-04 14:36:14,736 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: internal_messages
|
| 176 |
+
2024-08-04 14:36:14,736 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: job_info
|
| 177 |
+
2024-08-04 14:36:14,893 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: job_info
|
| 178 |
+
2024-08-04 14:36:14,893 INFO MainThread:11584 [wandb_run.py:_footer_history_summary_info():3866] rendering history
|
| 179 |
+
2024-08-04 14:36:14,894 INFO MainThread:11584 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
|
| 180 |
+
2024-08-04 14:36:14,894 INFO MainThread:11584 [wandb_run.py:_footer_sync_info():3825] logging synced files
|
| 181 |
+
2024-08-04 14:36:14,894 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: shutdown
|
| 182 |
+
2024-08-04 14:36:14,894 INFO HandlerThread:11584 [handler.py:finish():869] shutting down handler
|
| 183 |
+
2024-08-04 14:36:15,737 INFO WriterThread:11584 [datastore.py:close():296] close: /project/wandb/run-20240804_143607-h7fxlkpt/run-h7fxlkpt.wandb
|
| 184 |
+
2024-08-04 14:36:15,893 INFO SenderThread:11584 [sender.py:finish():1572] shutting down sender
|
| 185 |
+
2024-08-04 14:36:15,894 INFO SenderThread:11584 [file_pusher.py:finish():172] shutting down file pusher
|
| 186 |
+
2024-08-04 14:36:15,894 INFO SenderThread:11584 [file_pusher.py:join():178] waiting for file pusher
|
wandb/run-20240804_143607-h7fxlkpt/logs/debug.log
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2024-08-04 14:36:07,213 INFO MainThread:11513 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
|
| 2 |
+
2024-08-04 14:36:07,214 INFO MainThread:11513 [wandb_setup.py:_flush():76] Configure stats pid to 11513
|
| 3 |
+
2024-08-04 14:36:07,214 INFO MainThread:11513 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
|
| 4 |
+
2024-08-04 14:36:07,214 INFO MainThread:11513 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
|
| 5 |
+
2024-08-04 14:36:07,214 INFO MainThread:11513 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train tiny llama sample'}
|
| 6 |
+
2024-08-04 14:36:07,214 INFO MainThread:11513 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
|
| 7 |
+
2024-08-04 14:36:07,214 INFO MainThread:11513 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
|
| 8 |
+
2024-08-04 14:36:07,214 INFO MainThread:11513 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240804_143607-h7fxlkpt/logs/debug.log
|
| 9 |
+
2024-08-04 14:36:07,214 INFO MainThread:11513 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240804_143607-h7fxlkpt/logs/debug-internal.log
|
| 10 |
+
2024-08-04 14:36:07,214 INFO MainThread:11513 [wandb_init.py:init():566] calling init triggers
|
| 11 |
+
2024-08-04 14:36:07,215 INFO MainThread:11513 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
|
| 12 |
+
config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'valid_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'test_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 512, 'num_workers': 2, 'tokenizer_type': 'Llama2Tokenizer', 'tokenizer_model': '/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'tiny-llama_train_2024-08-04-14:35:56', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/tiny-llama', 'save': '/work/llm_recipes/models/tiny-llama', 'base_model': '/share/pretrained_lm/meta-llama/TinyLlama_v1.1', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 2000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 2000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 8, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/tiny-llama', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 32000, 'gradient_accumulation_steps': 40}
|
| 13 |
+
2024-08-04 14:36:07,215 INFO MainThread:11513 [wandb_init.py:init():616] starting backend
|
| 14 |
+
2024-08-04 14:36:07,215 INFO MainThread:11513 [wandb_init.py:init():620] setting up manager
|
| 15 |
+
2024-08-04 14:36:07,219 INFO MainThread:11513 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 16 |
+
2024-08-04 14:36:07,220 INFO MainThread:11513 [wandb_init.py:init():628] backend started and connected
|
| 17 |
+
2024-08-04 14:36:07,225 INFO MainThread:11513 [wandb_init.py:init():720] updated telemetry
|
| 18 |
+
2024-08-04 14:36:07,236 INFO MainThread:11513 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
|
| 19 |
+
2024-08-04 14:36:07,701 INFO MainThread:11513 [wandb_run.py:_on_init():2262] communicating current version
|
| 20 |
+
2024-08-04 14:36:07,784 INFO MainThread:11513 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.5 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
|
| 21 |
+
|
| 22 |
+
2024-08-04 14:36:07,784 INFO MainThread:11513 [wandb_init.py:init():804] starting run threads in backend
|
| 23 |
+
2024-08-04 14:36:07,841 INFO MainThread:11513 [wandb_run.py:_console_start():2241] atexit reg
|
| 24 |
+
2024-08-04 14:36:07,842 INFO MainThread:11513 [wandb_run.py:_redirect():2096] redirect: wrap_raw
|
| 25 |
+
2024-08-04 14:36:07,842 INFO MainThread:11513 [wandb_run.py:_redirect():2161] Wrapping output streams.
|
| 26 |
+
2024-08-04 14:36:07,842 INFO MainThread:11513 [wandb_run.py:_redirect():2186] Redirects installed.
|
| 27 |
+
2024-08-04 14:36:07,843 INFO MainThread:11513 [wandb_init.py:init():847] run started, returning control to user process
|
| 28 |
+
2024-08-04 14:36:10,261 INFO MainThread:11513 [wandb_run.py:_config_callback():1343] config_cb None None {'activation_function': 'silu', 'hidden_size': 2048, 'model_type': 'llama', 'max_position_embeddings': 2048, 'num_attention_heads': 32, 'num_hidden_layers': 22, 'model_architecture': 'LlamaForCausalLM'}
|
| 29 |
+
2024-08-04 14:36:10,261 INFO MainThread:11513 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
|
| 30 |
+
2024-08-04 14:36:15,895 WARNING MsgRouterThr:11513 [router.py:message_loop():77] message_loop has been closed
|
wandb/run-20240804_143607-h7fxlkpt/run-h7fxlkpt.wandb
ADDED
|
Binary file (20.4 kB). View file
|
|
|
wandb/run-20240804_221132-o8ieoj9i/files/config.yaml
ADDED
|
@@ -0,0 +1,335 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
wandb_version: 1
|
| 2 |
+
|
| 3 |
+
sharding_strategy:
|
| 4 |
+
desc: null
|
| 5 |
+
value: FULL_SHARD
|
| 6 |
+
checkpoint_type:
|
| 7 |
+
desc: null
|
| 8 |
+
value: LOCAL_STATE_DICT
|
| 9 |
+
fsdp_activation_checkpointing:
|
| 10 |
+
desc: null
|
| 11 |
+
value: true
|
| 12 |
+
fsdp_cpu_offload:
|
| 13 |
+
desc: null
|
| 14 |
+
value: false
|
| 15 |
+
low_cpu_fsdp:
|
| 16 |
+
desc: null
|
| 17 |
+
value: false
|
| 18 |
+
no_meta_device:
|
| 19 |
+
desc: null
|
| 20 |
+
value: false
|
| 21 |
+
data_path:
|
| 22 |
+
desc: null
|
| 23 |
+
value: null
|
| 24 |
+
split:
|
| 25 |
+
desc: null
|
| 26 |
+
value: 969, 30, 1
|
| 27 |
+
train_data_path:
|
| 28 |
+
desc: null
|
| 29 |
+
value:
|
| 30 |
+
- '235289369'
|
| 31 |
+
- /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document
|
| 32 |
+
valid_data_path:
|
| 33 |
+
desc: null
|
| 34 |
+
value:
|
| 35 |
+
- '235289369'
|
| 36 |
+
- /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document
|
| 37 |
+
test_data_path:
|
| 38 |
+
desc: null
|
| 39 |
+
value:
|
| 40 |
+
- '235289369'
|
| 41 |
+
- /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document
|
| 42 |
+
data_cache_path:
|
| 43 |
+
desc: null
|
| 44 |
+
value: null
|
| 45 |
+
vocab_size:
|
| 46 |
+
desc: null
|
| 47 |
+
value: null
|
| 48 |
+
vocab_file:
|
| 49 |
+
desc: null
|
| 50 |
+
value: null
|
| 51 |
+
merge_file:
|
| 52 |
+
desc: null
|
| 53 |
+
value: null
|
| 54 |
+
seq_length:
|
| 55 |
+
desc: null
|
| 56 |
+
value: 4096
|
| 57 |
+
num_workers:
|
| 58 |
+
desc: null
|
| 59 |
+
value: 2
|
| 60 |
+
tokenizer_type:
|
| 61 |
+
desc: null
|
| 62 |
+
value: HFPreTrainedTokenizer
|
| 63 |
+
tokenizer_model:
|
| 64 |
+
desc: null
|
| 65 |
+
value: /share/pretrained_lm/google/gemma-2-2b
|
| 66 |
+
reset_position_ids:
|
| 67 |
+
desc: null
|
| 68 |
+
value: false
|
| 69 |
+
reset_attention_mask:
|
| 70 |
+
desc: null
|
| 71 |
+
value: false
|
| 72 |
+
eod_mask_loss:
|
| 73 |
+
desc: null
|
| 74 |
+
value: false
|
| 75 |
+
retro_return_doc_ids:
|
| 76 |
+
desc: null
|
| 77 |
+
value: false
|
| 78 |
+
short_seq_prob:
|
| 79 |
+
desc: null
|
| 80 |
+
value: 0.1
|
| 81 |
+
vocab_extra_ids:
|
| 82 |
+
desc: null
|
| 83 |
+
value: 0
|
| 84 |
+
seed:
|
| 85 |
+
desc: null
|
| 86 |
+
value: 1234
|
| 87 |
+
use_mpi:
|
| 88 |
+
desc: null
|
| 89 |
+
value: false
|
| 90 |
+
wandb_entity:
|
| 91 |
+
desc: null
|
| 92 |
+
value: iwakawa-koichi-q5-tohoku-nlp6723
|
| 93 |
+
wandb_name:
|
| 94 |
+
desc: null
|
| 95 |
+
value: yans-sample-gemma-2-2b_train_2024-08-04-22:11:21
|
| 96 |
+
wandb_project:
|
| 97 |
+
desc: null
|
| 98 |
+
value: llm_tutorial
|
| 99 |
+
quantization:
|
| 100 |
+
desc: null
|
| 101 |
+
value: false
|
| 102 |
+
use_freeze_layers:
|
| 103 |
+
desc: null
|
| 104 |
+
value: false
|
| 105 |
+
freeze_layers:
|
| 106 |
+
desc: null
|
| 107 |
+
value: null
|
| 108 |
+
bf16:
|
| 109 |
+
desc: null
|
| 110 |
+
value: true
|
| 111 |
+
fp16:
|
| 112 |
+
desc: null
|
| 113 |
+
value: false
|
| 114 |
+
mixed_precision:
|
| 115 |
+
desc: null
|
| 116 |
+
value: true
|
| 117 |
+
param_dtype:
|
| 118 |
+
desc: null
|
| 119 |
+
value: null
|
| 120 |
+
load:
|
| 121 |
+
desc: null
|
| 122 |
+
value: /work/llm_recipes/models/yans-sample-gemma-2-2b
|
| 123 |
+
save:
|
| 124 |
+
desc: null
|
| 125 |
+
value: /work/llm_recipes/models/yans-sample-gemma-2-2b
|
| 126 |
+
base_model:
|
| 127 |
+
desc: null
|
| 128 |
+
value: /share/pretrained_lm/google/gemma-2-2b
|
| 129 |
+
use_better_transformer:
|
| 130 |
+
desc: null
|
| 131 |
+
value: false
|
| 132 |
+
grad_clip_norm:
|
| 133 |
+
desc: null
|
| 134 |
+
value: 1.0
|
| 135 |
+
eval_interval:
|
| 136 |
+
desc: null
|
| 137 |
+
value: 200
|
| 138 |
+
save_interval:
|
| 139 |
+
desc: null
|
| 140 |
+
value: 200
|
| 141 |
+
eval_iters:
|
| 142 |
+
desc: null
|
| 143 |
+
value: 10
|
| 144 |
+
optimizer:
|
| 145 |
+
desc: null
|
| 146 |
+
value: anyprecision
|
| 147 |
+
lr:
|
| 148 |
+
desc: null
|
| 149 |
+
value: 2.0e-05
|
| 150 |
+
lr_decay_style:
|
| 151 |
+
desc: null
|
| 152 |
+
value: cosine
|
| 153 |
+
lr_decay_iters:
|
| 154 |
+
desc: null
|
| 155 |
+
value: 20000
|
| 156 |
+
lr_warmup_iters:
|
| 157 |
+
desc: null
|
| 158 |
+
value: 500
|
| 159 |
+
min_lr:
|
| 160 |
+
desc: null
|
| 161 |
+
value: 1.0e-06
|
| 162 |
+
train_iters:
|
| 163 |
+
desc: null
|
| 164 |
+
value: 20000
|
| 165 |
+
train_samples:
|
| 166 |
+
desc: null
|
| 167 |
+
value: null
|
| 168 |
+
global_batch_size:
|
| 169 |
+
desc: null
|
| 170 |
+
value: 320
|
| 171 |
+
micro_batch_size:
|
| 172 |
+
desc: null
|
| 173 |
+
value: 2
|
| 174 |
+
make_vocab_size_divisible_by:
|
| 175 |
+
desc: null
|
| 176 |
+
value: 128
|
| 177 |
+
sliding_window_size:
|
| 178 |
+
desc: null
|
| 179 |
+
value: 4096
|
| 180 |
+
skip_batch:
|
| 181 |
+
desc: null
|
| 182 |
+
value: null
|
| 183 |
+
no_save_optimizer_state:
|
| 184 |
+
desc: null
|
| 185 |
+
value: false
|
| 186 |
+
continual_pretraining:
|
| 187 |
+
desc: null
|
| 188 |
+
value: false
|
| 189 |
+
instruction_tuning:
|
| 190 |
+
desc: null
|
| 191 |
+
value: false
|
| 192 |
+
direct_preference_optimization:
|
| 193 |
+
desc: null
|
| 194 |
+
value: false
|
| 195 |
+
attention_dropout:
|
| 196 |
+
desc: null
|
| 197 |
+
value: 0.1
|
| 198 |
+
hidden_dropout:
|
| 199 |
+
desc: null
|
| 200 |
+
value: 0.1
|
| 201 |
+
weight_decay:
|
| 202 |
+
desc: null
|
| 203 |
+
value: 0.1
|
| 204 |
+
adam_beta1:
|
| 205 |
+
desc: null
|
| 206 |
+
value: 0.9
|
| 207 |
+
adam_beta2:
|
| 208 |
+
desc: null
|
| 209 |
+
value: 0.95
|
| 210 |
+
adam_eps:
|
| 211 |
+
desc: null
|
| 212 |
+
value: 1.0e-06
|
| 213 |
+
hf_transformer_model_dir:
|
| 214 |
+
desc: null
|
| 215 |
+
value: null
|
| 216 |
+
instruction_train_data_path:
|
| 217 |
+
desc: null
|
| 218 |
+
value: null
|
| 219 |
+
instruction_valid_data_path:
|
| 220 |
+
desc: null
|
| 221 |
+
value: null
|
| 222 |
+
epoch:
|
| 223 |
+
desc: null
|
| 224 |
+
value: null
|
| 225 |
+
instruction_dataset_size:
|
| 226 |
+
desc: null
|
| 227 |
+
value: null
|
| 228 |
+
save_sampler_state:
|
| 229 |
+
desc: null
|
| 230 |
+
value: false
|
| 231 |
+
label_smoothing:
|
| 232 |
+
desc: null
|
| 233 |
+
value: 0.0
|
| 234 |
+
save_n_checkpoints:
|
| 235 |
+
desc: null
|
| 236 |
+
value: 10
|
| 237 |
+
hf_repo_id:
|
| 238 |
+
desc: null
|
| 239 |
+
value: koichi12/yans-sample-gemma-2-2b
|
| 240 |
+
create_public_hf_repo:
|
| 241 |
+
desc: null
|
| 242 |
+
value: false
|
| 243 |
+
upload_all_checkpoints_to_hf:
|
| 244 |
+
desc: null
|
| 245 |
+
value: false
|
| 246 |
+
hf_upload_retry_limit:
|
| 247 |
+
desc: null
|
| 248 |
+
value: 2
|
| 249 |
+
exit_duration_in_mins:
|
| 250 |
+
desc: null
|
| 251 |
+
value: null
|
| 252 |
+
source_key:
|
| 253 |
+
desc: null
|
| 254 |
+
value: null
|
| 255 |
+
target_key:
|
| 256 |
+
desc: null
|
| 257 |
+
value: null
|
| 258 |
+
attn_implementation:
|
| 259 |
+
desc: null
|
| 260 |
+
value: flash_attention_2
|
| 261 |
+
efficient_instruction_tuning:
|
| 262 |
+
desc: null
|
| 263 |
+
value: false
|
| 264 |
+
remove_padding_masking:
|
| 265 |
+
desc: null
|
| 266 |
+
value: false
|
| 267 |
+
save_start_iter:
|
| 268 |
+
desc: null
|
| 269 |
+
value: null
|
| 270 |
+
rank:
|
| 271 |
+
desc: null
|
| 272 |
+
value: 0
|
| 273 |
+
world_size:
|
| 274 |
+
desc: null
|
| 275 |
+
value: 1
|
| 276 |
+
padded_vocab_size:
|
| 277 |
+
desc: null
|
| 278 |
+
value: 256000
|
| 279 |
+
gradient_accumulation_steps:
|
| 280 |
+
desc: null
|
| 281 |
+
value: 160
|
| 282 |
+
_wandb:
|
| 283 |
+
desc: null
|
| 284 |
+
value:
|
| 285 |
+
python_version: 3.10.12
|
| 286 |
+
cli_version: 0.16.3
|
| 287 |
+
framework: huggingface
|
| 288 |
+
huggingface_version: 4.43.3
|
| 289 |
+
is_jupyter_run: false
|
| 290 |
+
is_kaggle_kernel: false
|
| 291 |
+
start_time: 1722777092.265577
|
| 292 |
+
t:
|
| 293 |
+
1:
|
| 294 |
+
- 1
|
| 295 |
+
- 11
|
| 296 |
+
- 49
|
| 297 |
+
- 55
|
| 298 |
+
- 71
|
| 299 |
+
2:
|
| 300 |
+
- 1
|
| 301 |
+
- 11
|
| 302 |
+
- 49
|
| 303 |
+
- 55
|
| 304 |
+
- 71
|
| 305 |
+
3:
|
| 306 |
+
- 13
|
| 307 |
+
- 16
|
| 308 |
+
- 23
|
| 309 |
+
4: 3.10.12
|
| 310 |
+
5: 0.16.3
|
| 311 |
+
6: 4.43.3
|
| 312 |
+
8:
|
| 313 |
+
- 5
|
| 314 |
+
13: linux-x86_64
|
| 315 |
+
activation_function:
|
| 316 |
+
desc: null
|
| 317 |
+
value: gelu_pytorch_tanh
|
| 318 |
+
hidden_size:
|
| 319 |
+
desc: null
|
| 320 |
+
value: 2304
|
| 321 |
+
model_type:
|
| 322 |
+
desc: null
|
| 323 |
+
value: gemma2
|
| 324 |
+
max_position_embeddings:
|
| 325 |
+
desc: null
|
| 326 |
+
value: 4096
|
| 327 |
+
num_attention_heads:
|
| 328 |
+
desc: null
|
| 329 |
+
value: 8
|
| 330 |
+
num_hidden_layers:
|
| 331 |
+
desc: null
|
| 332 |
+
value: 26
|
| 333 |
+
model_architecture:
|
| 334 |
+
desc: null
|
| 335 |
+
value: Gemma2ForCausalLM
|
wandb/run-20240804_221132-o8ieoj9i/files/output.log
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Created Hugging Face repository with ID koichi12/yans-sample-gemma-2-2b.
|
| 2 |
+
Clearing GPU cache for all ranks
|
| 3 |
+
--> Running with torch torch_distributed debug set to detail
|
| 4 |
+
File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
|
| 5 |
+
Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
|
| 6 |
+
File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
|
| 7 |
+
Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
|
| 8 |
+
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
Loading checkpoint shards: 67%|██████▋ | 2/3 [02:29<01:15, 75.36s/it]
|
| 12 |
+
File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
|
| 13 |
+
Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
|
| 14 |
+
No checkpoint found in /work/llm_recipes/models/yans-sample-gemma-2-2b, skipping model loading
|
| 15 |
+
--> Model /share/pretrained_lm/google/gemma-2-2b
|
| 16 |
+
--> /share/pretrained_lm/google/gemma-2-2b has 2614.341888 Million params
|
| 17 |
+
BFloat16 enabled for mixed precision - using bfSixteen policy
|
| 18 |
+
--> applying fsdp activation checkpointing...
|
| 19 |
+
> datasets target sizes (minimum size):
|
| 20 |
+
train: 6400000
|
| 21 |
+
validation: 323200
|
| 22 |
+
test: 3200
|
| 23 |
+
Loading checkpoint shards: 100%|██████████| 3/3 [02:38<00:00, 52.69s/it]
|
| 24 |
+
/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
|
| 25 |
+
warnings.warn(
|
| 26 |
+
Let split = None
|
| 27 |
+
Building a BlendedDataset for a single MegatronDataset
|
| 28 |
+
> finished creating GPT datasets ...
|
| 29 |
+
File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
|
| 30 |
+
Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
|
| 31 |
+
No checkpoint found in /work/llm_recipes/models/yans-sample-gemma-2-2b, skipping optimizer loading
|
| 32 |
+
File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
|
| 33 |
+
Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
|
| 34 |
+
model info: FullyShardedDataParallel(
|
| 35 |
+
(_fsdp_wrapped_module): Gemma2ForCausalLM(
|
| 36 |
+
(model): Gemma2Model(
|
| 37 |
+
(embed_tokens): Embedding(256000, 2304, padding_idx=0)
|
| 38 |
+
(layers): ModuleList(
|
| 39 |
+
(0-25): 26 x FullyShardedDataParallel(
|
| 40 |
+
(_fsdp_wrapped_module): CheckpointWrapper(
|
| 41 |
+
(_checkpoint_wrapped_module): Gemma2DecoderLayer(
|
| 42 |
+
(self_attn): Gemma2FlashAttention2(
|
| 43 |
+
(q_proj): Linear(in_features=2304, out_features=2048, bias=False)
|
| 44 |
+
(k_proj): Linear(in_features=2304, out_features=1024, bias=False)
|
| 45 |
+
(v_proj): Linear(in_features=2304, out_features=1024, bias=False)
|
| 46 |
+
(o_proj): Linear(in_features=2048, out_features=2304, bias=False)
|
| 47 |
+
(rotary_emb): Gemma2RotaryEmbedding()
|
| 48 |
+
)
|
| 49 |
+
(mlp): Gemma2MLP(
|
| 50 |
+
(gate_proj): Linear(in_features=2304, out_features=9216, bias=False)
|
| 51 |
+
(up_proj): Linear(in_features=2304, out_features=9216, bias=False)
|
| 52 |
+
(down_proj): Linear(in_features=9216, out_features=2304, bias=False)
|
| 53 |
+
(act_fn): PytorchGELUTanh()
|
| 54 |
+
)
|
| 55 |
+
(input_layernorm): Gemma2RMSNorm()
|
| 56 |
+
(post_attention_layernorm): Gemma2RMSNorm()
|
| 57 |
+
(pre_feedforward_layernorm): Gemma2RMSNorm()
|
| 58 |
+
(post_feedforward_layernorm): Gemma2RMSNorm()
|
| 59 |
+
)
|
| 60 |
+
)
|
| 61 |
+
)
|
| 62 |
+
)
|
| 63 |
+
(norm): Gemma2RMSNorm()
|
| 64 |
+
)
|
| 65 |
+
(lm_head): Linear(in_features=2304, out_features=256000, bias=False)
|
| 66 |
+
)
|
| 67 |
+
)
|
| 68 |
+
model config: Gemma2Config {
|
| 69 |
+
"_name_or_path": "/share/pretrained_lm/google/gemma-2-2b",
|
| 70 |
+
"architectures": [
|
| 71 |
+
"Gemma2ForCausalLM"
|
| 72 |
+
],
|
| 73 |
+
"attention_bias": false,
|
| 74 |
+
"attention_dropout": 0.0,
|
| 75 |
+
"attn_logit_softcapping": 50.0,
|
| 76 |
+
"bos_token_id": 2,
|
| 77 |
+
"cache_implementation": "hybrid",
|
| 78 |
+
"eos_token_id": 1,
|
| 79 |
+
"final_logit_softcapping": 30.0,
|
| 80 |
+
"head_dim": 256,
|
| 81 |
+
"hidden_act": "gelu_pytorch_tanh",
|
| 82 |
+
"hidden_activation": "gelu_pytorch_tanh",
|
| 83 |
+
"hidden_size": 2304,
|
| 84 |
+
"initializer_range": 0.02,
|
| 85 |
+
"intermediate_size": 9216,
|
| 86 |
+
"label_smoothing": 0.0,
|
| 87 |
+
"max_position_embeddings": 4096,
|
| 88 |
+
"model_type": "gemma2",
|
| 89 |
+
"num_attention_heads": 8,
|
| 90 |
+
"num_hidden_layers": 26,
|
| 91 |
+
"num_key_value_heads": 4,
|
| 92 |
+
"pad_token_id": 0,
|
| 93 |
+
"query_pre_attn_scalar": 256,
|
| 94 |
+
"rms_norm_eps": 1e-06,
|
| 95 |
+
"rope_theta": 10000.0,
|
| 96 |
+
"sliding_window": 4096,
|
| 97 |
+
"torch_dtype": "float32",
|
| 98 |
+
"transformers_version": "4.43.3",
|
| 99 |
+
"use_cache": false,
|
| 100 |
+
"vocab_size": 256000
|
| 101 |
+
}
|
| 102 |
+
Unable to save the indexes because path_to_cache is None
|
| 103 |
+
Building a BlendedDataset for a single MegatronDataset
|
| 104 |
+
Unable to save the indexes because path_to_cache is None
|
| 105 |
+
Building a BlendedDataset for a single MegatronDataset
|
| 106 |
+
Unable to save the indexes because path_to_cache is None
|
| 107 |
+
It is strongly recommended to train Gemma2 models with the `eager` attention implementation instead of `flash_attention_2`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.
|
| 108 |
+
Traceback (most recent call last):
|
| 109 |
+
File "/project/examples/finetuning.py", line 13, in <module>
|
| 110 |
+
main()
|
| 111 |
+
File "/project/src/llama_recipes/finetuning.py", line 281, in main
|
| 112 |
+
train(
|
| 113 |
+
File "/project/src/llama_recipes/utils/train_utils.py", line 110, in train
|
| 114 |
+
loss: torch.Tensor = model(**batch).loss
|
| 115 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
|
| 116 |
+
return self._call_impl(*args, **kwargs)
|
| 117 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
|
| 118 |
+
return forward_call(*args, **kwargs)
|
| 119 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 849, in forward
|
| 120 |
+
output = self._fsdp_wrapped_module(*args, **kwargs)
|
| 121 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
|
| 122 |
+
return self._call_impl(*args, **kwargs)
|
| 123 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
|
| 124 |
+
return forward_call(*args, **kwargs)
|
| 125 |
+
File "/project/lib/transformers/src/transformers/models/gemma2/modeling_gemma2.py", line 976, in forward
|
| 126 |
+
loss = loss_fct(shift_logits, shift_labels)
|
| 127 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
|
| 128 |
+
return self._call_impl(*args, **kwargs)
|
| 129 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
|
| 130 |
+
return forward_call(*args, **kwargs)
|
| 131 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/loss.py", line 1179, in forward
|
| 132 |
+
return F.cross_entropy(input, target, weight=self.weight,
|
| 133 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/nn/functional.py", line 3086, in cross_entropy
|
| 134 |
+
return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
|
| 135 |
+
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 7.81 GiB. GPU 0 has a total capacity of 39.39 GiB of which 7.81 GiB is free. Including non-PyTorch memory, this process has 31.58 GiB memory in use. Of the allocated memory 30.38 GiB is allocated by PyTorch, and 385.31 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
|
wandb/run-20240804_221132-o8ieoj9i/files/requirements.txt
ADDED
|
@@ -0,0 +1,271 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
absl-py==2.1.0
|
| 2 |
+
accelerate==0.33.0
|
| 3 |
+
aiohttp==3.9.1
|
| 4 |
+
aiosignal==1.3.1
|
| 5 |
+
annotated-types==0.6.0
|
| 6 |
+
apex==0.1
|
| 7 |
+
appdirs==1.4.4
|
| 8 |
+
argon2-cffi-bindings==21.2.0
|
| 9 |
+
argon2-cffi==23.1.0
|
| 10 |
+
asttokens==2.4.1
|
| 11 |
+
astunparse==1.6.3
|
| 12 |
+
async-timeout==4.0.3
|
| 13 |
+
attrs==23.2.0
|
| 14 |
+
audioread==3.0.1
|
| 15 |
+
beautifulsoup4==4.12.3
|
| 16 |
+
bleach==6.1.0
|
| 17 |
+
blis==0.7.11
|
| 18 |
+
cachetools==5.3.2
|
| 19 |
+
catalogue==2.0.10
|
| 20 |
+
certifi==2024.2.2
|
| 21 |
+
cffi==1.16.0
|
| 22 |
+
charset-normalizer==3.3.2
|
| 23 |
+
click==8.1.7
|
| 24 |
+
cloudpathlib==0.16.0
|
| 25 |
+
cloudpickle==3.0.0
|
| 26 |
+
cmake==3.28.1
|
| 27 |
+
colorama==0.4.6
|
| 28 |
+
comm==0.2.1
|
| 29 |
+
confection==0.1.4
|
| 30 |
+
contourpy==1.2.0
|
| 31 |
+
cubinlinker==0.3.0+2.g405ac64
|
| 32 |
+
cuda-python==12.3.0rc4+9.gdb8c48a.dirty
|
| 33 |
+
cudf==23.12.0
|
| 34 |
+
cugraph-dgl==23.12.0
|
| 35 |
+
cugraph-service-client==23.12.0
|
| 36 |
+
cugraph-service-server==23.12.0
|
| 37 |
+
cugraph==23.12.0
|
| 38 |
+
cuml==23.12.0
|
| 39 |
+
cupy-cuda12x==12.3.0
|
| 40 |
+
cycler==0.12.1
|
| 41 |
+
cymem==2.0.8
|
| 42 |
+
cython==3.0.8
|
| 43 |
+
dask-cuda==23.12.0
|
| 44 |
+
dask-cudf==23.12.0
|
| 45 |
+
dask==2023.11.0
|
| 46 |
+
debugpy==1.8.1
|
| 47 |
+
decorator==5.1.1
|
| 48 |
+
defusedxml==0.7.1
|
| 49 |
+
distributed==2023.11.0
|
| 50 |
+
dm-tree==0.1.8
|
| 51 |
+
docker-pycreds==0.4.0
|
| 52 |
+
einops==0.7.0
|
| 53 |
+
exceptiongroup==1.2.0
|
| 54 |
+
execnet==2.0.2
|
| 55 |
+
executing==2.0.1
|
| 56 |
+
expecttest==0.1.3
|
| 57 |
+
fastjsonschema==2.19.1
|
| 58 |
+
fastrlock==0.8.2
|
| 59 |
+
filelock==3.13.1
|
| 60 |
+
flash-attn==2.4.2
|
| 61 |
+
fonttools==4.48.1
|
| 62 |
+
frozenlist==1.4.1
|
| 63 |
+
fsspec==2023.12.2
|
| 64 |
+
gast==0.5.4
|
| 65 |
+
gitdb==4.0.11
|
| 66 |
+
gitpython==3.1.43
|
| 67 |
+
google-auth-oauthlib==0.4.6
|
| 68 |
+
google-auth==2.27.0
|
| 69 |
+
graphsurgeon==0.4.6
|
| 70 |
+
grpcio==1.60.1
|
| 71 |
+
huggingface-hub==0.24.5
|
| 72 |
+
hypothesis==5.35.1
|
| 73 |
+
idna==3.6
|
| 74 |
+
importlib-metadata==7.0.1
|
| 75 |
+
iniconfig==2.0.0
|
| 76 |
+
intel-openmp==2021.4.0
|
| 77 |
+
ipadic==1.0.0
|
| 78 |
+
ipykernel==6.29.2
|
| 79 |
+
ipython-genutils==0.2.0
|
| 80 |
+
ipython==8.21.0
|
| 81 |
+
jedi==0.19.1
|
| 82 |
+
jinja2==3.1.3
|
| 83 |
+
joblib==1.3.2
|
| 84 |
+
json5==0.9.14
|
| 85 |
+
jsonnet==0.19.1
|
| 86 |
+
jsonschema-specifications==2023.12.1
|
| 87 |
+
jsonschema==4.21.1
|
| 88 |
+
jupyter-client==8.6.0
|
| 89 |
+
jupyter-core==5.7.1
|
| 90 |
+
jupyter-tensorboard==0.2.0
|
| 91 |
+
jupyterlab-pygments==0.3.0
|
| 92 |
+
jupyterlab-server==1.2.0
|
| 93 |
+
jupyterlab==2.3.2
|
| 94 |
+
jupytext==1.16.1
|
| 95 |
+
kiwisolver==1.4.5
|
| 96 |
+
langcodes==3.3.0
|
| 97 |
+
lazy-loader==0.3
|
| 98 |
+
librosa==0.10.1
|
| 99 |
+
llvmlite==0.40.1
|
| 100 |
+
locket==1.0.0
|
| 101 |
+
logzero==1.7.0
|
| 102 |
+
lxml==5.2.2
|
| 103 |
+
markdown-it-py==3.0.0
|
| 104 |
+
markdown==3.5.2
|
| 105 |
+
markupsafe==2.1.4
|
| 106 |
+
matplotlib-inline==0.1.6
|
| 107 |
+
matplotlib==3.8.2
|
| 108 |
+
mdit-py-plugins==0.4.0
|
| 109 |
+
mdurl==0.1.2
|
| 110 |
+
mecab-python3==1.0.6
|
| 111 |
+
mistune==3.0.2
|
| 112 |
+
mkl-devel==2021.1.1
|
| 113 |
+
mkl-include==2021.1.1
|
| 114 |
+
mkl==2021.1.1
|
| 115 |
+
mock==5.1.0
|
| 116 |
+
more-itertools==9.1.0
|
| 117 |
+
mpmath==1.3.0
|
| 118 |
+
msgpack==1.0.7
|
| 119 |
+
multidict==6.0.4
|
| 120 |
+
murmurhash==1.0.10
|
| 121 |
+
nbclient==0.9.0
|
| 122 |
+
nbconvert==7.16.0
|
| 123 |
+
nbformat==5.9.2
|
| 124 |
+
nest-asyncio==1.6.0
|
| 125 |
+
networkx==2.6.3
|
| 126 |
+
ninja==1.11.1.1
|
| 127 |
+
nltk==3.8.1
|
| 128 |
+
notebook==6.4.10
|
| 129 |
+
numba==0.57.1+1.g1ff679645
|
| 130 |
+
numpy==1.24.4
|
| 131 |
+
nvfuser==0.1.4a0+d0bb811
|
| 132 |
+
nvidia-dali-cuda120==1.34.0
|
| 133 |
+
nvidia-pyindex==1.0.9
|
| 134 |
+
nvtx==0.2.5
|
| 135 |
+
oauthlib==3.2.2
|
| 136 |
+
onnx==1.15.0rc2
|
| 137 |
+
opencv==4.7.0
|
| 138 |
+
optree==0.10.0
|
| 139 |
+
packaging==23.2
|
| 140 |
+
pandas==1.5.3
|
| 141 |
+
pandocfilters==1.5.1
|
| 142 |
+
parso==0.8.3
|
| 143 |
+
partd==1.4.1
|
| 144 |
+
peft==0.11.1
|
| 145 |
+
pexpect==4.9.0
|
| 146 |
+
pillow==10.2.0
|
| 147 |
+
pip==24.0
|
| 148 |
+
platformdirs==4.2.0
|
| 149 |
+
pluggy==1.4.0
|
| 150 |
+
ply==3.11
|
| 151 |
+
polygraphy==0.49.4
|
| 152 |
+
pooch==1.8.0
|
| 153 |
+
portalocker==2.10.1
|
| 154 |
+
preshed==3.0.9
|
| 155 |
+
prettytable==3.9.0
|
| 156 |
+
prometheus-client==0.19.0
|
| 157 |
+
prompt-toolkit==3.0.43
|
| 158 |
+
protobuf==4.24.4
|
| 159 |
+
psutil==5.9.4
|
| 160 |
+
ptxcompiler==0.8.1+2.g0d406d6
|
| 161 |
+
ptyprocess==0.7.0
|
| 162 |
+
pure-eval==0.2.2
|
| 163 |
+
pyarrow==14.0.1.dev0+gba5374836.d20240125
|
| 164 |
+
pyasn1-modules==0.3.0
|
| 165 |
+
pyasn1==0.5.1
|
| 166 |
+
pybind11-global==2.11.1
|
| 167 |
+
pybind11==2.11.1
|
| 168 |
+
pycocotools==2.0+nv0.8.0
|
| 169 |
+
pycparser==2.21
|
| 170 |
+
pydantic-core==2.16.2
|
| 171 |
+
pydantic==2.6.1
|
| 172 |
+
pygments==2.17.2
|
| 173 |
+
pylibcugraph==23.12.0
|
| 174 |
+
pylibcugraphops==23.12.0
|
| 175 |
+
pylibraft==23.12.0
|
| 176 |
+
pynvml==11.4.1
|
| 177 |
+
pyparsing==3.1.1
|
| 178 |
+
pytest-flakefinder==1.1.0
|
| 179 |
+
pytest-rerunfailures==13.0
|
| 180 |
+
pytest-shard==0.1.2
|
| 181 |
+
pytest-xdist==3.5.0
|
| 182 |
+
pytest==8.0.0
|
| 183 |
+
python-dateutil==2.8.2
|
| 184 |
+
python-dotenv==1.0.0
|
| 185 |
+
python-hostlist==1.23.0
|
| 186 |
+
pytorch-quantization==2.1.2
|
| 187 |
+
pytz==2023.3.post1
|
| 188 |
+
pyyaml==6.0.1
|
| 189 |
+
pyzmq==25.1.2
|
| 190 |
+
raft-dask==23.12.0
|
| 191 |
+
rapids-dask-dependency==23.12.1
|
| 192 |
+
referencing==0.33.0
|
| 193 |
+
regex==2023.12.25
|
| 194 |
+
requests-oauthlib==1.3.1
|
| 195 |
+
requests==2.31.0
|
| 196 |
+
rich==13.7.0
|
| 197 |
+
rmm==23.12.0
|
| 198 |
+
rpds-py==0.17.1
|
| 199 |
+
rsa==4.9
|
| 200 |
+
sacrebleu==2.4.0
|
| 201 |
+
safetensors==0.4.3
|
| 202 |
+
scikit-learn==1.2.0
|
| 203 |
+
scipy==1.12.0
|
| 204 |
+
send2trash==1.8.2
|
| 205 |
+
sentencepiece==0.1.99
|
| 206 |
+
sentry-sdk==2.12.0
|
| 207 |
+
setproctitle==1.3.3
|
| 208 |
+
setuptools==68.2.2
|
| 209 |
+
six==1.16.0
|
| 210 |
+
smart-open==6.4.0
|
| 211 |
+
smmap==5.0.1
|
| 212 |
+
sortedcontainers==2.4.0
|
| 213 |
+
soundfile==0.12.1
|
| 214 |
+
soupsieve==2.5
|
| 215 |
+
soxr==0.3.7
|
| 216 |
+
spacy-legacy==3.0.12
|
| 217 |
+
spacy-loggers==1.0.5
|
| 218 |
+
spacy==3.7.2
|
| 219 |
+
sphinx-glpi-theme==0.6
|
| 220 |
+
srsly==2.4.8
|
| 221 |
+
stack-data==0.6.3
|
| 222 |
+
sympy==1.12
|
| 223 |
+
tabulate==0.9.0
|
| 224 |
+
tbb==2021.11.0
|
| 225 |
+
tblib==3.0.0
|
| 226 |
+
tensorboard-data-server==0.6.1
|
| 227 |
+
tensorboard-plugin-wit==1.8.1
|
| 228 |
+
tensorboard==2.9.0
|
| 229 |
+
tensorrt==8.6.3
|
| 230 |
+
terminado==0.18.0
|
| 231 |
+
termplotlib==0.3.9
|
| 232 |
+
thinc==8.2.3
|
| 233 |
+
threadpoolctl==3.2.0
|
| 234 |
+
thriftpy2==0.4.17
|
| 235 |
+
tinycss2==1.2.1
|
| 236 |
+
tokenizers==0.19.1
|
| 237 |
+
toml==0.10.2
|
| 238 |
+
tomli==2.0.1
|
| 239 |
+
toolz==0.12.1
|
| 240 |
+
torch-tensorrt==2.3.0a0
|
| 241 |
+
torch==2.3.0a0+ebedce2
|
| 242 |
+
torchdata==0.7.1a0
|
| 243 |
+
torchtext==0.17.0a0
|
| 244 |
+
torchvision==0.18.0a0
|
| 245 |
+
tornado==6.4
|
| 246 |
+
tqdm==4.66.1
|
| 247 |
+
traitlets==5.9.0
|
| 248 |
+
transformer-engine==1.3.0+5b90b7f
|
| 249 |
+
transformers==4.43.3
|
| 250 |
+
treelite-runtime==3.9.1
|
| 251 |
+
treelite==3.9.1
|
| 252 |
+
triton==2.2.0+e28a256
|
| 253 |
+
typer==0.9.0
|
| 254 |
+
types-dataclasses==0.6.6
|
| 255 |
+
typing-extensions==4.9.0
|
| 256 |
+
ucx-py==0.35.0
|
| 257 |
+
uff==0.6.9
|
| 258 |
+
ujson==5.8.0
|
| 259 |
+
urllib3==1.26.18
|
| 260 |
+
wandb==0.16.3
|
| 261 |
+
wasabi==1.1.2
|
| 262 |
+
wcwidth==0.2.13
|
| 263 |
+
weasel==0.3.4
|
| 264 |
+
webencodings==0.5.1
|
| 265 |
+
werkzeug==3.0.1
|
| 266 |
+
wheel==0.42.0
|
| 267 |
+
xdoctest==1.0.2
|
| 268 |
+
xgboost==1.7.6
|
| 269 |
+
yarl==1.9.4
|
| 270 |
+
zict==3.0.0
|
| 271 |
+
zipp==3.17.0
|
wandb/run-20240804_221132-o8ieoj9i/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "3.10.12",
|
| 4 |
+
"heartbeatAt": "2024-08-04T13:11:32.902217",
|
| 5 |
+
"startedAt": "2024-08-04T13:11:32.253120",
|
| 6 |
+
"docker": null,
|
| 7 |
+
"cuda": null,
|
| 8 |
+
"args": [
|
| 9 |
+
"--seq-length",
|
| 10 |
+
"4096",
|
| 11 |
+
"--sliding-window-size",
|
| 12 |
+
"4096",
|
| 13 |
+
"--micro-batch-size",
|
| 14 |
+
"2",
|
| 15 |
+
"--global-batch-size",
|
| 16 |
+
"320",
|
| 17 |
+
"--train-iters",
|
| 18 |
+
"20000",
|
| 19 |
+
"--tokenizer-type",
|
| 20 |
+
"HFPreTrainedTokenizer",
|
| 21 |
+
"--tokenizer-model",
|
| 22 |
+
"/share/pretrained_lm/google/gemma-2-2b",
|
| 23 |
+
"--train-data-path",
|
| 24 |
+
"235289369",
|
| 25 |
+
"/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document",
|
| 26 |
+
"--valid-data-path",
|
| 27 |
+
"235289369",
|
| 28 |
+
"/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document",
|
| 29 |
+
"--test-data-path",
|
| 30 |
+
"235289369",
|
| 31 |
+
"/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document",
|
| 32 |
+
"--lr",
|
| 33 |
+
"2e-5",
|
| 34 |
+
"--min-lr",
|
| 35 |
+
"1e-6",
|
| 36 |
+
"--lr-decay-style",
|
| 37 |
+
"cosine",
|
| 38 |
+
"--lr-warmup-iters",
|
| 39 |
+
"500",
|
| 40 |
+
"--lr-decay-iters",
|
| 41 |
+
"20000",
|
| 42 |
+
"--weight-decay",
|
| 43 |
+
"0.1",
|
| 44 |
+
"--grad-clip-norm",
|
| 45 |
+
"1.0",
|
| 46 |
+
"--optimizer",
|
| 47 |
+
"anyprecision",
|
| 48 |
+
"--adam-beta1",
|
| 49 |
+
"0.9",
|
| 50 |
+
"--adam-beta2",
|
| 51 |
+
"0.95",
|
| 52 |
+
"--adam-eps",
|
| 53 |
+
"1e-6",
|
| 54 |
+
"--save-interval",
|
| 55 |
+
"200",
|
| 56 |
+
"--eval-interval",
|
| 57 |
+
"200",
|
| 58 |
+
"--eval-iters",
|
| 59 |
+
"10",
|
| 60 |
+
"--bf16",
|
| 61 |
+
"--mixed-precision",
|
| 62 |
+
"--base-model",
|
| 63 |
+
"/share/pretrained_lm/google/gemma-2-2b",
|
| 64 |
+
"--save",
|
| 65 |
+
"/work/llm_recipes/models/yans-sample-gemma-2-2b",
|
| 66 |
+
"--load",
|
| 67 |
+
"/work/llm_recipes/models/yans-sample-gemma-2-2b",
|
| 68 |
+
"--fsdp-activation-checkpointing",
|
| 69 |
+
"--sharding-strategy",
|
| 70 |
+
"FULL_SHARD",
|
| 71 |
+
"--checkpoint-type",
|
| 72 |
+
"LOCAL_STATE_DICT",
|
| 73 |
+
"--save-n-checkpoints",
|
| 74 |
+
"10",
|
| 75 |
+
"--hf-upload-retry-limit",
|
| 76 |
+
"2",
|
| 77 |
+
"--hf-repo-id",
|
| 78 |
+
"koichi12/yans-sample-gemma-2-2b",
|
| 79 |
+
"--wandb-entity",
|
| 80 |
+
"iwakawa-koichi-q5-tohoku-nlp6723",
|
| 81 |
+
"--wandb-project",
|
| 82 |
+
"llm_tutorial",
|
| 83 |
+
"--wandb-name",
|
| 84 |
+
"yans-sample-gemma-2-2b_train_2024-08-04-22:11:21"
|
| 85 |
+
],
|
| 86 |
+
"state": "running",
|
| 87 |
+
"program": "/project/examples/finetuning.py",
|
| 88 |
+
"codePathLocal": "examples/finetuning.py",
|
| 89 |
+
"codePath": "examples/finetuning.py",
|
| 90 |
+
"git": {
|
| 91 |
+
"remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
|
| 92 |
+
"commit": "0336bd6c20fe25d78eda1d14afa66c1ae2e6d687"
|
| 93 |
+
},
|
| 94 |
+
"email": null,
|
| 95 |
+
"root": "/project",
|
| 96 |
+
"host": "gpu-koiwa-00",
|
| 97 |
+
"username": "koiwa",
|
| 98 |
+
"executable": "/usr/bin/python",
|
| 99 |
+
"cpu_count": 18,
|
| 100 |
+
"cpu_count_logical": 18,
|
| 101 |
+
"cpu_freq": {
|
| 102 |
+
"current": 2400.044999999999,
|
| 103 |
+
"min": 0.0,
|
| 104 |
+
"max": 0.0
|
| 105 |
+
},
|
| 106 |
+
"cpu_freq_per_core": [
|
| 107 |
+
{
|
| 108 |
+
"current": 2400.045,
|
| 109 |
+
"min": 0.0,
|
| 110 |
+
"max": 0.0
|
| 111 |
+
},
|
| 112 |
+
{
|
| 113 |
+
"current": 2400.045,
|
| 114 |
+
"min": 0.0,
|
| 115 |
+
"max": 0.0
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"current": 2400.045,
|
| 119 |
+
"min": 0.0,
|
| 120 |
+
"max": 0.0
|
| 121 |
+
},
|
| 122 |
+
{
|
| 123 |
+
"current": 2400.045,
|
| 124 |
+
"min": 0.0,
|
| 125 |
+
"max": 0.0
|
| 126 |
+
},
|
| 127 |
+
{
|
| 128 |
+
"current": 2400.045,
|
| 129 |
+
"min": 0.0,
|
| 130 |
+
"max": 0.0
|
| 131 |
+
},
|
| 132 |
+
{
|
| 133 |
+
"current": 2400.045,
|
| 134 |
+
"min": 0.0,
|
| 135 |
+
"max": 0.0
|
| 136 |
+
},
|
| 137 |
+
{
|
| 138 |
+
"current": 2400.045,
|
| 139 |
+
"min": 0.0,
|
| 140 |
+
"max": 0.0
|
| 141 |
+
},
|
| 142 |
+
{
|
| 143 |
+
"current": 2400.045,
|
| 144 |
+
"min": 0.0,
|
| 145 |
+
"max": 0.0
|
| 146 |
+
},
|
| 147 |
+
{
|
| 148 |
+
"current": 2400.045,
|
| 149 |
+
"min": 0.0,
|
| 150 |
+
"max": 0.0
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"current": 2400.045,
|
| 154 |
+
"min": 0.0,
|
| 155 |
+
"max": 0.0
|
| 156 |
+
},
|
| 157 |
+
{
|
| 158 |
+
"current": 2400.045,
|
| 159 |
+
"min": 0.0,
|
| 160 |
+
"max": 0.0
|
| 161 |
+
},
|
| 162 |
+
{
|
| 163 |
+
"current": 2400.045,
|
| 164 |
+
"min": 0.0,
|
| 165 |
+
"max": 0.0
|
| 166 |
+
},
|
| 167 |
+
{
|
| 168 |
+
"current": 2400.045,
|
| 169 |
+
"min": 0.0,
|
| 170 |
+
"max": 0.0
|
| 171 |
+
},
|
| 172 |
+
{
|
| 173 |
+
"current": 2400.045,
|
| 174 |
+
"min": 0.0,
|
| 175 |
+
"max": 0.0
|
| 176 |
+
},
|
| 177 |
+
{
|
| 178 |
+
"current": 2400.045,
|
| 179 |
+
"min": 0.0,
|
| 180 |
+
"max": 0.0
|
| 181 |
+
},
|
| 182 |
+
{
|
| 183 |
+
"current": 2400.045,
|
| 184 |
+
"min": 0.0,
|
| 185 |
+
"max": 0.0
|
| 186 |
+
},
|
| 187 |
+
{
|
| 188 |
+
"current": 2400.045,
|
| 189 |
+
"min": 0.0,
|
| 190 |
+
"max": 0.0
|
| 191 |
+
},
|
| 192 |
+
{
|
| 193 |
+
"current": 2400.045,
|
| 194 |
+
"min": 0.0,
|
| 195 |
+
"max": 0.0
|
| 196 |
+
}
|
| 197 |
+
],
|
| 198 |
+
"disk": {
|
| 199 |
+
"/": {
|
| 200 |
+
"total": 0.0625,
|
| 201 |
+
"used": 1.1444091796875e-05
|
| 202 |
+
}
|
| 203 |
+
},
|
| 204 |
+
"gpu": "NVIDIA A100-SXM4-40GB",
|
| 205 |
+
"gpu_count": 1,
|
| 206 |
+
"gpu_devices": [
|
| 207 |
+
{
|
| 208 |
+
"name": "NVIDIA A100-SXM4-40GB",
|
| 209 |
+
"memory_total": 42949672960
|
| 210 |
+
}
|
| 211 |
+
],
|
| 212 |
+
"memory": {
|
| 213 |
+
"total": 56.48782730102539
|
| 214 |
+
}
|
| 215 |
+
}
|
wandb/run-20240804_221132-o8ieoj9i/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"_wandb": {"runtime": 166}}
|
wandb/run-20240804_221132-o8ieoj9i/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,263 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2024-08-04 22:11:32,267 INFO StreamThr :12237 [internal.py:wandb_internal():86] W&B internal server running at pid: 12237, started at: 2024-08-04 22:11:32.266168
|
| 2 |
+
2024-08-04 22:11:32,268 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status
|
| 3 |
+
2024-08-04 22:11:32,270 INFO WriterThread:12237 [datastore.py:open_for_write():87] open: /project/wandb/run-20240804_221132-o8ieoj9i/run-o8ieoj9i.wandb
|
| 4 |
+
2024-08-04 22:11:32,271 DEBUG SenderThread:12237 [sender.py:send():382] send: header
|
| 5 |
+
2024-08-04 22:11:32,285 DEBUG SenderThread:12237 [sender.py:send():382] send: run
|
| 6 |
+
2024-08-04 22:11:32,779 INFO SenderThread:12237 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240804_221132-o8ieoj9i/files
|
| 7 |
+
2024-08-04 22:11:32,779 INFO SenderThread:12237 [sender.py:_start_run_threads():1136] run started: o8ieoj9i with start time 1722777092.265577
|
| 8 |
+
2024-08-04 22:11:32,784 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: check_version
|
| 9 |
+
2024-08-04 22:11:32,784 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: check_version
|
| 10 |
+
2024-08-04 22:11:32,884 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: run_start
|
| 11 |
+
2024-08-04 22:11:32,890 DEBUG HandlerThread:12237 [system_info.py:__init__():27] System info init
|
| 12 |
+
2024-08-04 22:11:32,890 DEBUG HandlerThread:12237 [system_info.py:__init__():42] System info init done
|
| 13 |
+
2024-08-04 22:11:32,890 INFO HandlerThread:12237 [system_monitor.py:start():194] Starting system monitor
|
| 14 |
+
2024-08-04 22:11:32,890 INFO SystemMonitor:12237 [system_monitor.py:_start():158] Starting system asset monitoring threads
|
| 15 |
+
2024-08-04 22:11:32,890 INFO HandlerThread:12237 [system_monitor.py:probe():214] Collecting system info
|
| 16 |
+
2024-08-04 22:11:32,891 INFO SystemMonitor:12237 [interfaces.py:start():190] Started cpu monitoring
|
| 17 |
+
2024-08-04 22:11:32,891 INFO SystemMonitor:12237 [interfaces.py:start():190] Started disk monitoring
|
| 18 |
+
2024-08-04 22:11:32,892 INFO SystemMonitor:12237 [interfaces.py:start():190] Started gpu monitoring
|
| 19 |
+
2024-08-04 22:11:32,893 INFO SystemMonitor:12237 [interfaces.py:start():190] Started memory monitoring
|
| 20 |
+
2024-08-04 22:11:32,893 INFO SystemMonitor:12237 [interfaces.py:start():190] Started network monitoring
|
| 21 |
+
2024-08-04 22:11:32,902 DEBUG HandlerThread:12237 [system_info.py:probe():151] Probing system
|
| 22 |
+
2024-08-04 22:11:32,904 DEBUG HandlerThread:12237 [system_info.py:_probe_git():136] Probing git
|
| 23 |
+
2024-08-04 22:11:32,916 DEBUG HandlerThread:12237 [system_info.py:_probe_git():144] Probing git done
|
| 24 |
+
2024-08-04 22:11:32,916 DEBUG HandlerThread:12237 [system_info.py:probe():199] Probing system done
|
| 25 |
+
2024-08-04 22:11:32,916 DEBUG HandlerThread:12237 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-04T13:11:32.902217', 'startedAt': '2024-08-04T13:11:32.253120', 'docker': None, 'cuda': None, 'args': ('--seq-length', '4096', '--sliding-window-size', '4096', '--micro-batch-size', '2', '--global-batch-size', '320', '--train-iters', '20000', '--tokenizer-type', 'HFPreTrainedTokenizer', '--tokenizer-model', '/share/pretrained_lm/google/gemma-2-2b', '--train-data-path', '235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document', '--valid-data-path', '235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document', '--test-data-path', '235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '20000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'anyprecision', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '200', '--eval-interval', '200', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/google/gemma-2-2b', '--save', '/work/llm_recipes/models/yans-sample-gemma-2-2b', '--load', '/work/llm_recipes/models/yans-sample-gemma-2-2b', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/yans-sample-gemma-2-2b', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'yans-sample-gemma-2-2b_train_2024-08-04-22:11:21'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '0336bd6c20fe25d78eda1d14afa66c1ae2e6d687'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.044999999999, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.48782730102539}}
|
| 26 |
+
2024-08-04 22:11:32,916 INFO HandlerThread:12237 [system_monitor.py:probe():224] Finished collecting system info
|
| 27 |
+
2024-08-04 22:11:32,916 INFO HandlerThread:12237 [system_monitor.py:probe():227] Publishing system info
|
| 28 |
+
2024-08-04 22:11:32,917 INFO HandlerThread:12237 [system_monitor.py:probe():229] Finished publishing system info
|
| 29 |
+
2024-08-04 22:11:32,923 DEBUG SenderThread:12237 [sender.py:send():382] send: files
|
| 30 |
+
2024-08-04 22:11:32,923 INFO SenderThread:12237 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
|
| 31 |
+
2024-08-04 22:11:32,932 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: python_packages
|
| 32 |
+
2024-08-04 22:11:32,933 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: stop_status
|
| 33 |
+
2024-08-04 22:11:32,933 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: internal_messages
|
| 34 |
+
2024-08-04 22:11:32,933 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: python_packages
|
| 35 |
+
2024-08-04 22:11:32,935 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: stop_status
|
| 36 |
+
2024-08-04 22:11:33,202 DEBUG SenderThread:12237 [sender.py:send():382] send: telemetry
|
| 37 |
+
2024-08-04 22:11:33,617 INFO wandb-upload_0:12237 [upload_job.py:push():131] Uploaded file /tmp/tmpntsoky67wandb/ybme98wl-wandb-metadata.json
|
| 38 |
+
2024-08-04 22:11:33,780 INFO Thread-12 :12237 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_221132-o8ieoj9i/files/requirements.txt
|
| 39 |
+
2024-08-04 22:11:33,781 INFO Thread-12 :12237 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_221132-o8ieoj9i/files/wandb-metadata.json
|
| 40 |
+
2024-08-04 22:11:33,781 INFO Thread-12 :12237 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_221132-o8ieoj9i/files/output.log
|
| 41 |
+
2024-08-04 22:11:35,781 INFO Thread-12 :12237 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_221132-o8ieoj9i/files/output.log
|
| 42 |
+
2024-08-04 22:11:37,800 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
|
| 43 |
+
2024-08-04 22:11:42,801 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
|
| 44 |
+
2024-08-04 22:11:47,802 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
|
| 45 |
+
2024-08-04 22:11:47,932 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: stop_status
|
| 46 |
+
2024-08-04 22:11:47,932 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: internal_messages
|
| 47 |
+
2024-08-04 22:11:47,933 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: stop_status
|
| 48 |
+
2024-08-04 22:11:53,184 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
|
| 49 |
+
2024-08-04 22:11:58,184 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
|
| 50 |
+
2024-08-04 22:12:02,932 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: stop_status
|
| 51 |
+
2024-08-04 22:12:02,932 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: stop_status
|
| 52 |
+
2024-08-04 22:12:02,972 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: internal_messages
|
| 53 |
+
2024-08-04 22:12:04,128 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
|
| 54 |
+
2024-08-04 22:12:04,797 INFO Thread-12 :12237 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_221132-o8ieoj9i/files/config.yaml
|
| 55 |
+
2024-08-04 22:12:09,335 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
|
| 56 |
+
2024-08-04 22:12:14,336 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
|
| 57 |
+
2024-08-04 22:12:17,932 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: stop_status
|
| 58 |
+
2024-08-04 22:12:17,932 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: stop_status
|
| 59 |
+
2024-08-04 22:12:17,972 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: internal_messages
|
| 60 |
+
2024-08-04 22:12:20,198 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
|
| 61 |
+
2024-08-04 22:12:25,199 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
|
| 62 |
+
2024-08-04 22:12:30,199 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
|
| 63 |
+
2024-08-04 22:12:32,894 DEBUG SystemMonitor:12237 [system_monitor.py:_start():172] Starting system metrics aggregation loop
|
| 64 |
+
2024-08-04 22:12:32,895 DEBUG SenderThread:12237 [sender.py:send():382] send: stats
|
| 65 |
+
2024-08-04 22:12:32,932 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: stop_status
|
| 66 |
+
2024-08-04 22:12:32,932 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: stop_status
|
| 67 |
+
2024-08-04 22:12:32,972 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: internal_messages
|
| 68 |
+
2024-08-04 22:12:36,110 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
|
| 69 |
+
2024-08-04 22:12:41,111 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
|
| 70 |
+
2024-08-04 22:12:45,820 INFO Thread-12 :12237 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_221132-o8ieoj9i/files/output.log
|
| 71 |
+
2024-08-04 22:12:46,558 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
|
| 72 |
+
2024-08-04 22:12:47,932 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: stop_status
|
| 73 |
+
2024-08-04 22:12:47,933 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: internal_messages
|
| 74 |
+
2024-08-04 22:12:47,933 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: stop_status
|
| 75 |
+
2024-08-04 22:12:52,156 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
|
| 76 |
+
2024-08-04 22:12:57,157 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
|
| 77 |
+
2024-08-04 22:13:02,157 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
|
| 78 |
+
2024-08-04 22:13:02,897 DEBUG SenderThread:12237 [sender.py:send():382] send: stats
|
| 79 |
+
2024-08-04 22:13:02,932 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: stop_status
|
| 80 |
+
2024-08-04 22:13:02,932 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: stop_status
|
| 81 |
+
2024-08-04 22:13:02,972 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: internal_messages
|
| 82 |
+
2024-08-04 22:13:08,124 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
|
| 83 |
+
2024-08-04 22:13:13,125 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
|
| 84 |
+
2024-08-04 22:13:17,932 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: stop_status
|
| 85 |
+
2024-08-04 22:13:17,933 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: stop_status
|
| 86 |
+
2024-08-04 22:13:17,976 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: internal_messages
|
| 87 |
+
2024-08-04 22:13:18,132 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
|
| 88 |
+
2024-08-04 22:13:23,133 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
|
| 89 |
+
2024-08-04 22:13:28,134 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
|
| 90 |
+
2024-08-04 22:13:32,898 DEBUG SenderThread:12237 [sender.py:send():382] send: stats
|
| 91 |
+
2024-08-04 22:13:32,932 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: stop_status
|
| 92 |
+
2024-08-04 22:13:32,933 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: stop_status
|
| 93 |
+
2024-08-04 22:13:32,976 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: internal_messages
|
| 94 |
+
2024-08-04 22:13:33,205 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
|
| 95 |
+
2024-08-04 22:13:38,206 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
|
| 96 |
+
2024-08-04 22:13:43,207 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
|
| 97 |
+
2024-08-04 22:13:47,932 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: stop_status
|
| 98 |
+
2024-08-04 22:13:47,933 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: stop_status
|
| 99 |
+
2024-08-04 22:13:47,976 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: internal_messages
|
| 100 |
+
2024-08-04 22:13:49,120 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
|
| 101 |
+
2024-08-04 22:13:54,121 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
|
| 102 |
+
2024-08-04 22:13:59,122 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
|
| 103 |
+
2024-08-04 22:14:02,898 DEBUG SenderThread:12237 [sender.py:send():382] send: stats
|
| 104 |
+
2024-08-04 22:14:02,932 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: stop_status
|
| 105 |
+
2024-08-04 22:14:02,933 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: stop_status
|
| 106 |
+
2024-08-04 22:14:02,976 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: internal_messages
|
| 107 |
+
2024-08-04 22:14:04,197 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
|
| 108 |
+
2024-08-04 22:14:04,864 INFO Thread-12 :12237 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_221132-o8ieoj9i/files/output.log
|
| 109 |
+
2024-08-04 22:14:09,198 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
|
| 110 |
+
2024-08-04 22:14:13,453 DEBUG SenderThread:12237 [sender.py:send():382] send: config
|
| 111 |
+
2024-08-04 22:14:13,453 DEBUG SenderThread:12237 [sender.py:send():382] send: config
|
| 112 |
+
2024-08-04 22:14:13,869 INFO Thread-12 :12237 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_221132-o8ieoj9i/files/output.log
|
| 113 |
+
2024-08-04 22:14:14,550 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
|
| 114 |
+
2024-08-04 22:14:15,870 INFO Thread-12 :12237 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_221132-o8ieoj9i/files/output.log
|
| 115 |
+
2024-08-04 22:14:17,933 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: stop_status
|
| 116 |
+
2024-08-04 22:14:17,934 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: internal_messages
|
| 117 |
+
2024-08-04 22:14:17,934 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: stop_status
|
| 118 |
+
2024-08-04 22:14:19,437 DEBUG SenderThread:12237 [sender.py:send():382] send: exit
|
| 119 |
+
2024-08-04 22:14:19,437 INFO SenderThread:12237 [sender.py:send_exit():589] handling exit code: 1
|
| 120 |
+
2024-08-04 22:14:19,437 INFO SenderThread:12237 [sender.py:send_exit():591] handling runtime: 166
|
| 121 |
+
2024-08-04 22:14:19,438 INFO SenderThread:12237 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
| 122 |
+
2024-08-04 22:14:19,439 INFO SenderThread:12237 [sender.py:send_exit():597] send defer
|
| 123 |
+
2024-08-04 22:14:19,439 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: defer
|
| 124 |
+
2024-08-04 22:14:19,439 INFO HandlerThread:12237 [handler.py:handle_request_defer():172] handle defer: 0
|
| 125 |
+
2024-08-04 22:14:19,439 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: defer
|
| 126 |
+
2024-08-04 22:14:19,439 INFO SenderThread:12237 [sender.py:send_request_defer():613] handle sender defer: 0
|
| 127 |
+
2024-08-04 22:14:19,439 INFO SenderThread:12237 [sender.py:transition_state():617] send defer: 1
|
| 128 |
+
2024-08-04 22:14:19,439 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: defer
|
| 129 |
+
2024-08-04 22:14:19,439 INFO HandlerThread:12237 [handler.py:handle_request_defer():172] handle defer: 1
|
| 130 |
+
2024-08-04 22:14:19,439 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: defer
|
| 131 |
+
2024-08-04 22:14:19,439 INFO SenderThread:12237 [sender.py:send_request_defer():613] handle sender defer: 1
|
| 132 |
+
2024-08-04 22:14:19,439 INFO SenderThread:12237 [sender.py:transition_state():617] send defer: 2
|
| 133 |
+
2024-08-04 22:14:19,439 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: defer
|
| 134 |
+
2024-08-04 22:14:19,440 INFO HandlerThread:12237 [handler.py:handle_request_defer():172] handle defer: 2
|
| 135 |
+
2024-08-04 22:14:19,440 INFO HandlerThread:12237 [system_monitor.py:finish():203] Stopping system monitor
|
| 136 |
+
2024-08-04 22:14:19,440 DEBUG SystemMonitor:12237 [system_monitor.py:_start():179] Finished system metrics aggregation loop
|
| 137 |
+
2024-08-04 22:14:19,440 DEBUG SystemMonitor:12237 [system_monitor.py:_start():183] Publishing last batch of metrics
|
| 138 |
+
2024-08-04 22:14:19,440 INFO HandlerThread:12237 [interfaces.py:finish():202] Joined cpu monitor
|
| 139 |
+
2024-08-04 22:14:19,441 INFO HandlerThread:12237 [interfaces.py:finish():202] Joined disk monitor
|
| 140 |
+
2024-08-04 22:14:19,474 INFO HandlerThread:12237 [interfaces.py:finish():202] Joined gpu monitor
|
| 141 |
+
2024-08-04 22:14:19,474 INFO HandlerThread:12237 [interfaces.py:finish():202] Joined memory monitor
|
| 142 |
+
2024-08-04 22:14:19,474 INFO HandlerThread:12237 [interfaces.py:finish():202] Joined network monitor
|
| 143 |
+
2024-08-04 22:14:19,475 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: defer
|
| 144 |
+
2024-08-04 22:14:19,475 INFO SenderThread:12237 [sender.py:send_request_defer():613] handle sender defer: 2
|
| 145 |
+
2024-08-04 22:14:19,475 INFO SenderThread:12237 [sender.py:transition_state():617] send defer: 3
|
| 146 |
+
2024-08-04 22:14:19,475 DEBUG SenderThread:12237 [sender.py:send():382] send: stats
|
| 147 |
+
2024-08-04 22:14:19,475 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: defer
|
| 148 |
+
2024-08-04 22:14:19,475 INFO HandlerThread:12237 [handler.py:handle_request_defer():172] handle defer: 3
|
| 149 |
+
2024-08-04 22:14:19,475 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: defer
|
| 150 |
+
2024-08-04 22:14:19,475 INFO SenderThread:12237 [sender.py:send_request_defer():613] handle sender defer: 3
|
| 151 |
+
2024-08-04 22:14:19,475 INFO SenderThread:12237 [sender.py:transition_state():617] send defer: 4
|
| 152 |
+
2024-08-04 22:14:19,475 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: defer
|
| 153 |
+
2024-08-04 22:14:19,475 INFO HandlerThread:12237 [handler.py:handle_request_defer():172] handle defer: 4
|
| 154 |
+
2024-08-04 22:14:19,476 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: defer
|
| 155 |
+
2024-08-04 22:14:19,476 INFO SenderThread:12237 [sender.py:send_request_defer():613] handle sender defer: 4
|
| 156 |
+
2024-08-04 22:14:19,476 INFO SenderThread:12237 [sender.py:transition_state():617] send defer: 5
|
| 157 |
+
2024-08-04 22:14:19,476 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: defer
|
| 158 |
+
2024-08-04 22:14:19,476 INFO HandlerThread:12237 [handler.py:handle_request_defer():172] handle defer: 5
|
| 159 |
+
2024-08-04 22:14:19,476 DEBUG SenderThread:12237 [sender.py:send():382] send: summary
|
| 160 |
+
2024-08-04 22:14:19,477 INFO SenderThread:12237 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
| 161 |
+
2024-08-04 22:14:19,477 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: defer
|
| 162 |
+
2024-08-04 22:14:19,477 INFO SenderThread:12237 [sender.py:send_request_defer():613] handle sender defer: 5
|
| 163 |
+
2024-08-04 22:14:19,477 INFO SenderThread:12237 [sender.py:transition_state():617] send defer: 6
|
| 164 |
+
2024-08-04 22:14:19,477 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: defer
|
| 165 |
+
2024-08-04 22:14:19,477 INFO HandlerThread:12237 [handler.py:handle_request_defer():172] handle defer: 6
|
| 166 |
+
2024-08-04 22:14:19,477 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: defer
|
| 167 |
+
2024-08-04 22:14:19,477 INFO SenderThread:12237 [sender.py:send_request_defer():613] handle sender defer: 6
|
| 168 |
+
2024-08-04 22:14:19,480 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
|
| 169 |
+
2024-08-04 22:14:19,712 INFO SenderThread:12237 [sender.py:transition_state():617] send defer: 7
|
| 170 |
+
2024-08-04 22:14:19,712 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: defer
|
| 171 |
+
2024-08-04 22:14:19,712 INFO HandlerThread:12237 [handler.py:handle_request_defer():172] handle defer: 7
|
| 172 |
+
2024-08-04 22:14:19,712 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: defer
|
| 173 |
+
2024-08-04 22:14:19,712 INFO SenderThread:12237 [sender.py:send_request_defer():613] handle sender defer: 7
|
| 174 |
+
2024-08-04 22:14:19,873 INFO Thread-12 :12237 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_221132-o8ieoj9i/files/config.yaml
|
| 175 |
+
2024-08-04 22:14:19,874 INFO Thread-12 :12237 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_221132-o8ieoj9i/files/wandb-summary.json
|
| 176 |
+
2024-08-04 22:14:20,437 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: poll_exit
|
| 177 |
+
2024-08-04 22:14:20,874 INFO Thread-12 :12237 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_221132-o8ieoj9i/files/output.log
|
| 178 |
+
2024-08-04 22:14:21,905 INFO SenderThread:12237 [sender.py:transition_state():617] send defer: 8
|
| 179 |
+
2024-08-04 22:14:21,905 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: poll_exit
|
| 180 |
+
2024-08-04 22:14:21,905 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: defer
|
| 181 |
+
2024-08-04 22:14:21,906 INFO HandlerThread:12237 [handler.py:handle_request_defer():172] handle defer: 8
|
| 182 |
+
2024-08-04 22:14:21,906 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: defer
|
| 183 |
+
2024-08-04 22:14:21,906 INFO SenderThread:12237 [sender.py:send_request_defer():613] handle sender defer: 8
|
| 184 |
+
2024-08-04 22:14:21,906 INFO SenderThread:12237 [job_builder.py:build():296] Attempting to build job artifact
|
| 185 |
+
2024-08-04 22:14:21,907 INFO SenderThread:12237 [job_builder.py:_get_source_type():426] is repo sourced job
|
| 186 |
+
2024-08-04 22:14:21,921 INFO SenderThread:12237 [job_builder.py:build():402] adding wandb-job metadata file
|
| 187 |
+
2024-08-04 22:14:21,929 INFO SenderThread:12237 [sender.py:transition_state():617] send defer: 9
|
| 188 |
+
2024-08-04 22:14:21,929 DEBUG SenderThread:12237 [sender.py:send():382] send: artifact
|
| 189 |
+
2024-08-04 22:14:21,929 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: defer
|
| 190 |
+
2024-08-04 22:14:21,931 INFO HandlerThread:12237 [handler.py:handle_request_defer():172] handle defer: 9
|
| 191 |
+
2024-08-04 22:14:22,437 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: poll_exit
|
| 192 |
+
2024-08-04 22:14:22,875 INFO Thread-12 :12237 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_221132-o8ieoj9i/files/output.log
|
| 193 |
+
2024-08-04 22:14:23,127 INFO wandb-upload_0:12237 [upload_job.py:push():86] Skipped uploading /singularity_home/.local/share/wandb/artifacts/staging/tmpaydno9il
|
| 194 |
+
2024-08-04 22:14:23,543 INFO wandb-upload_1:12237 [upload_job.py:push():89] Uploaded file /singularity_home/.local/share/wandb/artifacts/staging/tmpaetcwljm
|
| 195 |
+
2024-08-04 22:14:24,702 INFO SenderThread:12237 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTA5ODUzNDkwNw==', 'state': 'PENDING', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTA5MzUzODM4NQ==', 'versionIndex': 3}}}
|
| 196 |
+
2024-08-04 22:14:24,702 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: defer
|
| 197 |
+
2024-08-04 22:14:24,702 INFO SenderThread:12237 [sender.py:send_request_defer():613] handle sender defer: 9
|
| 198 |
+
2024-08-04 22:14:24,702 INFO SenderThread:12237 [dir_watcher.py:finish():358] shutting down directory watcher
|
| 199 |
+
2024-08-04 22:14:24,876 INFO SenderThread:12237 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240804_221132-o8ieoj9i/files
|
| 200 |
+
2024-08-04 22:14:24,876 INFO SenderThread:12237 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_221132-o8ieoj9i/files/requirements.txt requirements.txt
|
| 201 |
+
2024-08-04 22:14:24,876 INFO SenderThread:12237 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_221132-o8ieoj9i/files/config.yaml config.yaml
|
| 202 |
+
2024-08-04 22:14:24,878 INFO SenderThread:12237 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_221132-o8ieoj9i/files/wandb-metadata.json wandb-metadata.json
|
| 203 |
+
2024-08-04 22:14:24,878 INFO SenderThread:12237 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_221132-o8ieoj9i/files/wandb-summary.json wandb-summary.json
|
| 204 |
+
2024-08-04 22:14:24,879 INFO SenderThread:12237 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_221132-o8ieoj9i/files/output.log output.log
|
| 205 |
+
2024-08-04 22:14:24,881 INFO SenderThread:12237 [sender.py:transition_state():617] send defer: 10
|
| 206 |
+
2024-08-04 22:14:24,881 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: poll_exit
|
| 207 |
+
2024-08-04 22:14:24,881 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: defer
|
| 208 |
+
2024-08-04 22:14:24,882 INFO HandlerThread:12237 [handler.py:handle_request_defer():172] handle defer: 10
|
| 209 |
+
2024-08-04 22:14:24,882 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
|
| 210 |
+
2024-08-04 22:14:24,883 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: defer
|
| 211 |
+
2024-08-04 22:14:24,883 INFO SenderThread:12237 [sender.py:send_request_defer():613] handle sender defer: 10
|
| 212 |
+
2024-08-04 22:14:24,883 INFO SenderThread:12237 [file_pusher.py:finish():172] shutting down file pusher
|
| 213 |
+
2024-08-04 22:14:25,282 INFO wandb-upload_0:12237 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_221132-o8ieoj9i/files/requirements.txt
|
| 214 |
+
2024-08-04 22:14:25,375 INFO wandb-upload_1:12237 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_221132-o8ieoj9i/files/config.yaml
|
| 215 |
+
2024-08-04 22:14:25,438 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: poll_exit
|
| 216 |
+
2024-08-04 22:14:25,438 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: poll_exit
|
| 217 |
+
2024-08-04 22:14:25,461 INFO wandb-upload_2:12237 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_221132-o8ieoj9i/files/wandb-summary.json
|
| 218 |
+
2024-08-04 22:14:25,480 INFO wandb-upload_3:12237 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_221132-o8ieoj9i/files/output.log
|
| 219 |
+
2024-08-04 22:14:25,680 INFO Thread-11 (_thread_body):12237 [sender.py:transition_state():617] send defer: 11
|
| 220 |
+
2024-08-04 22:14:25,681 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: defer
|
| 221 |
+
2024-08-04 22:14:25,681 INFO HandlerThread:12237 [handler.py:handle_request_defer():172] handle defer: 11
|
| 222 |
+
2024-08-04 22:14:25,681 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: defer
|
| 223 |
+
2024-08-04 22:14:25,681 INFO SenderThread:12237 [sender.py:send_request_defer():613] handle sender defer: 11
|
| 224 |
+
2024-08-04 22:14:25,681 INFO SenderThread:12237 [file_pusher.py:join():178] waiting for file pusher
|
| 225 |
+
2024-08-04 22:14:25,681 INFO SenderThread:12237 [sender.py:transition_state():617] send defer: 12
|
| 226 |
+
2024-08-04 22:14:25,681 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: defer
|
| 227 |
+
2024-08-04 22:14:25,681 INFO HandlerThread:12237 [handler.py:handle_request_defer():172] handle defer: 12
|
| 228 |
+
2024-08-04 22:14:25,681 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: defer
|
| 229 |
+
2024-08-04 22:14:25,681 INFO SenderThread:12237 [sender.py:send_request_defer():613] handle sender defer: 12
|
| 230 |
+
2024-08-04 22:14:25,681 INFO SenderThread:12237 [file_stream.py:finish():595] file stream finish called
|
| 231 |
+
2024-08-04 22:14:25,848 INFO SenderThread:12237 [file_stream.py:finish():599] file stream finish is done
|
| 232 |
+
2024-08-04 22:14:25,848 INFO SenderThread:12237 [sender.py:transition_state():617] send defer: 13
|
| 233 |
+
2024-08-04 22:14:25,849 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: defer
|
| 234 |
+
2024-08-04 22:14:25,849 INFO HandlerThread:12237 [handler.py:handle_request_defer():172] handle defer: 13
|
| 235 |
+
2024-08-04 22:14:25,849 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: defer
|
| 236 |
+
2024-08-04 22:14:25,849 INFO SenderThread:12237 [sender.py:send_request_defer():613] handle sender defer: 13
|
| 237 |
+
2024-08-04 22:14:25,849 INFO SenderThread:12237 [sender.py:transition_state():617] send defer: 14
|
| 238 |
+
2024-08-04 22:14:25,849 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: defer
|
| 239 |
+
2024-08-04 22:14:25,849 DEBUG SenderThread:12237 [sender.py:send():382] send: final
|
| 240 |
+
2024-08-04 22:14:25,849 INFO HandlerThread:12237 [handler.py:handle_request_defer():172] handle defer: 14
|
| 241 |
+
2024-08-04 22:14:25,849 DEBUG SenderThread:12237 [sender.py:send():382] send: footer
|
| 242 |
+
2024-08-04 22:14:25,850 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: defer
|
| 243 |
+
2024-08-04 22:14:25,850 INFO SenderThread:12237 [sender.py:send_request_defer():613] handle sender defer: 14
|
| 244 |
+
2024-08-04 22:14:25,850 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: poll_exit
|
| 245 |
+
2024-08-04 22:14:25,850 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: poll_exit
|
| 246 |
+
2024-08-04 22:14:25,850 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: poll_exit
|
| 247 |
+
2024-08-04 22:14:25,851 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: poll_exit
|
| 248 |
+
2024-08-04 22:14:25,851 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: server_info
|
| 249 |
+
2024-08-04 22:14:25,851 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: get_summary
|
| 250 |
+
2024-08-04 22:14:25,851 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: sampled_history
|
| 251 |
+
2024-08-04 22:14:25,852 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: server_info
|
| 252 |
+
2024-08-04 22:14:25,852 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: internal_messages
|
| 253 |
+
2024-08-04 22:14:25,853 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: job_info
|
| 254 |
+
2024-08-04 22:14:26,030 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: job_info
|
| 255 |
+
2024-08-04 22:14:26,030 INFO MainThread:12237 [wandb_run.py:_footer_history_summary_info():3866] rendering history
|
| 256 |
+
2024-08-04 22:14:26,030 INFO MainThread:12237 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
|
| 257 |
+
2024-08-04 22:14:26,030 INFO MainThread:12237 [wandb_run.py:_footer_sync_info():3825] logging synced files
|
| 258 |
+
2024-08-04 22:14:26,031 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: shutdown
|
| 259 |
+
2024-08-04 22:14:26,031 INFO HandlerThread:12237 [handler.py:finish():869] shutting down handler
|
| 260 |
+
2024-08-04 22:14:26,853 INFO WriterThread:12237 [datastore.py:close():296] close: /project/wandb/run-20240804_221132-o8ieoj9i/run-o8ieoj9i.wandb
|
| 261 |
+
2024-08-04 22:14:27,030 INFO SenderThread:12237 [sender.py:finish():1572] shutting down sender
|
| 262 |
+
2024-08-04 22:14:27,030 INFO SenderThread:12237 [file_pusher.py:finish():172] shutting down file pusher
|
| 263 |
+
2024-08-04 22:14:27,030 INFO SenderThread:12237 [file_pusher.py:join():178] waiting for file pusher
|
wandb/run-20240804_221132-o8ieoj9i/logs/debug.log
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2024-08-04 22:11:32,259 INFO MainThread:12166 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
|
| 2 |
+
2024-08-04 22:11:32,259 INFO MainThread:12166 [wandb_setup.py:_flush():76] Configure stats pid to 12166
|
| 3 |
+
2024-08-04 22:11:32,259 INFO MainThread:12166 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
|
| 4 |
+
2024-08-04 22:11:32,259 INFO MainThread:12166 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
|
| 5 |
+
2024-08-04 22:11:32,259 INFO MainThread:12166 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train sample'}
|
| 6 |
+
2024-08-04 22:11:32,259 INFO MainThread:12166 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
|
| 7 |
+
2024-08-04 22:11:32,259 INFO MainThread:12166 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
|
| 8 |
+
2024-08-04 22:11:32,259 INFO MainThread:12166 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240804_221132-o8ieoj9i/logs/debug.log
|
| 9 |
+
2024-08-04 22:11:32,259 INFO MainThread:12166 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240804_221132-o8ieoj9i/logs/debug-internal.log
|
| 10 |
+
2024-08-04 22:11:32,259 INFO MainThread:12166 [wandb_init.py:init():566] calling init triggers
|
| 11 |
+
2024-08-04 22:11:32,259 INFO MainThread:12166 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
|
| 12 |
+
config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document'], 'valid_data_path': ['235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document'], 'test_data_path': ['235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 4096, 'num_workers': 2, 'tokenizer_type': 'HFPreTrainedTokenizer', 'tokenizer_model': '/share/pretrained_lm/google/gemma-2-2b', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'yans-sample-gemma-2-2b_train_2024-08-04-22:11:21', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/yans-sample-gemma-2-2b', 'save': '/work/llm_recipes/models/yans-sample-gemma-2-2b', 'base_model': '/share/pretrained_lm/google/gemma-2-2b', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'anyprecision', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 2, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/yans-sample-gemma-2-2b', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 256000, 'gradient_accumulation_steps': 160}
|
| 13 |
+
2024-08-04 22:11:32,260 INFO MainThread:12166 [wandb_init.py:init():616] starting backend
|
| 14 |
+
2024-08-04 22:11:32,260 INFO MainThread:12166 [wandb_init.py:init():620] setting up manager
|
| 15 |
+
2024-08-04 22:11:32,264 INFO MainThread:12166 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 16 |
+
2024-08-04 22:11:32,265 INFO MainThread:12166 [wandb_init.py:init():628] backend started and connected
|
| 17 |
+
2024-08-04 22:11:32,270 INFO MainThread:12166 [wandb_init.py:init():720] updated telemetry
|
| 18 |
+
2024-08-04 22:11:32,281 INFO MainThread:12166 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
|
| 19 |
+
2024-08-04 22:11:32,783 INFO MainThread:12166 [wandb_run.py:_on_init():2262] communicating current version
|
| 20 |
+
2024-08-04 22:11:32,877 INFO MainThread:12166 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.5 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
|
| 21 |
+
|
| 22 |
+
2024-08-04 22:11:32,877 INFO MainThread:12166 [wandb_init.py:init():804] starting run threads in backend
|
| 23 |
+
2024-08-04 22:11:32,932 INFO MainThread:12166 [wandb_run.py:_console_start():2241] atexit reg
|
| 24 |
+
2024-08-04 22:11:32,932 INFO MainThread:12166 [wandb_run.py:_redirect():2096] redirect: wrap_raw
|
| 25 |
+
2024-08-04 22:11:32,932 INFO MainThread:12166 [wandb_run.py:_redirect():2161] Wrapping output streams.
|
| 26 |
+
2024-08-04 22:11:32,932 INFO MainThread:12166 [wandb_run.py:_redirect():2186] Redirects installed.
|
| 27 |
+
2024-08-04 22:11:32,933 INFO MainThread:12166 [wandb_init.py:init():847] run started, returning control to user process
|
| 28 |
+
2024-08-04 22:14:13,452 INFO MainThread:12166 [wandb_run.py:_config_callback():1343] config_cb None None {'activation_function': 'gelu_pytorch_tanh', 'hidden_size': 2304, 'model_type': 'gemma2', 'max_position_embeddings': 4096, 'num_attention_heads': 8, 'num_hidden_layers': 26, 'model_architecture': 'Gemma2ForCausalLM'}
|
| 29 |
+
2024-08-04 22:14:13,453 INFO MainThread:12166 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
|
| 30 |
+
2024-08-04 22:14:27,031 WARNING MsgRouterThr:12166 [router.py:message_loop():77] message_loop has been closed
|
wandb/run-20240804_221132-o8ieoj9i/run-o8ieoj9i.wandb
ADDED
|
Binary file (27.3 kB). View file
|
|
|
wandb/run-20240812_052853-n84i0o06/files/config.yaml
ADDED
|
@@ -0,0 +1,335 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
wandb_version: 1
|
| 2 |
+
|
| 3 |
+
sharding_strategy:
|
| 4 |
+
desc: null
|
| 5 |
+
value: FULL_SHARD
|
| 6 |
+
checkpoint_type:
|
| 7 |
+
desc: null
|
| 8 |
+
value: LOCAL_STATE_DICT
|
| 9 |
+
fsdp_activation_checkpointing:
|
| 10 |
+
desc: null
|
| 11 |
+
value: true
|
| 12 |
+
fsdp_cpu_offload:
|
| 13 |
+
desc: null
|
| 14 |
+
value: false
|
| 15 |
+
low_cpu_fsdp:
|
| 16 |
+
desc: null
|
| 17 |
+
value: false
|
| 18 |
+
no_meta_device:
|
| 19 |
+
desc: null
|
| 20 |
+
value: false
|
| 21 |
+
data_path:
|
| 22 |
+
desc: null
|
| 23 |
+
value: null
|
| 24 |
+
split:
|
| 25 |
+
desc: null
|
| 26 |
+
value: 969, 30, 1
|
| 27 |
+
train_data_path:
|
| 28 |
+
desc: null
|
| 29 |
+
value:
|
| 30 |
+
- '304771887'
|
| 31 |
+
- /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
|
| 32 |
+
valid_data_path:
|
| 33 |
+
desc: null
|
| 34 |
+
value:
|
| 35 |
+
- '304771887'
|
| 36 |
+
- /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
|
| 37 |
+
test_data_path:
|
| 38 |
+
desc: null
|
| 39 |
+
value:
|
| 40 |
+
- '304771887'
|
| 41 |
+
- /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
|
| 42 |
+
data_cache_path:
|
| 43 |
+
desc: null
|
| 44 |
+
value: null
|
| 45 |
+
vocab_size:
|
| 46 |
+
desc: null
|
| 47 |
+
value: null
|
| 48 |
+
vocab_file:
|
| 49 |
+
desc: null
|
| 50 |
+
value: null
|
| 51 |
+
merge_file:
|
| 52 |
+
desc: null
|
| 53 |
+
value: null
|
| 54 |
+
seq_length:
|
| 55 |
+
desc: null
|
| 56 |
+
value: 4096
|
| 57 |
+
num_workers:
|
| 58 |
+
desc: null
|
| 59 |
+
value: 2
|
| 60 |
+
tokenizer_type:
|
| 61 |
+
desc: null
|
| 62 |
+
value: HFPreTrainedTokenizer
|
| 63 |
+
tokenizer_model:
|
| 64 |
+
desc: null
|
| 65 |
+
value: /share/pretrained_lm/Qwen/Qwen2-0.5B
|
| 66 |
+
reset_position_ids:
|
| 67 |
+
desc: null
|
| 68 |
+
value: false
|
| 69 |
+
reset_attention_mask:
|
| 70 |
+
desc: null
|
| 71 |
+
value: false
|
| 72 |
+
eod_mask_loss:
|
| 73 |
+
desc: null
|
| 74 |
+
value: false
|
| 75 |
+
retro_return_doc_ids:
|
| 76 |
+
desc: null
|
| 77 |
+
value: false
|
| 78 |
+
short_seq_prob:
|
| 79 |
+
desc: null
|
| 80 |
+
value: 0.1
|
| 81 |
+
vocab_extra_ids:
|
| 82 |
+
desc: null
|
| 83 |
+
value: 0
|
| 84 |
+
seed:
|
| 85 |
+
desc: null
|
| 86 |
+
value: 1234
|
| 87 |
+
use_mpi:
|
| 88 |
+
desc: null
|
| 89 |
+
value: false
|
| 90 |
+
wandb_entity:
|
| 91 |
+
desc: null
|
| 92 |
+
value: iwakawa-koichi-q5-tohoku-nlp6723
|
| 93 |
+
wandb_name:
|
| 94 |
+
desc: null
|
| 95 |
+
value: yans-qwen2-0.5B_train_2024-08-12-05:28:42
|
| 96 |
+
wandb_project:
|
| 97 |
+
desc: null
|
| 98 |
+
value: llm_tutorial
|
| 99 |
+
quantization:
|
| 100 |
+
desc: null
|
| 101 |
+
value: false
|
| 102 |
+
use_freeze_layers:
|
| 103 |
+
desc: null
|
| 104 |
+
value: false
|
| 105 |
+
freeze_layers:
|
| 106 |
+
desc: null
|
| 107 |
+
value: null
|
| 108 |
+
bf16:
|
| 109 |
+
desc: null
|
| 110 |
+
value: true
|
| 111 |
+
fp16:
|
| 112 |
+
desc: null
|
| 113 |
+
value: false
|
| 114 |
+
mixed_precision:
|
| 115 |
+
desc: null
|
| 116 |
+
value: true
|
| 117 |
+
param_dtype:
|
| 118 |
+
desc: null
|
| 119 |
+
value: null
|
| 120 |
+
load:
|
| 121 |
+
desc: null
|
| 122 |
+
value: /work/llm_recipes/models/yans-qwen2-0.5B
|
| 123 |
+
save:
|
| 124 |
+
desc: null
|
| 125 |
+
value: /work/llm_recipes/models/yans-qwen2-0.5B
|
| 126 |
+
base_model:
|
| 127 |
+
desc: null
|
| 128 |
+
value: /share/pretrained_lm/Qwen/Qwen2-0.5B
|
| 129 |
+
use_better_transformer:
|
| 130 |
+
desc: null
|
| 131 |
+
value: false
|
| 132 |
+
grad_clip_norm:
|
| 133 |
+
desc: null
|
| 134 |
+
value: 1.0
|
| 135 |
+
eval_interval:
|
| 136 |
+
desc: null
|
| 137 |
+
value: 200
|
| 138 |
+
save_interval:
|
| 139 |
+
desc: null
|
| 140 |
+
value: 5
|
| 141 |
+
eval_iters:
|
| 142 |
+
desc: null
|
| 143 |
+
value: 10
|
| 144 |
+
optimizer:
|
| 145 |
+
desc: null
|
| 146 |
+
value: adam
|
| 147 |
+
lr:
|
| 148 |
+
desc: null
|
| 149 |
+
value: 2.0e-05
|
| 150 |
+
lr_decay_style:
|
| 151 |
+
desc: null
|
| 152 |
+
value: cosine
|
| 153 |
+
lr_decay_iters:
|
| 154 |
+
desc: null
|
| 155 |
+
value: 20000
|
| 156 |
+
lr_warmup_iters:
|
| 157 |
+
desc: null
|
| 158 |
+
value: 500
|
| 159 |
+
min_lr:
|
| 160 |
+
desc: null
|
| 161 |
+
value: 1.0e-06
|
| 162 |
+
train_iters:
|
| 163 |
+
desc: null
|
| 164 |
+
value: 20000
|
| 165 |
+
train_samples:
|
| 166 |
+
desc: null
|
| 167 |
+
value: null
|
| 168 |
+
global_batch_size:
|
| 169 |
+
desc: null
|
| 170 |
+
value: 320
|
| 171 |
+
micro_batch_size:
|
| 172 |
+
desc: null
|
| 173 |
+
value: 1
|
| 174 |
+
make_vocab_size_divisible_by:
|
| 175 |
+
desc: null
|
| 176 |
+
value: 128
|
| 177 |
+
sliding_window_size:
|
| 178 |
+
desc: null
|
| 179 |
+
value: 4096
|
| 180 |
+
skip_batch:
|
| 181 |
+
desc: null
|
| 182 |
+
value: null
|
| 183 |
+
no_save_optimizer_state:
|
| 184 |
+
desc: null
|
| 185 |
+
value: false
|
| 186 |
+
continual_pretraining:
|
| 187 |
+
desc: null
|
| 188 |
+
value: false
|
| 189 |
+
instruction_tuning:
|
| 190 |
+
desc: null
|
| 191 |
+
value: false
|
| 192 |
+
direct_preference_optimization:
|
| 193 |
+
desc: null
|
| 194 |
+
value: false
|
| 195 |
+
attention_dropout:
|
| 196 |
+
desc: null
|
| 197 |
+
value: 0.1
|
| 198 |
+
hidden_dropout:
|
| 199 |
+
desc: null
|
| 200 |
+
value: 0.1
|
| 201 |
+
weight_decay:
|
| 202 |
+
desc: null
|
| 203 |
+
value: 0.1
|
| 204 |
+
adam_beta1:
|
| 205 |
+
desc: null
|
| 206 |
+
value: 0.9
|
| 207 |
+
adam_beta2:
|
| 208 |
+
desc: null
|
| 209 |
+
value: 0.95
|
| 210 |
+
adam_eps:
|
| 211 |
+
desc: null
|
| 212 |
+
value: 1.0e-06
|
| 213 |
+
hf_transformer_model_dir:
|
| 214 |
+
desc: null
|
| 215 |
+
value: null
|
| 216 |
+
instruction_train_data_path:
|
| 217 |
+
desc: null
|
| 218 |
+
value: null
|
| 219 |
+
instruction_valid_data_path:
|
| 220 |
+
desc: null
|
| 221 |
+
value: null
|
| 222 |
+
epoch:
|
| 223 |
+
desc: null
|
| 224 |
+
value: null
|
| 225 |
+
instruction_dataset_size:
|
| 226 |
+
desc: null
|
| 227 |
+
value: null
|
| 228 |
+
save_sampler_state:
|
| 229 |
+
desc: null
|
| 230 |
+
value: false
|
| 231 |
+
label_smoothing:
|
| 232 |
+
desc: null
|
| 233 |
+
value: 0.0
|
| 234 |
+
save_n_checkpoints:
|
| 235 |
+
desc: null
|
| 236 |
+
value: 10
|
| 237 |
+
hf_repo_id:
|
| 238 |
+
desc: null
|
| 239 |
+
value: koichi12/yans-qwen2-0.5B
|
| 240 |
+
create_public_hf_repo:
|
| 241 |
+
desc: null
|
| 242 |
+
value: false
|
| 243 |
+
upload_all_checkpoints_to_hf:
|
| 244 |
+
desc: null
|
| 245 |
+
value: false
|
| 246 |
+
hf_upload_retry_limit:
|
| 247 |
+
desc: null
|
| 248 |
+
value: 2
|
| 249 |
+
exit_duration_in_mins:
|
| 250 |
+
desc: null
|
| 251 |
+
value: null
|
| 252 |
+
source_key:
|
| 253 |
+
desc: null
|
| 254 |
+
value: null
|
| 255 |
+
target_key:
|
| 256 |
+
desc: null
|
| 257 |
+
value: null
|
| 258 |
+
attn_implementation:
|
| 259 |
+
desc: null
|
| 260 |
+
value: flash_attention_2
|
| 261 |
+
efficient_instruction_tuning:
|
| 262 |
+
desc: null
|
| 263 |
+
value: false
|
| 264 |
+
remove_padding_masking:
|
| 265 |
+
desc: null
|
| 266 |
+
value: false
|
| 267 |
+
save_start_iter:
|
| 268 |
+
desc: null
|
| 269 |
+
value: null
|
| 270 |
+
rank:
|
| 271 |
+
desc: null
|
| 272 |
+
value: 0
|
| 273 |
+
world_size:
|
| 274 |
+
desc: null
|
| 275 |
+
value: 1
|
| 276 |
+
padded_vocab_size:
|
| 277 |
+
desc: null
|
| 278 |
+
value: 151680
|
| 279 |
+
gradient_accumulation_steps:
|
| 280 |
+
desc: null
|
| 281 |
+
value: 320
|
| 282 |
+
_wandb:
|
| 283 |
+
desc: null
|
| 284 |
+
value:
|
| 285 |
+
python_version: 3.10.12
|
| 286 |
+
cli_version: 0.16.3
|
| 287 |
+
framework: huggingface
|
| 288 |
+
huggingface_version: 4.43.3
|
| 289 |
+
is_jupyter_run: false
|
| 290 |
+
is_kaggle_kernel: false
|
| 291 |
+
start_time: 1723408133.524123
|
| 292 |
+
t:
|
| 293 |
+
1:
|
| 294 |
+
- 1
|
| 295 |
+
- 11
|
| 296 |
+
- 49
|
| 297 |
+
- 55
|
| 298 |
+
- 71
|
| 299 |
+
2:
|
| 300 |
+
- 1
|
| 301 |
+
- 11
|
| 302 |
+
- 49
|
| 303 |
+
- 55
|
| 304 |
+
- 71
|
| 305 |
+
3:
|
| 306 |
+
- 13
|
| 307 |
+
- 16
|
| 308 |
+
- 23
|
| 309 |
+
4: 3.10.12
|
| 310 |
+
5: 0.16.3
|
| 311 |
+
6: 4.43.3
|
| 312 |
+
8:
|
| 313 |
+
- 5
|
| 314 |
+
13: linux-x86_64
|
| 315 |
+
model_architecture:
|
| 316 |
+
desc: null
|
| 317 |
+
value: Qwen2ForCausalLM
|
| 318 |
+
activation_function:
|
| 319 |
+
desc: null
|
| 320 |
+
value: silu
|
| 321 |
+
hidden_size:
|
| 322 |
+
desc: null
|
| 323 |
+
value: 896
|
| 324 |
+
model_type:
|
| 325 |
+
desc: null
|
| 326 |
+
value: qwen2
|
| 327 |
+
max_position_embeddings:
|
| 328 |
+
desc: null
|
| 329 |
+
value: 4096
|
| 330 |
+
num_attention_heads:
|
| 331 |
+
desc: null
|
| 332 |
+
value: 14
|
| 333 |
+
num_hidden_layers:
|
| 334 |
+
desc: null
|
| 335 |
+
value: 24
|
wandb/run-20240812_052853-n84i0o06/files/output.log
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Created Hugging Face repository with ID koichi12/yans-qwen2-0.5B.
|
| 2 |
+
Clearing GPU cache for all ranks
|
| 3 |
+
--> Running with torch torch_distributed debug set to detail
|
| 4 |
+
File not found: /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
|
| 5 |
+
Unable to read latest iteration from /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
|
| 6 |
+
File not found: /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
|
| 7 |
+
Unable to read latest iteration from /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
|
| 8 |
+
File not found: /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
|
| 9 |
+
Unable to read latest iteration from /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
|
| 10 |
+
No checkpoint found in /work/llm_recipes/models/yans-qwen2-0.5B, skipping model loading
|
| 11 |
+
--> Model /share/pretrained_lm/Qwen/Qwen2-0.5B
|
| 12 |
+
--> /share/pretrained_lm/Qwen/Qwen2-0.5B has 494.032768 Million params
|
| 13 |
+
BFloat16 enabled for mixed precision - using bfSixteen policy
|
| 14 |
+
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
|
| 15 |
+
/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
|
| 16 |
+
warnings.warn(
|
| 17 |
+
--> applying fsdp activation checkpointing...
|
| 18 |
+
> datasets target sizes (minimum size):
|
| 19 |
+
train: 6400000
|
| 20 |
+
validation: 323200
|
| 21 |
+
test: 3200
|
| 22 |
+
> building train, validation, and test datasets for GPT ...
|
| 23 |
+
Let split = None
|
| 24 |
+
> finished creating GPT datasets ...
|
| 25 |
+
File not found: /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
|
| 26 |
+
Unable to read latest iteration from /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
|
| 27 |
+
No checkpoint found in /work/llm_recipes/models/yans-qwen2-0.5B, skipping optimizer loading
|
| 28 |
+
File not found: /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
|
| 29 |
+
Unable to read latest iteration from /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
|
| 30 |
+
model info: FullyShardedDataParallel(
|
| 31 |
+
(_fsdp_wrapped_module): Qwen2ForCausalLM(
|
| 32 |
+
(model): Qwen2Model(
|
| 33 |
+
(embed_tokens): Embedding(151936, 896)
|
| 34 |
+
(layers): ModuleList(
|
| 35 |
+
(0-23): 24 x FullyShardedDataParallel(
|
| 36 |
+
(_fsdp_wrapped_module): CheckpointWrapper(
|
| 37 |
+
(_checkpoint_wrapped_module): Qwen2DecoderLayer(
|
| 38 |
+
(self_attn): Qwen2FlashAttention2(
|
| 39 |
+
(q_proj): Linear(in_features=896, out_features=896, bias=True)
|
| 40 |
+
(k_proj): Linear(in_features=896, out_features=128, bias=True)
|
| 41 |
+
(v_proj): Linear(in_features=896, out_features=128, bias=True)
|
| 42 |
+
(o_proj): Linear(in_features=896, out_features=896, bias=False)
|
| 43 |
+
(rotary_emb): Qwen2RotaryEmbedding()
|
| 44 |
+
)
|
| 45 |
+
(mlp): Qwen2MLP(
|
| 46 |
+
(gate_proj): Linear(in_features=896, out_features=4864, bias=False)
|
| 47 |
+
(up_proj): Linear(in_features=896, out_features=4864, bias=False)
|
| 48 |
+
(down_proj): Linear(in_features=4864, out_features=896, bias=False)
|
| 49 |
+
(act_fn): SiLU()
|
| 50 |
+
)
|
| 51 |
+
(input_layernorm): Qwen2RMSNorm()
|
| 52 |
+
(post_attention_layernorm): Qwen2RMSNorm()
|
| 53 |
+
)
|
| 54 |
+
)
|
| 55 |
+
)
|
| 56 |
+
)
|
| 57 |
+
(norm): Qwen2RMSNorm()
|
| 58 |
+
)
|
| 59 |
+
(lm_head): Linear(in_features=896, out_features=151936, bias=False)
|
| 60 |
+
)
|
| 61 |
+
)
|
| 62 |
+
model config: Qwen2Config {
|
| 63 |
+
"_name_or_path": "/share/pretrained_lm/Qwen/Qwen2-0.5B",
|
| 64 |
+
"architectures": [
|
| 65 |
+
"Qwen2ForCausalLM"
|
| 66 |
+
],
|
| 67 |
+
"attention_dropout": 0.0,
|
| 68 |
+
"bos_token_id": 151643,
|
| 69 |
+
"eos_token_id": 151643,
|
| 70 |
+
"hidden_act": "silu",
|
| 71 |
+
"hidden_size": 896,
|
| 72 |
+
"initializer_range": 0.02,
|
| 73 |
+
"intermediate_size": 4864,
|
| 74 |
+
"label_smoothing": 0.0,
|
| 75 |
+
"max_position_embeddings": 4096,
|
| 76 |
+
"max_window_layers": 24,
|
| 77 |
+
"model_type": "qwen2",
|
| 78 |
+
"num_attention_heads": 14,
|
| 79 |
+
"num_hidden_layers": 24,
|
| 80 |
+
"num_key_value_heads": 2,
|
| 81 |
+
"rms_norm_eps": 1e-06,
|
| 82 |
+
"rope_theta": 1000000.0,
|
| 83 |
+
"sliding_window": null,
|
| 84 |
+
"tie_word_embeddings": true,
|
| 85 |
+
"torch_dtype": "bfloat16",
|
| 86 |
+
"transformers_version": "4.43.3",
|
| 87 |
+
"use_cache": false,
|
| 88 |
+
"use_sliding_window": false,
|
| 89 |
+
"vocab_size": 151936
|
| 90 |
+
}
|
| 91 |
+
Building a BlendedDataset for a single MegatronDataset
|
| 92 |
+
Unable to save the indexes because path_to_cache is None
|
| 93 |
+
Building a BlendedDataset for a single MegatronDataset
|
| 94 |
+
Unable to save the indexes because path_to_cache is None
|
| 95 |
+
Building a BlendedDataset for a single MegatronDataset
|
| 96 |
+
Unable to save the indexes because path_to_cache is None
|
| 97 |
+
------------------------------------------------------------------
|
| 98 |
+
iteration: 1 , TFLOPS: 67.05501421617748, Tokens per sec: 16676.24515769431, Loss: 4.1814446449279785
|
| 99 |
+
------------------------------------------------------------------
|
| 100 |
+
------------------------------------------------------------------
|
| 101 |
+
iteration: 2 , TFLOPS: 70.71126656778048, Tokens per sec: 17585.5367488818, Loss: 4.19144344329834
|
| 102 |
+
------------------------------------------------------------------
|
| 103 |
+
------------------------------------------------------------------
|
| 104 |
+
iteration: 3 , TFLOPS: 70.545913767934, Tokens per sec: 17544.41433827636, Loss: 4.197675704956055
|
| 105 |
+
------------------------------------------------------------------
|
| 106 |
+
------------------------------------------------------------------
|
| 107 |
+
iteration: 4 , TFLOPS: 70.68479486678217, Tokens per sec: 17578.953369834773, Loss: 4.183629989624023
|
| 108 |
+
------------------------------------------------------------------
|
| 109 |
+
------------------------------------------------------------------
|
| 110 |
+
iteration: 5 , TFLOPS: 70.61673302016509, Tokens per sec: 17562.0267305172, Loss: 4.198177337646484
|
| 111 |
+
------------------------------------------------------------------
|
| 112 |
+
Saving checkpoint to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005
|
| 113 |
+
Saving model state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005/model.pt
|
| 114 |
+
/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_state_dict_utils.py:773: UserWarning: When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict willbe returned.
|
| 115 |
+
warnings.warn(
|
| 116 |
+
/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_state_dict_utils.py:716: UserWarning: When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict willbe returned.
|
| 117 |
+
warnings.warn(
|
| 118 |
+
Saved model state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005/model.pt
|
| 119 |
+
Saving optimizer state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005/optimizer.pt
|
| 120 |
+
[rank0]:[2024-08-12 05:35:23,399] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _optim_state_dict() profiling: defaultdict(<class 'float'>, {'preprocessing': 0.00647389400000975, 'preprocessing_with_comm': 0.0007460029999037943, 'state_converting': 0.9694889820000299, <Type.ALL: 'all'>: 0.9780955020000874})
|
| 121 |
+
Saved optimizer state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005/optimizer.pt
|
| 122 |
+
Saving scheduler state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005/scheduler.pt
|
| 123 |
+
Saved scheduler state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005/scheduler.pt
|
| 124 |
+
Saving RNG states to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005/rng.pt
|
| 125 |
+
Saved RNG states to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005/rng.pt
|
| 126 |
+
Traceback (most recent call last):
|
| 127 |
+
File "/project/examples/finetuning.py", line 13, in <module>
|
| 128 |
+
main()
|
| 129 |
+
File "/project/src/llama_recipes/finetuning.py", line 281, in main
|
| 130 |
+
train(
|
| 131 |
+
File "/project/src/llama_recipes/utils/train_utils.py", line 175, in train
|
| 132 |
+
save_checkpoint(
|
| 133 |
+
File "/project/src/llama_recipes/utils/checkpoint.py", line 168, in save_checkpoint
|
| 134 |
+
tokenizer.tokenizer.save_pretrained(tokenizer_path)
|
| 135 |
+
File "/project/lib/transformers/src/transformers/tokenization_utils_base.py", line 2622, in save_pretrained
|
| 136 |
+
if os.path.isfile(save_directory):
|
| 137 |
+
File "/usr/lib/python3.10/genericpath.py", line 30, in isfile
|
| 138 |
+
st = os.stat(path)
|
| 139 |
+
TypeError: stat: path should be string, bytes, os.PathLike or integer, not NoneType
|
wandb/run-20240812_052853-n84i0o06/files/requirements.txt
ADDED
|
@@ -0,0 +1,271 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
absl-py==2.1.0
|
| 2 |
+
accelerate==0.33.0
|
| 3 |
+
aiohttp==3.9.1
|
| 4 |
+
aiosignal==1.3.1
|
| 5 |
+
annotated-types==0.6.0
|
| 6 |
+
apex==0.1
|
| 7 |
+
appdirs==1.4.4
|
| 8 |
+
argon2-cffi-bindings==21.2.0
|
| 9 |
+
argon2-cffi==23.1.0
|
| 10 |
+
asttokens==2.4.1
|
| 11 |
+
astunparse==1.6.3
|
| 12 |
+
async-timeout==4.0.3
|
| 13 |
+
attrs==23.2.0
|
| 14 |
+
audioread==3.0.1
|
| 15 |
+
beautifulsoup4==4.12.3
|
| 16 |
+
bleach==6.1.0
|
| 17 |
+
blis==0.7.11
|
| 18 |
+
cachetools==5.3.2
|
| 19 |
+
catalogue==2.0.10
|
| 20 |
+
certifi==2024.2.2
|
| 21 |
+
cffi==1.16.0
|
| 22 |
+
charset-normalizer==3.3.2
|
| 23 |
+
click==8.1.7
|
| 24 |
+
cloudpathlib==0.16.0
|
| 25 |
+
cloudpickle==3.0.0
|
| 26 |
+
cmake==3.28.1
|
| 27 |
+
colorama==0.4.6
|
| 28 |
+
comm==0.2.1
|
| 29 |
+
confection==0.1.4
|
| 30 |
+
contourpy==1.2.0
|
| 31 |
+
cubinlinker==0.3.0+2.g405ac64
|
| 32 |
+
cuda-python==12.3.0rc4+9.gdb8c48a.dirty
|
| 33 |
+
cudf==23.12.0
|
| 34 |
+
cugraph-dgl==23.12.0
|
| 35 |
+
cugraph-service-client==23.12.0
|
| 36 |
+
cugraph-service-server==23.12.0
|
| 37 |
+
cugraph==23.12.0
|
| 38 |
+
cuml==23.12.0
|
| 39 |
+
cupy-cuda12x==12.3.0
|
| 40 |
+
cycler==0.12.1
|
| 41 |
+
cymem==2.0.8
|
| 42 |
+
cython==3.0.8
|
| 43 |
+
dask-cuda==23.12.0
|
| 44 |
+
dask-cudf==23.12.0
|
| 45 |
+
dask==2023.11.0
|
| 46 |
+
debugpy==1.8.1
|
| 47 |
+
decorator==5.1.1
|
| 48 |
+
defusedxml==0.7.1
|
| 49 |
+
distributed==2023.11.0
|
| 50 |
+
dm-tree==0.1.8
|
| 51 |
+
docker-pycreds==0.4.0
|
| 52 |
+
einops==0.7.0
|
| 53 |
+
exceptiongroup==1.2.0
|
| 54 |
+
execnet==2.0.2
|
| 55 |
+
executing==2.0.1
|
| 56 |
+
expecttest==0.1.3
|
| 57 |
+
fastjsonschema==2.19.1
|
| 58 |
+
fastrlock==0.8.2
|
| 59 |
+
filelock==3.13.1
|
| 60 |
+
flash-attn==2.4.2
|
| 61 |
+
fonttools==4.48.1
|
| 62 |
+
frozenlist==1.4.1
|
| 63 |
+
fsspec==2023.12.2
|
| 64 |
+
gast==0.5.4
|
| 65 |
+
gitdb==4.0.11
|
| 66 |
+
gitpython==3.1.43
|
| 67 |
+
google-auth-oauthlib==0.4.6
|
| 68 |
+
google-auth==2.27.0
|
| 69 |
+
graphsurgeon==0.4.6
|
| 70 |
+
grpcio==1.60.1
|
| 71 |
+
huggingface-hub==0.24.5
|
| 72 |
+
hypothesis==5.35.1
|
| 73 |
+
idna==3.6
|
| 74 |
+
importlib-metadata==7.0.1
|
| 75 |
+
iniconfig==2.0.0
|
| 76 |
+
intel-openmp==2021.4.0
|
| 77 |
+
ipadic==1.0.0
|
| 78 |
+
ipykernel==6.29.2
|
| 79 |
+
ipython-genutils==0.2.0
|
| 80 |
+
ipython==8.21.0
|
| 81 |
+
jedi==0.19.1
|
| 82 |
+
jinja2==3.1.3
|
| 83 |
+
joblib==1.3.2
|
| 84 |
+
json5==0.9.14
|
| 85 |
+
jsonnet==0.19.1
|
| 86 |
+
jsonschema-specifications==2023.12.1
|
| 87 |
+
jsonschema==4.21.1
|
| 88 |
+
jupyter-client==8.6.0
|
| 89 |
+
jupyter-core==5.7.1
|
| 90 |
+
jupyter-tensorboard==0.2.0
|
| 91 |
+
jupyterlab-pygments==0.3.0
|
| 92 |
+
jupyterlab-server==1.2.0
|
| 93 |
+
jupyterlab==2.3.2
|
| 94 |
+
jupytext==1.16.1
|
| 95 |
+
kiwisolver==1.4.5
|
| 96 |
+
langcodes==3.3.0
|
| 97 |
+
lazy-loader==0.3
|
| 98 |
+
librosa==0.10.1
|
| 99 |
+
llvmlite==0.40.1
|
| 100 |
+
locket==1.0.0
|
| 101 |
+
logzero==1.7.0
|
| 102 |
+
lxml==5.2.2
|
| 103 |
+
markdown-it-py==3.0.0
|
| 104 |
+
markdown==3.5.2
|
| 105 |
+
markupsafe==2.1.4
|
| 106 |
+
matplotlib-inline==0.1.6
|
| 107 |
+
matplotlib==3.8.2
|
| 108 |
+
mdit-py-plugins==0.4.0
|
| 109 |
+
mdurl==0.1.2
|
| 110 |
+
mecab-python3==1.0.6
|
| 111 |
+
mistune==3.0.2
|
| 112 |
+
mkl-devel==2021.1.1
|
| 113 |
+
mkl-include==2021.1.1
|
| 114 |
+
mkl==2021.1.1
|
| 115 |
+
mock==5.1.0
|
| 116 |
+
more-itertools==9.1.0
|
| 117 |
+
mpmath==1.3.0
|
| 118 |
+
msgpack==1.0.7
|
| 119 |
+
multidict==6.0.4
|
| 120 |
+
murmurhash==1.0.10
|
| 121 |
+
nbclient==0.9.0
|
| 122 |
+
nbconvert==7.16.0
|
| 123 |
+
nbformat==5.9.2
|
| 124 |
+
nest-asyncio==1.6.0
|
| 125 |
+
networkx==2.6.3
|
| 126 |
+
ninja==1.11.1.1
|
| 127 |
+
nltk==3.8.1
|
| 128 |
+
notebook==6.4.10
|
| 129 |
+
numba==0.57.1+1.g1ff679645
|
| 130 |
+
numpy==1.24.4
|
| 131 |
+
nvfuser==0.1.4a0+d0bb811
|
| 132 |
+
nvidia-dali-cuda120==1.34.0
|
| 133 |
+
nvidia-pyindex==1.0.9
|
| 134 |
+
nvtx==0.2.5
|
| 135 |
+
oauthlib==3.2.2
|
| 136 |
+
onnx==1.15.0rc2
|
| 137 |
+
opencv==4.7.0
|
| 138 |
+
optree==0.10.0
|
| 139 |
+
packaging==23.2
|
| 140 |
+
pandas==1.5.3
|
| 141 |
+
pandocfilters==1.5.1
|
| 142 |
+
parso==0.8.3
|
| 143 |
+
partd==1.4.1
|
| 144 |
+
peft==0.11.1
|
| 145 |
+
pexpect==4.9.0
|
| 146 |
+
pillow==10.2.0
|
| 147 |
+
pip==24.0
|
| 148 |
+
platformdirs==4.2.0
|
| 149 |
+
pluggy==1.4.0
|
| 150 |
+
ply==3.11
|
| 151 |
+
polygraphy==0.49.4
|
| 152 |
+
pooch==1.8.0
|
| 153 |
+
portalocker==2.10.1
|
| 154 |
+
preshed==3.0.9
|
| 155 |
+
prettytable==3.9.0
|
| 156 |
+
prometheus-client==0.19.0
|
| 157 |
+
prompt-toolkit==3.0.43
|
| 158 |
+
protobuf==4.24.4
|
| 159 |
+
psutil==5.9.4
|
| 160 |
+
ptxcompiler==0.8.1+2.g0d406d6
|
| 161 |
+
ptyprocess==0.7.0
|
| 162 |
+
pure-eval==0.2.2
|
| 163 |
+
pyarrow==14.0.1.dev0+gba5374836.d20240125
|
| 164 |
+
pyasn1-modules==0.3.0
|
| 165 |
+
pyasn1==0.5.1
|
| 166 |
+
pybind11-global==2.11.1
|
| 167 |
+
pybind11==2.11.1
|
| 168 |
+
pycocotools==2.0+nv0.8.0
|
| 169 |
+
pycparser==2.21
|
| 170 |
+
pydantic-core==2.16.2
|
| 171 |
+
pydantic==2.6.1
|
| 172 |
+
pygments==2.17.2
|
| 173 |
+
pylibcugraph==23.12.0
|
| 174 |
+
pylibcugraphops==23.12.0
|
| 175 |
+
pylibraft==23.12.0
|
| 176 |
+
pynvml==11.4.1
|
| 177 |
+
pyparsing==3.1.1
|
| 178 |
+
pytest-flakefinder==1.1.0
|
| 179 |
+
pytest-rerunfailures==13.0
|
| 180 |
+
pytest-shard==0.1.2
|
| 181 |
+
pytest-xdist==3.5.0
|
| 182 |
+
pytest==8.0.0
|
| 183 |
+
python-dateutil==2.8.2
|
| 184 |
+
python-dotenv==1.0.0
|
| 185 |
+
python-hostlist==1.23.0
|
| 186 |
+
pytorch-quantization==2.1.2
|
| 187 |
+
pytz==2023.3.post1
|
| 188 |
+
pyyaml==6.0.1
|
| 189 |
+
pyzmq==25.1.2
|
| 190 |
+
raft-dask==23.12.0
|
| 191 |
+
rapids-dask-dependency==23.12.1
|
| 192 |
+
referencing==0.33.0
|
| 193 |
+
regex==2023.12.25
|
| 194 |
+
requests-oauthlib==1.3.1
|
| 195 |
+
requests==2.31.0
|
| 196 |
+
rich==13.7.0
|
| 197 |
+
rmm==23.12.0
|
| 198 |
+
rpds-py==0.17.1
|
| 199 |
+
rsa==4.9
|
| 200 |
+
sacrebleu==2.4.0
|
| 201 |
+
safetensors==0.4.3
|
| 202 |
+
scikit-learn==1.2.0
|
| 203 |
+
scipy==1.12.0
|
| 204 |
+
send2trash==1.8.2
|
| 205 |
+
sentencepiece==0.1.99
|
| 206 |
+
sentry-sdk==2.12.0
|
| 207 |
+
setproctitle==1.3.3
|
| 208 |
+
setuptools==68.2.2
|
| 209 |
+
six==1.16.0
|
| 210 |
+
smart-open==6.4.0
|
| 211 |
+
smmap==5.0.1
|
| 212 |
+
sortedcontainers==2.4.0
|
| 213 |
+
soundfile==0.12.1
|
| 214 |
+
soupsieve==2.5
|
| 215 |
+
soxr==0.3.7
|
| 216 |
+
spacy-legacy==3.0.12
|
| 217 |
+
spacy-loggers==1.0.5
|
| 218 |
+
spacy==3.7.2
|
| 219 |
+
sphinx-glpi-theme==0.6
|
| 220 |
+
srsly==2.4.8
|
| 221 |
+
stack-data==0.6.3
|
| 222 |
+
sympy==1.12
|
| 223 |
+
tabulate==0.9.0
|
| 224 |
+
tbb==2021.11.0
|
| 225 |
+
tblib==3.0.0
|
| 226 |
+
tensorboard-data-server==0.6.1
|
| 227 |
+
tensorboard-plugin-wit==1.8.1
|
| 228 |
+
tensorboard==2.9.0
|
| 229 |
+
tensorrt==8.6.3
|
| 230 |
+
terminado==0.18.0
|
| 231 |
+
termplotlib==0.3.9
|
| 232 |
+
thinc==8.2.3
|
| 233 |
+
threadpoolctl==3.2.0
|
| 234 |
+
thriftpy2==0.4.17
|
| 235 |
+
tinycss2==1.2.1
|
| 236 |
+
tokenizers==0.19.1
|
| 237 |
+
toml==0.10.2
|
| 238 |
+
tomli==2.0.1
|
| 239 |
+
toolz==0.12.1
|
| 240 |
+
torch-tensorrt==2.3.0a0
|
| 241 |
+
torch==2.3.0a0+ebedce2
|
| 242 |
+
torchdata==0.7.1a0
|
| 243 |
+
torchtext==0.17.0a0
|
| 244 |
+
torchvision==0.18.0a0
|
| 245 |
+
tornado==6.4
|
| 246 |
+
tqdm==4.66.1
|
| 247 |
+
traitlets==5.9.0
|
| 248 |
+
transformer-engine==1.3.0+5b90b7f
|
| 249 |
+
transformers==4.43.3
|
| 250 |
+
treelite-runtime==3.9.1
|
| 251 |
+
treelite==3.9.1
|
| 252 |
+
triton==2.2.0+e28a256
|
| 253 |
+
typer==0.9.0
|
| 254 |
+
types-dataclasses==0.6.6
|
| 255 |
+
typing-extensions==4.9.0
|
| 256 |
+
ucx-py==0.35.0
|
| 257 |
+
uff==0.6.9
|
| 258 |
+
ujson==5.8.0
|
| 259 |
+
urllib3==1.26.18
|
| 260 |
+
wandb==0.16.3
|
| 261 |
+
wasabi==1.1.2
|
| 262 |
+
wcwidth==0.2.13
|
| 263 |
+
weasel==0.3.4
|
| 264 |
+
webencodings==0.5.1
|
| 265 |
+
werkzeug==3.0.1
|
| 266 |
+
wheel==0.42.0
|
| 267 |
+
xdoctest==1.0.2
|
| 268 |
+
xgboost==1.7.6
|
| 269 |
+
yarl==1.9.4
|
| 270 |
+
zict==3.0.0
|
| 271 |
+
zipp==3.17.0
|
wandb/run-20240812_052853-n84i0o06/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "3.10.12",
|
| 4 |
+
"heartbeatAt": "2024-08-11T20:28:54.148690",
|
| 5 |
+
"startedAt": "2024-08-11T20:28:53.511276",
|
| 6 |
+
"docker": null,
|
| 7 |
+
"cuda": null,
|
| 8 |
+
"args": [
|
| 9 |
+
"--seq-length",
|
| 10 |
+
"4096",
|
| 11 |
+
"--sliding-window-size",
|
| 12 |
+
"4096",
|
| 13 |
+
"--micro-batch-size",
|
| 14 |
+
"1",
|
| 15 |
+
"--global-batch-size",
|
| 16 |
+
"320",
|
| 17 |
+
"--train-iters",
|
| 18 |
+
"20000",
|
| 19 |
+
"--tokenizer-type",
|
| 20 |
+
"HFPreTrainedTokenizer",
|
| 21 |
+
"--tokenizer-model",
|
| 22 |
+
"/share/pretrained_lm/Qwen/Qwen2-0.5B",
|
| 23 |
+
"--train-data-path",
|
| 24 |
+
"304771887",
|
| 25 |
+
"/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
|
| 26 |
+
"--valid-data-path",
|
| 27 |
+
"304771887",
|
| 28 |
+
"/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
|
| 29 |
+
"--test-data-path",
|
| 30 |
+
"304771887",
|
| 31 |
+
"/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
|
| 32 |
+
"--lr",
|
| 33 |
+
"2e-5",
|
| 34 |
+
"--min-lr",
|
| 35 |
+
"1e-6",
|
| 36 |
+
"--lr-decay-style",
|
| 37 |
+
"cosine",
|
| 38 |
+
"--lr-warmup-iters",
|
| 39 |
+
"500",
|
| 40 |
+
"--lr-decay-iters",
|
| 41 |
+
"20000",
|
| 42 |
+
"--weight-decay",
|
| 43 |
+
"0.1",
|
| 44 |
+
"--grad-clip-norm",
|
| 45 |
+
"1.0",
|
| 46 |
+
"--optimizer",
|
| 47 |
+
"adam",
|
| 48 |
+
"--adam-beta1",
|
| 49 |
+
"0.9",
|
| 50 |
+
"--adam-beta2",
|
| 51 |
+
"0.95",
|
| 52 |
+
"--adam-eps",
|
| 53 |
+
"1e-6",
|
| 54 |
+
"--save-interval",
|
| 55 |
+
"5",
|
| 56 |
+
"--eval-interval",
|
| 57 |
+
"200",
|
| 58 |
+
"--eval-iters",
|
| 59 |
+
"10",
|
| 60 |
+
"--bf16",
|
| 61 |
+
"--mixed-precision",
|
| 62 |
+
"--base-model",
|
| 63 |
+
"/share/pretrained_lm/Qwen/Qwen2-0.5B",
|
| 64 |
+
"--save",
|
| 65 |
+
"/work/llm_recipes/models/yans-qwen2-0.5B",
|
| 66 |
+
"--load",
|
| 67 |
+
"/work/llm_recipes/models/yans-qwen2-0.5B",
|
| 68 |
+
"--fsdp-activation-checkpointing",
|
| 69 |
+
"--sharding-strategy",
|
| 70 |
+
"FULL_SHARD",
|
| 71 |
+
"--checkpoint-type",
|
| 72 |
+
"LOCAL_STATE_DICT",
|
| 73 |
+
"--save-n-checkpoints",
|
| 74 |
+
"10",
|
| 75 |
+
"--hf-upload-retry-limit",
|
| 76 |
+
"2",
|
| 77 |
+
"--hf-repo-id",
|
| 78 |
+
"koichi12/yans-qwen2-0.5B",
|
| 79 |
+
"--wandb-entity",
|
| 80 |
+
"iwakawa-koichi-q5-tohoku-nlp6723",
|
| 81 |
+
"--wandb-project",
|
| 82 |
+
"llm_tutorial",
|
| 83 |
+
"--wandb-name",
|
| 84 |
+
"yans-qwen2-0.5B_train_2024-08-12-05:28:42"
|
| 85 |
+
],
|
| 86 |
+
"state": "running",
|
| 87 |
+
"program": "/project/examples/finetuning.py",
|
| 88 |
+
"codePathLocal": "examples/finetuning.py",
|
| 89 |
+
"codePath": "examples/finetuning.py",
|
| 90 |
+
"git": {
|
| 91 |
+
"remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
|
| 92 |
+
"commit": "6da01327e78c302bc0cfdb335f3ca297e2a19c8c"
|
| 93 |
+
},
|
| 94 |
+
"email": null,
|
| 95 |
+
"root": "/project",
|
| 96 |
+
"host": "gpu-koiwa-00",
|
| 97 |
+
"username": "koiwa",
|
| 98 |
+
"executable": "/usr/bin/python",
|
| 99 |
+
"cpu_count": 18,
|
| 100 |
+
"cpu_count_logical": 18,
|
| 101 |
+
"cpu_freq": {
|
| 102 |
+
"current": 2400.0429999999997,
|
| 103 |
+
"min": 0.0,
|
| 104 |
+
"max": 0.0
|
| 105 |
+
},
|
| 106 |
+
"cpu_freq_per_core": [
|
| 107 |
+
{
|
| 108 |
+
"current": 2400.043,
|
| 109 |
+
"min": 0.0,
|
| 110 |
+
"max": 0.0
|
| 111 |
+
},
|
| 112 |
+
{
|
| 113 |
+
"current": 2400.043,
|
| 114 |
+
"min": 0.0,
|
| 115 |
+
"max": 0.0
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"current": 2400.043,
|
| 119 |
+
"min": 0.0,
|
| 120 |
+
"max": 0.0
|
| 121 |
+
},
|
| 122 |
+
{
|
| 123 |
+
"current": 2400.043,
|
| 124 |
+
"min": 0.0,
|
| 125 |
+
"max": 0.0
|
| 126 |
+
},
|
| 127 |
+
{
|
| 128 |
+
"current": 2400.043,
|
| 129 |
+
"min": 0.0,
|
| 130 |
+
"max": 0.0
|
| 131 |
+
},
|
| 132 |
+
{
|
| 133 |
+
"current": 2400.043,
|
| 134 |
+
"min": 0.0,
|
| 135 |
+
"max": 0.0
|
| 136 |
+
},
|
| 137 |
+
{
|
| 138 |
+
"current": 2400.043,
|
| 139 |
+
"min": 0.0,
|
| 140 |
+
"max": 0.0
|
| 141 |
+
},
|
| 142 |
+
{
|
| 143 |
+
"current": 2400.043,
|
| 144 |
+
"min": 0.0,
|
| 145 |
+
"max": 0.0
|
| 146 |
+
},
|
| 147 |
+
{
|
| 148 |
+
"current": 2400.043,
|
| 149 |
+
"min": 0.0,
|
| 150 |
+
"max": 0.0
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"current": 2400.043,
|
| 154 |
+
"min": 0.0,
|
| 155 |
+
"max": 0.0
|
| 156 |
+
},
|
| 157 |
+
{
|
| 158 |
+
"current": 2400.043,
|
| 159 |
+
"min": 0.0,
|
| 160 |
+
"max": 0.0
|
| 161 |
+
},
|
| 162 |
+
{
|
| 163 |
+
"current": 2400.043,
|
| 164 |
+
"min": 0.0,
|
| 165 |
+
"max": 0.0
|
| 166 |
+
},
|
| 167 |
+
{
|
| 168 |
+
"current": 2400.043,
|
| 169 |
+
"min": 0.0,
|
| 170 |
+
"max": 0.0
|
| 171 |
+
},
|
| 172 |
+
{
|
| 173 |
+
"current": 2400.043,
|
| 174 |
+
"min": 0.0,
|
| 175 |
+
"max": 0.0
|
| 176 |
+
},
|
| 177 |
+
{
|
| 178 |
+
"current": 2400.043,
|
| 179 |
+
"min": 0.0,
|
| 180 |
+
"max": 0.0
|
| 181 |
+
},
|
| 182 |
+
{
|
| 183 |
+
"current": 2400.043,
|
| 184 |
+
"min": 0.0,
|
| 185 |
+
"max": 0.0
|
| 186 |
+
},
|
| 187 |
+
{
|
| 188 |
+
"current": 2400.043,
|
| 189 |
+
"min": 0.0,
|
| 190 |
+
"max": 0.0
|
| 191 |
+
},
|
| 192 |
+
{
|
| 193 |
+
"current": 2400.043,
|
| 194 |
+
"min": 0.0,
|
| 195 |
+
"max": 0.0
|
| 196 |
+
}
|
| 197 |
+
],
|
| 198 |
+
"disk": {
|
| 199 |
+
"/": {
|
| 200 |
+
"total": 0.0625,
|
| 201 |
+
"used": 1.1444091796875e-05
|
| 202 |
+
}
|
| 203 |
+
},
|
| 204 |
+
"gpu": "NVIDIA A100-SXM4-40GB",
|
| 205 |
+
"gpu_count": 1,
|
| 206 |
+
"gpu_devices": [
|
| 207 |
+
{
|
| 208 |
+
"name": "NVIDIA A100-SXM4-40GB",
|
| 209 |
+
"memory_total": 42949672960
|
| 210 |
+
}
|
| 211 |
+
],
|
| 212 |
+
"memory": {
|
| 213 |
+
"total": 56.487823486328125
|
| 214 |
+
}
|
| 215 |
+
}
|
wandb/run-20240812_052853-n84i0o06/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"training/loss": 4.198177337646484, "training/perplexity": 66.56489507784042, "utils/batch_size": 1, "utils/global_batch_size": 320, "utils/seq_len": 4097, "utils/gradient_accumulation_steps": 320, "utils/iteration": 5, "optimizer/lr": 1.19e-06, "optimizer/variance_l2": 0.00650817005037245, "optimizer/variance_sqrt_l2": 0.4753125323283669, "optimizer/momentum_l2": 0.4059003829432183, "optimizer/weight_l2": 825.0639369164065, "optimizer/variance_l1": 0.22650909423828125, "optimizer/variance_sqrt_l1": 1979.75, "optimizer/momentum_l1": 1591.375, "optimizer/weight_l1": 6886400.0, "optimizer/variance_abs_max": 0.004669189453125, "optimizer/variance_sqrt_abs_max": 0.068359375, "optimizer/momentum_abs_max": 0.058837890625, "optimizer/weight_abs_max": 175.0, "stats/1_iteration_time": 74.65197611400004, "stats/tokens_per_sec": 17562.0267305172, "stats/tokens_per_sec_per_gpu": 17562.0267305172, "stats/tflops": 70.61673302016509, "_timestamp": 1723408520.9273944, "_runtime": 387.4032714366913, "_step": 5, "_wandb": {"runtime": 391}}
|
wandb/run-20240812_052853-n84i0o06/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,384 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2024-08-12 05:28:53,525 INFO StreamThr :10531 [internal.py:wandb_internal():86] W&B internal server running at pid: 10531, started at: 2024-08-12 05:28:53.524894
|
| 2 |
+
2024-08-12 05:28:53,527 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status
|
| 3 |
+
2024-08-12 05:28:53,529 INFO WriterThread:10531 [datastore.py:open_for_write():87] open: /project/wandb/run-20240812_052853-n84i0o06/run-n84i0o06.wandb
|
| 4 |
+
2024-08-12 05:28:53,530 DEBUG SenderThread:10531 [sender.py:send():382] send: header
|
| 5 |
+
2024-08-12 05:28:53,544 DEBUG SenderThread:10531 [sender.py:send():382] send: run
|
| 6 |
+
2024-08-12 05:28:54,033 INFO SenderThread:10531 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240812_052853-n84i0o06/files
|
| 7 |
+
2024-08-12 05:28:54,033 INFO SenderThread:10531 [sender.py:_start_run_threads():1136] run started: n84i0o06 with start time 1723408133.524123
|
| 8 |
+
2024-08-12 05:28:54,038 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: check_version
|
| 9 |
+
2024-08-12 05:28:54,038 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: check_version
|
| 10 |
+
2024-08-12 05:28:54,128 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: run_start
|
| 11 |
+
2024-08-12 05:28:54,135 DEBUG HandlerThread:10531 [system_info.py:__init__():27] System info init
|
| 12 |
+
2024-08-12 05:28:54,135 DEBUG HandlerThread:10531 [system_info.py:__init__():42] System info init done
|
| 13 |
+
2024-08-12 05:28:54,135 INFO HandlerThread:10531 [system_monitor.py:start():194] Starting system monitor
|
| 14 |
+
2024-08-12 05:28:54,135 INFO SystemMonitor:10531 [system_monitor.py:_start():158] Starting system asset monitoring threads
|
| 15 |
+
2024-08-12 05:28:54,135 INFO HandlerThread:10531 [system_monitor.py:probe():214] Collecting system info
|
| 16 |
+
2024-08-12 05:28:54,136 INFO SystemMonitor:10531 [interfaces.py:start():190] Started cpu monitoring
|
| 17 |
+
2024-08-12 05:28:54,136 INFO SystemMonitor:10531 [interfaces.py:start():190] Started disk monitoring
|
| 18 |
+
2024-08-12 05:28:54,137 INFO SystemMonitor:10531 [interfaces.py:start():190] Started gpu monitoring
|
| 19 |
+
2024-08-12 05:28:54,138 INFO SystemMonitor:10531 [interfaces.py:start():190] Started memory monitoring
|
| 20 |
+
2024-08-12 05:28:54,139 INFO SystemMonitor:10531 [interfaces.py:start():190] Started network monitoring
|
| 21 |
+
2024-08-12 05:28:54,148 DEBUG HandlerThread:10531 [system_info.py:probe():151] Probing system
|
| 22 |
+
2024-08-12 05:28:54,150 DEBUG HandlerThread:10531 [system_info.py:_probe_git():136] Probing git
|
| 23 |
+
2024-08-12 05:28:54,163 DEBUG HandlerThread:10531 [system_info.py:_probe_git():144] Probing git done
|
| 24 |
+
2024-08-12 05:28:54,163 DEBUG HandlerThread:10531 [system_info.py:probe():199] Probing system done
|
| 25 |
+
2024-08-12 05:28:54,163 DEBUG HandlerThread:10531 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-11T20:28:54.148690', 'startedAt': '2024-08-11T20:28:53.511276', 'docker': None, 'cuda': None, 'args': ('--seq-length', '4096', '--sliding-window-size', '4096', '--micro-batch-size', '1', '--global-batch-size', '320', '--train-iters', '20000', '--tokenizer-type', 'HFPreTrainedTokenizer', '--tokenizer-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--train-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--valid-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--test-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '20000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '5', '--eval-interval', '200', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--save', '/work/llm_recipes/models/yans-qwen2-0.5B', '--load', '/work/llm_recipes/models/yans-qwen2-0.5B', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/yans-qwen2-0.5B', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'yans-qwen2-0.5B_train_2024-08-12-05:28:42'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '6da01327e78c302bc0cfdb335f3ca297e2a19c8c'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0429999999997, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.487823486328125}}
|
| 26 |
+
2024-08-12 05:28:54,163 INFO HandlerThread:10531 [system_monitor.py:probe():224] Finished collecting system info
|
| 27 |
+
2024-08-12 05:28:54,163 INFO HandlerThread:10531 [system_monitor.py:probe():227] Publishing system info
|
| 28 |
+
2024-08-12 05:28:54,164 INFO HandlerThread:10531 [system_monitor.py:probe():229] Finished publishing system info
|
| 29 |
+
2024-08-12 05:28:54,170 DEBUG SenderThread:10531 [sender.py:send():382] send: files
|
| 30 |
+
2024-08-12 05:28:54,170 INFO SenderThread:10531 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
|
| 31 |
+
2024-08-12 05:28:54,180 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: python_packages
|
| 32 |
+
2024-08-12 05:28:54,180 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
|
| 33 |
+
2024-08-12 05:28:54,181 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: python_packages
|
| 34 |
+
2024-08-12 05:28:54,181 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
|
| 35 |
+
2024-08-12 05:28:54,182 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
|
| 36 |
+
2024-08-12 05:28:54,475 DEBUG SenderThread:10531 [sender.py:send():382] send: telemetry
|
| 37 |
+
2024-08-12 05:28:54,885 INFO wandb-upload_0:10531 [upload_job.py:push():131] Uploaded file /tmp/tmp0u7r0fs3wandb/exuilam8-wandb-metadata.json
|
| 38 |
+
2024-08-12 05:28:55,035 INFO Thread-12 :10531 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_052853-n84i0o06/files/wandb-metadata.json
|
| 39 |
+
2024-08-12 05:28:55,035 INFO Thread-12 :10531 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_052853-n84i0o06/files/requirements.txt
|
| 40 |
+
2024-08-12 05:28:56,035 INFO Thread-12 :10531 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_052853-n84i0o06/files/output.log
|
| 41 |
+
2024-08-12 05:28:58,036 INFO Thread-12 :10531 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052853-n84i0o06/files/output.log
|
| 42 |
+
2024-08-12 05:28:59,328 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 43 |
+
2024-08-12 05:29:00,038 INFO Thread-12 :10531 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052853-n84i0o06/files/output.log
|
| 44 |
+
2024-08-12 05:29:01,878 DEBUG SenderThread:10531 [sender.py:send():382] send: config
|
| 45 |
+
2024-08-12 05:29:01,879 DEBUG SenderThread:10531 [sender.py:send():382] send: config
|
| 46 |
+
2024-08-12 05:29:02,039 INFO Thread-12 :10531 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052853-n84i0o06/files/output.log
|
| 47 |
+
2024-08-12 05:29:04,040 INFO Thread-12 :10531 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052853-n84i0o06/files/output.log
|
| 48 |
+
2024-08-12 05:29:04,879 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 49 |
+
2024-08-12 05:29:09,180 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
|
| 50 |
+
2024-08-12 05:29:09,181 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
|
| 51 |
+
2024-08-12 05:29:09,181 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
|
| 52 |
+
2024-08-12 05:29:10,368 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 53 |
+
2024-08-12 05:29:15,369 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 54 |
+
2024-08-12 05:29:20,370 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 55 |
+
2024-08-12 05:29:24,180 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
|
| 56 |
+
2024-08-12 05:29:24,180 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
|
| 57 |
+
2024-08-12 05:29:24,220 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
|
| 58 |
+
2024-08-12 05:29:26,367 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 59 |
+
2024-08-12 05:29:27,058 INFO Thread-12 :10531 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052853-n84i0o06/files/config.yaml
|
| 60 |
+
2024-08-12 05:29:31,577 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 61 |
+
2024-08-12 05:29:36,578 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 62 |
+
2024-08-12 05:29:39,180 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
|
| 63 |
+
2024-08-12 05:29:39,180 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
|
| 64 |
+
2024-08-12 05:29:39,220 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
|
| 65 |
+
2024-08-12 05:29:42,448 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 66 |
+
2024-08-12 05:29:47,449 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 67 |
+
2024-08-12 05:29:52,450 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 68 |
+
2024-08-12 05:29:54,139 DEBUG SystemMonitor:10531 [system_monitor.py:_start():172] Starting system metrics aggregation loop
|
| 69 |
+
2024-08-12 05:29:54,141 DEBUG SenderThread:10531 [sender.py:send():382] send: stats
|
| 70 |
+
2024-08-12 05:29:54,180 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
|
| 71 |
+
2024-08-12 05:29:54,180 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
|
| 72 |
+
2024-08-12 05:29:54,220 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
|
| 73 |
+
2024-08-12 05:29:58,446 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 74 |
+
2024-08-12 05:30:03,447 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 75 |
+
2024-08-12 05:30:08,448 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 76 |
+
2024-08-12 05:30:09,180 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
|
| 77 |
+
2024-08-12 05:30:09,181 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
|
| 78 |
+
2024-08-12 05:30:09,224 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
|
| 79 |
+
2024-08-12 05:30:13,456 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 80 |
+
2024-08-12 05:30:18,457 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 81 |
+
2024-08-12 05:30:22,408 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: partial_history
|
| 82 |
+
2024-08-12 05:30:24,097 INFO Thread-12 :10531 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052853-n84i0o06/files/output.log
|
| 83 |
+
2024-08-12 05:30:24,142 DEBUG SenderThread:10531 [sender.py:send():382] send: stats
|
| 84 |
+
2024-08-12 05:30:24,142 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 85 |
+
2024-08-12 05:30:24,180 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
|
| 86 |
+
2024-08-12 05:30:24,180 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
|
| 87 |
+
2024-08-12 05:30:24,182 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
|
| 88 |
+
2024-08-12 05:30:29,451 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 89 |
+
2024-08-12 05:30:34,451 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 90 |
+
2024-08-12 05:30:39,180 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
|
| 91 |
+
2024-08-12 05:30:39,181 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
|
| 92 |
+
2024-08-12 05:30:39,224 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
|
| 93 |
+
2024-08-12 05:30:40,419 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 94 |
+
2024-08-12 05:30:45,420 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 95 |
+
2024-08-12 05:30:50,421 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 96 |
+
2024-08-12 05:30:54,143 DEBUG SenderThread:10531 [sender.py:send():382] send: stats
|
| 97 |
+
2024-08-12 05:30:54,180 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
|
| 98 |
+
2024-08-12 05:30:54,181 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
|
| 99 |
+
2024-08-12 05:30:54,224 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
|
| 100 |
+
2024-08-12 05:30:56,414 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 101 |
+
2024-08-12 05:31:01,416 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 102 |
+
2024-08-12 05:31:06,417 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 103 |
+
2024-08-12 05:31:09,181 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
|
| 104 |
+
2024-08-12 05:31:09,181 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
|
| 105 |
+
2024-08-12 05:31:09,224 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
|
| 106 |
+
2024-08-12 05:31:12,373 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 107 |
+
2024-08-12 05:31:17,375 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 108 |
+
2024-08-12 05:31:22,376 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 109 |
+
2024-08-12 05:31:24,144 DEBUG SenderThread:10531 [sender.py:send():382] send: stats
|
| 110 |
+
2024-08-12 05:31:24,181 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
|
| 111 |
+
2024-08-12 05:31:24,181 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
|
| 112 |
+
2024-08-12 05:31:24,224 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
|
| 113 |
+
2024-08-12 05:31:28,366 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 114 |
+
2024-08-12 05:31:33,367 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 115 |
+
2024-08-12 05:31:36,963 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: partial_history
|
| 116 |
+
2024-08-12 05:31:36,966 DEBUG SenderThread:10531 [sender.py:send():382] send: history
|
| 117 |
+
2024-08-12 05:31:36,966 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: summary_record
|
| 118 |
+
2024-08-12 05:31:36,968 INFO SenderThread:10531 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
| 119 |
+
2024-08-12 05:31:37,152 INFO Thread-12 :10531 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_052853-n84i0o06/files/wandb-summary.json
|
| 120 |
+
2024-08-12 05:31:39,006 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 121 |
+
2024-08-12 05:31:39,181 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
|
| 122 |
+
2024-08-12 05:31:39,181 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
|
| 123 |
+
2024-08-12 05:31:39,183 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
|
| 124 |
+
2024-08-12 05:31:40,154 INFO Thread-12 :10531 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052853-n84i0o06/files/output.log
|
| 125 |
+
2024-08-12 05:31:44,409 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 126 |
+
2024-08-12 05:31:49,410 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 127 |
+
2024-08-12 05:31:54,145 DEBUG SenderThread:10531 [sender.py:send():382] send: stats
|
| 128 |
+
2024-08-12 05:31:54,181 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
|
| 129 |
+
2024-08-12 05:31:54,181 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
|
| 130 |
+
2024-08-12 05:31:54,228 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
|
| 131 |
+
2024-08-12 05:31:55,354 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 132 |
+
2024-08-12 05:32:00,355 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 133 |
+
2024-08-12 05:32:05,356 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 134 |
+
2024-08-12 05:32:09,181 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
|
| 135 |
+
2024-08-12 05:32:09,181 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
|
| 136 |
+
2024-08-12 05:32:09,224 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
|
| 137 |
+
2024-08-12 05:32:10,376 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 138 |
+
2024-08-12 05:32:15,377 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 139 |
+
2024-08-12 05:32:20,378 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 140 |
+
2024-08-12 05:32:24,146 DEBUG SenderThread:10531 [sender.py:send():382] send: stats
|
| 141 |
+
2024-08-12 05:32:24,181 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
|
| 142 |
+
2024-08-12 05:32:24,181 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
|
| 143 |
+
2024-08-12 05:32:24,224 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
|
| 144 |
+
2024-08-12 05:32:25,450 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 145 |
+
2024-08-12 05:32:30,451 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 146 |
+
2024-08-12 05:32:35,451 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 147 |
+
2024-08-12 05:32:39,181 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
|
| 148 |
+
2024-08-12 05:32:39,182 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
|
| 149 |
+
2024-08-12 05:32:39,224 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
|
| 150 |
+
2024-08-12 05:32:41,437 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 151 |
+
2024-08-12 05:32:46,438 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 152 |
+
2024-08-12 05:32:51,438 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 153 |
+
2024-08-12 05:32:51,692 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: partial_history
|
| 154 |
+
2024-08-12 05:32:51,694 DEBUG SenderThread:10531 [sender.py:send():382] send: history
|
| 155 |
+
2024-08-12 05:32:51,694 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: summary_record
|
| 156 |
+
2024-08-12 05:32:51,696 INFO SenderThread:10531 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
| 157 |
+
2024-08-12 05:32:52,204 INFO Thread-12 :10531 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052853-n84i0o06/files/wandb-summary.json
|
| 158 |
+
2024-08-12 05:32:54,147 DEBUG SenderThread:10531 [sender.py:send():382] send: stats
|
| 159 |
+
2024-08-12 05:32:54,181 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
|
| 160 |
+
2024-08-12 05:32:54,182 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
|
| 161 |
+
2024-08-12 05:32:54,183 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
|
| 162 |
+
2024-08-12 05:32:54,205 INFO Thread-12 :10531 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052853-n84i0o06/files/output.log
|
| 163 |
+
2024-08-12 05:32:56,453 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 164 |
+
2024-08-12 05:33:01,453 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 165 |
+
2024-08-12 05:33:06,454 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 166 |
+
2024-08-12 05:33:09,181 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
|
| 167 |
+
2024-08-12 05:33:09,182 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
|
| 168 |
+
2024-08-12 05:33:09,224 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
|
| 169 |
+
2024-08-12 05:33:12,386 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 170 |
+
2024-08-12 05:33:17,386 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 171 |
+
2024-08-12 05:33:22,387 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 172 |
+
2024-08-12 05:33:24,148 DEBUG SenderThread:10531 [sender.py:send():382] send: stats
|
| 173 |
+
2024-08-12 05:33:24,181 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
|
| 174 |
+
2024-08-12 05:33:24,182 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
|
| 175 |
+
2024-08-12 05:33:24,224 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
|
| 176 |
+
2024-08-12 05:33:28,379 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 177 |
+
2024-08-12 05:33:33,380 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 178 |
+
2024-08-12 05:33:38,380 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 179 |
+
2024-08-12 05:33:39,182 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
|
| 180 |
+
2024-08-12 05:33:39,182 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
|
| 181 |
+
2024-08-12 05:33:39,224 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
|
| 182 |
+
2024-08-12 05:33:43,420 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 183 |
+
2024-08-12 05:33:48,421 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 184 |
+
2024-08-12 05:33:53,421 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 185 |
+
2024-08-12 05:33:54,149 DEBUG SenderThread:10531 [sender.py:send():382] send: stats
|
| 186 |
+
2024-08-12 05:33:54,182 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
|
| 187 |
+
2024-08-12 05:33:54,182 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
|
| 188 |
+
2024-08-12 05:33:54,224 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
|
| 189 |
+
2024-08-12 05:33:59,378 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 190 |
+
2024-08-12 05:34:04,379 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 191 |
+
2024-08-12 05:34:06,274 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: partial_history
|
| 192 |
+
2024-08-12 05:34:06,276 DEBUG SenderThread:10531 [sender.py:send():382] send: history
|
| 193 |
+
2024-08-12 05:34:06,277 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: summary_record
|
| 194 |
+
2024-08-12 05:34:06,278 INFO SenderThread:10531 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
| 195 |
+
2024-08-12 05:34:07,249 INFO Thread-12 :10531 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052853-n84i0o06/files/wandb-summary.json
|
| 196 |
+
2024-08-12 05:34:08,250 INFO Thread-12 :10531 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052853-n84i0o06/files/output.log
|
| 197 |
+
2024-08-12 05:34:09,182 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
|
| 198 |
+
2024-08-12 05:34:09,182 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
|
| 199 |
+
2024-08-12 05:34:09,184 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
|
| 200 |
+
2024-08-12 05:34:09,395 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 201 |
+
2024-08-12 05:34:14,395 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 202 |
+
2024-08-12 05:34:19,396 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 203 |
+
2024-08-12 05:34:24,150 DEBUG SenderThread:10531 [sender.py:send():382] send: stats
|
| 204 |
+
2024-08-12 05:34:24,182 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
|
| 205 |
+
2024-08-12 05:34:24,182 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
|
| 206 |
+
2024-08-12 05:34:24,224 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
|
| 207 |
+
2024-08-12 05:34:25,394 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 208 |
+
2024-08-12 05:34:30,395 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 209 |
+
2024-08-12 05:34:35,396 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 210 |
+
2024-08-12 05:34:39,182 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
|
| 211 |
+
2024-08-12 05:34:39,182 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
|
| 212 |
+
2024-08-12 05:34:39,224 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
|
| 213 |
+
2024-08-12 05:34:40,439 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 214 |
+
2024-08-12 05:34:45,439 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 215 |
+
2024-08-12 05:34:50,440 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 216 |
+
2024-08-12 05:34:54,152 DEBUG SenderThread:10531 [sender.py:send():382] send: stats
|
| 217 |
+
2024-08-12 05:34:54,182 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
|
| 218 |
+
2024-08-12 05:34:54,182 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
|
| 219 |
+
2024-08-12 05:34:54,224 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
|
| 220 |
+
2024-08-12 05:34:55,454 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 221 |
+
2024-08-12 05:35:00,455 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 222 |
+
2024-08-12 05:35:05,455 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 223 |
+
2024-08-12 05:35:09,182 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
|
| 224 |
+
2024-08-12 05:35:09,182 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
|
| 225 |
+
2024-08-12 05:35:09,224 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
|
| 226 |
+
2024-08-12 05:35:11,407 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 227 |
+
2024-08-12 05:35:16,407 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 228 |
+
2024-08-12 05:35:20,928 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: partial_history
|
| 229 |
+
2024-08-12 05:35:20,930 DEBUG SenderThread:10531 [sender.py:send():382] send: history
|
| 230 |
+
2024-08-12 05:35:20,931 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: summary_record
|
| 231 |
+
2024-08-12 05:35:20,932 INFO SenderThread:10531 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
| 232 |
+
2024-08-12 05:35:21,295 INFO Thread-12 :10531 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052853-n84i0o06/files/wandb-summary.json
|
| 233 |
+
2024-08-12 05:35:21,970 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 234 |
+
2024-08-12 05:35:22,296 INFO Thread-12 :10531 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052853-n84i0o06/files/output.log
|
| 235 |
+
2024-08-12 05:35:24,152 DEBUG SenderThread:10531 [sender.py:send():382] send: stats
|
| 236 |
+
2024-08-12 05:35:24,232 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
|
| 237 |
+
2024-08-12 05:35:24,255 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
|
| 238 |
+
2024-08-12 05:35:24,256 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
|
| 239 |
+
2024-08-12 05:35:24,297 INFO Thread-12 :10531 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052853-n84i0o06/files/output.log
|
| 240 |
+
2024-08-12 05:35:25,212 DEBUG SenderThread:10531 [sender.py:send():382] send: exit
|
| 241 |
+
2024-08-12 05:35:25,213 INFO SenderThread:10531 [sender.py:send_exit():589] handling exit code: 1
|
| 242 |
+
2024-08-12 05:35:25,213 INFO SenderThread:10531 [sender.py:send_exit():591] handling runtime: 391
|
| 243 |
+
2024-08-12 05:35:25,214 INFO SenderThread:10531 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
| 244 |
+
2024-08-12 05:35:25,214 INFO SenderThread:10531 [sender.py:send_exit():597] send defer
|
| 245 |
+
2024-08-12 05:35:25,214 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: defer
|
| 246 |
+
2024-08-12 05:35:25,214 INFO HandlerThread:10531 [handler.py:handle_request_defer():172] handle defer: 0
|
| 247 |
+
2024-08-12 05:35:25,215 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: defer
|
| 248 |
+
2024-08-12 05:35:25,215 INFO SenderThread:10531 [sender.py:send_request_defer():613] handle sender defer: 0
|
| 249 |
+
2024-08-12 05:35:25,215 INFO SenderThread:10531 [sender.py:transition_state():617] send defer: 1
|
| 250 |
+
2024-08-12 05:35:25,215 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: defer
|
| 251 |
+
2024-08-12 05:35:25,215 INFO HandlerThread:10531 [handler.py:handle_request_defer():172] handle defer: 1
|
| 252 |
+
2024-08-12 05:35:25,215 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: defer
|
| 253 |
+
2024-08-12 05:35:25,215 INFO SenderThread:10531 [sender.py:send_request_defer():613] handle sender defer: 1
|
| 254 |
+
2024-08-12 05:35:25,215 INFO SenderThread:10531 [sender.py:transition_state():617] send defer: 2
|
| 255 |
+
2024-08-12 05:35:25,215 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: defer
|
| 256 |
+
2024-08-12 05:35:25,215 INFO HandlerThread:10531 [handler.py:handle_request_defer():172] handle defer: 2
|
| 257 |
+
2024-08-12 05:35:25,215 INFO HandlerThread:10531 [system_monitor.py:finish():203] Stopping system monitor
|
| 258 |
+
2024-08-12 05:35:25,215 DEBUG SystemMonitor:10531 [system_monitor.py:_start():179] Finished system metrics aggregation loop
|
| 259 |
+
2024-08-12 05:35:25,215 INFO HandlerThread:10531 [interfaces.py:finish():202] Joined cpu monitor
|
| 260 |
+
2024-08-12 05:35:25,216 DEBUG SystemMonitor:10531 [system_monitor.py:_start():183] Publishing last batch of metrics
|
| 261 |
+
2024-08-12 05:35:25,216 INFO HandlerThread:10531 [interfaces.py:finish():202] Joined disk monitor
|
| 262 |
+
2024-08-12 05:35:25,249 INFO HandlerThread:10531 [interfaces.py:finish():202] Joined gpu monitor
|
| 263 |
+
2024-08-12 05:35:25,249 INFO HandlerThread:10531 [interfaces.py:finish():202] Joined memory monitor
|
| 264 |
+
2024-08-12 05:35:25,249 INFO HandlerThread:10531 [interfaces.py:finish():202] Joined network monitor
|
| 265 |
+
2024-08-12 05:35:25,249 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: defer
|
| 266 |
+
2024-08-12 05:35:25,249 INFO SenderThread:10531 [sender.py:send_request_defer():613] handle sender defer: 2
|
| 267 |
+
2024-08-12 05:35:25,249 INFO SenderThread:10531 [sender.py:transition_state():617] send defer: 3
|
| 268 |
+
2024-08-12 05:35:25,249 DEBUG SenderThread:10531 [sender.py:send():382] send: stats
|
| 269 |
+
2024-08-12 05:35:25,250 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: defer
|
| 270 |
+
2024-08-12 05:35:25,250 INFO HandlerThread:10531 [handler.py:handle_request_defer():172] handle defer: 3
|
| 271 |
+
2024-08-12 05:35:25,251 DEBUG SenderThread:10531 [sender.py:send():382] send: history
|
| 272 |
+
2024-08-12 05:35:25,252 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: summary_record
|
| 273 |
+
2024-08-12 05:35:25,253 INFO SenderThread:10531 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
| 274 |
+
2024-08-12 05:35:25,253 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: defer
|
| 275 |
+
2024-08-12 05:35:25,253 INFO SenderThread:10531 [sender.py:send_request_defer():613] handle sender defer: 3
|
| 276 |
+
2024-08-12 05:35:25,253 INFO SenderThread:10531 [sender.py:transition_state():617] send defer: 4
|
| 277 |
+
2024-08-12 05:35:25,253 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: defer
|
| 278 |
+
2024-08-12 05:35:25,253 INFO HandlerThread:10531 [handler.py:handle_request_defer():172] handle defer: 4
|
| 279 |
+
2024-08-12 05:35:25,253 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: defer
|
| 280 |
+
2024-08-12 05:35:25,253 INFO SenderThread:10531 [sender.py:send_request_defer():613] handle sender defer: 4
|
| 281 |
+
2024-08-12 05:35:25,253 INFO SenderThread:10531 [sender.py:transition_state():617] send defer: 5
|
| 282 |
+
2024-08-12 05:35:25,253 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: defer
|
| 283 |
+
2024-08-12 05:35:25,253 INFO HandlerThread:10531 [handler.py:handle_request_defer():172] handle defer: 5
|
| 284 |
+
2024-08-12 05:35:25,254 DEBUG SenderThread:10531 [sender.py:send():382] send: summary
|
| 285 |
+
2024-08-12 05:35:25,255 INFO SenderThread:10531 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
| 286 |
+
2024-08-12 05:35:25,255 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: defer
|
| 287 |
+
2024-08-12 05:35:25,255 INFO SenderThread:10531 [sender.py:send_request_defer():613] handle sender defer: 5
|
| 288 |
+
2024-08-12 05:35:25,255 INFO SenderThread:10531 [sender.py:transition_state():617] send defer: 6
|
| 289 |
+
2024-08-12 05:35:25,255 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: defer
|
| 290 |
+
2024-08-12 05:35:25,255 INFO HandlerThread:10531 [handler.py:handle_request_defer():172] handle defer: 6
|
| 291 |
+
2024-08-12 05:35:25,255 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: defer
|
| 292 |
+
2024-08-12 05:35:25,255 INFO SenderThread:10531 [sender.py:send_request_defer():613] handle sender defer: 6
|
| 293 |
+
2024-08-12 05:35:25,256 INFO SenderThread:10531 [sender.py:transition_state():617] send defer: 7
|
| 294 |
+
2024-08-12 05:35:25,256 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
|
| 295 |
+
2024-08-12 05:35:25,256 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: defer
|
| 296 |
+
2024-08-12 05:35:25,256 INFO HandlerThread:10531 [handler.py:handle_request_defer():172] handle defer: 7
|
| 297 |
+
2024-08-12 05:35:25,256 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: defer
|
| 298 |
+
2024-08-12 05:35:25,256 INFO SenderThread:10531 [sender.py:send_request_defer():613] handle sender defer: 7
|
| 299 |
+
2024-08-12 05:35:25,298 INFO Thread-12 :10531 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052853-n84i0o06/files/wandb-summary.json
|
| 300 |
+
2024-08-12 05:35:26,141 INFO SenderThread:10531 [sender.py:transition_state():617] send defer: 8
|
| 301 |
+
2024-08-12 05:35:26,142 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: defer
|
| 302 |
+
2024-08-12 05:35:26,142 INFO HandlerThread:10531 [handler.py:handle_request_defer():172] handle defer: 8
|
| 303 |
+
2024-08-12 05:35:26,142 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: defer
|
| 304 |
+
2024-08-12 05:35:26,142 INFO SenderThread:10531 [sender.py:send_request_defer():613] handle sender defer: 8
|
| 305 |
+
2024-08-12 05:35:26,142 INFO SenderThread:10531 [job_builder.py:build():296] Attempting to build job artifact
|
| 306 |
+
2024-08-12 05:35:26,143 INFO SenderThread:10531 [job_builder.py:_get_source_type():426] is repo sourced job
|
| 307 |
+
2024-08-12 05:35:26,157 INFO SenderThread:10531 [job_builder.py:build():402] adding wandb-job metadata file
|
| 308 |
+
2024-08-12 05:35:26,166 INFO SenderThread:10531 [sender.py:transition_state():617] send defer: 9
|
| 309 |
+
2024-08-12 05:35:26,166 DEBUG SenderThread:10531 [sender.py:send():382] send: artifact
|
| 310 |
+
2024-08-12 05:35:26,166 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: defer
|
| 311 |
+
2024-08-12 05:35:26,167 INFO HandlerThread:10531 [handler.py:handle_request_defer():172] handle defer: 9
|
| 312 |
+
2024-08-12 05:35:26,213 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: poll_exit
|
| 313 |
+
2024-08-12 05:35:26,299 INFO Thread-12 :10531 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052853-n84i0o06/files/output.log
|
| 314 |
+
2024-08-12 05:35:27,302 INFO wandb-upload_1:10531 [upload_job.py:push():86] Skipped uploading /singularity_home/.local/share/wandb/artifacts/staging/tmpyfws5ko3
|
| 315 |
+
2024-08-12 05:35:27,738 INFO wandb-upload_0:10531 [upload_job.py:push():89] Uploaded file /singularity_home/.local/share/wandb/artifacts/staging/tmpypuucsag
|
| 316 |
+
2024-08-12 05:35:29,357 INFO SenderThread:10531 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTEzOTg5OTc5MQ==', 'state': 'PENDING', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTEzOTgzMzc4Mw==', 'versionIndex': 6}}}
|
| 317 |
+
2024-08-12 05:35:29,357 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: defer
|
| 318 |
+
2024-08-12 05:35:29,357 INFO SenderThread:10531 [sender.py:send_request_defer():613] handle sender defer: 9
|
| 319 |
+
2024-08-12 05:35:29,358 INFO SenderThread:10531 [dir_watcher.py:finish():358] shutting down directory watcher
|
| 320 |
+
2024-08-12 05:35:30,300 INFO SenderThread:10531 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240812_052853-n84i0o06/files
|
| 321 |
+
2024-08-12 05:35:30,301 INFO SenderThread:10531 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_052853-n84i0o06/files/requirements.txt requirements.txt
|
| 322 |
+
2024-08-12 05:35:30,301 INFO SenderThread:10531 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_052853-n84i0o06/files/config.yaml config.yaml
|
| 323 |
+
2024-08-12 05:35:30,301 INFO SenderThread:10531 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_052853-n84i0o06/files/wandb-metadata.json wandb-metadata.json
|
| 324 |
+
2024-08-12 05:35:30,302 INFO SenderThread:10531 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_052853-n84i0o06/files/wandb-summary.json wandb-summary.json
|
| 325 |
+
2024-08-12 05:35:30,304 INFO SenderThread:10531 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_052853-n84i0o06/files/output.log output.log
|
| 326 |
+
2024-08-12 05:35:30,306 INFO SenderThread:10531 [sender.py:transition_state():617] send defer: 10
|
| 327 |
+
2024-08-12 05:35:30,306 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: poll_exit
|
| 328 |
+
2024-08-12 05:35:30,306 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: defer
|
| 329 |
+
2024-08-12 05:35:30,307 INFO HandlerThread:10531 [handler.py:handle_request_defer():172] handle defer: 10
|
| 330 |
+
2024-08-12 05:35:30,308 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: defer
|
| 331 |
+
2024-08-12 05:35:30,308 INFO SenderThread:10531 [sender.py:send_request_defer():613] handle sender defer: 10
|
| 332 |
+
2024-08-12 05:35:30,308 INFO SenderThread:10531 [file_pusher.py:finish():172] shutting down file pusher
|
| 333 |
+
2024-08-12 05:35:30,718 INFO wandb-upload_0:10531 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_052853-n84i0o06/files/config.yaml
|
| 334 |
+
2024-08-12 05:35:30,895 INFO wandb-upload_3:10531 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_052853-n84i0o06/files/output.log
|
| 335 |
+
2024-08-12 05:35:31,214 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: keepalive
|
| 336 |
+
2024-08-12 05:35:31,214 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: poll_exit
|
| 337 |
+
2024-08-12 05:35:31,214 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: poll_exit
|
| 338 |
+
2024-08-12 05:35:31,248 INFO wandb-upload_1:10531 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_052853-n84i0o06/files/requirements.txt
|
| 339 |
+
2024-08-12 05:35:31,299 INFO wandb-upload_2:10531 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_052853-n84i0o06/files/wandb-summary.json
|
| 340 |
+
2024-08-12 05:35:31,499 INFO Thread-11 (_thread_body):10531 [sender.py:transition_state():617] send defer: 11
|
| 341 |
+
2024-08-12 05:35:31,499 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: defer
|
| 342 |
+
2024-08-12 05:35:31,500 INFO HandlerThread:10531 [handler.py:handle_request_defer():172] handle defer: 11
|
| 343 |
+
2024-08-12 05:35:31,500 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: defer
|
| 344 |
+
2024-08-12 05:35:31,500 INFO SenderThread:10531 [sender.py:send_request_defer():613] handle sender defer: 11
|
| 345 |
+
2024-08-12 05:35:31,500 INFO SenderThread:10531 [file_pusher.py:join():178] waiting for file pusher
|
| 346 |
+
2024-08-12 05:35:31,500 INFO SenderThread:10531 [sender.py:transition_state():617] send defer: 12
|
| 347 |
+
2024-08-12 05:35:31,500 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: defer
|
| 348 |
+
2024-08-12 05:35:31,500 INFO HandlerThread:10531 [handler.py:handle_request_defer():172] handle defer: 12
|
| 349 |
+
2024-08-12 05:35:31,500 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: defer
|
| 350 |
+
2024-08-12 05:35:31,500 INFO SenderThread:10531 [sender.py:send_request_defer():613] handle sender defer: 12
|
| 351 |
+
2024-08-12 05:35:31,500 INFO SenderThread:10531 [file_stream.py:finish():595] file stream finish called
|
| 352 |
+
2024-08-12 05:35:32,061 INFO SenderThread:10531 [file_stream.py:finish():599] file stream finish is done
|
| 353 |
+
2024-08-12 05:35:32,061 INFO SenderThread:10531 [sender.py:transition_state():617] send defer: 13
|
| 354 |
+
2024-08-12 05:35:32,061 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: defer
|
| 355 |
+
2024-08-12 05:35:32,061 INFO HandlerThread:10531 [handler.py:handle_request_defer():172] handle defer: 13
|
| 356 |
+
2024-08-12 05:35:32,062 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: defer
|
| 357 |
+
2024-08-12 05:35:32,062 INFO SenderThread:10531 [sender.py:send_request_defer():613] handle sender defer: 13
|
| 358 |
+
2024-08-12 05:35:32,062 INFO SenderThread:10531 [sender.py:transition_state():617] send defer: 14
|
| 359 |
+
2024-08-12 05:35:32,062 DEBUG SenderThread:10531 [sender.py:send():382] send: final
|
| 360 |
+
2024-08-12 05:35:32,062 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: defer
|
| 361 |
+
2024-08-12 05:35:32,062 INFO HandlerThread:10531 [handler.py:handle_request_defer():172] handle defer: 14
|
| 362 |
+
2024-08-12 05:35:32,062 DEBUG SenderThread:10531 [sender.py:send():382] send: footer
|
| 363 |
+
2024-08-12 05:35:32,062 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: defer
|
| 364 |
+
2024-08-12 05:35:32,062 INFO SenderThread:10531 [sender.py:send_request_defer():613] handle sender defer: 14
|
| 365 |
+
2024-08-12 05:35:32,063 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: poll_exit
|
| 366 |
+
2024-08-12 05:35:32,063 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: poll_exit
|
| 367 |
+
2024-08-12 05:35:32,063 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: poll_exit
|
| 368 |
+
2024-08-12 05:35:32,064 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: poll_exit
|
| 369 |
+
2024-08-12 05:35:32,064 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: server_info
|
| 370 |
+
2024-08-12 05:35:32,064 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: server_info
|
| 371 |
+
2024-08-12 05:35:32,065 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: get_summary
|
| 372 |
+
2024-08-12 05:35:32,066 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: sampled_history
|
| 373 |
+
2024-08-12 05:35:32,067 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
|
| 374 |
+
2024-08-12 05:35:32,067 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: job_info
|
| 375 |
+
2024-08-12 05:35:32,238 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: job_info
|
| 376 |
+
2024-08-12 05:35:32,238 INFO MainThread:10531 [wandb_run.py:_footer_history_summary_info():3866] rendering history
|
| 377 |
+
2024-08-12 05:35:32,239 INFO MainThread:10531 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
|
| 378 |
+
2024-08-12 05:35:32,239 INFO MainThread:10531 [wandb_run.py:_footer_sync_info():3825] logging synced files
|
| 379 |
+
2024-08-12 05:35:32,240 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: shutdown
|
| 380 |
+
2024-08-12 05:35:32,240 INFO HandlerThread:10531 [handler.py:finish():869] shutting down handler
|
| 381 |
+
2024-08-12 05:35:33,068 INFO WriterThread:10531 [datastore.py:close():296] close: /project/wandb/run-20240812_052853-n84i0o06/run-n84i0o06.wandb
|
| 382 |
+
2024-08-12 05:35:33,239 INFO SenderThread:10531 [sender.py:finish():1572] shutting down sender
|
| 383 |
+
2024-08-12 05:35:33,239 INFO SenderThread:10531 [file_pusher.py:finish():172] shutting down file pusher
|
| 384 |
+
2024-08-12 05:35:33,239 INFO SenderThread:10531 [file_pusher.py:join():178] waiting for file pusher
|
wandb/run-20240812_052853-n84i0o06/logs/debug.log
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2024-08-12 05:28:53,517 INFO MainThread:10460 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
|
| 2 |
+
2024-08-12 05:28:53,517 INFO MainThread:10460 [wandb_setup.py:_flush():76] Configure stats pid to 10460
|
| 3 |
+
2024-08-12 05:28:53,517 INFO MainThread:10460 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
|
| 4 |
+
2024-08-12 05:28:53,517 INFO MainThread:10460 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
|
| 5 |
+
2024-08-12 05:28:53,517 INFO MainThread:10460 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train Qwen2'}
|
| 6 |
+
2024-08-12 05:28:53,518 INFO MainThread:10460 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
|
| 7 |
+
2024-08-12 05:28:53,518 INFO MainThread:10460 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
|
| 8 |
+
2024-08-12 05:28:53,518 INFO MainThread:10460 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240812_052853-n84i0o06/logs/debug.log
|
| 9 |
+
2024-08-12 05:28:53,518 INFO MainThread:10460 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240812_052853-n84i0o06/logs/debug-internal.log
|
| 10 |
+
2024-08-12 05:28:53,518 INFO MainThread:10460 [wandb_init.py:init():566] calling init triggers
|
| 11 |
+
2024-08-12 05:28:53,518 INFO MainThread:10460 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
|
| 12 |
+
config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'valid_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'test_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 4096, 'num_workers': 2, 'tokenizer_type': 'HFPreTrainedTokenizer', 'tokenizer_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'yans-qwen2-0.5B_train_2024-08-12-05:28:42', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/yans-qwen2-0.5B', 'save': '/work/llm_recipes/models/yans-qwen2-0.5B', 'base_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 5, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 1, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/yans-qwen2-0.5B', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 151680, 'gradient_accumulation_steps': 320}
|
| 13 |
+
2024-08-12 05:28:53,518 INFO MainThread:10460 [wandb_init.py:init():616] starting backend
|
| 14 |
+
2024-08-12 05:28:53,518 INFO MainThread:10460 [wandb_init.py:init():620] setting up manager
|
| 15 |
+
2024-08-12 05:28:53,523 INFO MainThread:10460 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 16 |
+
2024-08-12 05:28:53,523 INFO MainThread:10460 [wandb_init.py:init():628] backend started and connected
|
| 17 |
+
2024-08-12 05:28:53,528 INFO MainThread:10460 [wandb_init.py:init():720] updated telemetry
|
| 18 |
+
2024-08-12 05:28:53,540 INFO MainThread:10460 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
|
| 19 |
+
2024-08-12 05:28:54,037 INFO MainThread:10460 [wandb_run.py:_on_init():2262] communicating current version
|
| 20 |
+
2024-08-12 05:28:54,121 INFO MainThread:10460 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.6 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
|
| 21 |
+
|
| 22 |
+
2024-08-12 05:28:54,121 INFO MainThread:10460 [wandb_init.py:init():804] starting run threads in backend
|
| 23 |
+
2024-08-12 05:28:54,179 INFO MainThread:10460 [wandb_run.py:_console_start():2241] atexit reg
|
| 24 |
+
2024-08-12 05:28:54,180 INFO MainThread:10460 [wandb_run.py:_redirect():2096] redirect: wrap_raw
|
| 25 |
+
2024-08-12 05:28:54,180 INFO MainThread:10460 [wandb_run.py:_redirect():2161] Wrapping output streams.
|
| 26 |
+
2024-08-12 05:28:54,180 INFO MainThread:10460 [wandb_run.py:_redirect():2186] Redirects installed.
|
| 27 |
+
2024-08-12 05:28:54,181 INFO MainThread:10460 [wandb_init.py:init():847] run started, returning control to user process
|
| 28 |
+
2024-08-12 05:29:01,877 INFO MainThread:10460 [wandb_run.py:_config_callback():1343] config_cb None None {'model_architecture': 'Qwen2ForCausalLM', 'activation_function': 'silu', 'hidden_size': 896, 'model_type': 'qwen2', 'max_position_embeddings': 4096, 'num_attention_heads': 14, 'num_hidden_layers': 24}
|
| 29 |
+
2024-08-12 05:29:01,878 INFO MainThread:10460 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
|
| 30 |
+
2024-08-12 05:35:33,240 WARNING MsgRouterThr:10460 [router.py:message_loop():77] message_loop has been closed
|
wandb/run-20240812_052853-n84i0o06/run-n84i0o06.wandb
ADDED
|
Binary file (45.7 kB). View file
|
|
|
wandb/run-20240812_063027-j1htzx7q/files/output.log
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Created Hugging Face repository with ID koichi12/yans-sample-gemma-2-2b.
|
| 2 |
+
Clearing GPU cache for all ranks
|
| 3 |
+
--> Running with torch torch_distributed debug set to detail
|
| 4 |
+
File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
|
| 5 |
+
Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
|
| 6 |
+
File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
|
| 7 |
+
Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
|
| 8 |
+
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
Loading checkpoint shards: 67%|██████▋ | 2/3 [02:31<01:16, 76.44s/it]
|
| 12 |
+
File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
|
| 13 |
+
Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
|
| 14 |
+
No checkpoint found in /work/llm_recipes/models/yans-sample-gemma-2-2b, skipping model loading
|
| 15 |
+
--> Model /share/pretrained_lm/google/gemma-2-2b
|
| 16 |
+
--> /share/pretrained_lm/google/gemma-2-2b has 2614.341888 Million params
|
| 17 |
+
BFloat16 enabled for mixed precision - using bfSixteen policy
|
| 18 |
+
--> applying fsdp activation checkpointing...
|
| 19 |
+
> datasets target sizes (minimum size):
|
| 20 |
+
train: 6400000
|
| 21 |
+
validation: 21334400
|
| 22 |
+
test: 3200
|
| 23 |
+
Loading checkpoint shards: 100%|██████████| 3/3 [02:40<00:00, 53.37s/it]
|
| 24 |
+
/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
|
| 25 |
+
warnings.warn(
|
| 26 |
+
Let split = None
|
| 27 |
+
Building a BlendedDataset for a single MegatronDataset
|
| 28 |
+
Unable to save the indexes because path_to_cache is None
|
| 29 |
+
Building a BlendedDataset for a single MegatronDataset
|
| 30 |
+
Unable to save the indexes because path_to_cache is None
|
| 31 |
+
Building a BlendedDataset for a single MegatronDataset
|
| 32 |
+
Unable to save the indexes because path_to_cache is None
|
| 33 |
+
> finished creating GPT datasets ...
|
| 34 |
+
File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
|
| 35 |
+
Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
|
| 36 |
+
No checkpoint found in /work/llm_recipes/models/yans-sample-gemma-2-2b, skipping optimizer loading
|
| 37 |
+
File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
|
| 38 |
+
Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
|
| 39 |
+
model info: FullyShardedDataParallel(
|
| 40 |
+
(_fsdp_wrapped_module): Gemma2ForCausalLM(
|
| 41 |
+
(model): Gemma2Model(
|
| 42 |
+
(embed_tokens): Embedding(256000, 2304, padding_idx=0)
|
| 43 |
+
(layers): ModuleList(
|
| 44 |
+
(0-25): 26 x FullyShardedDataParallel(
|
| 45 |
+
(_fsdp_wrapped_module): CheckpointWrapper(
|
| 46 |
+
(_checkpoint_wrapped_module): Gemma2DecoderLayer(
|
| 47 |
+
(self_attn): Gemma2FlashAttention2(
|
| 48 |
+
(q_proj): Linear(in_features=2304, out_features=2048, bias=False)
|
| 49 |
+
(k_proj): Linear(in_features=2304, out_features=1024, bias=False)
|
| 50 |
+
(v_proj): Linear(in_features=2304, out_features=1024, bias=False)
|
| 51 |
+
(o_proj): Linear(in_features=2048, out_features=2304, bias=False)
|
| 52 |
+
(rotary_emb): Gemma2RotaryEmbedding()
|
| 53 |
+
)
|
| 54 |
+
(mlp): Gemma2MLP(
|
| 55 |
+
(gate_proj): Linear(in_features=2304, out_features=9216, bias=False)
|
| 56 |
+
(up_proj): Linear(in_features=2304, out_features=9216, bias=False)
|
| 57 |
+
(down_proj): Linear(in_features=9216, out_features=2304, bias=False)
|
| 58 |
+
(act_fn): PytorchGELUTanh()
|
| 59 |
+
)
|
| 60 |
+
(input_layernorm): Gemma2RMSNorm()
|
| 61 |
+
(post_attention_layernorm): Gemma2RMSNorm()
|
| 62 |
+
(pre_feedforward_layernorm): Gemma2RMSNorm()
|
| 63 |
+
(post_feedforward_layernorm): Gemma2RMSNorm()
|
| 64 |
+
)
|
| 65 |
+
)
|
| 66 |
+
)
|
| 67 |
+
)
|
| 68 |
+
(norm): Gemma2RMSNorm()
|
| 69 |
+
)
|
| 70 |
+
(lm_head): Linear(in_features=2304, out_features=256000, bias=False)
|
| 71 |
+
)
|
| 72 |
+
)
|
| 73 |
+
model config: Gemma2Config {
|
| 74 |
+
"_name_or_path": "/share/pretrained_lm/google/gemma-2-2b",
|
| 75 |
+
"architectures": [
|
| 76 |
+
"Gemma2ForCausalLM"
|
| 77 |
+
],
|
| 78 |
+
"attention_bias": false,
|
| 79 |
+
"attention_dropout": 0.0,
|
| 80 |
+
"attn_logit_softcapping": 50.0,
|
| 81 |
+
"bos_token_id": 2,
|
| 82 |
+
"cache_implementation": "hybrid",
|
| 83 |
+
"eos_token_id": 1,
|
| 84 |
+
"final_logit_softcapping": 30.0,
|
| 85 |
+
"head_dim": 256,
|
| 86 |
+
"hidden_act": "gelu_pytorch_tanh",
|
| 87 |
+
"hidden_activation": "gelu_pytorch_tanh",
|
| 88 |
+
"hidden_size": 2304,
|
| 89 |
+
"initializer_range": 0.02,
|
| 90 |
+
"intermediate_size": 9216,
|
| 91 |
+
"label_smoothing": 0.0,
|
| 92 |
+
"max_position_embeddings": 4096,
|
| 93 |
+
"model_type": "gemma2",
|
| 94 |
+
"num_attention_heads": 8,
|
| 95 |
+
"num_hidden_layers": 26,
|
| 96 |
+
"num_key_value_heads": 4,
|
| 97 |
+
"pad_token_id": 0,
|
| 98 |
+
"query_pre_attn_scalar": 256,
|
| 99 |
+
"rms_norm_eps": 1e-06,
|
| 100 |
+
"rope_theta": 10000.0,
|
| 101 |
+
"sliding_window": 4096,
|
| 102 |
+
"torch_dtype": "float32",
|
| 103 |
+
"transformers_version": "4.43.3",
|
| 104 |
+
"use_cache": false,
|
| 105 |
+
"vocab_size": 256000
|
| 106 |
+
}
|
| 107 |
+
It is strongly recommended to train Gemma2 models with the `eager` attention implementation instead of `flash_attention_2`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.
|
| 108 |
+
Traceback (most recent call last):
|
| 109 |
+
File "/project/examples/finetuning.py", line 13, in <module>
|
| 110 |
+
main()
|
| 111 |
+
File "/project/src/llama_recipes/finetuning.py", line 281, in main
|
| 112 |
+
train(
|
| 113 |
+
File "/project/src/llama_recipes/utils/train_utils.py", line 118, in train
|
| 114 |
+
loss.backward()
|
| 115 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/_tensor.py", line 522, in backward
|
| 116 |
+
torch.autograd.backward(
|
| 117 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/autograd/__init__.py", line 267, in backward
|
| 118 |
+
_engine_run_backward(
|
| 119 |
+
File "/usr/local/lib/python3.10/dist-packages/torch/autograd/graph.py", line 681, in _engine_run_backward
|
| 120 |
+
return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
|
| 121 |
+
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.70 GiB. GPU 0 has a total capacity of 39.39 GiB of which 3.86 GiB is free. Including non-PyTorch memory, this process has 35.52 GiB memory in use. Of the allocated memory 32.71 GiB is allocated by PyTorch, and 1.99 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
|
wandb/run-20240812_063027-j1htzx7q/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"_wandb": {"runtime": 167}}
|
wandb/run-20240823_154448-v9m85jnt/files/config.yaml
ADDED
|
@@ -0,0 +1,321 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
wandb_version: 1
|
| 2 |
+
|
| 3 |
+
sharding_strategy:
|
| 4 |
+
desc: null
|
| 5 |
+
value: FULL_SHARD
|
| 6 |
+
checkpoint_type:
|
| 7 |
+
desc: null
|
| 8 |
+
value: LOCAL_STATE_DICT
|
| 9 |
+
fsdp_activation_checkpointing:
|
| 10 |
+
desc: null
|
| 11 |
+
value: true
|
| 12 |
+
fsdp_cpu_offload:
|
| 13 |
+
desc: null
|
| 14 |
+
value: false
|
| 15 |
+
low_cpu_fsdp:
|
| 16 |
+
desc: null
|
| 17 |
+
value: false
|
| 18 |
+
no_meta_device:
|
| 19 |
+
desc: null
|
| 20 |
+
value: false
|
| 21 |
+
data_path:
|
| 22 |
+
desc: null
|
| 23 |
+
value: null
|
| 24 |
+
split:
|
| 25 |
+
desc: null
|
| 26 |
+
value: 969, 30, 1
|
| 27 |
+
train_data_path:
|
| 28 |
+
desc: null
|
| 29 |
+
value:
|
| 30 |
+
- '1754785366'
|
| 31 |
+
- /project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document
|
| 32 |
+
- '28623823675'
|
| 33 |
+
- /project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document
|
| 34 |
+
valid_data_path:
|
| 35 |
+
desc: null
|
| 36 |
+
value:
|
| 37 |
+
- '1754785366'
|
| 38 |
+
- /project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document
|
| 39 |
+
test_data_path:
|
| 40 |
+
desc: null
|
| 41 |
+
value:
|
| 42 |
+
- '1754785366'
|
| 43 |
+
- /project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document
|
| 44 |
+
data_cache_path:
|
| 45 |
+
desc: null
|
| 46 |
+
value: null
|
| 47 |
+
vocab_size:
|
| 48 |
+
desc: null
|
| 49 |
+
value: null
|
| 50 |
+
vocab_file:
|
| 51 |
+
desc: null
|
| 52 |
+
value: null
|
| 53 |
+
merge_file:
|
| 54 |
+
desc: null
|
| 55 |
+
value: null
|
| 56 |
+
seq_length:
|
| 57 |
+
desc: null
|
| 58 |
+
value: 2048
|
| 59 |
+
num_workers:
|
| 60 |
+
desc: null
|
| 61 |
+
value: 2
|
| 62 |
+
tokenizer_type:
|
| 63 |
+
desc: null
|
| 64 |
+
value: HFPreTrainedTokenizer
|
| 65 |
+
tokenizer_model:
|
| 66 |
+
desc: null
|
| 67 |
+
value: /share/pretrained_lm/Qwen/Qwen2-0.5B
|
| 68 |
+
reset_position_ids:
|
| 69 |
+
desc: null
|
| 70 |
+
value: false
|
| 71 |
+
reset_attention_mask:
|
| 72 |
+
desc: null
|
| 73 |
+
value: false
|
| 74 |
+
eod_mask_loss:
|
| 75 |
+
desc: null
|
| 76 |
+
value: false
|
| 77 |
+
retro_return_doc_ids:
|
| 78 |
+
desc: null
|
| 79 |
+
value: false
|
| 80 |
+
short_seq_prob:
|
| 81 |
+
desc: null
|
| 82 |
+
value: 0.1
|
| 83 |
+
vocab_extra_ids:
|
| 84 |
+
desc: null
|
| 85 |
+
value: 0
|
| 86 |
+
seed:
|
| 87 |
+
desc: null
|
| 88 |
+
value: 1234
|
| 89 |
+
use_mpi:
|
| 90 |
+
desc: null
|
| 91 |
+
value: false
|
| 92 |
+
wandb_entity:
|
| 93 |
+
desc: null
|
| 94 |
+
value: iwakawa-koichi-q5-tohoku-nlp6723
|
| 95 |
+
wandb_name:
|
| 96 |
+
desc: null
|
| 97 |
+
value: Qwen2-0.5b-0.2_train_2024-08-23-15:44:18
|
| 98 |
+
wandb_project:
|
| 99 |
+
desc: null
|
| 100 |
+
value: llm_tutorial-0.2
|
| 101 |
+
quantization:
|
| 102 |
+
desc: null
|
| 103 |
+
value: false
|
| 104 |
+
use_freeze_layers:
|
| 105 |
+
desc: null
|
| 106 |
+
value: false
|
| 107 |
+
freeze_layers:
|
| 108 |
+
desc: null
|
| 109 |
+
value: null
|
| 110 |
+
bf16:
|
| 111 |
+
desc: null
|
| 112 |
+
value: true
|
| 113 |
+
fp16:
|
| 114 |
+
desc: null
|
| 115 |
+
value: false
|
| 116 |
+
mixed_precision:
|
| 117 |
+
desc: null
|
| 118 |
+
value: true
|
| 119 |
+
param_dtype:
|
| 120 |
+
desc: null
|
| 121 |
+
value: null
|
| 122 |
+
load:
|
| 123 |
+
desc: null
|
| 124 |
+
value: /work/llm_recipes/models/Qwen2-0.5b-0.2
|
| 125 |
+
save:
|
| 126 |
+
desc: null
|
| 127 |
+
value: /work/llm_recipes/models/Qwen2-0.5b-0.2
|
| 128 |
+
base_model:
|
| 129 |
+
desc: null
|
| 130 |
+
value: /share/pretrained_lm/Qwen/Qwen2-0.5B
|
| 131 |
+
use_better_transformer:
|
| 132 |
+
desc: null
|
| 133 |
+
value: false
|
| 134 |
+
grad_clip_norm:
|
| 135 |
+
desc: null
|
| 136 |
+
value: 1.0
|
| 137 |
+
eval_interval:
|
| 138 |
+
desc: null
|
| 139 |
+
value: 10
|
| 140 |
+
save_interval:
|
| 141 |
+
desc: null
|
| 142 |
+
value: 10
|
| 143 |
+
eval_iters:
|
| 144 |
+
desc: null
|
| 145 |
+
value: 10
|
| 146 |
+
optimizer:
|
| 147 |
+
desc: null
|
| 148 |
+
value: anyprecision
|
| 149 |
+
lr:
|
| 150 |
+
desc: null
|
| 151 |
+
value: 2.0e-05
|
| 152 |
+
lr_decay_style:
|
| 153 |
+
desc: null
|
| 154 |
+
value: cosine
|
| 155 |
+
lr_decay_iters:
|
| 156 |
+
desc: null
|
| 157 |
+
value: 7500
|
| 158 |
+
lr_warmup_iters:
|
| 159 |
+
desc: null
|
| 160 |
+
value: 500
|
| 161 |
+
min_lr:
|
| 162 |
+
desc: null
|
| 163 |
+
value: 1.0e-06
|
| 164 |
+
train_iters:
|
| 165 |
+
desc: null
|
| 166 |
+
value: 7500
|
| 167 |
+
train_samples:
|
| 168 |
+
desc: null
|
| 169 |
+
value: null
|
| 170 |
+
global_batch_size:
|
| 171 |
+
desc: null
|
| 172 |
+
value: 320
|
| 173 |
+
micro_batch_size:
|
| 174 |
+
desc: null
|
| 175 |
+
value: 1
|
| 176 |
+
make_vocab_size_divisible_by:
|
| 177 |
+
desc: null
|
| 178 |
+
value: 128
|
| 179 |
+
sliding_window_size:
|
| 180 |
+
desc: null
|
| 181 |
+
value: 4096
|
| 182 |
+
skip_batch:
|
| 183 |
+
desc: null
|
| 184 |
+
value: null
|
| 185 |
+
no_save_optimizer_state:
|
| 186 |
+
desc: null
|
| 187 |
+
value: false
|
| 188 |
+
continual_pretraining:
|
| 189 |
+
desc: null
|
| 190 |
+
value: false
|
| 191 |
+
instruction_tuning:
|
| 192 |
+
desc: null
|
| 193 |
+
value: false
|
| 194 |
+
direct_preference_optimization:
|
| 195 |
+
desc: null
|
| 196 |
+
value: false
|
| 197 |
+
attention_dropout:
|
| 198 |
+
desc: null
|
| 199 |
+
value: 0.1
|
| 200 |
+
hidden_dropout:
|
| 201 |
+
desc: null
|
| 202 |
+
value: 0.1
|
| 203 |
+
weight_decay:
|
| 204 |
+
desc: null
|
| 205 |
+
value: 0.1
|
| 206 |
+
adam_beta1:
|
| 207 |
+
desc: null
|
| 208 |
+
value: 0.9
|
| 209 |
+
adam_beta2:
|
| 210 |
+
desc: null
|
| 211 |
+
value: 0.95
|
| 212 |
+
adam_eps:
|
| 213 |
+
desc: null
|
| 214 |
+
value: 1.0e-06
|
| 215 |
+
hf_transformer_model_dir:
|
| 216 |
+
desc: null
|
| 217 |
+
value: null
|
| 218 |
+
instruction_train_data_path:
|
| 219 |
+
desc: null
|
| 220 |
+
value: null
|
| 221 |
+
instruction_valid_data_path:
|
| 222 |
+
desc: null
|
| 223 |
+
value: null
|
| 224 |
+
epoch:
|
| 225 |
+
desc: null
|
| 226 |
+
value: null
|
| 227 |
+
instruction_dataset_size:
|
| 228 |
+
desc: null
|
| 229 |
+
value: null
|
| 230 |
+
save_sampler_state:
|
| 231 |
+
desc: null
|
| 232 |
+
value: false
|
| 233 |
+
label_smoothing:
|
| 234 |
+
desc: null
|
| 235 |
+
value: 0.0
|
| 236 |
+
save_n_checkpoints:
|
| 237 |
+
desc: null
|
| 238 |
+
value: 10
|
| 239 |
+
hf_repo_id:
|
| 240 |
+
desc: null
|
| 241 |
+
value: koichi12/Qwen2-0.5b-0.2
|
| 242 |
+
create_public_hf_repo:
|
| 243 |
+
desc: null
|
| 244 |
+
value: false
|
| 245 |
+
upload_all_checkpoints_to_hf:
|
| 246 |
+
desc: null
|
| 247 |
+
value: true
|
| 248 |
+
hf_upload_retry_limit:
|
| 249 |
+
desc: null
|
| 250 |
+
value: 2
|
| 251 |
+
exit_duration_in_mins:
|
| 252 |
+
desc: null
|
| 253 |
+
value: null
|
| 254 |
+
source_key:
|
| 255 |
+
desc: null
|
| 256 |
+
value: null
|
| 257 |
+
target_key:
|
| 258 |
+
desc: null
|
| 259 |
+
value: null
|
| 260 |
+
attn_implementation:
|
| 261 |
+
desc: null
|
| 262 |
+
value: flash_attention_2
|
| 263 |
+
efficient_instruction_tuning:
|
| 264 |
+
desc: null
|
| 265 |
+
value: false
|
| 266 |
+
remove_padding_masking:
|
| 267 |
+
desc: null
|
| 268 |
+
value: false
|
| 269 |
+
save_start_iter:
|
| 270 |
+
desc: null
|
| 271 |
+
value: null
|
| 272 |
+
valid_micro_batch_size:
|
| 273 |
+
desc: null
|
| 274 |
+
value: 1
|
| 275 |
+
rank:
|
| 276 |
+
desc: null
|
| 277 |
+
value: 0
|
| 278 |
+
world_size:
|
| 279 |
+
desc: null
|
| 280 |
+
value: 1
|
| 281 |
+
padded_vocab_size:
|
| 282 |
+
desc: null
|
| 283 |
+
value: 151680
|
| 284 |
+
gradient_accumulation_steps:
|
| 285 |
+
desc: null
|
| 286 |
+
value: 320
|
| 287 |
+
_wandb:
|
| 288 |
+
desc: null
|
| 289 |
+
value:
|
| 290 |
+
python_version: 3.10.12
|
| 291 |
+
cli_version: 0.16.3
|
| 292 |
+
framework: huggingface
|
| 293 |
+
huggingface_version: 4.43.3
|
| 294 |
+
is_jupyter_run: false
|
| 295 |
+
is_kaggle_kernel: false
|
| 296 |
+
start_time: 1724395488.891619
|
| 297 |
+
t:
|
| 298 |
+
1:
|
| 299 |
+
- 1
|
| 300 |
+
- 11
|
| 301 |
+
- 49
|
| 302 |
+
- 55
|
| 303 |
+
- 71
|
| 304 |
+
- 105
|
| 305 |
+
2:
|
| 306 |
+
- 1
|
| 307 |
+
- 11
|
| 308 |
+
- 49
|
| 309 |
+
- 55
|
| 310 |
+
- 71
|
| 311 |
+
- 105
|
| 312 |
+
3:
|
| 313 |
+
- 13
|
| 314 |
+
- 16
|
| 315 |
+
- 23
|
| 316 |
+
4: 3.10.12
|
| 317 |
+
5: 0.16.3
|
| 318 |
+
6: 4.43.3
|
| 319 |
+
8:
|
| 320 |
+
- 5
|
| 321 |
+
13: linux-x86_64
|
wandb/run-20240823_154448-v9m85jnt/files/output.log
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Created Hugging Face repository with ID koichi12/Qwen2-0.5b-0.2.
|
| 2 |
+
Clearing GPU cache for all ranks
|
| 3 |
+
--> Running with torch torch_distributed debug set to detail
|
| 4 |
+
File not found: /work/llm_recipes/models/Qwen2-0.5b-0.2/latest_iteration.txt
|
| 5 |
+
Unable to read latest iteration from /work/llm_recipes/models/Qwen2-0.5b-0.2/latest_iteration.txt
|
| 6 |
+
File not found: /work/llm_recipes/models/Qwen2-0.5b-0.2/latest_iteration.txt
|
| 7 |
+
Unable to read latest iteration from /work/llm_recipes/models/Qwen2-0.5b-0.2/latest_iteration.txt
|
| 8 |
+
Traceback (most recent call last):
|
| 9 |
+
File "/project/examples/finetuning.py", line 13, in <module>
|
| 10 |
+
main()
|
| 11 |
+
File "/project/src/llama_recipes/finetuning.py", line 103, in main
|
| 12 |
+
model = get_model(
|
| 13 |
+
File "/project/src/llama_recipes/get_models.py", line 106, in get_model
|
| 14 |
+
assert sliding_window == 131072
|
| 15 |
+
AssertionError
|
wandb/run-20240823_154448-v9m85jnt/files/requirements.txt
ADDED
|
@@ -0,0 +1,375 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
absl-py==2.1.0
|
| 2 |
+
accelerate==0.23.0
|
| 3 |
+
aiohttp==3.9.1
|
| 4 |
+
aiosignal==1.3.1
|
| 5 |
+
annotated-types==0.6.0
|
| 6 |
+
antlr4-python3-runtime==4.9.3
|
| 7 |
+
anyio==4.4.0
|
| 8 |
+
apex==0.1
|
| 9 |
+
appdirs==1.4.4
|
| 10 |
+
argon2-cffi-bindings==21.2.0
|
| 11 |
+
argon2-cffi==23.1.0
|
| 12 |
+
astroid==3.2.4
|
| 13 |
+
asttokens==2.4.1
|
| 14 |
+
astunparse==1.6.3
|
| 15 |
+
async-timeout==4.0.3
|
| 16 |
+
attrs==23.2.0
|
| 17 |
+
audioread==3.0.1
|
| 18 |
+
beautifulsoup4==4.12.3
|
| 19 |
+
bert-score==0.3.13
|
| 20 |
+
bleach==6.1.0
|
| 21 |
+
blis==0.7.11
|
| 22 |
+
build==1.2.1
|
| 23 |
+
cachecontrol==0.14.0
|
| 24 |
+
cachetools==5.3.2
|
| 25 |
+
catalogue==2.0.10
|
| 26 |
+
certifi==2024.2.2
|
| 27 |
+
cffi==1.16.0
|
| 28 |
+
chardet==5.2.0
|
| 29 |
+
charset-normalizer==3.3.2
|
| 30 |
+
cleo==2.1.0
|
| 31 |
+
click==8.1.7
|
| 32 |
+
cloudpathlib==0.16.0
|
| 33 |
+
cloudpickle==3.0.0
|
| 34 |
+
cmake==3.28.1
|
| 35 |
+
colorama==0.4.6
|
| 36 |
+
comm==0.2.1
|
| 37 |
+
confection==0.1.4
|
| 38 |
+
contourpy==1.2.0
|
| 39 |
+
cramjam==2.8.3
|
| 40 |
+
crashtest==0.4.1
|
| 41 |
+
cryptography==43.0.0
|
| 42 |
+
cubinlinker==0.3.0+2.g405ac64
|
| 43 |
+
cuda-python==12.3.0rc4+9.gdb8c48a.dirty
|
| 44 |
+
cudf==23.12.0
|
| 45 |
+
cugraph-dgl==23.12.0
|
| 46 |
+
cugraph-service-client==23.12.0
|
| 47 |
+
cugraph-service-server==23.12.0
|
| 48 |
+
cugraph==23.12.0
|
| 49 |
+
cuml==23.12.0
|
| 50 |
+
cupy-cuda12x==12.3.0
|
| 51 |
+
cycler==0.12.1
|
| 52 |
+
cymem==2.0.8
|
| 53 |
+
cython==3.0.8
|
| 54 |
+
dask-cuda==23.12.0
|
| 55 |
+
dask-cudf==23.12.0
|
| 56 |
+
dask==2023.11.0
|
| 57 |
+
dataclasses-json==0.6.7
|
| 58 |
+
dataproperty==1.0.1
|
| 59 |
+
datasets==2.20.0
|
| 60 |
+
debugpy==1.8.1
|
| 61 |
+
decorator==5.1.1
|
| 62 |
+
defusedxml==0.7.1
|
| 63 |
+
dill==0.3.8
|
| 64 |
+
distlib==0.3.8
|
| 65 |
+
distributed==2023.11.0
|
| 66 |
+
distro==1.9.0
|
| 67 |
+
dm-tree==0.1.8
|
| 68 |
+
docker-pycreds==0.4.0
|
| 69 |
+
dulwich==0.21.7
|
| 70 |
+
einops==0.7.0
|
| 71 |
+
emoji==2.12.1
|
| 72 |
+
entmax==1.3
|
| 73 |
+
evaluate==0.4.2
|
| 74 |
+
exceptiongroup==1.2.0
|
| 75 |
+
execnet==2.0.2
|
| 76 |
+
executing==2.0.1
|
| 77 |
+
expecttest==0.1.3
|
| 78 |
+
fastjsonschema==2.19.1
|
| 79 |
+
fastparquet==2023.10.1
|
| 80 |
+
fastrlock==0.8.2
|
| 81 |
+
filelock==3.13.1
|
| 82 |
+
flash-attn==2.4.2
|
| 83 |
+
fonttools==4.48.1
|
| 84 |
+
frozenlist==1.4.1
|
| 85 |
+
fsspec==2023.12.2
|
| 86 |
+
fugashi==1.3.2
|
| 87 |
+
fuzzywuzzy==0.18.0
|
| 88 |
+
gast==0.5.4
|
| 89 |
+
gitdb==4.0.11
|
| 90 |
+
gitpython==3.1.43
|
| 91 |
+
google-auth-oauthlib==0.4.6
|
| 92 |
+
google-auth==2.27.0
|
| 93 |
+
graphsurgeon==0.4.6
|
| 94 |
+
greenlet==3.0.3
|
| 95 |
+
grpcio==1.60.1
|
| 96 |
+
h11==0.14.0
|
| 97 |
+
httpcore==1.0.5
|
| 98 |
+
httpx==0.27.0
|
| 99 |
+
huggingface-hub==0.24.5
|
| 100 |
+
hydra-core==1.3.2
|
| 101 |
+
hypothesis==5.35.1
|
| 102 |
+
idna==3.6
|
| 103 |
+
importlib-metadata==7.0.1
|
| 104 |
+
iniconfig==2.0.0
|
| 105 |
+
installer==0.7.0
|
| 106 |
+
intel-openmp==2021.4.0
|
| 107 |
+
ipadic==1.0.0
|
| 108 |
+
ipykernel==6.29.2
|
| 109 |
+
ipython-genutils==0.2.0
|
| 110 |
+
ipython==8.21.0
|
| 111 |
+
isort==5.13.2
|
| 112 |
+
jaraco.classes==3.4.0
|
| 113 |
+
jedi==0.19.1
|
| 114 |
+
jeepney==0.8.0
|
| 115 |
+
jinja2==3.1.3
|
| 116 |
+
jiter==0.5.0
|
| 117 |
+
joblib==1.3.2
|
| 118 |
+
json5==0.9.14
|
| 119 |
+
jsonargparse==3.13.1
|
| 120 |
+
jsonlines==4.0.0
|
| 121 |
+
jsonnet==0.19.1
|
| 122 |
+
jsonpatch==1.33
|
| 123 |
+
jsonpointer==3.0.0
|
| 124 |
+
jsonschema-specifications==2023.12.1
|
| 125 |
+
jsonschema==4.21.1
|
| 126 |
+
jupyter-client==8.6.0
|
| 127 |
+
jupyter-core==5.7.1
|
| 128 |
+
jupyter-tensorboard==0.2.0
|
| 129 |
+
jupyterlab-pygments==0.3.0
|
| 130 |
+
jupyterlab-server==1.2.0
|
| 131 |
+
jupyterlab==2.3.2
|
| 132 |
+
jupytext==1.16.1
|
| 133 |
+
keyring==24.3.1
|
| 134 |
+
kiwisolver==1.4.5
|
| 135 |
+
langchain-community==0.2.12
|
| 136 |
+
langchain-core==0.2.31
|
| 137 |
+
langchain-huggingface==0.0.2
|
| 138 |
+
langchain-openai==0.1.21
|
| 139 |
+
langchain-text-splitters==0.2.2
|
| 140 |
+
langchain==0.2.13
|
| 141 |
+
langcodes==3.3.0
|
| 142 |
+
langsmith==0.1.99
|
| 143 |
+
lazy-loader==0.3
|
| 144 |
+
levenshtein==0.25.1
|
| 145 |
+
librosa==0.10.1
|
| 146 |
+
lightning-utilities==0.11.6
|
| 147 |
+
llm-jp-eval==1.4.0
|
| 148 |
+
llvmlite==0.40.1
|
| 149 |
+
lm-eval==0.3.0
|
| 150 |
+
locket==1.0.0
|
| 151 |
+
logzero==1.7.0
|
| 152 |
+
lxml==5.2.2
|
| 153 |
+
markdown-it-py==3.0.0
|
| 154 |
+
markdown==3.5.2
|
| 155 |
+
markupsafe==2.1.4
|
| 156 |
+
marshmallow==3.21.3
|
| 157 |
+
matplotlib-inline==0.1.6
|
| 158 |
+
matplotlib==3.8.2
|
| 159 |
+
mbstrdecoder==1.1.3
|
| 160 |
+
mccabe==0.7.0
|
| 161 |
+
mdit-py-plugins==0.4.0
|
| 162 |
+
mdurl==0.1.2
|
| 163 |
+
mecab-python3==1.0.6
|
| 164 |
+
mistune==3.0.2
|
| 165 |
+
mkl-devel==2021.1.1
|
| 166 |
+
mkl-include==2021.1.1
|
| 167 |
+
mkl==2021.1.1
|
| 168 |
+
mock==5.1.0
|
| 169 |
+
mojimoji==0.0.13
|
| 170 |
+
more-itertools==9.1.0
|
| 171 |
+
mpmath==1.3.0
|
| 172 |
+
msgpack==1.0.7
|
| 173 |
+
multidict==6.0.4
|
| 174 |
+
multiprocess==0.70.16
|
| 175 |
+
murmurhash==1.0.10
|
| 176 |
+
mypy-extensions==1.0.0
|
| 177 |
+
nbclient==0.9.0
|
| 178 |
+
nbconvert==7.16.0
|
| 179 |
+
nbformat==5.9.2
|
| 180 |
+
neologdn==0.5.3
|
| 181 |
+
nest-asyncio==1.6.0
|
| 182 |
+
networkx==2.6.3
|
| 183 |
+
ninja==1.11.1.1
|
| 184 |
+
nltk==3.8.1
|
| 185 |
+
notebook==6.4.10
|
| 186 |
+
numba==0.57.1+1.g1ff679645
|
| 187 |
+
numexpr==2.10.1
|
| 188 |
+
numpy==1.24.4
|
| 189 |
+
nvfuser==0.1.4a0+d0bb811
|
| 190 |
+
nvidia-dali-cuda120==1.34.0
|
| 191 |
+
nvidia-pyindex==1.0.9
|
| 192 |
+
nvtx==0.2.5
|
| 193 |
+
oauthlib==3.2.2
|
| 194 |
+
omegaconf==2.3.0
|
| 195 |
+
onnx==1.15.0rc2
|
| 196 |
+
openai==1.40.6
|
| 197 |
+
opencv==4.7.0
|
| 198 |
+
optree==0.10.0
|
| 199 |
+
orjson==3.10.7
|
| 200 |
+
packaging==23.2
|
| 201 |
+
pandas==2.2.2
|
| 202 |
+
pandocfilters==1.5.1
|
| 203 |
+
parso==0.8.3
|
| 204 |
+
partd==1.4.1
|
| 205 |
+
pathvalidate==3.2.0
|
| 206 |
+
peft==0.5.0
|
| 207 |
+
pexpect==4.9.0
|
| 208 |
+
pillow==10.2.0
|
| 209 |
+
pip==24.0
|
| 210 |
+
pkginfo==1.11.1
|
| 211 |
+
plac==1.4.3
|
| 212 |
+
platformdirs==4.2.0
|
| 213 |
+
pluggy==1.4.0
|
| 214 |
+
ply==3.11
|
| 215 |
+
poetry-core==1.9.0
|
| 216 |
+
poetry-plugin-export==1.8.0
|
| 217 |
+
poetry==1.8.3
|
| 218 |
+
polygraphy==0.49.4
|
| 219 |
+
pooch==1.8.0
|
| 220 |
+
portalocker==2.10.1
|
| 221 |
+
preshed==3.0.9
|
| 222 |
+
prettytable==3.9.0
|
| 223 |
+
prometheus-client==0.19.0
|
| 224 |
+
prompt-toolkit==3.0.43
|
| 225 |
+
protobuf==4.24.4
|
| 226 |
+
psutil==5.9.4
|
| 227 |
+
ptxcompiler==0.8.1+2.g0d406d6
|
| 228 |
+
ptyprocess==0.7.0
|
| 229 |
+
pure-eval==0.2.2
|
| 230 |
+
pyarrow-hotfix==0.6
|
| 231 |
+
pyarrow==15.0.2
|
| 232 |
+
pyasn1-modules==0.3.0
|
| 233 |
+
pyasn1==0.5.1
|
| 234 |
+
pybind11-global==2.11.1
|
| 235 |
+
pybind11==2.11.1
|
| 236 |
+
pycocotools==2.0+nv0.8.0
|
| 237 |
+
pycountry==24.6.1
|
| 238 |
+
pycparser==2.21
|
| 239 |
+
pydantic-core==2.16.2
|
| 240 |
+
pydantic==2.6.1
|
| 241 |
+
pygments==2.17.2
|
| 242 |
+
pylibcugraph==23.12.0
|
| 243 |
+
pylibcugraphops==23.12.0
|
| 244 |
+
pylibraft==23.12.0
|
| 245 |
+
pylint==3.2.6
|
| 246 |
+
pynvml==11.4.1
|
| 247 |
+
pyparsing==3.1.1
|
| 248 |
+
pyproject-hooks==1.1.0
|
| 249 |
+
pytablewriter==1.2.0
|
| 250 |
+
pytest-flakefinder==1.1.0
|
| 251 |
+
pytest-rerunfailures==13.0
|
| 252 |
+
pytest-shard==0.1.2
|
| 253 |
+
pytest-xdist==3.5.0
|
| 254 |
+
pytest==8.0.0
|
| 255 |
+
python-dateutil==2.8.2
|
| 256 |
+
python-dotenv==1.0.0
|
| 257 |
+
python-hostlist==1.23.0
|
| 258 |
+
python-levenshtein==0.25.1
|
| 259 |
+
pytorch-lightning==2.4.0
|
| 260 |
+
pytorch-quantization==2.1.2
|
| 261 |
+
pytz==2023.3.post1
|
| 262 |
+
pyyaml==6.0.1
|
| 263 |
+
pyzmq==25.1.2
|
| 264 |
+
raft-dask==23.12.0
|
| 265 |
+
rapidfuzz==3.9.6
|
| 266 |
+
rapids-dask-dependency==23.12.1
|
| 267 |
+
referencing==0.33.0
|
| 268 |
+
regex==2023.12.25
|
| 269 |
+
requests-oauthlib==1.3.1
|
| 270 |
+
requests-toolbelt==1.0.0
|
| 271 |
+
requests==2.32.3
|
| 272 |
+
rhoknp==1.7.0
|
| 273 |
+
rich==13.7.0
|
| 274 |
+
rmm==23.12.0
|
| 275 |
+
rouge-score==0.1.2
|
| 276 |
+
rpds-py==0.17.1
|
| 277 |
+
rsa==4.9
|
| 278 |
+
sacrebleu==2.4.2
|
| 279 |
+
safetensors==0.4.3
|
| 280 |
+
scikit-learn==1.5.1
|
| 281 |
+
scipy==1.12.0
|
| 282 |
+
secretstorage==3.3.3
|
| 283 |
+
send2trash==1.8.2
|
| 284 |
+
sentence-transformers==3.0.1
|
| 285 |
+
sentencepiece==0.1.99
|
| 286 |
+
sentry-sdk==2.12.0
|
| 287 |
+
setproctitle==1.3.3
|
| 288 |
+
setuptools==68.2.2
|
| 289 |
+
shellingham==1.5.4
|
| 290 |
+
six==1.16.0
|
| 291 |
+
smart-open==6.4.0
|
| 292 |
+
smmap==5.0.1
|
| 293 |
+
sniffio==1.3.1
|
| 294 |
+
sortedcontainers==2.4.0
|
| 295 |
+
soundfile==0.12.1
|
| 296 |
+
soupsieve==2.5
|
| 297 |
+
soxr==0.3.7
|
| 298 |
+
spacy-legacy==3.0.12
|
| 299 |
+
spacy-loggers==1.0.5
|
| 300 |
+
spacy==3.7.2
|
| 301 |
+
sphinx-glpi-theme==0.6
|
| 302 |
+
sqlalchemy==2.0.32
|
| 303 |
+
sqlitedict==2.1.0
|
| 304 |
+
srsly==2.4.8
|
| 305 |
+
stack-data==0.6.3
|
| 306 |
+
sumeval==0.2.2
|
| 307 |
+
sympy==1.12
|
| 308 |
+
tabledata==1.3.3
|
| 309 |
+
tabulate==0.9.0
|
| 310 |
+
tbb==2021.11.0
|
| 311 |
+
tblib==3.0.0
|
| 312 |
+
tcolorpy==0.1.6
|
| 313 |
+
tenacity==8.5.0
|
| 314 |
+
tensorboard-data-server==0.6.1
|
| 315 |
+
tensorboard-plugin-wit==1.8.1
|
| 316 |
+
tensorboard==2.9.0
|
| 317 |
+
tensorrt==8.6.3
|
| 318 |
+
terminado==0.18.0
|
| 319 |
+
termplotlib==0.3.9
|
| 320 |
+
text-generation==0.7.0
|
| 321 |
+
thinc==8.2.3
|
| 322 |
+
threadpoolctl==3.2.0
|
| 323 |
+
thriftpy2==0.4.17
|
| 324 |
+
tiktoken==0.7.0
|
| 325 |
+
tinycss2==1.2.1
|
| 326 |
+
tokenizers==0.19.1
|
| 327 |
+
toml==0.10.2
|
| 328 |
+
tomli==2.0.1
|
| 329 |
+
tomlkit==0.13.2
|
| 330 |
+
toolz==0.12.1
|
| 331 |
+
torch-tensorrt==2.3.0a0
|
| 332 |
+
torch==2.3.0a0+ebedce2
|
| 333 |
+
torchdata==0.7.1a0
|
| 334 |
+
torchmetrics==0.10.3
|
| 335 |
+
torchtext==0.17.0a0
|
| 336 |
+
torchvision==0.18.0a0
|
| 337 |
+
tornado==6.4
|
| 338 |
+
tqdm-multiprocess==0.0.11
|
| 339 |
+
tqdm==4.66.5
|
| 340 |
+
traitlets==5.9.0
|
| 341 |
+
transformer-engine==1.3.0+5b90b7f
|
| 342 |
+
transformers==4.43.3
|
| 343 |
+
treelite-runtime==3.9.1
|
| 344 |
+
treelite==3.9.1
|
| 345 |
+
triton==2.2.0+e28a256
|
| 346 |
+
trove-classifiers==2024.7.2
|
| 347 |
+
typepy==1.3.2
|
| 348 |
+
typer==0.9.0
|
| 349 |
+
types-dataclasses==0.6.6
|
| 350 |
+
typing-extensions==4.12.2
|
| 351 |
+
typing-inspect==0.9.0
|
| 352 |
+
tzdata==2024.1
|
| 353 |
+
ucx-py==0.35.0
|
| 354 |
+
uff==0.6.9
|
| 355 |
+
ujson==5.8.0
|
| 356 |
+
unbabel-comet==2.2.2
|
| 357 |
+
unidic-lite==1.0.8
|
| 358 |
+
urllib3==1.26.18
|
| 359 |
+
virtualenv==20.26.3
|
| 360 |
+
wandb==0.16.3
|
| 361 |
+
wasabi==1.1.2
|
| 362 |
+
wcwidth==0.2.13
|
| 363 |
+
weasel==0.3.4
|
| 364 |
+
webencodings==0.5.1
|
| 365 |
+
werkzeug==3.0.1
|
| 366 |
+
wheel==0.42.0
|
| 367 |
+
word2number==1.1
|
| 368 |
+
xdoctest==1.0.2
|
| 369 |
+
xgboost==1.7.6
|
| 370 |
+
xmltodict==0.13.0
|
| 371 |
+
xxhash==3.4.1
|
| 372 |
+
yarl==1.9.4
|
| 373 |
+
zict==3.0.0
|
| 374 |
+
zipp==3.17.0
|
| 375 |
+
zstandard==0.23.0
|
wandb/run-20240823_154448-v9m85jnt/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "3.10.12",
|
| 4 |
+
"heartbeatAt": "2024-08-23T06:44:49.486428",
|
| 5 |
+
"startedAt": "2024-08-23T06:44:48.878270",
|
| 6 |
+
"docker": null,
|
| 7 |
+
"cuda": null,
|
| 8 |
+
"args": [
|
| 9 |
+
"--seq-length",
|
| 10 |
+
"2048",
|
| 11 |
+
"--sliding-window-size",
|
| 12 |
+
"4096",
|
| 13 |
+
"--micro-batch-size",
|
| 14 |
+
"1",
|
| 15 |
+
"--valid_micro_batch_size",
|
| 16 |
+
"1",
|
| 17 |
+
"--global-batch-size",
|
| 18 |
+
"320",
|
| 19 |
+
"--train-iters",
|
| 20 |
+
"7500",
|
| 21 |
+
"--tokenizer-type",
|
| 22 |
+
"HFPreTrainedTokenizer",
|
| 23 |
+
"--tokenizer-model",
|
| 24 |
+
"/share/pretrained_lm/Qwen/Qwen2-0.5B",
|
| 25 |
+
"--train-data-path",
|
| 26 |
+
"1754785366",
|
| 27 |
+
"/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document",
|
| 28 |
+
"28623823675",
|
| 29 |
+
"/project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document",
|
| 30 |
+
"--valid-data-path",
|
| 31 |
+
"1754785366",
|
| 32 |
+
"/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document",
|
| 33 |
+
"--test-data-path",
|
| 34 |
+
"1754785366",
|
| 35 |
+
"/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document",
|
| 36 |
+
"--lr",
|
| 37 |
+
"2e-5",
|
| 38 |
+
"--min-lr",
|
| 39 |
+
"1e-6",
|
| 40 |
+
"--lr-decay-style",
|
| 41 |
+
"cosine",
|
| 42 |
+
"--lr-warmup-iters",
|
| 43 |
+
"500",
|
| 44 |
+
"--lr-decay-iters",
|
| 45 |
+
"7500",
|
| 46 |
+
"--weight-decay",
|
| 47 |
+
"0.1",
|
| 48 |
+
"--grad-clip-norm",
|
| 49 |
+
"1.0",
|
| 50 |
+
"--optimizer",
|
| 51 |
+
"anyprecision",
|
| 52 |
+
"--adam-beta1",
|
| 53 |
+
"0.9",
|
| 54 |
+
"--adam-beta2",
|
| 55 |
+
"0.95",
|
| 56 |
+
"--adam-eps",
|
| 57 |
+
"1e-6",
|
| 58 |
+
"--save-interval",
|
| 59 |
+
"10",
|
| 60 |
+
"--eval-interval",
|
| 61 |
+
"10",
|
| 62 |
+
"--eval-iters",
|
| 63 |
+
"10",
|
| 64 |
+
"--bf16",
|
| 65 |
+
"--mixed-precision",
|
| 66 |
+
"--base-model",
|
| 67 |
+
"/share/pretrained_lm/Qwen/Qwen2-0.5B",
|
| 68 |
+
"--save",
|
| 69 |
+
"/work/llm_recipes/models/Qwen2-0.5b-0.2",
|
| 70 |
+
"--load",
|
| 71 |
+
"/work/llm_recipes/models/Qwen2-0.5b-0.2",
|
| 72 |
+
"--fsdp-activation-checkpointing",
|
| 73 |
+
"--sharding-strategy",
|
| 74 |
+
"FULL_SHARD",
|
| 75 |
+
"--checkpoint-type",
|
| 76 |
+
"LOCAL_STATE_DICT",
|
| 77 |
+
"--save-n-checkpoints",
|
| 78 |
+
"10",
|
| 79 |
+
"--upload-all-checkpoints-to-hf",
|
| 80 |
+
"--hf-upload-retry-limit",
|
| 81 |
+
"2",
|
| 82 |
+
"--hf-repo-id",
|
| 83 |
+
"koichi12/Qwen2-0.5b-0.2",
|
| 84 |
+
"--wandb-entity",
|
| 85 |
+
"iwakawa-koichi-q5-tohoku-nlp6723",
|
| 86 |
+
"--wandb-project",
|
| 87 |
+
"llm_tutorial-0.2",
|
| 88 |
+
"--wandb-name",
|
| 89 |
+
"Qwen2-0.5b-0.2_train_2024-08-23-15:44:18"
|
| 90 |
+
],
|
| 91 |
+
"state": "running",
|
| 92 |
+
"program": "/project/examples/finetuning.py",
|
| 93 |
+
"codePathLocal": "examples/finetuning.py",
|
| 94 |
+
"codePath": "examples/finetuning.py",
|
| 95 |
+
"git": {
|
| 96 |
+
"remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
|
| 97 |
+
"commit": "887a2cc5d104c10264701f95cbbb0a6a116768d6"
|
| 98 |
+
},
|
| 99 |
+
"email": null,
|
| 100 |
+
"root": "/project",
|
| 101 |
+
"host": "gpu-koiwa-00",
|
| 102 |
+
"username": "koiwa",
|
| 103 |
+
"executable": "/usr/bin/python",
|
| 104 |
+
"cpu_count": 18,
|
| 105 |
+
"cpu_count_logical": 18,
|
| 106 |
+
"cpu_freq": {
|
| 107 |
+
"current": 2400.0389999999993,
|
| 108 |
+
"min": 0.0,
|
| 109 |
+
"max": 0.0
|
| 110 |
+
},
|
| 111 |
+
"cpu_freq_per_core": [
|
| 112 |
+
{
|
| 113 |
+
"current": 2400.039,
|
| 114 |
+
"min": 0.0,
|
| 115 |
+
"max": 0.0
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"current": 2400.039,
|
| 119 |
+
"min": 0.0,
|
| 120 |
+
"max": 0.0
|
| 121 |
+
},
|
| 122 |
+
{
|
| 123 |
+
"current": 2400.039,
|
| 124 |
+
"min": 0.0,
|
| 125 |
+
"max": 0.0
|
| 126 |
+
},
|
| 127 |
+
{
|
| 128 |
+
"current": 2400.039,
|
| 129 |
+
"min": 0.0,
|
| 130 |
+
"max": 0.0
|
| 131 |
+
},
|
| 132 |
+
{
|
| 133 |
+
"current": 2400.039,
|
| 134 |
+
"min": 0.0,
|
| 135 |
+
"max": 0.0
|
| 136 |
+
},
|
| 137 |
+
{
|
| 138 |
+
"current": 2400.039,
|
| 139 |
+
"min": 0.0,
|
| 140 |
+
"max": 0.0
|
| 141 |
+
},
|
| 142 |
+
{
|
| 143 |
+
"current": 2400.039,
|
| 144 |
+
"min": 0.0,
|
| 145 |
+
"max": 0.0
|
| 146 |
+
},
|
| 147 |
+
{
|
| 148 |
+
"current": 2400.039,
|
| 149 |
+
"min": 0.0,
|
| 150 |
+
"max": 0.0
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"current": 2400.039,
|
| 154 |
+
"min": 0.0,
|
| 155 |
+
"max": 0.0
|
| 156 |
+
},
|
| 157 |
+
{
|
| 158 |
+
"current": 2400.039,
|
| 159 |
+
"min": 0.0,
|
| 160 |
+
"max": 0.0
|
| 161 |
+
},
|
| 162 |
+
{
|
| 163 |
+
"current": 2400.039,
|
| 164 |
+
"min": 0.0,
|
| 165 |
+
"max": 0.0
|
| 166 |
+
},
|
| 167 |
+
{
|
| 168 |
+
"current": 2400.039,
|
| 169 |
+
"min": 0.0,
|
| 170 |
+
"max": 0.0
|
| 171 |
+
},
|
| 172 |
+
{
|
| 173 |
+
"current": 2400.039,
|
| 174 |
+
"min": 0.0,
|
| 175 |
+
"max": 0.0
|
| 176 |
+
},
|
| 177 |
+
{
|
| 178 |
+
"current": 2400.039,
|
| 179 |
+
"min": 0.0,
|
| 180 |
+
"max": 0.0
|
| 181 |
+
},
|
| 182 |
+
{
|
| 183 |
+
"current": 2400.039,
|
| 184 |
+
"min": 0.0,
|
| 185 |
+
"max": 0.0
|
| 186 |
+
},
|
| 187 |
+
{
|
| 188 |
+
"current": 2400.039,
|
| 189 |
+
"min": 0.0,
|
| 190 |
+
"max": 0.0
|
| 191 |
+
},
|
| 192 |
+
{
|
| 193 |
+
"current": 2400.039,
|
| 194 |
+
"min": 0.0,
|
| 195 |
+
"max": 0.0
|
| 196 |
+
},
|
| 197 |
+
{
|
| 198 |
+
"current": 2400.039,
|
| 199 |
+
"min": 0.0,
|
| 200 |
+
"max": 0.0
|
| 201 |
+
}
|
| 202 |
+
],
|
| 203 |
+
"disk": {
|
| 204 |
+
"/": {
|
| 205 |
+
"total": 0.0625,
|
| 206 |
+
"used": 1.1444091796875e-05
|
| 207 |
+
}
|
| 208 |
+
},
|
| 209 |
+
"gpu": "NVIDIA A100-SXM4-40GB",
|
| 210 |
+
"gpu_count": 1,
|
| 211 |
+
"gpu_devices": [
|
| 212 |
+
{
|
| 213 |
+
"name": "NVIDIA A100-SXM4-40GB",
|
| 214 |
+
"memory_total": 42949672960
|
| 215 |
+
}
|
| 216 |
+
],
|
| 217 |
+
"memory": {
|
| 218 |
+
"total": 56.487831115722656
|
| 219 |
+
}
|
| 220 |
+
}
|
wandb/run-20240823_154448-v9m85jnt/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"_wandb": {"runtime": 1}}
|
wandb/run-20240823_154448-v9m85jnt/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2024-08-23 15:44:48,892 INFO StreamThr :10032 [internal.py:wandb_internal():86] W&B internal server running at pid: 10032, started at: 2024-08-23 15:44:48.891774
|
| 2 |
+
2024-08-23 15:44:48,893 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: status
|
| 3 |
+
2024-08-23 15:44:48,896 INFO WriterThread:10032 [datastore.py:open_for_write():87] open: /project/wandb/run-20240823_154448-v9m85jnt/run-v9m85jnt.wandb
|
| 4 |
+
2024-08-23 15:44:48,897 DEBUG SenderThread:10032 [sender.py:send():382] send: header
|
| 5 |
+
2024-08-23 15:44:48,913 DEBUG SenderThread:10032 [sender.py:send():382] send: run
|
| 6 |
+
2024-08-23 15:44:49,390 INFO SenderThread:10032 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240823_154448-v9m85jnt/files
|
| 7 |
+
2024-08-23 15:44:49,390 INFO SenderThread:10032 [sender.py:_start_run_threads():1136] run started: v9m85jnt with start time 1724395488.891619
|
| 8 |
+
2024-08-23 15:44:49,395 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: check_version
|
| 9 |
+
2024-08-23 15:44:49,396 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: check_version
|
| 10 |
+
2024-08-23 15:44:49,467 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: run_start
|
| 11 |
+
2024-08-23 15:44:49,473 DEBUG HandlerThread:10032 [system_info.py:__init__():27] System info init
|
| 12 |
+
2024-08-23 15:44:49,474 DEBUG HandlerThread:10032 [system_info.py:__init__():42] System info init done
|
| 13 |
+
2024-08-23 15:44:49,474 INFO HandlerThread:10032 [system_monitor.py:start():194] Starting system monitor
|
| 14 |
+
2024-08-23 15:44:49,474 INFO SystemMonitor:10032 [system_monitor.py:_start():158] Starting system asset monitoring threads
|
| 15 |
+
2024-08-23 15:44:49,474 INFO HandlerThread:10032 [system_monitor.py:probe():214] Collecting system info
|
| 16 |
+
2024-08-23 15:44:49,474 INFO SystemMonitor:10032 [interfaces.py:start():190] Started cpu monitoring
|
| 17 |
+
2024-08-23 15:44:49,475 INFO SystemMonitor:10032 [interfaces.py:start():190] Started disk monitoring
|
| 18 |
+
2024-08-23 15:44:49,475 INFO SystemMonitor:10032 [interfaces.py:start():190] Started gpu monitoring
|
| 19 |
+
2024-08-23 15:44:49,475 INFO SystemMonitor:10032 [interfaces.py:start():190] Started memory monitoring
|
| 20 |
+
2024-08-23 15:44:49,476 INFO SystemMonitor:10032 [interfaces.py:start():190] Started network monitoring
|
| 21 |
+
2024-08-23 15:44:49,486 DEBUG HandlerThread:10032 [system_info.py:probe():151] Probing system
|
| 22 |
+
2024-08-23 15:44:49,488 DEBUG HandlerThread:10032 [system_info.py:_probe_git():136] Probing git
|
| 23 |
+
2024-08-23 15:44:49,500 DEBUG HandlerThread:10032 [system_info.py:_probe_git():144] Probing git done
|
| 24 |
+
2024-08-23 15:44:49,500 DEBUG HandlerThread:10032 [system_info.py:probe():199] Probing system done
|
| 25 |
+
2024-08-23 15:44:49,500 DEBUG HandlerThread:10032 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-23T06:44:49.486428', 'startedAt': '2024-08-23T06:44:48.878270', 'docker': None, 'cuda': None, 'args': ('--seq-length', '2048', '--sliding-window-size', '4096', '--micro-batch-size', '1', '--valid_micro_batch_size', '1', '--global-batch-size', '320', '--train-iters', '7500', '--tokenizer-type', 'HFPreTrainedTokenizer', '--tokenizer-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--train-data-path', '1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '28623823675', '/project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document', '--valid-data-path', '1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '--test-data-path', '1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '7500', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'anyprecision', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '10', '--eval-interval', '10', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--save', '/work/llm_recipes/models/Qwen2-0.5b-0.2', '--load', '/work/llm_recipes/models/Qwen2-0.5b-0.2', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--upload-all-checkpoints-to-hf', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/Qwen2-0.5b-0.2', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial-0.2', '--wandb-name', 'Qwen2-0.5b-0.2_train_2024-08-23-15:44:18'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '887a2cc5d104c10264701f95cbbb0a6a116768d6'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0389999999993, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.487831115722656}}
|
| 26 |
+
2024-08-23 15:44:49,500 INFO HandlerThread:10032 [system_monitor.py:probe():224] Finished collecting system info
|
| 27 |
+
2024-08-23 15:44:49,500 INFO HandlerThread:10032 [system_monitor.py:probe():227] Publishing system info
|
| 28 |
+
2024-08-23 15:44:49,502 INFO HandlerThread:10032 [system_monitor.py:probe():229] Finished publishing system info
|
| 29 |
+
2024-08-23 15:44:49,528 DEBUG SenderThread:10032 [sender.py:send():382] send: files
|
| 30 |
+
2024-08-23 15:44:49,529 INFO SenderThread:10032 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
|
| 31 |
+
2024-08-23 15:44:49,540 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: python_packages
|
| 32 |
+
2024-08-23 15:44:49,540 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: stop_status
|
| 33 |
+
2024-08-23 15:44:49,540 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: internal_messages
|
| 34 |
+
2024-08-23 15:44:49,541 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: python_packages
|
| 35 |
+
2024-08-23 15:44:49,543 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: stop_status
|
| 36 |
+
2024-08-23 15:44:49,740 DEBUG SenderThread:10032 [sender.py:send():382] send: telemetry
|
| 37 |
+
2024-08-23 15:44:50,157 INFO wandb-upload_0:10032 [upload_job.py:push():131] Uploaded file /tmp/tmp_akktvpmwandb/xbudf9th-wandb-metadata.json
|
| 38 |
+
2024-08-23 15:44:50,392 INFO Thread-12 :10032 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240823_154448-v9m85jnt/files/wandb-metadata.json
|
| 39 |
+
2024-08-23 15:44:50,392 INFO Thread-12 :10032 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240823_154448-v9m85jnt/files/requirements.txt
|
| 40 |
+
2024-08-23 15:44:50,392 INFO Thread-12 :10032 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240823_154448-v9m85jnt/files/output.log
|
| 41 |
+
2024-08-23 15:44:50,729 DEBUG SenderThread:10032 [sender.py:send():382] send: exit
|
| 42 |
+
2024-08-23 15:44:50,729 INFO SenderThread:10032 [sender.py:send_exit():589] handling exit code: 1
|
| 43 |
+
2024-08-23 15:44:50,730 INFO SenderThread:10032 [sender.py:send_exit():591] handling runtime: 1
|
| 44 |
+
2024-08-23 15:44:50,731 INFO SenderThread:10032 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
| 45 |
+
2024-08-23 15:44:50,731 INFO SenderThread:10032 [sender.py:send_exit():597] send defer
|
| 46 |
+
2024-08-23 15:44:50,731 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: defer
|
| 47 |
+
2024-08-23 15:44:50,731 INFO HandlerThread:10032 [handler.py:handle_request_defer():172] handle defer: 0
|
| 48 |
+
2024-08-23 15:44:50,731 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: defer
|
| 49 |
+
2024-08-23 15:44:50,732 INFO SenderThread:10032 [sender.py:send_request_defer():613] handle sender defer: 0
|
| 50 |
+
2024-08-23 15:44:50,732 INFO SenderThread:10032 [sender.py:transition_state():617] send defer: 1
|
| 51 |
+
2024-08-23 15:44:50,732 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: defer
|
| 52 |
+
2024-08-23 15:44:50,732 INFO HandlerThread:10032 [handler.py:handle_request_defer():172] handle defer: 1
|
| 53 |
+
2024-08-23 15:44:50,732 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: defer
|
| 54 |
+
2024-08-23 15:44:50,732 INFO SenderThread:10032 [sender.py:send_request_defer():613] handle sender defer: 1
|
| 55 |
+
2024-08-23 15:44:50,732 INFO SenderThread:10032 [sender.py:transition_state():617] send defer: 2
|
| 56 |
+
2024-08-23 15:44:50,732 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: defer
|
| 57 |
+
2024-08-23 15:44:50,732 INFO HandlerThread:10032 [handler.py:handle_request_defer():172] handle defer: 2
|
| 58 |
+
2024-08-23 15:44:50,732 INFO HandlerThread:10032 [system_monitor.py:finish():203] Stopping system monitor
|
| 59 |
+
2024-08-23 15:44:50,732 INFO HandlerThread:10032 [interfaces.py:finish():202] Joined cpu monitor
|
| 60 |
+
2024-08-23 15:44:50,733 DEBUG SystemMonitor:10032 [system_monitor.py:_start():172] Starting system metrics aggregation loop
|
| 61 |
+
2024-08-23 15:44:50,733 INFO HandlerThread:10032 [interfaces.py:finish():202] Joined disk monitor
|
| 62 |
+
2024-08-23 15:44:50,733 DEBUG SystemMonitor:10032 [system_monitor.py:_start():179] Finished system metrics aggregation loop
|
| 63 |
+
2024-08-23 15:44:50,733 DEBUG SystemMonitor:10032 [system_monitor.py:_start():183] Publishing last batch of metrics
|
| 64 |
+
2024-08-23 15:44:50,765 INFO HandlerThread:10032 [interfaces.py:finish():202] Joined gpu monitor
|
| 65 |
+
2024-08-23 15:44:50,765 INFO HandlerThread:10032 [interfaces.py:finish():202] Joined memory monitor
|
| 66 |
+
2024-08-23 15:44:50,765 INFO HandlerThread:10032 [interfaces.py:finish():202] Joined network monitor
|
| 67 |
+
2024-08-23 15:44:50,766 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: defer
|
| 68 |
+
2024-08-23 15:44:50,766 INFO SenderThread:10032 [sender.py:send_request_defer():613] handle sender defer: 2
|
| 69 |
+
2024-08-23 15:44:50,766 INFO SenderThread:10032 [sender.py:transition_state():617] send defer: 3
|
| 70 |
+
2024-08-23 15:44:50,766 DEBUG SenderThread:10032 [sender.py:send():382] send: stats
|
| 71 |
+
2024-08-23 15:44:50,766 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: defer
|
| 72 |
+
2024-08-23 15:44:50,766 INFO HandlerThread:10032 [handler.py:handle_request_defer():172] handle defer: 3
|
| 73 |
+
2024-08-23 15:44:50,766 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: defer
|
| 74 |
+
2024-08-23 15:44:50,766 INFO SenderThread:10032 [sender.py:send_request_defer():613] handle sender defer: 3
|
| 75 |
+
2024-08-23 15:44:50,766 INFO SenderThread:10032 [sender.py:transition_state():617] send defer: 4
|
| 76 |
+
2024-08-23 15:44:50,767 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: defer
|
| 77 |
+
2024-08-23 15:44:50,767 INFO HandlerThread:10032 [handler.py:handle_request_defer():172] handle defer: 4
|
| 78 |
+
2024-08-23 15:44:50,767 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: defer
|
| 79 |
+
2024-08-23 15:44:50,767 INFO SenderThread:10032 [sender.py:send_request_defer():613] handle sender defer: 4
|
| 80 |
+
2024-08-23 15:44:50,767 INFO SenderThread:10032 [sender.py:transition_state():617] send defer: 5
|
| 81 |
+
2024-08-23 15:44:50,767 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: defer
|
| 82 |
+
2024-08-23 15:44:50,767 INFO HandlerThread:10032 [handler.py:handle_request_defer():172] handle defer: 5
|
| 83 |
+
2024-08-23 15:44:50,767 DEBUG SenderThread:10032 [sender.py:send():382] send: summary
|
| 84 |
+
2024-08-23 15:44:50,768 INFO SenderThread:10032 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
|
| 85 |
+
2024-08-23 15:44:50,768 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: defer
|
| 86 |
+
2024-08-23 15:44:50,768 INFO SenderThread:10032 [sender.py:send_request_defer():613] handle sender defer: 5
|
| 87 |
+
2024-08-23 15:44:50,768 INFO SenderThread:10032 [sender.py:transition_state():617] send defer: 6
|
| 88 |
+
2024-08-23 15:44:50,768 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: defer
|
| 89 |
+
2024-08-23 15:44:50,768 INFO HandlerThread:10032 [handler.py:handle_request_defer():172] handle defer: 6
|
| 90 |
+
2024-08-23 15:44:50,768 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: defer
|
| 91 |
+
2024-08-23 15:44:50,769 INFO SenderThread:10032 [sender.py:send_request_defer():613] handle sender defer: 6
|
| 92 |
+
2024-08-23 15:44:50,771 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: status_report
|
| 93 |
+
2024-08-23 15:44:50,957 INFO SenderThread:10032 [sender.py:transition_state():617] send defer: 7
|
| 94 |
+
2024-08-23 15:44:50,957 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: defer
|
| 95 |
+
2024-08-23 15:44:50,957 INFO HandlerThread:10032 [handler.py:handle_request_defer():172] handle defer: 7
|
| 96 |
+
2024-08-23 15:44:50,958 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: defer
|
| 97 |
+
2024-08-23 15:44:50,958 INFO SenderThread:10032 [sender.py:send_request_defer():613] handle sender defer: 7
|
| 98 |
+
2024-08-23 15:44:51,392 INFO Thread-12 :10032 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_154448-v9m85jnt/files/config.yaml
|
| 99 |
+
2024-08-23 15:44:51,392 INFO Thread-12 :10032 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240823_154448-v9m85jnt/files/wandb-summary.json
|
| 100 |
+
2024-08-23 15:44:51,729 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: poll_exit
|
| 101 |
+
2024-08-23 15:44:52,393 INFO Thread-12 :10032 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_154448-v9m85jnt/files/output.log
|
| 102 |
+
2024-08-23 15:44:52,721 INFO SenderThread:10032 [sender.py:transition_state():617] send defer: 8
|
| 103 |
+
2024-08-23 15:44:52,721 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: poll_exit
|
| 104 |
+
2024-08-23 15:44:52,721 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: defer
|
| 105 |
+
2024-08-23 15:44:52,721 INFO HandlerThread:10032 [handler.py:handle_request_defer():172] handle defer: 8
|
| 106 |
+
2024-08-23 15:44:52,721 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: defer
|
| 107 |
+
2024-08-23 15:44:52,721 INFO SenderThread:10032 [sender.py:send_request_defer():613] handle sender defer: 8
|
| 108 |
+
2024-08-23 15:44:52,721 INFO SenderThread:10032 [job_builder.py:build():296] Attempting to build job artifact
|
| 109 |
+
2024-08-23 15:44:52,722 INFO SenderThread:10032 [job_builder.py:_get_source_type():426] is repo sourced job
|
| 110 |
+
2024-08-23 15:44:52,730 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: poll_exit
|
| 111 |
+
2024-08-23 15:44:52,737 INFO SenderThread:10032 [job_builder.py:build():402] adding wandb-job metadata file
|
| 112 |
+
2024-08-23 15:44:52,746 INFO SenderThread:10032 [sender.py:transition_state():617] send defer: 9
|
| 113 |
+
2024-08-23 15:44:52,747 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: poll_exit
|
| 114 |
+
2024-08-23 15:44:52,747 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: defer
|
| 115 |
+
2024-08-23 15:44:52,747 DEBUG SenderThread:10032 [sender.py:send():382] send: artifact
|
| 116 |
+
2024-08-23 15:44:52,747 INFO HandlerThread:10032 [handler.py:handle_request_defer():172] handle defer: 9
|
| 117 |
+
2024-08-23 15:44:53,393 INFO Thread-12 :10032 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_154448-v9m85jnt/files/output.log
|
| 118 |
+
2024-08-23 15:44:53,730 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: poll_exit
|
| 119 |
+
2024-08-23 15:44:54,153 INFO wandb-upload_1:10032 [upload_job.py:push():89] Uploaded file /singularity_home/.local/share/wandb/artifacts/staging/tmp_o6jbw71
|
| 120 |
+
2024-08-23 15:44:54,878 INFO wandb-upload_0:10032 [upload_job.py:push():89] Uploaded file /singularity_home/.local/share/wandb/artifacts/staging/tmpdgbh2byi
|
| 121 |
+
2024-08-23 15:44:55,934 INFO SenderThread:10032 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTE2MTk3MTc1OA==', 'state': 'PENDING', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjQxNjQ1ODQ1MA==', 'latestArtifact': None}}
|
| 122 |
+
2024-08-23 15:44:55,934 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: defer
|
| 123 |
+
2024-08-23 15:44:55,934 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: status_report
|
| 124 |
+
2024-08-23 15:44:55,934 INFO SenderThread:10032 [sender.py:send_request_defer():613] handle sender defer: 9
|
| 125 |
+
2024-08-23 15:44:55,934 INFO SenderThread:10032 [dir_watcher.py:finish():358] shutting down directory watcher
|
| 126 |
+
2024-08-23 15:44:56,394 INFO SenderThread:10032 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240823_154448-v9m85jnt/files
|
| 127 |
+
2024-08-23 15:44:56,395 INFO SenderThread:10032 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_154448-v9m85jnt/files/requirements.txt requirements.txt
|
| 128 |
+
2024-08-23 15:44:56,395 INFO SenderThread:10032 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_154448-v9m85jnt/files/config.yaml config.yaml
|
| 129 |
+
2024-08-23 15:44:56,396 INFO SenderThread:10032 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_154448-v9m85jnt/files/wandb-metadata.json wandb-metadata.json
|
| 130 |
+
2024-08-23 15:44:56,396 INFO SenderThread:10032 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_154448-v9m85jnt/files/wandb-summary.json wandb-summary.json
|
| 131 |
+
2024-08-23 15:44:56,398 INFO SenderThread:10032 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_154448-v9m85jnt/files/output.log output.log
|
| 132 |
+
2024-08-23 15:44:56,399 INFO SenderThread:10032 [sender.py:transition_state():617] send defer: 10
|
| 133 |
+
2024-08-23 15:44:56,399 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: poll_exit
|
| 134 |
+
2024-08-23 15:44:56,399 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: defer
|
| 135 |
+
2024-08-23 15:44:56,401 INFO HandlerThread:10032 [handler.py:handle_request_defer():172] handle defer: 10
|
| 136 |
+
2024-08-23 15:44:56,401 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: defer
|
| 137 |
+
2024-08-23 15:44:56,401 INFO SenderThread:10032 [sender.py:send_request_defer():613] handle sender defer: 10
|
| 138 |
+
2024-08-23 15:44:56,401 INFO SenderThread:10032 [file_pusher.py:finish():172] shutting down file pusher
|
| 139 |
+
2024-08-23 15:44:56,731 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: poll_exit
|
| 140 |
+
2024-08-23 15:44:56,731 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: poll_exit
|
| 141 |
+
2024-08-23 15:44:56,790 INFO wandb-upload_1:10032 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240823_154448-v9m85jnt/files/requirements.txt
|
| 142 |
+
2024-08-23 15:44:56,818 INFO wandb-upload_0:10032 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240823_154448-v9m85jnt/files/config.yaml
|
| 143 |
+
2024-08-23 15:44:56,848 INFO wandb-upload_2:10032 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240823_154448-v9m85jnt/files/wandb-summary.json
|
| 144 |
+
2024-08-23 15:44:56,865 INFO wandb-upload_3:10032 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240823_154448-v9m85jnt/files/output.log
|
| 145 |
+
2024-08-23 15:44:57,065 INFO Thread-11 (_thread_body):10032 [sender.py:transition_state():617] send defer: 11
|
| 146 |
+
2024-08-23 15:44:57,065 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: defer
|
| 147 |
+
2024-08-23 15:44:57,065 INFO HandlerThread:10032 [handler.py:handle_request_defer():172] handle defer: 11
|
| 148 |
+
2024-08-23 15:44:57,065 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: defer
|
| 149 |
+
2024-08-23 15:44:57,065 INFO SenderThread:10032 [sender.py:send_request_defer():613] handle sender defer: 11
|
| 150 |
+
2024-08-23 15:44:57,065 INFO SenderThread:10032 [file_pusher.py:join():178] waiting for file pusher
|
| 151 |
+
2024-08-23 15:44:57,066 INFO SenderThread:10032 [sender.py:transition_state():617] send defer: 12
|
| 152 |
+
2024-08-23 15:44:57,066 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: defer
|
| 153 |
+
2024-08-23 15:44:57,066 INFO HandlerThread:10032 [handler.py:handle_request_defer():172] handle defer: 12
|
| 154 |
+
2024-08-23 15:44:57,066 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: defer
|
| 155 |
+
2024-08-23 15:44:57,066 INFO SenderThread:10032 [sender.py:send_request_defer():613] handle sender defer: 12
|
| 156 |
+
2024-08-23 15:44:57,066 INFO SenderThread:10032 [file_stream.py:finish():595] file stream finish called
|
| 157 |
+
2024-08-23 15:44:57,271 INFO SenderThread:10032 [file_stream.py:finish():599] file stream finish is done
|
| 158 |
+
2024-08-23 15:44:57,271 INFO SenderThread:10032 [sender.py:transition_state():617] send defer: 13
|
| 159 |
+
2024-08-23 15:44:57,271 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: defer
|
| 160 |
+
2024-08-23 15:44:57,271 INFO HandlerThread:10032 [handler.py:handle_request_defer():172] handle defer: 13
|
| 161 |
+
2024-08-23 15:44:57,271 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: defer
|
| 162 |
+
2024-08-23 15:44:57,271 INFO SenderThread:10032 [sender.py:send_request_defer():613] handle sender defer: 13
|
| 163 |
+
2024-08-23 15:44:57,271 INFO SenderThread:10032 [sender.py:transition_state():617] send defer: 14
|
| 164 |
+
2024-08-23 15:44:57,271 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: defer
|
| 165 |
+
2024-08-23 15:44:57,271 DEBUG SenderThread:10032 [sender.py:send():382] send: final
|
| 166 |
+
2024-08-23 15:44:57,271 INFO HandlerThread:10032 [handler.py:handle_request_defer():172] handle defer: 14
|
| 167 |
+
2024-08-23 15:44:57,271 DEBUG SenderThread:10032 [sender.py:send():382] send: footer
|
| 168 |
+
2024-08-23 15:44:57,272 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: defer
|
| 169 |
+
2024-08-23 15:44:57,272 INFO SenderThread:10032 [sender.py:send_request_defer():613] handle sender defer: 14
|
| 170 |
+
2024-08-23 15:44:57,272 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: poll_exit
|
| 171 |
+
2024-08-23 15:44:57,272 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: poll_exit
|
| 172 |
+
2024-08-23 15:44:57,272 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: server_info
|
| 173 |
+
2024-08-23 15:44:57,273 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: get_summary
|
| 174 |
+
2024-08-23 15:44:57,273 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: sampled_history
|
| 175 |
+
2024-08-23 15:44:57,273 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: poll_exit
|
| 176 |
+
2024-08-23 15:44:57,273 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: internal_messages
|
| 177 |
+
2024-08-23 15:44:57,273 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: poll_exit
|
| 178 |
+
2024-08-23 15:44:57,274 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: server_info
|
| 179 |
+
2024-08-23 15:44:57,275 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: job_info
|
| 180 |
+
2024-08-23 15:44:57,441 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: job_info
|
| 181 |
+
2024-08-23 15:44:57,441 INFO MainThread:10032 [wandb_run.py:_footer_history_summary_info():3866] rendering history
|
| 182 |
+
2024-08-23 15:44:57,441 INFO MainThread:10032 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
|
| 183 |
+
2024-08-23 15:44:57,441 INFO MainThread:10032 [wandb_run.py:_footer_sync_info():3825] logging synced files
|
| 184 |
+
2024-08-23 15:44:57,441 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: shutdown
|
| 185 |
+
2024-08-23 15:44:57,441 INFO HandlerThread:10032 [handler.py:finish():869] shutting down handler
|
| 186 |
+
2024-08-23 15:44:58,275 INFO WriterThread:10032 [datastore.py:close():296] close: /project/wandb/run-20240823_154448-v9m85jnt/run-v9m85jnt.wandb
|
| 187 |
+
2024-08-23 15:44:58,441 INFO SenderThread:10032 [sender.py:finish():1572] shutting down sender
|
| 188 |
+
2024-08-23 15:44:58,441 INFO SenderThread:10032 [file_pusher.py:finish():172] shutting down file pusher
|
| 189 |
+
2024-08-23 15:44:58,441 INFO SenderThread:10032 [file_pusher.py:join():178] waiting for file pusher
|
wandb/run-20240823_154448-v9m85jnt/logs/debug.log
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2024-08-23 15:44:48,884 INFO MainThread:9961 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
|
| 2 |
+
2024-08-23 15:44:48,884 INFO MainThread:9961 [wandb_setup.py:_flush():76] Configure stats pid to 9961
|
| 3 |
+
2024-08-23 15:44:48,884 INFO MainThread:9961 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
|
| 4 |
+
2024-08-23 15:44:48,884 INFO MainThread:9961 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
|
| 5 |
+
2024-08-23 15:44:48,884 INFO MainThread:9961 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train sample'}
|
| 6 |
+
2024-08-23 15:44:48,884 INFO MainThread:9961 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
|
| 7 |
+
2024-08-23 15:44:48,884 INFO MainThread:9961 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
|
| 8 |
+
2024-08-23 15:44:48,884 INFO MainThread:9961 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240823_154448-v9m85jnt/logs/debug.log
|
| 9 |
+
2024-08-23 15:44:48,884 INFO MainThread:9961 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240823_154448-v9m85jnt/logs/debug-internal.log
|
| 10 |
+
2024-08-23 15:44:48,884 INFO MainThread:9961 [wandb_init.py:init():566] calling init triggers
|
| 11 |
+
2024-08-23 15:44:48,884 INFO MainThread:9961 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
|
| 12 |
+
config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '28623823675', '/project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document'], 'valid_data_path': ['1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document'], 'test_data_path': ['1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 2048, 'num_workers': 2, 'tokenizer_type': 'HFPreTrainedTokenizer', 'tokenizer_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'Qwen2-0.5b-0.2_train_2024-08-23-15:44:18', 'wandb_project': 'llm_tutorial-0.2', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/Qwen2-0.5b-0.2', 'save': '/work/llm_recipes/models/Qwen2-0.5b-0.2', 'base_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 10, 'save_interval': 10, 'eval_iters': 10, 'optimizer': 'anyprecision', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 7500, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 7500, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 1, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/Qwen2-0.5b-0.2', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': True, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'valid_micro_batch_size': 1, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 151680, 'gradient_accumulation_steps': 320}
|
| 13 |
+
2024-08-23 15:44:48,884 INFO MainThread:9961 [wandb_init.py:init():616] starting backend
|
| 14 |
+
2024-08-23 15:44:48,885 INFO MainThread:9961 [wandb_init.py:init():620] setting up manager
|
| 15 |
+
2024-08-23 15:44:48,889 INFO MainThread:9961 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 16 |
+
2024-08-23 15:44:48,891 INFO MainThread:9961 [wandb_init.py:init():628] backend started and connected
|
| 17 |
+
2024-08-23 15:44:48,896 INFO MainThread:9961 [wandb_init.py:init():720] updated telemetry
|
| 18 |
+
2024-08-23 15:44:48,909 INFO MainThread:9961 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
|
| 19 |
+
2024-08-23 15:44:49,395 INFO MainThread:9961 [wandb_run.py:_on_init():2262] communicating current version
|
| 20 |
+
2024-08-23 15:44:49,418 INFO MainThread:9961 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.7 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
|
| 21 |
+
|
| 22 |
+
2024-08-23 15:44:49,418 INFO MainThread:9961 [wandb_init.py:init():804] starting run threads in backend
|
| 23 |
+
2024-08-23 15:44:49,539 INFO MainThread:9961 [wandb_run.py:_console_start():2241] atexit reg
|
| 24 |
+
2024-08-23 15:44:49,539 INFO MainThread:9961 [wandb_run.py:_redirect():2096] redirect: wrap_raw
|
| 25 |
+
2024-08-23 15:44:49,539 INFO MainThread:9961 [wandb_run.py:_redirect():2161] Wrapping output streams.
|
| 26 |
+
2024-08-23 15:44:49,539 INFO MainThread:9961 [wandb_run.py:_redirect():2186] Redirects installed.
|
| 27 |
+
2024-08-23 15:44:49,540 INFO MainThread:9961 [wandb_init.py:init():847] run started, returning control to user process
|
| 28 |
+
2024-08-23 15:44:58,442 WARNING MsgRouterThr:9961 [router.py:message_loop():77] message_loop has been closed
|
wandb/run-20240823_154448-v9m85jnt/run-v9m85jnt.wandb
ADDED
|
Binary file (8.01 kB). View file
|
|
|