koichi12 commited on
Commit
b38ed3f
·
verified ·
1 Parent(s): 1cf1fd6

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. wandb/run-20240804_140603-q9i5g6sv/files/config.yaml +335 -0
  2. wandb/run-20240804_140603-q9i5g6sv/files/output.log +130 -0
  3. wandb/run-20240804_140603-q9i5g6sv/files/requirements.txt +271 -0
  4. wandb/run-20240804_140603-q9i5g6sv/files/wandb-metadata.json +215 -0
  5. wandb/run-20240804_140603-q9i5g6sv/files/wandb-summary.json +1 -0
  6. wandb/run-20240804_140603-q9i5g6sv/logs/debug-internal.log +186 -0
  7. wandb/run-20240804_140603-q9i5g6sv/logs/debug.log +30 -0
  8. wandb/run-20240804_140603-q9i5g6sv/run-q9i5g6sv.wandb +0 -0
  9. wandb/run-20240804_142250-6p58tz1g/files/config.yaml +335 -0
  10. wandb/run-20240804_142250-6p58tz1g/files/output.log +135 -0
  11. wandb/run-20240804_142250-6p58tz1g/files/requirements.txt +271 -0
  12. wandb/run-20240804_142250-6p58tz1g/files/wandb-metadata.json +215 -0
  13. wandb/run-20240804_142250-6p58tz1g/files/wandb-summary.json +1 -0
  14. wandb/run-20240804_142250-6p58tz1g/logs/debug-internal.log +186 -0
  15. wandb/run-20240804_142250-6p58tz1g/logs/debug.log +30 -0
  16. wandb/run-20240804_142250-6p58tz1g/run-6p58tz1g.wandb +0 -0
  17. wandb/run-20240804_143607-h7fxlkpt/files/config.yaml +335 -0
  18. wandb/run-20240804_143607-h7fxlkpt/files/output.log +135 -0
  19. wandb/run-20240804_143607-h7fxlkpt/files/requirements.txt +271 -0
  20. wandb/run-20240804_143607-h7fxlkpt/files/wandb-metadata.json +215 -0
  21. wandb/run-20240804_143607-h7fxlkpt/files/wandb-summary.json +1 -0
  22. wandb/run-20240804_143607-h7fxlkpt/logs/debug-internal.log +186 -0
  23. wandb/run-20240804_143607-h7fxlkpt/logs/debug.log +30 -0
  24. wandb/run-20240804_143607-h7fxlkpt/run-h7fxlkpt.wandb +0 -0
  25. wandb/run-20240804_221132-o8ieoj9i/files/config.yaml +335 -0
  26. wandb/run-20240804_221132-o8ieoj9i/files/output.log +135 -0
  27. wandb/run-20240804_221132-o8ieoj9i/files/requirements.txt +271 -0
  28. wandb/run-20240804_221132-o8ieoj9i/files/wandb-metadata.json +215 -0
  29. wandb/run-20240804_221132-o8ieoj9i/files/wandb-summary.json +1 -0
  30. wandb/run-20240804_221132-o8ieoj9i/logs/debug-internal.log +263 -0
  31. wandb/run-20240804_221132-o8ieoj9i/logs/debug.log +30 -0
  32. wandb/run-20240804_221132-o8ieoj9i/run-o8ieoj9i.wandb +0 -0
  33. wandb/run-20240812_052853-n84i0o06/files/config.yaml +335 -0
  34. wandb/run-20240812_052853-n84i0o06/files/output.log +139 -0
  35. wandb/run-20240812_052853-n84i0o06/files/requirements.txt +271 -0
  36. wandb/run-20240812_052853-n84i0o06/files/wandb-metadata.json +215 -0
  37. wandb/run-20240812_052853-n84i0o06/files/wandb-summary.json +1 -0
  38. wandb/run-20240812_052853-n84i0o06/logs/debug-internal.log +384 -0
  39. wandb/run-20240812_052853-n84i0o06/logs/debug.log +30 -0
  40. wandb/run-20240812_052853-n84i0o06/run-n84i0o06.wandb +0 -0
  41. wandb/run-20240812_063027-j1htzx7q/files/output.log +121 -0
  42. wandb/run-20240812_063027-j1htzx7q/files/wandb-summary.json +1 -0
  43. wandb/run-20240823_154448-v9m85jnt/files/config.yaml +321 -0
  44. wandb/run-20240823_154448-v9m85jnt/files/output.log +15 -0
  45. wandb/run-20240823_154448-v9m85jnt/files/requirements.txt +375 -0
  46. wandb/run-20240823_154448-v9m85jnt/files/wandb-metadata.json +220 -0
  47. wandb/run-20240823_154448-v9m85jnt/files/wandb-summary.json +1 -0
  48. wandb/run-20240823_154448-v9m85jnt/logs/debug-internal.log +189 -0
  49. wandb/run-20240823_154448-v9m85jnt/logs/debug.log +28 -0
  50. wandb/run-20240823_154448-v9m85jnt/run-v9m85jnt.wandb +0 -0
wandb/run-20240804_140603-q9i5g6sv/files/config.yaml ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ sharding_strategy:
4
+ desc: null
5
+ value: FULL_SHARD
6
+ checkpoint_type:
7
+ desc: null
8
+ value: LOCAL_STATE_DICT
9
+ fsdp_activation_checkpointing:
10
+ desc: null
11
+ value: true
12
+ fsdp_cpu_offload:
13
+ desc: null
14
+ value: false
15
+ low_cpu_fsdp:
16
+ desc: null
17
+ value: false
18
+ no_meta_device:
19
+ desc: null
20
+ value: false
21
+ data_path:
22
+ desc: null
23
+ value: null
24
+ split:
25
+ desc: null
26
+ value: 969, 30, 1
27
+ train_data_path:
28
+ desc: null
29
+ value:
30
+ - '4013541'
31
+ - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
32
+ valid_data_path:
33
+ desc: null
34
+ value:
35
+ - '4013541'
36
+ - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
37
+ test_data_path:
38
+ desc: null
39
+ value:
40
+ - '4013541'
41
+ - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
42
+ data_cache_path:
43
+ desc: null
44
+ value: null
45
+ vocab_size:
46
+ desc: null
47
+ value: null
48
+ vocab_file:
49
+ desc: null
50
+ value: null
51
+ merge_file:
52
+ desc: null
53
+ value: null
54
+ seq_length:
55
+ desc: null
56
+ value: 512
57
+ num_workers:
58
+ desc: null
59
+ value: 2
60
+ tokenizer_type:
61
+ desc: null
62
+ value: Llama2Tokenizer
63
+ tokenizer_model:
64
+ desc: null
65
+ value: /share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model
66
+ reset_position_ids:
67
+ desc: null
68
+ value: false
69
+ reset_attention_mask:
70
+ desc: null
71
+ value: false
72
+ eod_mask_loss:
73
+ desc: null
74
+ value: false
75
+ retro_return_doc_ids:
76
+ desc: null
77
+ value: false
78
+ short_seq_prob:
79
+ desc: null
80
+ value: 0.1
81
+ vocab_extra_ids:
82
+ desc: null
83
+ value: 0
84
+ seed:
85
+ desc: null
86
+ value: 1234
87
+ use_mpi:
88
+ desc: null
89
+ value: false
90
+ wandb_entity:
91
+ desc: null
92
+ value: iwakawa-koichi-q5-tohoku-nlp6723
93
+ wandb_name:
94
+ desc: null
95
+ value: tiny-llama_train_2024-08-04-14:05:53
96
+ wandb_project:
97
+ desc: null
98
+ value: llm_tutorial
99
+ quantization:
100
+ desc: null
101
+ value: false
102
+ use_freeze_layers:
103
+ desc: null
104
+ value: false
105
+ freeze_layers:
106
+ desc: null
107
+ value: null
108
+ bf16:
109
+ desc: null
110
+ value: true
111
+ fp16:
112
+ desc: null
113
+ value: false
114
+ mixed_precision:
115
+ desc: null
116
+ value: true
117
+ param_dtype:
118
+ desc: null
119
+ value: null
120
+ load:
121
+ desc: null
122
+ value: /work/llm_recipes/models/tiny-llama
123
+ save:
124
+ desc: null
125
+ value: /work/llm_recipes/models/tiny-llama
126
+ base_model:
127
+ desc: null
128
+ value: /share/pretrained_lm/meta-llama/TinyLlama_v1.1
129
+ use_better_transformer:
130
+ desc: null
131
+ value: false
132
+ grad_clip_norm:
133
+ desc: null
134
+ value: 1.0
135
+ eval_interval:
136
+ desc: null
137
+ value: 200
138
+ save_interval:
139
+ desc: null
140
+ value: 200
141
+ eval_iters:
142
+ desc: null
143
+ value: 10
144
+ optimizer:
145
+ desc: null
146
+ value: adam
147
+ lr:
148
+ desc: null
149
+ value: 2.0e-05
150
+ lr_decay_style:
151
+ desc: null
152
+ value: cosine
153
+ lr_decay_iters:
154
+ desc: null
155
+ value: 2000
156
+ lr_warmup_iters:
157
+ desc: null
158
+ value: 500
159
+ min_lr:
160
+ desc: null
161
+ value: 1.0e-06
162
+ train_iters:
163
+ desc: null
164
+ value: 2000
165
+ train_samples:
166
+ desc: null
167
+ value: null
168
+ global_batch_size:
169
+ desc: null
170
+ value: 320
171
+ micro_batch_size:
172
+ desc: null
173
+ value: 8
174
+ make_vocab_size_divisible_by:
175
+ desc: null
176
+ value: 128
177
+ sliding_window_size:
178
+ desc: null
179
+ value: 4096
180
+ skip_batch:
181
+ desc: null
182
+ value: null
183
+ no_save_optimizer_state:
184
+ desc: null
185
+ value: false
186
+ continual_pretraining:
187
+ desc: null
188
+ value: false
189
+ instruction_tuning:
190
+ desc: null
191
+ value: false
192
+ direct_preference_optimization:
193
+ desc: null
194
+ value: false
195
+ attention_dropout:
196
+ desc: null
197
+ value: 0.1
198
+ hidden_dropout:
199
+ desc: null
200
+ value: 0.1
201
+ weight_decay:
202
+ desc: null
203
+ value: 0.1
204
+ adam_beta1:
205
+ desc: null
206
+ value: 0.9
207
+ adam_beta2:
208
+ desc: null
209
+ value: 0.95
210
+ adam_eps:
211
+ desc: null
212
+ value: 1.0e-06
213
+ hf_transformer_model_dir:
214
+ desc: null
215
+ value: null
216
+ instruction_train_data_path:
217
+ desc: null
218
+ value: null
219
+ instruction_valid_data_path:
220
+ desc: null
221
+ value: null
222
+ epoch:
223
+ desc: null
224
+ value: null
225
+ instruction_dataset_size:
226
+ desc: null
227
+ value: null
228
+ save_sampler_state:
229
+ desc: null
230
+ value: false
231
+ label_smoothing:
232
+ desc: null
233
+ value: 0.0
234
+ save_n_checkpoints:
235
+ desc: null
236
+ value: 10
237
+ hf_repo_id:
238
+ desc: null
239
+ value: koichi12/tiny-llama
240
+ create_public_hf_repo:
241
+ desc: null
242
+ value: false
243
+ upload_all_checkpoints_to_hf:
244
+ desc: null
245
+ value: false
246
+ hf_upload_retry_limit:
247
+ desc: null
248
+ value: 2
249
+ exit_duration_in_mins:
250
+ desc: null
251
+ value: null
252
+ source_key:
253
+ desc: null
254
+ value: null
255
+ target_key:
256
+ desc: null
257
+ value: null
258
+ attn_implementation:
259
+ desc: null
260
+ value: flash_attention_2
261
+ efficient_instruction_tuning:
262
+ desc: null
263
+ value: false
264
+ remove_padding_masking:
265
+ desc: null
266
+ value: false
267
+ save_start_iter:
268
+ desc: null
269
+ value: null
270
+ rank:
271
+ desc: null
272
+ value: 0
273
+ world_size:
274
+ desc: null
275
+ value: 1
276
+ padded_vocab_size:
277
+ desc: null
278
+ value: 32000
279
+ gradient_accumulation_steps:
280
+ desc: null
281
+ value: 40
282
+ _wandb:
283
+ desc: null
284
+ value:
285
+ python_version: 3.10.12
286
+ cli_version: 0.16.3
287
+ framework: huggingface
288
+ huggingface_version: 4.43.3
289
+ is_jupyter_run: false
290
+ is_kaggle_kernel: false
291
+ start_time: 1722747963.684337
292
+ t:
293
+ 1:
294
+ - 1
295
+ - 11
296
+ - 49
297
+ - 55
298
+ - 71
299
+ 2:
300
+ - 1
301
+ - 11
302
+ - 49
303
+ - 55
304
+ - 71
305
+ 3:
306
+ - 13
307
+ - 16
308
+ - 23
309
+ 4: 3.10.12
310
+ 5: 0.16.3
311
+ 6: 4.43.3
312
+ 8:
313
+ - 5
314
+ 13: linux-x86_64
315
+ activation_function:
316
+ desc: null
317
+ value: silu
318
+ hidden_size:
319
+ desc: null
320
+ value: 2048
321
+ model_type:
322
+ desc: null
323
+ value: llama
324
+ max_position_embeddings:
325
+ desc: null
326
+ value: 2048
327
+ num_attention_heads:
328
+ desc: null
329
+ value: 32
330
+ num_hidden_layers:
331
+ desc: null
332
+ value: 22
333
+ model_architecture:
334
+ desc: null
335
+ value: LlamaForCausalLM
wandb/run-20240804_140603-q9i5g6sv/files/output.log ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Created Hugging Face repository with ID koichi12/tiny-llama.
2
+ Clearing GPU cache for all ranks
3
+ --> Running with torch torch_distributed debug set to detail
4
+ File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
5
+ Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
6
+ File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
7
+ Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
8
+ File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
9
+ Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
10
+ No checkpoint found in /work/llm_recipes/models/tiny-llama, skipping model loading
11
+ --> Model /share/pretrained_lm/meta-llama/TinyLlama_v1.1
12
+ --> /share/pretrained_lm/meta-llama/TinyLlama_v1.1 has 1100.048384 Million params
13
+ You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
14
+ You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
15
+ Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaForCausalLM is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
16
+ Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
17
+ BFloat16 enabled for mixed precision - using bfSixteen policy
18
+ --> applying fsdp activation checkpointing...
19
+ > datasets target sizes (minimum size):
20
+ train: 640000
21
+ validation: 35200
22
+ test: 3200
23
+ > building train, validation, and test datasets for GPT ...
24
+ > finished creating GPT datasets ...
25
+ File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
26
+ Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
27
+ No checkpoint found in /work/llm_recipes/models/tiny-llama, skipping optimizer loading
28
+ File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
29
+ Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
30
+ model info: FullyShardedDataParallel(
31
+ (_fsdp_wrapped_module): LlamaForCausalLM(
32
+ (model): LlamaModel(
33
+ (embed_tokens): Embedding(32000, 2048)
34
+ (layers): ModuleList(
35
+ (0-21): 22 x FullyShardedDataParallel(
36
+ (_fsdp_wrapped_module): CheckpointWrapper(
37
+ (_checkpoint_wrapped_module): LlamaDecoderLayer(
38
+ (self_attn): LlamaFlashAttention2(
39
+ (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
40
+ (k_proj): Linear(in_features=2048, out_features=256, bias=False)
41
+ (v_proj): Linear(in_features=2048, out_features=256, bias=False)
42
+ (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
43
+ (rotary_emb): LlamaRotaryEmbedding()
44
+ )
45
+ (mlp): LlamaMLP(
46
+ (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
47
+ (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
48
+ (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
49
+ (act_fn): SiLU()
50
+ )
51
+ (input_layernorm): LlamaRMSNorm()
52
+ (post_attention_layernorm): LlamaRMSNorm()
53
+ )
54
+ )
55
+ )
56
+ )
57
+ (norm): LlamaRMSNorm()
58
+ (rotary_emb): LlamaRotaryEmbedding()
59
+ )
60
+ (lm_head): Linear(in_features=2048, out_features=32000, bias=False)
61
+ )
62
+ )
63
+ model config: LlamaConfig {
64
+ "_name_or_path": "/share/pretrained_lm/meta-llama/TinyLlama_v1.1",
65
+ "architectures": [
66
+ "LlamaForCausalLM"
67
+ ],
68
+ "attention_bias": false,
69
+ "attention_dropout": 0.0,
70
+ "bos_token_id": 1,
71
+ "eos_token_id": 2,
72
+ "hidden_act": "silu",
73
+ "hidden_size": 2048,
74
+ "initializer_range": 0.02,
75
+ "intermediate_size": 5632,
76
+ "label_smoothing": 0.0,
77
+ "max_position_embeddings": 2048,
78
+ "mlp_bias": false,
79
+ "model_type": "llama",
80
+ "num_attention_heads": 32,
81
+ "num_hidden_layers": 22,
82
+ "num_key_value_heads": 4,
83
+ "pretraining_tp": 1,
84
+ "rms_norm_eps": 1e-05,
85
+ "rope_scaling": null,
86
+ "rope_theta": 10000.0,
87
+ "tie_word_embeddings": false,
88
+ "torch_dtype": "float32",
89
+ "transformers_version": "4.43.3",
90
+ "use_cache": false,
91
+ "vocab_size": 32000
92
+ }
93
+ /usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
94
+ warnings.warn(
95
+ Let split = None
96
+ Building a BlendedDataset for a single MegatronDataset
97
+ Unable to save the indexes because path_to_cache is None
98
+ Building a BlendedDataset for a single MegatronDataset
99
+ Unable to save the indexes because path_to_cache is None
100
+ Building a BlendedDataset for a single MegatronDataset
101
+ Unable to save the indexes because path_to_cache is None
102
+ Traceback (most recent call last):
103
+ File "/project/examples/finetuning.py", line 13, in <module>
104
+ main()
105
+ File "/project/src/llama_recipes/finetuning.py", line 281, in main
106
+ train(
107
+ File "/project/src/llama_recipes/utils/train_utils.py", line 110, in train
108
+ loss: torch.Tensor = model(**batch).loss
109
+ File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
110
+ return self._call_impl(*args, **kwargs)
111
+ File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
112
+ return forward_call(*args, **kwargs)
113
+ File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 849, in forward
114
+ output = self._fsdp_wrapped_module(*args, **kwargs)
115
+ File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
116
+ return self._call_impl(*args, **kwargs)
117
+ File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
118
+ return forward_call(*args, **kwargs)
119
+ File "/project/lib/transformers/src/transformers/models/llama/modeling_llama.py", line 1141, in forward
120
+ outputs = self.model(
121
+ File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
122
+ return self._call_impl(*args, **kwargs)
123
+ File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
124
+ return forward_call(*args, **kwargs)
125
+ File "/project/lib/transformers/src/transformers/models/llama/modeling_llama.py", line 908, in forward
126
+ cache_position = torch.arange(
127
+ RuntimeError: CUDA error: device-side assert triggered
128
+ CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
129
+ For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
130
+ Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
wandb/run-20240804_140603-q9i5g6sv/files/requirements.txt ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.1.0
2
+ accelerate==0.33.0
3
+ aiohttp==3.9.1
4
+ aiosignal==1.3.1
5
+ annotated-types==0.6.0
6
+ apex==0.1
7
+ appdirs==1.4.4
8
+ argon2-cffi-bindings==21.2.0
9
+ argon2-cffi==23.1.0
10
+ asttokens==2.4.1
11
+ astunparse==1.6.3
12
+ async-timeout==4.0.3
13
+ attrs==23.2.0
14
+ audioread==3.0.1
15
+ beautifulsoup4==4.12.3
16
+ bleach==6.1.0
17
+ blis==0.7.11
18
+ cachetools==5.3.2
19
+ catalogue==2.0.10
20
+ certifi==2024.2.2
21
+ cffi==1.16.0
22
+ charset-normalizer==3.3.2
23
+ click==8.1.7
24
+ cloudpathlib==0.16.0
25
+ cloudpickle==3.0.0
26
+ cmake==3.28.1
27
+ colorama==0.4.6
28
+ comm==0.2.1
29
+ confection==0.1.4
30
+ contourpy==1.2.0
31
+ cubinlinker==0.3.0+2.g405ac64
32
+ cuda-python==12.3.0rc4+9.gdb8c48a.dirty
33
+ cudf==23.12.0
34
+ cugraph-dgl==23.12.0
35
+ cugraph-service-client==23.12.0
36
+ cugraph-service-server==23.12.0
37
+ cugraph==23.12.0
38
+ cuml==23.12.0
39
+ cupy-cuda12x==12.3.0
40
+ cycler==0.12.1
41
+ cymem==2.0.8
42
+ cython==3.0.8
43
+ dask-cuda==23.12.0
44
+ dask-cudf==23.12.0
45
+ dask==2023.11.0
46
+ debugpy==1.8.1
47
+ decorator==5.1.1
48
+ defusedxml==0.7.1
49
+ distributed==2023.11.0
50
+ dm-tree==0.1.8
51
+ docker-pycreds==0.4.0
52
+ einops==0.7.0
53
+ exceptiongroup==1.2.0
54
+ execnet==2.0.2
55
+ executing==2.0.1
56
+ expecttest==0.1.3
57
+ fastjsonschema==2.19.1
58
+ fastrlock==0.8.2
59
+ filelock==3.13.1
60
+ flash-attn==2.4.2
61
+ fonttools==4.48.1
62
+ frozenlist==1.4.1
63
+ fsspec==2023.12.2
64
+ gast==0.5.4
65
+ gitdb==4.0.11
66
+ gitpython==3.1.43
67
+ google-auth-oauthlib==0.4.6
68
+ google-auth==2.27.0
69
+ graphsurgeon==0.4.6
70
+ grpcio==1.60.1
71
+ huggingface-hub==0.24.5
72
+ hypothesis==5.35.1
73
+ idna==3.6
74
+ importlib-metadata==7.0.1
75
+ iniconfig==2.0.0
76
+ intel-openmp==2021.4.0
77
+ ipadic==1.0.0
78
+ ipykernel==6.29.2
79
+ ipython-genutils==0.2.0
80
+ ipython==8.21.0
81
+ jedi==0.19.1
82
+ jinja2==3.1.3
83
+ joblib==1.3.2
84
+ json5==0.9.14
85
+ jsonnet==0.19.1
86
+ jsonschema-specifications==2023.12.1
87
+ jsonschema==4.21.1
88
+ jupyter-client==8.6.0
89
+ jupyter-core==5.7.1
90
+ jupyter-tensorboard==0.2.0
91
+ jupyterlab-pygments==0.3.0
92
+ jupyterlab-server==1.2.0
93
+ jupyterlab==2.3.2
94
+ jupytext==1.16.1
95
+ kiwisolver==1.4.5
96
+ langcodes==3.3.0
97
+ lazy-loader==0.3
98
+ librosa==0.10.1
99
+ llvmlite==0.40.1
100
+ locket==1.0.0
101
+ logzero==1.7.0
102
+ lxml==5.2.2
103
+ markdown-it-py==3.0.0
104
+ markdown==3.5.2
105
+ markupsafe==2.1.4
106
+ matplotlib-inline==0.1.6
107
+ matplotlib==3.8.2
108
+ mdit-py-plugins==0.4.0
109
+ mdurl==0.1.2
110
+ mecab-python3==1.0.6
111
+ mistune==3.0.2
112
+ mkl-devel==2021.1.1
113
+ mkl-include==2021.1.1
114
+ mkl==2021.1.1
115
+ mock==5.1.0
116
+ more-itertools==9.1.0
117
+ mpmath==1.3.0
118
+ msgpack==1.0.7
119
+ multidict==6.0.4
120
+ murmurhash==1.0.10
121
+ nbclient==0.9.0
122
+ nbconvert==7.16.0
123
+ nbformat==5.9.2
124
+ nest-asyncio==1.6.0
125
+ networkx==2.6.3
126
+ ninja==1.11.1.1
127
+ nltk==3.8.1
128
+ notebook==6.4.10
129
+ numba==0.57.1+1.g1ff679645
130
+ numpy==1.24.4
131
+ nvfuser==0.1.4a0+d0bb811
132
+ nvidia-dali-cuda120==1.34.0
133
+ nvidia-pyindex==1.0.9
134
+ nvtx==0.2.5
135
+ oauthlib==3.2.2
136
+ onnx==1.15.0rc2
137
+ opencv==4.7.0
138
+ optree==0.10.0
139
+ packaging==23.2
140
+ pandas==1.5.3
141
+ pandocfilters==1.5.1
142
+ parso==0.8.3
143
+ partd==1.4.1
144
+ peft==0.11.1
145
+ pexpect==4.9.0
146
+ pillow==10.2.0
147
+ pip==24.0
148
+ platformdirs==4.2.0
149
+ pluggy==1.4.0
150
+ ply==3.11
151
+ polygraphy==0.49.4
152
+ pooch==1.8.0
153
+ portalocker==2.10.1
154
+ preshed==3.0.9
155
+ prettytable==3.9.0
156
+ prometheus-client==0.19.0
157
+ prompt-toolkit==3.0.43
158
+ protobuf==4.24.4
159
+ psutil==5.9.4
160
+ ptxcompiler==0.8.1+2.g0d406d6
161
+ ptyprocess==0.7.0
162
+ pure-eval==0.2.2
163
+ pyarrow==14.0.1.dev0+gba5374836.d20240125
164
+ pyasn1-modules==0.3.0
165
+ pyasn1==0.5.1
166
+ pybind11-global==2.11.1
167
+ pybind11==2.11.1
168
+ pycocotools==2.0+nv0.8.0
169
+ pycparser==2.21
170
+ pydantic-core==2.16.2
171
+ pydantic==2.6.1
172
+ pygments==2.17.2
173
+ pylibcugraph==23.12.0
174
+ pylibcugraphops==23.12.0
175
+ pylibraft==23.12.0
176
+ pynvml==11.4.1
177
+ pyparsing==3.1.1
178
+ pytest-flakefinder==1.1.0
179
+ pytest-rerunfailures==13.0
180
+ pytest-shard==0.1.2
181
+ pytest-xdist==3.5.0
182
+ pytest==8.0.0
183
+ python-dateutil==2.8.2
184
+ python-dotenv==1.0.0
185
+ python-hostlist==1.23.0
186
+ pytorch-quantization==2.1.2
187
+ pytz==2023.3.post1
188
+ pyyaml==6.0.1
189
+ pyzmq==25.1.2
190
+ raft-dask==23.12.0
191
+ rapids-dask-dependency==23.12.1
192
+ referencing==0.33.0
193
+ regex==2023.12.25
194
+ requests-oauthlib==1.3.1
195
+ requests==2.31.0
196
+ rich==13.7.0
197
+ rmm==23.12.0
198
+ rpds-py==0.17.1
199
+ rsa==4.9
200
+ sacrebleu==2.4.0
201
+ safetensors==0.4.3
202
+ scikit-learn==1.2.0
203
+ scipy==1.12.0
204
+ send2trash==1.8.2
205
+ sentencepiece==0.1.99
206
+ sentry-sdk==2.12.0
207
+ setproctitle==1.3.3
208
+ setuptools==68.2.2
209
+ six==1.16.0
210
+ smart-open==6.4.0
211
+ smmap==5.0.1
212
+ sortedcontainers==2.4.0
213
+ soundfile==0.12.1
214
+ soupsieve==2.5
215
+ soxr==0.3.7
216
+ spacy-legacy==3.0.12
217
+ spacy-loggers==1.0.5
218
+ spacy==3.7.2
219
+ sphinx-glpi-theme==0.6
220
+ srsly==2.4.8
221
+ stack-data==0.6.3
222
+ sympy==1.12
223
+ tabulate==0.9.0
224
+ tbb==2021.11.0
225
+ tblib==3.0.0
226
+ tensorboard-data-server==0.6.1
227
+ tensorboard-plugin-wit==1.8.1
228
+ tensorboard==2.9.0
229
+ tensorrt==8.6.3
230
+ terminado==0.18.0
231
+ termplotlib==0.3.9
232
+ thinc==8.2.3
233
+ threadpoolctl==3.2.0
234
+ thriftpy2==0.4.17
235
+ tinycss2==1.2.1
236
+ tokenizers==0.19.1
237
+ toml==0.10.2
238
+ tomli==2.0.1
239
+ toolz==0.12.1
240
+ torch-tensorrt==2.3.0a0
241
+ torch==2.3.0a0+ebedce2
242
+ torchdata==0.7.1a0
243
+ torchtext==0.17.0a0
244
+ torchvision==0.18.0a0
245
+ tornado==6.4
246
+ tqdm==4.66.1
247
+ traitlets==5.9.0
248
+ transformer-engine==1.3.0+5b90b7f
249
+ transformers==4.43.3
250
+ treelite-runtime==3.9.1
251
+ treelite==3.9.1
252
+ triton==2.2.0+e28a256
253
+ typer==0.9.0
254
+ types-dataclasses==0.6.6
255
+ typing-extensions==4.9.0
256
+ ucx-py==0.35.0
257
+ uff==0.6.9
258
+ ujson==5.8.0
259
+ urllib3==1.26.18
260
+ wandb==0.16.3
261
+ wasabi==1.1.2
262
+ wcwidth==0.2.13
263
+ weasel==0.3.4
264
+ webencodings==0.5.1
265
+ werkzeug==3.0.1
266
+ wheel==0.42.0
267
+ xdoctest==1.0.2
268
+ xgboost==1.7.6
269
+ yarl==1.9.4
270
+ zict==3.0.0
271
+ zipp==3.17.0
wandb/run-20240804_140603-q9i5g6sv/files/wandb-metadata.json ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
3
+ "python": "3.10.12",
4
+ "heartbeatAt": "2024-08-04T05:06:04.333644",
5
+ "startedAt": "2024-08-04T05:06:03.671763",
6
+ "docker": null,
7
+ "cuda": null,
8
+ "args": [
9
+ "--seq-length",
10
+ "512",
11
+ "--sliding-window-size",
12
+ "4096",
13
+ "--micro-batch-size",
14
+ "8",
15
+ "--global-batch-size",
16
+ "320",
17
+ "--train-iters",
18
+ "2000",
19
+ "--tokenizer-type",
20
+ "Llama2Tokenizer",
21
+ "--tokenizer-model",
22
+ "/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model",
23
+ "--train-data-path",
24
+ "4013541",
25
+ "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
26
+ "--valid-data-path",
27
+ "4013541",
28
+ "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
29
+ "--test-data-path",
30
+ "4013541",
31
+ "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
32
+ "--lr",
33
+ "2e-5",
34
+ "--min-lr",
35
+ "1e-6",
36
+ "--lr-decay-style",
37
+ "cosine",
38
+ "--lr-warmup-iters",
39
+ "500",
40
+ "--lr-decay-iters",
41
+ "2000",
42
+ "--weight-decay",
43
+ "0.1",
44
+ "--grad-clip-norm",
45
+ "1.0",
46
+ "--optimizer",
47
+ "adam",
48
+ "--adam-beta1",
49
+ "0.9",
50
+ "--adam-beta2",
51
+ "0.95",
52
+ "--adam-eps",
53
+ "1e-6",
54
+ "--save-interval",
55
+ "200",
56
+ "--eval-interval",
57
+ "200",
58
+ "--eval-iters",
59
+ "10",
60
+ "--bf16",
61
+ "--mixed-precision",
62
+ "--base-model",
63
+ "/share/pretrained_lm/meta-llama/TinyLlama_v1.1",
64
+ "--save",
65
+ "/work/llm_recipes/models/tiny-llama",
66
+ "--load",
67
+ "/work/llm_recipes/models/tiny-llama",
68
+ "--fsdp-activation-checkpointing",
69
+ "--sharding-strategy",
70
+ "FULL_SHARD",
71
+ "--checkpoint-type",
72
+ "LOCAL_STATE_DICT",
73
+ "--save-n-checkpoints",
74
+ "10",
75
+ "--hf-upload-retry-limit",
76
+ "2",
77
+ "--hf-repo-id",
78
+ "koichi12/tiny-llama",
79
+ "--wandb-entity",
80
+ "iwakawa-koichi-q5-tohoku-nlp6723",
81
+ "--wandb-project",
82
+ "llm_tutorial",
83
+ "--wandb-name",
84
+ "tiny-llama_train_2024-08-04-14:05:53"
85
+ ],
86
+ "state": "running",
87
+ "program": "/project/examples/finetuning.py",
88
+ "codePathLocal": "examples/finetuning.py",
89
+ "codePath": "examples/finetuning.py",
90
+ "git": {
91
+ "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
92
+ "commit": "3be5353210a678dc7008f237fa16b99f2bdf36ea"
93
+ },
94
+ "email": null,
95
+ "root": "/project",
96
+ "host": "gpu-koiwa-00",
97
+ "username": "koiwa",
98
+ "executable": "/usr/bin/python",
99
+ "cpu_count": 18,
100
+ "cpu_count_logical": 18,
101
+ "cpu_freq": {
102
+ "current": 2400.0389999999993,
103
+ "min": 0.0,
104
+ "max": 0.0
105
+ },
106
+ "cpu_freq_per_core": [
107
+ {
108
+ "current": 2400.039,
109
+ "min": 0.0,
110
+ "max": 0.0
111
+ },
112
+ {
113
+ "current": 2400.039,
114
+ "min": 0.0,
115
+ "max": 0.0
116
+ },
117
+ {
118
+ "current": 2400.039,
119
+ "min": 0.0,
120
+ "max": 0.0
121
+ },
122
+ {
123
+ "current": 2400.039,
124
+ "min": 0.0,
125
+ "max": 0.0
126
+ },
127
+ {
128
+ "current": 2400.039,
129
+ "min": 0.0,
130
+ "max": 0.0
131
+ },
132
+ {
133
+ "current": 2400.039,
134
+ "min": 0.0,
135
+ "max": 0.0
136
+ },
137
+ {
138
+ "current": 2400.039,
139
+ "min": 0.0,
140
+ "max": 0.0
141
+ },
142
+ {
143
+ "current": 2400.039,
144
+ "min": 0.0,
145
+ "max": 0.0
146
+ },
147
+ {
148
+ "current": 2400.039,
149
+ "min": 0.0,
150
+ "max": 0.0
151
+ },
152
+ {
153
+ "current": 2400.039,
154
+ "min": 0.0,
155
+ "max": 0.0
156
+ },
157
+ {
158
+ "current": 2400.039,
159
+ "min": 0.0,
160
+ "max": 0.0
161
+ },
162
+ {
163
+ "current": 2400.039,
164
+ "min": 0.0,
165
+ "max": 0.0
166
+ },
167
+ {
168
+ "current": 2400.039,
169
+ "min": 0.0,
170
+ "max": 0.0
171
+ },
172
+ {
173
+ "current": 2400.039,
174
+ "min": 0.0,
175
+ "max": 0.0
176
+ },
177
+ {
178
+ "current": 2400.039,
179
+ "min": 0.0,
180
+ "max": 0.0
181
+ },
182
+ {
183
+ "current": 2400.039,
184
+ "min": 0.0,
185
+ "max": 0.0
186
+ },
187
+ {
188
+ "current": 2400.039,
189
+ "min": 0.0,
190
+ "max": 0.0
191
+ },
192
+ {
193
+ "current": 2400.039,
194
+ "min": 0.0,
195
+ "max": 0.0
196
+ }
197
+ ],
198
+ "disk": {
199
+ "/": {
200
+ "total": 0.0625,
201
+ "used": 1.1444091796875e-05
202
+ }
203
+ },
204
+ "gpu": "NVIDIA A100-SXM4-40GB",
205
+ "gpu_count": 1,
206
+ "gpu_devices": [
207
+ {
208
+ "name": "NVIDIA A100-SXM4-40GB",
209
+ "memory_total": 42949672960
210
+ }
211
+ ],
212
+ "memory": {
213
+ "total": 56.48781967163086
214
+ }
215
+ }
wandb/run-20240804_140603-q9i5g6sv/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb": {"runtime": 4}}
wandb/run-20240804_140603-q9i5g6sv/logs/debug-internal.log ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-04 14:06:03,686 INFO StreamThr :9469 [internal.py:wandb_internal():86] W&B internal server running at pid: 9469, started at: 2024-08-04 14:06:03.685029
2
+ 2024-08-04 14:06:03,687 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: status
3
+ 2024-08-04 14:06:03,689 INFO WriterThread:9469 [datastore.py:open_for_write():87] open: /project/wandb/run-20240804_140603-q9i5g6sv/run-q9i5g6sv.wandb
4
+ 2024-08-04 14:06:03,690 DEBUG SenderThread:9469 [sender.py:send():382] send: header
5
+ 2024-08-04 14:06:03,703 DEBUG SenderThread:9469 [sender.py:send():382] send: run
6
+ 2024-08-04 14:06:04,218 INFO SenderThread:9469 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240804_140603-q9i5g6sv/files
7
+ 2024-08-04 14:06:04,218 INFO SenderThread:9469 [sender.py:_start_run_threads():1136] run started: q9i5g6sv with start time 1722747963.684337
8
+ 2024-08-04 14:06:04,223 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: check_version
9
+ 2024-08-04 14:06:04,223 DEBUG SenderThread:9469 [sender.py:send_request():409] send_request: check_version
10
+ 2024-08-04 14:06:04,313 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: run_start
11
+ 2024-08-04 14:06:04,320 DEBUG HandlerThread:9469 [system_info.py:__init__():27] System info init
12
+ 2024-08-04 14:06:04,320 DEBUG HandlerThread:9469 [system_info.py:__init__():42] System info init done
13
+ 2024-08-04 14:06:04,320 INFO HandlerThread:9469 [system_monitor.py:start():194] Starting system monitor
14
+ 2024-08-04 14:06:04,320 INFO SystemMonitor:9469 [system_monitor.py:_start():158] Starting system asset monitoring threads
15
+ 2024-08-04 14:06:04,320 INFO HandlerThread:9469 [system_monitor.py:probe():214] Collecting system info
16
+ 2024-08-04 14:06:04,321 INFO SystemMonitor:9469 [interfaces.py:start():190] Started cpu monitoring
17
+ 2024-08-04 14:06:04,321 INFO SystemMonitor:9469 [interfaces.py:start():190] Started disk monitoring
18
+ 2024-08-04 14:06:04,322 INFO SystemMonitor:9469 [interfaces.py:start():190] Started gpu monitoring
19
+ 2024-08-04 14:06:04,322 INFO SystemMonitor:9469 [interfaces.py:start():190] Started memory monitoring
20
+ 2024-08-04 14:06:04,323 INFO SystemMonitor:9469 [interfaces.py:start():190] Started network monitoring
21
+ 2024-08-04 14:06:04,333 DEBUG HandlerThread:9469 [system_info.py:probe():151] Probing system
22
+ 2024-08-04 14:06:04,335 DEBUG HandlerThread:9469 [system_info.py:_probe_git():136] Probing git
23
+ 2024-08-04 14:06:04,347 DEBUG HandlerThread:9469 [system_info.py:_probe_git():144] Probing git done
24
+ 2024-08-04 14:06:04,347 DEBUG HandlerThread:9469 [system_info.py:probe():199] Probing system done
25
+ 2024-08-04 14:06:04,347 DEBUG HandlerThread:9469 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-04T05:06:04.333644', 'startedAt': '2024-08-04T05:06:03.671763', 'docker': None, 'cuda': None, 'args': ('--seq-length', '512', '--sliding-window-size', '4096', '--micro-batch-size', '8', '--global-batch-size', '320', '--train-iters', '2000', '--tokenizer-type', 'Llama2Tokenizer', '--tokenizer-model', '/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model', '--train-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--valid-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--test-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '2000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '200', '--eval-interval', '200', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/meta-llama/TinyLlama_v1.1', '--save', '/work/llm_recipes/models/tiny-llama', '--load', '/work/llm_recipes/models/tiny-llama', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/tiny-llama', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'tiny-llama_train_2024-08-04-14:05:53'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '3be5353210a678dc7008f237fa16b99f2bdf36ea'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0389999999993, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.48781967163086}}
26
+ 2024-08-04 14:06:04,347 INFO HandlerThread:9469 [system_monitor.py:probe():224] Finished collecting system info
27
+ 2024-08-04 14:06:04,347 INFO HandlerThread:9469 [system_monitor.py:probe():227] Publishing system info
28
+ 2024-08-04 14:06:04,349 INFO HandlerThread:9469 [system_monitor.py:probe():229] Finished publishing system info
29
+ 2024-08-04 14:06:04,354 DEBUG SenderThread:9469 [sender.py:send():382] send: files
30
+ 2024-08-04 14:06:04,354 INFO SenderThread:9469 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
31
+ 2024-08-04 14:06:04,364 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: python_packages
32
+ 2024-08-04 14:06:04,364 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: stop_status
33
+ 2024-08-04 14:06:04,364 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: internal_messages
34
+ 2024-08-04 14:06:04,364 DEBUG SenderThread:9469 [sender.py:send_request():409] send_request: python_packages
35
+ 2024-08-04 14:06:04,366 DEBUG SenderThread:9469 [sender.py:send_request():409] send_request: stop_status
36
+ 2024-08-04 14:06:04,605 DEBUG SenderThread:9469 [sender.py:send():382] send: telemetry
37
+ 2024-08-04 14:06:04,996 INFO wandb-upload_0:9469 [upload_job.py:push():131] Uploaded file /tmp/tmpz1emajybwandb/prws540s-wandb-metadata.json
38
+ 2024-08-04 14:06:05,220 INFO Thread-12 :9469 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_140603-q9i5g6sv/files/requirements.txt
39
+ 2024-08-04 14:06:05,220 INFO Thread-12 :9469 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_140603-q9i5g6sv/files/wandb-metadata.json
40
+ 2024-08-04 14:06:05,220 INFO Thread-12 :9469 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_140603-q9i5g6sv/files/output.log
41
+ 2024-08-04 14:06:07,221 INFO Thread-12 :9469 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_140603-q9i5g6sv/files/output.log
42
+ 2024-08-04 14:06:07,604 DEBUG SenderThread:9469 [sender.py:send():382] send: config
43
+ 2024-08-04 14:06:07,605 DEBUG SenderThread:9469 [sender.py:send():382] send: config
44
+ 2024-08-04 14:06:08,222 INFO Thread-12 :9469 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_140603-q9i5g6sv/files/output.log
45
+ 2024-08-04 14:06:08,620 DEBUG SenderThread:9469 [sender.py:send():382] send: exit
46
+ 2024-08-04 14:06:08,620 INFO SenderThread:9469 [sender.py:send_exit():589] handling exit code: 1
47
+ 2024-08-04 14:06:08,620 INFO SenderThread:9469 [sender.py:send_exit():591] handling runtime: 4
48
+ 2024-08-04 14:06:08,621 INFO SenderThread:9469 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
49
+ 2024-08-04 14:06:08,622 INFO SenderThread:9469 [sender.py:send_exit():597] send defer
50
+ 2024-08-04 14:06:08,622 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: defer
51
+ 2024-08-04 14:06:08,622 INFO HandlerThread:9469 [handler.py:handle_request_defer():172] handle defer: 0
52
+ 2024-08-04 14:06:08,622 DEBUG SenderThread:9469 [sender.py:send_request():409] send_request: defer
53
+ 2024-08-04 14:06:08,622 INFO SenderThread:9469 [sender.py:send_request_defer():613] handle sender defer: 0
54
+ 2024-08-04 14:06:08,622 INFO SenderThread:9469 [sender.py:transition_state():617] send defer: 1
55
+ 2024-08-04 14:06:08,622 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: defer
56
+ 2024-08-04 14:06:08,622 INFO HandlerThread:9469 [handler.py:handle_request_defer():172] handle defer: 1
57
+ 2024-08-04 14:06:08,622 DEBUG SenderThread:9469 [sender.py:send_request():409] send_request: defer
58
+ 2024-08-04 14:06:08,622 INFO SenderThread:9469 [sender.py:send_request_defer():613] handle sender defer: 1
59
+ 2024-08-04 14:06:08,623 INFO SenderThread:9469 [sender.py:transition_state():617] send defer: 2
60
+ 2024-08-04 14:06:08,623 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: defer
61
+ 2024-08-04 14:06:08,623 INFO HandlerThread:9469 [handler.py:handle_request_defer():172] handle defer: 2
62
+ 2024-08-04 14:06:08,623 INFO HandlerThread:9469 [system_monitor.py:finish():203] Stopping system monitor
63
+ 2024-08-04 14:06:08,623 DEBUG SystemMonitor:9469 [system_monitor.py:_start():172] Starting system metrics aggregation loop
64
+ 2024-08-04 14:06:08,623 INFO HandlerThread:9469 [interfaces.py:finish():202] Joined cpu monitor
65
+ 2024-08-04 14:06:08,623 DEBUG SystemMonitor:9469 [system_monitor.py:_start():179] Finished system metrics aggregation loop
66
+ 2024-08-04 14:06:08,623 INFO HandlerThread:9469 [interfaces.py:finish():202] Joined disk monitor
67
+ 2024-08-04 14:06:08,624 DEBUG SystemMonitor:9469 [system_monitor.py:_start():183] Publishing last batch of metrics
68
+ 2024-08-04 14:06:08,656 INFO HandlerThread:9469 [interfaces.py:finish():202] Joined gpu monitor
69
+ 2024-08-04 14:06:08,656 INFO HandlerThread:9469 [interfaces.py:finish():202] Joined memory monitor
70
+ 2024-08-04 14:06:08,656 INFO HandlerThread:9469 [interfaces.py:finish():202] Joined network monitor
71
+ 2024-08-04 14:06:08,657 DEBUG SenderThread:9469 [sender.py:send_request():409] send_request: defer
72
+ 2024-08-04 14:06:08,657 INFO SenderThread:9469 [sender.py:send_request_defer():613] handle sender defer: 2
73
+ 2024-08-04 14:06:08,657 INFO SenderThread:9469 [sender.py:transition_state():617] send defer: 3
74
+ 2024-08-04 14:06:08,657 DEBUG SenderThread:9469 [sender.py:send():382] send: stats
75
+ 2024-08-04 14:06:08,657 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: defer
76
+ 2024-08-04 14:06:08,657 INFO HandlerThread:9469 [handler.py:handle_request_defer():172] handle defer: 3
77
+ 2024-08-04 14:06:08,657 DEBUG SenderThread:9469 [sender.py:send_request():409] send_request: defer
78
+ 2024-08-04 14:06:08,657 INFO SenderThread:9469 [sender.py:send_request_defer():613] handle sender defer: 3
79
+ 2024-08-04 14:06:08,657 INFO SenderThread:9469 [sender.py:transition_state():617] send defer: 4
80
+ 2024-08-04 14:06:08,657 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: defer
81
+ 2024-08-04 14:06:08,657 INFO HandlerThread:9469 [handler.py:handle_request_defer():172] handle defer: 4
82
+ 2024-08-04 14:06:08,658 DEBUG SenderThread:9469 [sender.py:send_request():409] send_request: defer
83
+ 2024-08-04 14:06:08,658 INFO SenderThread:9469 [sender.py:send_request_defer():613] handle sender defer: 4
84
+ 2024-08-04 14:06:08,658 INFO SenderThread:9469 [sender.py:transition_state():617] send defer: 5
85
+ 2024-08-04 14:06:08,658 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: defer
86
+ 2024-08-04 14:06:08,658 INFO HandlerThread:9469 [handler.py:handle_request_defer():172] handle defer: 5
87
+ 2024-08-04 14:06:08,658 DEBUG SenderThread:9469 [sender.py:send():382] send: summary
88
+ 2024-08-04 14:06:08,659 INFO SenderThread:9469 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
89
+ 2024-08-04 14:06:08,659 DEBUG SenderThread:9469 [sender.py:send_request():409] send_request: defer
90
+ 2024-08-04 14:06:08,659 INFO SenderThread:9469 [sender.py:send_request_defer():613] handle sender defer: 5
91
+ 2024-08-04 14:06:08,659 INFO SenderThread:9469 [sender.py:transition_state():617] send defer: 6
92
+ 2024-08-04 14:06:08,659 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: defer
93
+ 2024-08-04 14:06:08,659 INFO HandlerThread:9469 [handler.py:handle_request_defer():172] handle defer: 6
94
+ 2024-08-04 14:06:08,659 DEBUG SenderThread:9469 [sender.py:send_request():409] send_request: defer
95
+ 2024-08-04 14:06:08,660 INFO SenderThread:9469 [sender.py:send_request_defer():613] handle sender defer: 6
96
+ 2024-08-04 14:06:08,662 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: status_report
97
+ 2024-08-04 14:06:08,848 INFO SenderThread:9469 [sender.py:transition_state():617] send defer: 7
98
+ 2024-08-04 14:06:08,849 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: defer
99
+ 2024-08-04 14:06:08,849 INFO HandlerThread:9469 [handler.py:handle_request_defer():172] handle defer: 7
100
+ 2024-08-04 14:06:08,849 DEBUG SenderThread:9469 [sender.py:send_request():409] send_request: defer
101
+ 2024-08-04 14:06:08,849 INFO SenderThread:9469 [sender.py:send_request_defer():613] handle sender defer: 7
102
+ 2024-08-04 14:06:09,223 INFO Thread-12 :9469 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_140603-q9i5g6sv/files/config.yaml
103
+ 2024-08-04 14:06:09,223 INFO Thread-12 :9469 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_140603-q9i5g6sv/files/output.log
104
+ 2024-08-04 14:06:09,223 INFO Thread-12 :9469 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_140603-q9i5g6sv/files/wandb-summary.json
105
+ 2024-08-04 14:06:09,360 INFO SenderThread:9469 [sender.py:transition_state():617] send defer: 8
106
+ 2024-08-04 14:06:09,361 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: defer
107
+ 2024-08-04 14:06:09,361 INFO HandlerThread:9469 [handler.py:handle_request_defer():172] handle defer: 8
108
+ 2024-08-04 14:06:09,361 DEBUG SenderThread:9469 [sender.py:send_request():409] send_request: defer
109
+ 2024-08-04 14:06:09,361 INFO SenderThread:9469 [sender.py:send_request_defer():613] handle sender defer: 8
110
+ 2024-08-04 14:06:09,361 INFO SenderThread:9469 [job_builder.py:build():296] Attempting to build job artifact
111
+ 2024-08-04 14:06:09,362 INFO SenderThread:9469 [job_builder.py:_get_source_type():426] is repo sourced job
112
+ 2024-08-04 14:06:09,376 INFO SenderThread:9469 [job_builder.py:build():402] adding wandb-job metadata file
113
+ 2024-08-04 14:06:09,384 INFO SenderThread:9469 [sender.py:transition_state():617] send defer: 9
114
+ 2024-08-04 14:06:09,384 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: defer
115
+ 2024-08-04 14:06:09,384 DEBUG SenderThread:9469 [sender.py:send():382] send: artifact
116
+ 2024-08-04 14:06:09,384 INFO HandlerThread:9469 [handler.py:handle_request_defer():172] handle defer: 9
117
+ 2024-08-04 14:06:09,620 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: poll_exit
118
+ 2024-08-04 14:06:10,224 INFO Thread-12 :9469 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_140603-q9i5g6sv/files/output.log
119
+ 2024-08-04 14:06:10,240 INFO SenderThread:9469 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTA5MTk2NTkzOA==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTA5MzUzODM4NQ==', 'versionIndex': 3}}}
120
+ 2024-08-04 14:06:10,240 DEBUG SenderThread:9469 [sender.py:send_request():409] send_request: defer
121
+ 2024-08-04 14:06:10,240 INFO SenderThread:9469 [sender.py:send_request_defer():613] handle sender defer: 9
122
+ 2024-08-04 14:06:10,240 INFO SenderThread:9469 [dir_watcher.py:finish():358] shutting down directory watcher
123
+ 2024-08-04 14:06:11,225 INFO SenderThread:9469 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240804_140603-q9i5g6sv/files
124
+ 2024-08-04 14:06:11,225 INFO SenderThread:9469 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_140603-q9i5g6sv/files/requirements.txt requirements.txt
125
+ 2024-08-04 14:06:11,225 INFO SenderThread:9469 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_140603-q9i5g6sv/files/config.yaml config.yaml
126
+ 2024-08-04 14:06:11,227 INFO SenderThread:9469 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_140603-q9i5g6sv/files/wandb-metadata.json wandb-metadata.json
127
+ 2024-08-04 14:06:11,227 INFO SenderThread:9469 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_140603-q9i5g6sv/files/wandb-summary.json wandb-summary.json
128
+ 2024-08-04 14:06:11,228 INFO SenderThread:9469 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_140603-q9i5g6sv/files/output.log output.log
129
+ 2024-08-04 14:06:11,230 INFO SenderThread:9469 [sender.py:transition_state():617] send defer: 10
130
+ 2024-08-04 14:06:11,230 DEBUG SenderThread:9469 [sender.py:send_request():409] send_request: poll_exit
131
+ 2024-08-04 14:06:11,230 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: defer
132
+ 2024-08-04 14:06:11,232 INFO HandlerThread:9469 [handler.py:handle_request_defer():172] handle defer: 10
133
+ 2024-08-04 14:06:11,232 DEBUG SenderThread:9469 [sender.py:send_request():409] send_request: defer
134
+ 2024-08-04 14:06:11,232 INFO SenderThread:9469 [sender.py:send_request_defer():613] handle sender defer: 10
135
+ 2024-08-04 14:06:11,232 INFO SenderThread:9469 [file_pusher.py:finish():172] shutting down file pusher
136
+ 2024-08-04 14:06:11,620 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: poll_exit
137
+ 2024-08-04 14:06:11,621 DEBUG SenderThread:9469 [sender.py:send_request():409] send_request: poll_exit
138
+ 2024-08-04 14:06:11,713 INFO wandb-upload_0:9469 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_140603-q9i5g6sv/files/requirements.txt
139
+ 2024-08-04 14:06:11,733 INFO wandb-upload_1:9469 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_140603-q9i5g6sv/files/config.yaml
140
+ 2024-08-04 14:06:11,829 INFO wandb-upload_2:9469 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_140603-q9i5g6sv/files/wandb-summary.json
141
+ 2024-08-04 14:06:11,833 INFO wandb-upload_3:9469 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_140603-q9i5g6sv/files/output.log
142
+ 2024-08-04 14:06:12,033 INFO Thread-11 (_thread_body):9469 [sender.py:transition_state():617] send defer: 11
143
+ 2024-08-04 14:06:12,034 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: defer
144
+ 2024-08-04 14:06:12,034 INFO HandlerThread:9469 [handler.py:handle_request_defer():172] handle defer: 11
145
+ 2024-08-04 14:06:12,034 DEBUG SenderThread:9469 [sender.py:send_request():409] send_request: defer
146
+ 2024-08-04 14:06:12,034 INFO SenderThread:9469 [sender.py:send_request_defer():613] handle sender defer: 11
147
+ 2024-08-04 14:06:12,034 INFO SenderThread:9469 [file_pusher.py:join():178] waiting for file pusher
148
+ 2024-08-04 14:06:12,034 INFO SenderThread:9469 [sender.py:transition_state():617] send defer: 12
149
+ 2024-08-04 14:06:12,034 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: defer
150
+ 2024-08-04 14:06:12,034 INFO HandlerThread:9469 [handler.py:handle_request_defer():172] handle defer: 12
151
+ 2024-08-04 14:06:12,035 DEBUG SenderThread:9469 [sender.py:send_request():409] send_request: defer
152
+ 2024-08-04 14:06:12,035 INFO SenderThread:9469 [sender.py:send_request_defer():613] handle sender defer: 12
153
+ 2024-08-04 14:06:12,035 INFO SenderThread:9469 [file_stream.py:finish():595] file stream finish called
154
+ 2024-08-04 14:06:12,204 INFO SenderThread:9469 [file_stream.py:finish():599] file stream finish is done
155
+ 2024-08-04 14:06:12,204 INFO SenderThread:9469 [sender.py:transition_state():617] send defer: 13
156
+ 2024-08-04 14:06:12,205 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: defer
157
+ 2024-08-04 14:06:12,205 INFO HandlerThread:9469 [handler.py:handle_request_defer():172] handle defer: 13
158
+ 2024-08-04 14:06:12,205 DEBUG SenderThread:9469 [sender.py:send_request():409] send_request: defer
159
+ 2024-08-04 14:06:12,205 INFO SenderThread:9469 [sender.py:send_request_defer():613] handle sender defer: 13
160
+ 2024-08-04 14:06:12,205 INFO SenderThread:9469 [sender.py:transition_state():617] send defer: 14
161
+ 2024-08-04 14:06:12,205 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: defer
162
+ 2024-08-04 14:06:12,205 DEBUG SenderThread:9469 [sender.py:send():382] send: final
163
+ 2024-08-04 14:06:12,205 INFO HandlerThread:9469 [handler.py:handle_request_defer():172] handle defer: 14
164
+ 2024-08-04 14:06:12,205 DEBUG SenderThread:9469 [sender.py:send():382] send: footer
165
+ 2024-08-04 14:06:12,206 DEBUG SenderThread:9469 [sender.py:send_request():409] send_request: defer
166
+ 2024-08-04 14:06:12,206 INFO SenderThread:9469 [sender.py:send_request_defer():613] handle sender defer: 14
167
+ 2024-08-04 14:06:12,206 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: poll_exit
168
+ 2024-08-04 14:06:12,206 DEBUG SenderThread:9469 [sender.py:send_request():409] send_request: poll_exit
169
+ 2024-08-04 14:06:12,206 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: poll_exit
170
+ 2024-08-04 14:06:12,207 DEBUG SenderThread:9469 [sender.py:send_request():409] send_request: poll_exit
171
+ 2024-08-04 14:06:12,207 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: server_info
172
+ 2024-08-04 14:06:12,207 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: get_summary
173
+ 2024-08-04 14:06:12,207 DEBUG SenderThread:9469 [sender.py:send_request():409] send_request: server_info
174
+ 2024-08-04 14:06:12,208 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: sampled_history
175
+ 2024-08-04 14:06:12,209 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: internal_messages
176
+ 2024-08-04 14:06:12,209 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: job_info
177
+ 2024-08-04 14:06:12,360 DEBUG SenderThread:9469 [sender.py:send_request():409] send_request: job_info
178
+ 2024-08-04 14:06:12,360 INFO MainThread:9469 [wandb_run.py:_footer_history_summary_info():3866] rendering history
179
+ 2024-08-04 14:06:12,360 INFO MainThread:9469 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
180
+ 2024-08-04 14:06:12,360 INFO MainThread:9469 [wandb_run.py:_footer_sync_info():3825] logging synced files
181
+ 2024-08-04 14:06:12,360 DEBUG HandlerThread:9469 [handler.py:handle_request():146] handle_request: shutdown
182
+ 2024-08-04 14:06:12,361 INFO HandlerThread:9469 [handler.py:finish():869] shutting down handler
183
+ 2024-08-04 14:06:13,210 INFO WriterThread:9469 [datastore.py:close():296] close: /project/wandb/run-20240804_140603-q9i5g6sv/run-q9i5g6sv.wandb
184
+ 2024-08-04 14:06:13,360 INFO SenderThread:9469 [sender.py:finish():1572] shutting down sender
185
+ 2024-08-04 14:06:13,360 INFO SenderThread:9469 [file_pusher.py:finish():172] shutting down file pusher
186
+ 2024-08-04 14:06:13,360 INFO SenderThread:9469 [file_pusher.py:join():178] waiting for file pusher
wandb/run-20240804_140603-q9i5g6sv/logs/debug.log ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-04 14:06:03,677 INFO MainThread:9398 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
2
+ 2024-08-04 14:06:03,678 INFO MainThread:9398 [wandb_setup.py:_flush():76] Configure stats pid to 9398
3
+ 2024-08-04 14:06:03,678 INFO MainThread:9398 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
4
+ 2024-08-04 14:06:03,678 INFO MainThread:9398 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
5
+ 2024-08-04 14:06:03,678 INFO MainThread:9398 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train tiny llama sample'}
6
+ 2024-08-04 14:06:03,678 INFO MainThread:9398 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
+ 2024-08-04 14:06:03,678 INFO MainThread:9398 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
8
+ 2024-08-04 14:06:03,678 INFO MainThread:9398 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240804_140603-q9i5g6sv/logs/debug.log
9
+ 2024-08-04 14:06:03,678 INFO MainThread:9398 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240804_140603-q9i5g6sv/logs/debug-internal.log
10
+ 2024-08-04 14:06:03,678 INFO MainThread:9398 [wandb_init.py:init():566] calling init triggers
11
+ 2024-08-04 14:06:03,678 INFO MainThread:9398 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
12
+ config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'valid_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'test_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 512, 'num_workers': 2, 'tokenizer_type': 'Llama2Tokenizer', 'tokenizer_model': '/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'tiny-llama_train_2024-08-04-14:05:53', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/tiny-llama', 'save': '/work/llm_recipes/models/tiny-llama', 'base_model': '/share/pretrained_lm/meta-llama/TinyLlama_v1.1', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 2000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 2000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 8, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/tiny-llama', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 32000, 'gradient_accumulation_steps': 40}
13
+ 2024-08-04 14:06:03,678 INFO MainThread:9398 [wandb_init.py:init():616] starting backend
14
+ 2024-08-04 14:06:03,678 INFO MainThread:9398 [wandb_init.py:init():620] setting up manager
15
+ 2024-08-04 14:06:03,683 INFO MainThread:9398 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
16
+ 2024-08-04 14:06:03,684 INFO MainThread:9398 [wandb_init.py:init():628] backend started and connected
17
+ 2024-08-04 14:06:03,689 INFO MainThread:9398 [wandb_init.py:init():720] updated telemetry
18
+ 2024-08-04 14:06:03,699 INFO MainThread:9398 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
19
+ 2024-08-04 14:06:04,223 INFO MainThread:9398 [wandb_run.py:_on_init():2262] communicating current version
20
+ 2024-08-04 14:06:04,307 INFO MainThread:9398 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.5 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
21
+
22
+ 2024-08-04 14:06:04,307 INFO MainThread:9398 [wandb_init.py:init():804] starting run threads in backend
23
+ 2024-08-04 14:06:04,363 INFO MainThread:9398 [wandb_run.py:_console_start():2241] atexit reg
24
+ 2024-08-04 14:06:04,363 INFO MainThread:9398 [wandb_run.py:_redirect():2096] redirect: wrap_raw
25
+ 2024-08-04 14:06:04,363 INFO MainThread:9398 [wandb_run.py:_redirect():2161] Wrapping output streams.
26
+ 2024-08-04 14:06:04,363 INFO MainThread:9398 [wandb_run.py:_redirect():2186] Redirects installed.
27
+ 2024-08-04 14:06:04,364 INFO MainThread:9398 [wandb_init.py:init():847] run started, returning control to user process
28
+ 2024-08-04 14:06:07,603 INFO MainThread:9398 [wandb_run.py:_config_callback():1343] config_cb None None {'activation_function': 'silu', 'hidden_size': 2048, 'model_type': 'llama', 'max_position_embeddings': 2048, 'num_attention_heads': 32, 'num_hidden_layers': 22, 'model_architecture': 'LlamaForCausalLM'}
29
+ 2024-08-04 14:06:07,604 INFO MainThread:9398 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
30
+ 2024-08-04 14:06:13,361 WARNING MsgRouterThr:9398 [router.py:message_loop():77] message_loop has been closed
wandb/run-20240804_140603-q9i5g6sv/run-q9i5g6sv.wandb ADDED
Binary file (20.7 kB). View file
 
wandb/run-20240804_142250-6p58tz1g/files/config.yaml ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ sharding_strategy:
4
+ desc: null
5
+ value: FULL_SHARD
6
+ checkpoint_type:
7
+ desc: null
8
+ value: LOCAL_STATE_DICT
9
+ fsdp_activation_checkpointing:
10
+ desc: null
11
+ value: true
12
+ fsdp_cpu_offload:
13
+ desc: null
14
+ value: false
15
+ low_cpu_fsdp:
16
+ desc: null
17
+ value: false
18
+ no_meta_device:
19
+ desc: null
20
+ value: false
21
+ data_path:
22
+ desc: null
23
+ value: null
24
+ split:
25
+ desc: null
26
+ value: 969, 30, 1
27
+ train_data_path:
28
+ desc: null
29
+ value:
30
+ - '4013541'
31
+ - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
32
+ valid_data_path:
33
+ desc: null
34
+ value:
35
+ - '4013541'
36
+ - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
37
+ test_data_path:
38
+ desc: null
39
+ value:
40
+ - '4013541'
41
+ - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
42
+ data_cache_path:
43
+ desc: null
44
+ value: null
45
+ vocab_size:
46
+ desc: null
47
+ value: null
48
+ vocab_file:
49
+ desc: null
50
+ value: null
51
+ merge_file:
52
+ desc: null
53
+ value: null
54
+ seq_length:
55
+ desc: null
56
+ value: 512
57
+ num_workers:
58
+ desc: null
59
+ value: 2
60
+ tokenizer_type:
61
+ desc: null
62
+ value: Llama2Tokenizer
63
+ tokenizer_model:
64
+ desc: null
65
+ value: /share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model
66
+ reset_position_ids:
67
+ desc: null
68
+ value: false
69
+ reset_attention_mask:
70
+ desc: null
71
+ value: false
72
+ eod_mask_loss:
73
+ desc: null
74
+ value: false
75
+ retro_return_doc_ids:
76
+ desc: null
77
+ value: false
78
+ short_seq_prob:
79
+ desc: null
80
+ value: 0.1
81
+ vocab_extra_ids:
82
+ desc: null
83
+ value: 0
84
+ seed:
85
+ desc: null
86
+ value: 1234
87
+ use_mpi:
88
+ desc: null
89
+ value: false
90
+ wandb_entity:
91
+ desc: null
92
+ value: iwakawa-koichi-q5-tohoku-nlp6723
93
+ wandb_name:
94
+ desc: null
95
+ value: tiny-llama_train_2024-08-04-14:22:39
96
+ wandb_project:
97
+ desc: null
98
+ value: llm_tutorial
99
+ quantization:
100
+ desc: null
101
+ value: false
102
+ use_freeze_layers:
103
+ desc: null
104
+ value: false
105
+ freeze_layers:
106
+ desc: null
107
+ value: null
108
+ bf16:
109
+ desc: null
110
+ value: true
111
+ fp16:
112
+ desc: null
113
+ value: false
114
+ mixed_precision:
115
+ desc: null
116
+ value: true
117
+ param_dtype:
118
+ desc: null
119
+ value: null
120
+ load:
121
+ desc: null
122
+ value: /work/llm_recipes/models/tiny-llama
123
+ save:
124
+ desc: null
125
+ value: /work/llm_recipes/models/tiny-llama
126
+ base_model:
127
+ desc: null
128
+ value: /share/pretrained_lm/meta-llama/TinyLlama_v1.1
129
+ use_better_transformer:
130
+ desc: null
131
+ value: false
132
+ grad_clip_norm:
133
+ desc: null
134
+ value: 1.0
135
+ eval_interval:
136
+ desc: null
137
+ value: 200
138
+ save_interval:
139
+ desc: null
140
+ value: 200
141
+ eval_iters:
142
+ desc: null
143
+ value: 10
144
+ optimizer:
145
+ desc: null
146
+ value: adam
147
+ lr:
148
+ desc: null
149
+ value: 2.0e-05
150
+ lr_decay_style:
151
+ desc: null
152
+ value: cosine
153
+ lr_decay_iters:
154
+ desc: null
155
+ value: 2000
156
+ lr_warmup_iters:
157
+ desc: null
158
+ value: 500
159
+ min_lr:
160
+ desc: null
161
+ value: 1.0e-06
162
+ train_iters:
163
+ desc: null
164
+ value: 2000
165
+ train_samples:
166
+ desc: null
167
+ value: null
168
+ global_batch_size:
169
+ desc: null
170
+ value: 320
171
+ micro_batch_size:
172
+ desc: null
173
+ value: 8
174
+ make_vocab_size_divisible_by:
175
+ desc: null
176
+ value: 128
177
+ sliding_window_size:
178
+ desc: null
179
+ value: 4096
180
+ skip_batch:
181
+ desc: null
182
+ value: null
183
+ no_save_optimizer_state:
184
+ desc: null
185
+ value: false
186
+ continual_pretraining:
187
+ desc: null
188
+ value: false
189
+ instruction_tuning:
190
+ desc: null
191
+ value: false
192
+ direct_preference_optimization:
193
+ desc: null
194
+ value: false
195
+ attention_dropout:
196
+ desc: null
197
+ value: 0.1
198
+ hidden_dropout:
199
+ desc: null
200
+ value: 0.1
201
+ weight_decay:
202
+ desc: null
203
+ value: 0.1
204
+ adam_beta1:
205
+ desc: null
206
+ value: 0.9
207
+ adam_beta2:
208
+ desc: null
209
+ value: 0.95
210
+ adam_eps:
211
+ desc: null
212
+ value: 1.0e-06
213
+ hf_transformer_model_dir:
214
+ desc: null
215
+ value: null
216
+ instruction_train_data_path:
217
+ desc: null
218
+ value: null
219
+ instruction_valid_data_path:
220
+ desc: null
221
+ value: null
222
+ epoch:
223
+ desc: null
224
+ value: null
225
+ instruction_dataset_size:
226
+ desc: null
227
+ value: null
228
+ save_sampler_state:
229
+ desc: null
230
+ value: false
231
+ label_smoothing:
232
+ desc: null
233
+ value: 0.0
234
+ save_n_checkpoints:
235
+ desc: null
236
+ value: 10
237
+ hf_repo_id:
238
+ desc: null
239
+ value: koichi12/tiny-llama
240
+ create_public_hf_repo:
241
+ desc: null
242
+ value: false
243
+ upload_all_checkpoints_to_hf:
244
+ desc: null
245
+ value: false
246
+ hf_upload_retry_limit:
247
+ desc: null
248
+ value: 2
249
+ exit_duration_in_mins:
250
+ desc: null
251
+ value: null
252
+ source_key:
253
+ desc: null
254
+ value: null
255
+ target_key:
256
+ desc: null
257
+ value: null
258
+ attn_implementation:
259
+ desc: null
260
+ value: flash_attention_2
261
+ efficient_instruction_tuning:
262
+ desc: null
263
+ value: false
264
+ remove_padding_masking:
265
+ desc: null
266
+ value: false
267
+ save_start_iter:
268
+ desc: null
269
+ value: null
270
+ rank:
271
+ desc: null
272
+ value: 0
273
+ world_size:
274
+ desc: null
275
+ value: 1
276
+ padded_vocab_size:
277
+ desc: null
278
+ value: 32000
279
+ gradient_accumulation_steps:
280
+ desc: null
281
+ value: 40
282
+ _wandb:
283
+ desc: null
284
+ value:
285
+ python_version: 3.10.12
286
+ cli_version: 0.16.3
287
+ framework: huggingface
288
+ huggingface_version: 4.43.3
289
+ is_jupyter_run: false
290
+ is_kaggle_kernel: false
291
+ start_time: 1722748970.443993
292
+ t:
293
+ 1:
294
+ - 1
295
+ - 11
296
+ - 49
297
+ - 55
298
+ - 71
299
+ 2:
300
+ - 1
301
+ - 11
302
+ - 49
303
+ - 55
304
+ - 71
305
+ 3:
306
+ - 13
307
+ - 16
308
+ - 23
309
+ 4: 3.10.12
310
+ 5: 0.16.3
311
+ 6: 4.43.3
312
+ 8:
313
+ - 5
314
+ 13: linux-x86_64
315
+ activation_function:
316
+ desc: null
317
+ value: silu
318
+ hidden_size:
319
+ desc: null
320
+ value: 2048
321
+ model_type:
322
+ desc: null
323
+ value: llama
324
+ max_position_embeddings:
325
+ desc: null
326
+ value: 2048
327
+ num_attention_heads:
328
+ desc: null
329
+ value: 32
330
+ num_hidden_layers:
331
+ desc: null
332
+ value: 22
333
+ model_architecture:
334
+ desc: null
335
+ value: LlamaForCausalLM
wandb/run-20240804_142250-6p58tz1g/files/output.log ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Created Hugging Face repository with ID koichi12/tiny-llama.
2
+ Clearing GPU cache for all ranks
3
+ --> Running with torch torch_distributed debug set to detail
4
+ File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
5
+ Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
6
+ File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
7
+ Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
8
+ File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
9
+ Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
10
+ No checkpoint found in /work/llm_recipes/models/tiny-llama, skipping model loading
11
+ --> Model /share/pretrained_lm/meta-llama/TinyLlama_v1.1
12
+ --> /share/pretrained_lm/meta-llama/TinyLlama_v1.1 has 1100.048384 Million params
13
+ You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
14
+ You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
15
+ Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaForCausalLM is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
16
+ Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
17
+ /usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
18
+ warnings.warn(
19
+ BFloat16 enabled for mixed precision - using bfSixteen policy
20
+ --> applying fsdp activation checkpointing...
21
+ > datasets target sizes (minimum size):
22
+ train: 640000
23
+ validation: 35200
24
+ test: 3200
25
+ > building train, validation, and test datasets for GPT ...
26
+ > finished creating GPT datasets ...
27
+ File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
28
+ Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
29
+ No checkpoint found in /work/llm_recipes/models/tiny-llama, skipping optimizer loading
30
+ File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
31
+ Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
32
+ model info: FullyShardedDataParallel(
33
+ (_fsdp_wrapped_module): LlamaForCausalLM(
34
+ (model): LlamaModel(
35
+ (embed_tokens): Embedding(32000, 2048)
36
+ (layers): ModuleList(
37
+ (0-21): 22 x FullyShardedDataParallel(
38
+ (_fsdp_wrapped_module): CheckpointWrapper(
39
+ (_checkpoint_wrapped_module): LlamaDecoderLayer(
40
+ (self_attn): LlamaFlashAttention2(
41
+ (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
42
+ (k_proj): Linear(in_features=2048, out_features=256, bias=False)
43
+ (v_proj): Linear(in_features=2048, out_features=256, bias=False)
44
+ (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
45
+ (rotary_emb): LlamaRotaryEmbedding()
46
+ )
47
+ (mlp): LlamaMLP(
48
+ (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
49
+ (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
50
+ (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
51
+ (act_fn): SiLU()
52
+ )
53
+ (input_layernorm): LlamaRMSNorm()
54
+ (post_attention_layernorm): LlamaRMSNorm()
55
+ )
56
+ )
57
+ )
58
+ )
59
+ (norm): LlamaRMSNorm()
60
+ (rotary_emb): LlamaRotaryEmbedding()
61
+ )
62
+ (lm_head): Linear(in_features=2048, out_features=32000, bias=False)
63
+ )
64
+ )
65
+ model config: LlamaConfig {
66
+ "_name_or_path": "/share/pretrained_lm/meta-llama/TinyLlama_v1.1",
67
+ "architectures": [
68
+ "LlamaForCausalLM"
69
+ ],
70
+ "attention_bias": false,
71
+ "attention_dropout": 0.0,
72
+ "bos_token_id": 1,
73
+ "eos_token_id": 2,
74
+ "hidden_act": "silu",
75
+ "hidden_size": 2048,
76
+ "initializer_range": 0.02,
77
+ "intermediate_size": 5632,
78
+ "label_smoothing": 0.0,
79
+ "max_position_embeddings": 2048,
80
+ "mlp_bias": false,
81
+ "model_type": "llama",
82
+ "num_attention_heads": 32,
83
+ "num_hidden_layers": 22,
84
+ "num_key_value_heads": 4,
85
+ "pretraining_tp": 1,
86
+ "rms_norm_eps": 1e-05,
87
+ "rope_scaling": null,
88
+ "rope_theta": 10000.0,
89
+ "tie_word_embeddings": false,
90
+ "torch_dtype": "float32",
91
+ "transformers_version": "4.43.3",
92
+ "use_cache": false,
93
+ "vocab_size": 32000
94
+ }
95
+ Let split = None
96
+ Building a BlendedDataset for a single MegatronDataset
97
+ Unable to save the indexes because path_to_cache is None
98
+ Building a BlendedDataset for a single MegatronDataset
99
+ Unable to save the indexes because path_to_cache is None
100
+ Building a BlendedDataset for a single MegatronDataset
101
+ Unable to save the indexes because path_to_cache is None
102
+ Traceback (most recent call last):
103
+ File "/project/examples/finetuning.py", line 13, in <module>
104
+ main()
105
+ File "/project/src/llama_recipes/finetuning.py", line 281, in main
106
+ train(
107
+ File "/project/src/llama_recipes/utils/train_utils.py", line 104, in train
108
+ batch = next(train_dataloader)
109
+ File "/project/src/llama_recipes/utils/train_utils.py", line 24, in cyclic_iter
110
+ for x in iter:
111
+ File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 631, in __next__
112
+ data = self._next_data()
113
+ File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1346, in _next_data
114
+ return self._process_data(data)
115
+ File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1372, in _process_data
116
+ data.reraise()
117
+ File "/usr/local/lib/python3.10/dist-packages/torch/_utils.py", line 705, in reraise
118
+ raise exception
119
+ RuntimeError: Caught RuntimeError in DataLoader worker process 0.
120
+ Original Traceback (most recent call last):
121
+ File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
122
+ data = fetcher.fetch(index)
123
+ File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/fetch.py", line 54, in fetch
124
+ return self.collate_fn(data)
125
+ File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 277, in default_collate
126
+ return collate(batch, collate_fn_map=default_collate_fn_map)
127
+ File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 129, in collate
128
+ return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
129
+ File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 129, in <dictcomp>
130
+ return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
131
+ File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 121, in collate
132
+ return collate_fn_map[elem_type](batch, collate_fn_map=collate_fn_map)
133
+ File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 174, in collate_tensor_fn
134
+ return torch.stack(batch, 0, out=out)
135
+ RuntimeError: stack expects each tensor to be equal size, but got [513] at entry 0 and [543] at entry 1
wandb/run-20240804_142250-6p58tz1g/files/requirements.txt ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.1.0
2
+ accelerate==0.33.0
3
+ aiohttp==3.9.1
4
+ aiosignal==1.3.1
5
+ annotated-types==0.6.0
6
+ apex==0.1
7
+ appdirs==1.4.4
8
+ argon2-cffi-bindings==21.2.0
9
+ argon2-cffi==23.1.0
10
+ asttokens==2.4.1
11
+ astunparse==1.6.3
12
+ async-timeout==4.0.3
13
+ attrs==23.2.0
14
+ audioread==3.0.1
15
+ beautifulsoup4==4.12.3
16
+ bleach==6.1.0
17
+ blis==0.7.11
18
+ cachetools==5.3.2
19
+ catalogue==2.0.10
20
+ certifi==2024.2.2
21
+ cffi==1.16.0
22
+ charset-normalizer==3.3.2
23
+ click==8.1.7
24
+ cloudpathlib==0.16.0
25
+ cloudpickle==3.0.0
26
+ cmake==3.28.1
27
+ colorama==0.4.6
28
+ comm==0.2.1
29
+ confection==0.1.4
30
+ contourpy==1.2.0
31
+ cubinlinker==0.3.0+2.g405ac64
32
+ cuda-python==12.3.0rc4+9.gdb8c48a.dirty
33
+ cudf==23.12.0
34
+ cugraph-dgl==23.12.0
35
+ cugraph-service-client==23.12.0
36
+ cugraph-service-server==23.12.0
37
+ cugraph==23.12.0
38
+ cuml==23.12.0
39
+ cupy-cuda12x==12.3.0
40
+ cycler==0.12.1
41
+ cymem==2.0.8
42
+ cython==3.0.8
43
+ dask-cuda==23.12.0
44
+ dask-cudf==23.12.0
45
+ dask==2023.11.0
46
+ debugpy==1.8.1
47
+ decorator==5.1.1
48
+ defusedxml==0.7.1
49
+ distributed==2023.11.0
50
+ dm-tree==0.1.8
51
+ docker-pycreds==0.4.0
52
+ einops==0.7.0
53
+ exceptiongroup==1.2.0
54
+ execnet==2.0.2
55
+ executing==2.0.1
56
+ expecttest==0.1.3
57
+ fastjsonschema==2.19.1
58
+ fastrlock==0.8.2
59
+ filelock==3.13.1
60
+ flash-attn==2.4.2
61
+ fonttools==4.48.1
62
+ frozenlist==1.4.1
63
+ fsspec==2023.12.2
64
+ gast==0.5.4
65
+ gitdb==4.0.11
66
+ gitpython==3.1.43
67
+ google-auth-oauthlib==0.4.6
68
+ google-auth==2.27.0
69
+ graphsurgeon==0.4.6
70
+ grpcio==1.60.1
71
+ huggingface-hub==0.24.5
72
+ hypothesis==5.35.1
73
+ idna==3.6
74
+ importlib-metadata==7.0.1
75
+ iniconfig==2.0.0
76
+ intel-openmp==2021.4.0
77
+ ipadic==1.0.0
78
+ ipykernel==6.29.2
79
+ ipython-genutils==0.2.0
80
+ ipython==8.21.0
81
+ jedi==0.19.1
82
+ jinja2==3.1.3
83
+ joblib==1.3.2
84
+ json5==0.9.14
85
+ jsonnet==0.19.1
86
+ jsonschema-specifications==2023.12.1
87
+ jsonschema==4.21.1
88
+ jupyter-client==8.6.0
89
+ jupyter-core==5.7.1
90
+ jupyter-tensorboard==0.2.0
91
+ jupyterlab-pygments==0.3.0
92
+ jupyterlab-server==1.2.0
93
+ jupyterlab==2.3.2
94
+ jupytext==1.16.1
95
+ kiwisolver==1.4.5
96
+ langcodes==3.3.0
97
+ lazy-loader==0.3
98
+ librosa==0.10.1
99
+ llvmlite==0.40.1
100
+ locket==1.0.0
101
+ logzero==1.7.0
102
+ lxml==5.2.2
103
+ markdown-it-py==3.0.0
104
+ markdown==3.5.2
105
+ markupsafe==2.1.4
106
+ matplotlib-inline==0.1.6
107
+ matplotlib==3.8.2
108
+ mdit-py-plugins==0.4.0
109
+ mdurl==0.1.2
110
+ mecab-python3==1.0.6
111
+ mistune==3.0.2
112
+ mkl-devel==2021.1.1
113
+ mkl-include==2021.1.1
114
+ mkl==2021.1.1
115
+ mock==5.1.0
116
+ more-itertools==9.1.0
117
+ mpmath==1.3.0
118
+ msgpack==1.0.7
119
+ multidict==6.0.4
120
+ murmurhash==1.0.10
121
+ nbclient==0.9.0
122
+ nbconvert==7.16.0
123
+ nbformat==5.9.2
124
+ nest-asyncio==1.6.0
125
+ networkx==2.6.3
126
+ ninja==1.11.1.1
127
+ nltk==3.8.1
128
+ notebook==6.4.10
129
+ numba==0.57.1+1.g1ff679645
130
+ numpy==1.24.4
131
+ nvfuser==0.1.4a0+d0bb811
132
+ nvidia-dali-cuda120==1.34.0
133
+ nvidia-pyindex==1.0.9
134
+ nvtx==0.2.5
135
+ oauthlib==3.2.2
136
+ onnx==1.15.0rc2
137
+ opencv==4.7.0
138
+ optree==0.10.0
139
+ packaging==23.2
140
+ pandas==1.5.3
141
+ pandocfilters==1.5.1
142
+ parso==0.8.3
143
+ partd==1.4.1
144
+ peft==0.11.1
145
+ pexpect==4.9.0
146
+ pillow==10.2.0
147
+ pip==24.0
148
+ platformdirs==4.2.0
149
+ pluggy==1.4.0
150
+ ply==3.11
151
+ polygraphy==0.49.4
152
+ pooch==1.8.0
153
+ portalocker==2.10.1
154
+ preshed==3.0.9
155
+ prettytable==3.9.0
156
+ prometheus-client==0.19.0
157
+ prompt-toolkit==3.0.43
158
+ protobuf==4.24.4
159
+ psutil==5.9.4
160
+ ptxcompiler==0.8.1+2.g0d406d6
161
+ ptyprocess==0.7.0
162
+ pure-eval==0.2.2
163
+ pyarrow==14.0.1.dev0+gba5374836.d20240125
164
+ pyasn1-modules==0.3.0
165
+ pyasn1==0.5.1
166
+ pybind11-global==2.11.1
167
+ pybind11==2.11.1
168
+ pycocotools==2.0+nv0.8.0
169
+ pycparser==2.21
170
+ pydantic-core==2.16.2
171
+ pydantic==2.6.1
172
+ pygments==2.17.2
173
+ pylibcugraph==23.12.0
174
+ pylibcugraphops==23.12.0
175
+ pylibraft==23.12.0
176
+ pynvml==11.4.1
177
+ pyparsing==3.1.1
178
+ pytest-flakefinder==1.1.0
179
+ pytest-rerunfailures==13.0
180
+ pytest-shard==0.1.2
181
+ pytest-xdist==3.5.0
182
+ pytest==8.0.0
183
+ python-dateutil==2.8.2
184
+ python-dotenv==1.0.0
185
+ python-hostlist==1.23.0
186
+ pytorch-quantization==2.1.2
187
+ pytz==2023.3.post1
188
+ pyyaml==6.0.1
189
+ pyzmq==25.1.2
190
+ raft-dask==23.12.0
191
+ rapids-dask-dependency==23.12.1
192
+ referencing==0.33.0
193
+ regex==2023.12.25
194
+ requests-oauthlib==1.3.1
195
+ requests==2.31.0
196
+ rich==13.7.0
197
+ rmm==23.12.0
198
+ rpds-py==0.17.1
199
+ rsa==4.9
200
+ sacrebleu==2.4.0
201
+ safetensors==0.4.3
202
+ scikit-learn==1.2.0
203
+ scipy==1.12.0
204
+ send2trash==1.8.2
205
+ sentencepiece==0.1.99
206
+ sentry-sdk==2.12.0
207
+ setproctitle==1.3.3
208
+ setuptools==68.2.2
209
+ six==1.16.0
210
+ smart-open==6.4.0
211
+ smmap==5.0.1
212
+ sortedcontainers==2.4.0
213
+ soundfile==0.12.1
214
+ soupsieve==2.5
215
+ soxr==0.3.7
216
+ spacy-legacy==3.0.12
217
+ spacy-loggers==1.0.5
218
+ spacy==3.7.2
219
+ sphinx-glpi-theme==0.6
220
+ srsly==2.4.8
221
+ stack-data==0.6.3
222
+ sympy==1.12
223
+ tabulate==0.9.0
224
+ tbb==2021.11.0
225
+ tblib==3.0.0
226
+ tensorboard-data-server==0.6.1
227
+ tensorboard-plugin-wit==1.8.1
228
+ tensorboard==2.9.0
229
+ tensorrt==8.6.3
230
+ terminado==0.18.0
231
+ termplotlib==0.3.9
232
+ thinc==8.2.3
233
+ threadpoolctl==3.2.0
234
+ thriftpy2==0.4.17
235
+ tinycss2==1.2.1
236
+ tokenizers==0.19.1
237
+ toml==0.10.2
238
+ tomli==2.0.1
239
+ toolz==0.12.1
240
+ torch-tensorrt==2.3.0a0
241
+ torch==2.3.0a0+ebedce2
242
+ torchdata==0.7.1a0
243
+ torchtext==0.17.0a0
244
+ torchvision==0.18.0a0
245
+ tornado==6.4
246
+ tqdm==4.66.1
247
+ traitlets==5.9.0
248
+ transformer-engine==1.3.0+5b90b7f
249
+ transformers==4.43.3
250
+ treelite-runtime==3.9.1
251
+ treelite==3.9.1
252
+ triton==2.2.0+e28a256
253
+ typer==0.9.0
254
+ types-dataclasses==0.6.6
255
+ typing-extensions==4.9.0
256
+ ucx-py==0.35.0
257
+ uff==0.6.9
258
+ ujson==5.8.0
259
+ urllib3==1.26.18
260
+ wandb==0.16.3
261
+ wasabi==1.1.2
262
+ wcwidth==0.2.13
263
+ weasel==0.3.4
264
+ webencodings==0.5.1
265
+ werkzeug==3.0.1
266
+ wheel==0.42.0
267
+ xdoctest==1.0.2
268
+ xgboost==1.7.6
269
+ yarl==1.9.4
270
+ zict==3.0.0
271
+ zipp==3.17.0
wandb/run-20240804_142250-6p58tz1g/files/wandb-metadata.json ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
3
+ "python": "3.10.12",
4
+ "heartbeatAt": "2024-08-04T05:22:51.055103",
5
+ "startedAt": "2024-08-04T05:22:50.431050",
6
+ "docker": null,
7
+ "cuda": null,
8
+ "args": [
9
+ "--seq-length",
10
+ "512",
11
+ "--sliding-window-size",
12
+ "4096",
13
+ "--micro-batch-size",
14
+ "8",
15
+ "--global-batch-size",
16
+ "320",
17
+ "--train-iters",
18
+ "2000",
19
+ "--tokenizer-type",
20
+ "Llama2Tokenizer",
21
+ "--tokenizer-model",
22
+ "/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model",
23
+ "--train-data-path",
24
+ "4013541",
25
+ "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
26
+ "--valid-data-path",
27
+ "4013541",
28
+ "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
29
+ "--test-data-path",
30
+ "4013541",
31
+ "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
32
+ "--lr",
33
+ "2e-5",
34
+ "--min-lr",
35
+ "1e-6",
36
+ "--lr-decay-style",
37
+ "cosine",
38
+ "--lr-warmup-iters",
39
+ "500",
40
+ "--lr-decay-iters",
41
+ "2000",
42
+ "--weight-decay",
43
+ "0.1",
44
+ "--grad-clip-norm",
45
+ "1.0",
46
+ "--optimizer",
47
+ "adam",
48
+ "--adam-beta1",
49
+ "0.9",
50
+ "--adam-beta2",
51
+ "0.95",
52
+ "--adam-eps",
53
+ "1e-6",
54
+ "--save-interval",
55
+ "200",
56
+ "--eval-interval",
57
+ "200",
58
+ "--eval-iters",
59
+ "10",
60
+ "--bf16",
61
+ "--mixed-precision",
62
+ "--base-model",
63
+ "/share/pretrained_lm/meta-llama/TinyLlama_v1.1",
64
+ "--save",
65
+ "/work/llm_recipes/models/tiny-llama",
66
+ "--load",
67
+ "/work/llm_recipes/models/tiny-llama",
68
+ "--fsdp-activation-checkpointing",
69
+ "--sharding-strategy",
70
+ "FULL_SHARD",
71
+ "--checkpoint-type",
72
+ "LOCAL_STATE_DICT",
73
+ "--save-n-checkpoints",
74
+ "10",
75
+ "--hf-upload-retry-limit",
76
+ "2",
77
+ "--hf-repo-id",
78
+ "koichi12/tiny-llama",
79
+ "--wandb-entity",
80
+ "iwakawa-koichi-q5-tohoku-nlp6723",
81
+ "--wandb-project",
82
+ "llm_tutorial",
83
+ "--wandb-name",
84
+ "tiny-llama_train_2024-08-04-14:22:39"
85
+ ],
86
+ "state": "running",
87
+ "program": "/project/examples/finetuning.py",
88
+ "codePathLocal": "examples/finetuning.py",
89
+ "codePath": "examples/finetuning.py",
90
+ "git": {
91
+ "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
92
+ "commit": "3be5353210a678dc7008f237fa16b99f2bdf36ea"
93
+ },
94
+ "email": null,
95
+ "root": "/project",
96
+ "host": "gpu-koiwa-00",
97
+ "username": "koiwa",
98
+ "executable": "/usr/bin/python",
99
+ "cpu_count": 18,
100
+ "cpu_count_logical": 18,
101
+ "cpu_freq": {
102
+ "current": 2400.0389999999993,
103
+ "min": 0.0,
104
+ "max": 0.0
105
+ },
106
+ "cpu_freq_per_core": [
107
+ {
108
+ "current": 2400.039,
109
+ "min": 0.0,
110
+ "max": 0.0
111
+ },
112
+ {
113
+ "current": 2400.039,
114
+ "min": 0.0,
115
+ "max": 0.0
116
+ },
117
+ {
118
+ "current": 2400.039,
119
+ "min": 0.0,
120
+ "max": 0.0
121
+ },
122
+ {
123
+ "current": 2400.039,
124
+ "min": 0.0,
125
+ "max": 0.0
126
+ },
127
+ {
128
+ "current": 2400.039,
129
+ "min": 0.0,
130
+ "max": 0.0
131
+ },
132
+ {
133
+ "current": 2400.039,
134
+ "min": 0.0,
135
+ "max": 0.0
136
+ },
137
+ {
138
+ "current": 2400.039,
139
+ "min": 0.0,
140
+ "max": 0.0
141
+ },
142
+ {
143
+ "current": 2400.039,
144
+ "min": 0.0,
145
+ "max": 0.0
146
+ },
147
+ {
148
+ "current": 2400.039,
149
+ "min": 0.0,
150
+ "max": 0.0
151
+ },
152
+ {
153
+ "current": 2400.039,
154
+ "min": 0.0,
155
+ "max": 0.0
156
+ },
157
+ {
158
+ "current": 2400.039,
159
+ "min": 0.0,
160
+ "max": 0.0
161
+ },
162
+ {
163
+ "current": 2400.039,
164
+ "min": 0.0,
165
+ "max": 0.0
166
+ },
167
+ {
168
+ "current": 2400.039,
169
+ "min": 0.0,
170
+ "max": 0.0
171
+ },
172
+ {
173
+ "current": 2400.039,
174
+ "min": 0.0,
175
+ "max": 0.0
176
+ },
177
+ {
178
+ "current": 2400.039,
179
+ "min": 0.0,
180
+ "max": 0.0
181
+ },
182
+ {
183
+ "current": 2400.039,
184
+ "min": 0.0,
185
+ "max": 0.0
186
+ },
187
+ {
188
+ "current": 2400.039,
189
+ "min": 0.0,
190
+ "max": 0.0
191
+ },
192
+ {
193
+ "current": 2400.039,
194
+ "min": 0.0,
195
+ "max": 0.0
196
+ }
197
+ ],
198
+ "disk": {
199
+ "/": {
200
+ "total": 0.0625,
201
+ "used": 1.1444091796875e-05
202
+ }
203
+ },
204
+ "gpu": "NVIDIA A100-SXM4-40GB",
205
+ "gpu_count": 1,
206
+ "gpu_devices": [
207
+ {
208
+ "name": "NVIDIA A100-SXM4-40GB",
209
+ "memory_total": 42949672960
210
+ }
211
+ ],
212
+ "memory": {
213
+ "total": 56.48781967163086
214
+ }
215
+ }
wandb/run-20240804_142250-6p58tz1g/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb": {"runtime": 2}}
wandb/run-20240804_142250-6p58tz1g/logs/debug-internal.log ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-04 14:22:50,445 INFO StreamThr :10451 [internal.py:wandb_internal():86] W&B internal server running at pid: 10451, started at: 2024-08-04 14:22:50.444819
2
+ 2024-08-04 14:22:50,447 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: status
3
+ 2024-08-04 14:22:50,449 INFO WriterThread:10451 [datastore.py:open_for_write():87] open: /project/wandb/run-20240804_142250-6p58tz1g/run-6p58tz1g.wandb
4
+ 2024-08-04 14:22:50,450 DEBUG SenderThread:10451 [sender.py:send():382] send: header
5
+ 2024-08-04 14:22:50,463 DEBUG SenderThread:10451 [sender.py:send():382] send: run
6
+ 2024-08-04 14:22:50,941 INFO SenderThread:10451 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240804_142250-6p58tz1g/files
7
+ 2024-08-04 14:22:50,941 INFO SenderThread:10451 [sender.py:_start_run_threads():1136] run started: 6p58tz1g with start time 1722748970.443993
8
+ 2024-08-04 14:22:50,946 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: check_version
9
+ 2024-08-04 14:22:50,946 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: check_version
10
+ 2024-08-04 14:22:51,034 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: run_start
11
+ 2024-08-04 14:22:51,041 DEBUG HandlerThread:10451 [system_info.py:__init__():27] System info init
12
+ 2024-08-04 14:22:51,041 DEBUG HandlerThread:10451 [system_info.py:__init__():42] System info init done
13
+ 2024-08-04 14:22:51,041 INFO HandlerThread:10451 [system_monitor.py:start():194] Starting system monitor
14
+ 2024-08-04 14:22:51,041 INFO SystemMonitor:10451 [system_monitor.py:_start():158] Starting system asset monitoring threads
15
+ 2024-08-04 14:22:51,042 INFO HandlerThread:10451 [system_monitor.py:probe():214] Collecting system info
16
+ 2024-08-04 14:22:51,042 INFO SystemMonitor:10451 [interfaces.py:start():190] Started cpu monitoring
17
+ 2024-08-04 14:22:51,043 INFO SystemMonitor:10451 [interfaces.py:start():190] Started disk monitoring
18
+ 2024-08-04 14:22:51,044 INFO SystemMonitor:10451 [interfaces.py:start():190] Started gpu monitoring
19
+ 2024-08-04 14:22:51,044 INFO SystemMonitor:10451 [interfaces.py:start():190] Started memory monitoring
20
+ 2024-08-04 14:22:51,045 INFO SystemMonitor:10451 [interfaces.py:start():190] Started network monitoring
21
+ 2024-08-04 14:22:51,055 DEBUG HandlerThread:10451 [system_info.py:probe():151] Probing system
22
+ 2024-08-04 14:22:51,059 DEBUG HandlerThread:10451 [system_info.py:_probe_git():136] Probing git
23
+ 2024-08-04 14:22:51,071 DEBUG HandlerThread:10451 [system_info.py:_probe_git():144] Probing git done
24
+ 2024-08-04 14:22:51,071 DEBUG HandlerThread:10451 [system_info.py:probe():199] Probing system done
25
+ 2024-08-04 14:22:51,071 DEBUG HandlerThread:10451 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-04T05:22:51.055103', 'startedAt': '2024-08-04T05:22:50.431050', 'docker': None, 'cuda': None, 'args': ('--seq-length', '512', '--sliding-window-size', '4096', '--micro-batch-size', '8', '--global-batch-size', '320', '--train-iters', '2000', '--tokenizer-type', 'Llama2Tokenizer', '--tokenizer-model', '/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model', '--train-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--valid-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--test-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '2000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '200', '--eval-interval', '200', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/meta-llama/TinyLlama_v1.1', '--save', '/work/llm_recipes/models/tiny-llama', '--load', '/work/llm_recipes/models/tiny-llama', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/tiny-llama', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'tiny-llama_train_2024-08-04-14:22:39'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '3be5353210a678dc7008f237fa16b99f2bdf36ea'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0389999999993, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.48781967163086}}
26
+ 2024-08-04 14:22:51,072 INFO HandlerThread:10451 [system_monitor.py:probe():224] Finished collecting system info
27
+ 2024-08-04 14:22:51,072 INFO HandlerThread:10451 [system_monitor.py:probe():227] Publishing system info
28
+ 2024-08-04 14:22:51,073 INFO HandlerThread:10451 [system_monitor.py:probe():229] Finished publishing system info
29
+ 2024-08-04 14:22:51,079 DEBUG SenderThread:10451 [sender.py:send():382] send: files
30
+ 2024-08-04 14:22:51,079 INFO SenderThread:10451 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
31
+ 2024-08-04 14:22:51,089 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: python_packages
32
+ 2024-08-04 14:22:51,089 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: stop_status
33
+ 2024-08-04 14:22:51,089 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: python_packages
34
+ 2024-08-04 14:22:51,090 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: internal_messages
35
+ 2024-08-04 14:22:51,091 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: stop_status
36
+ 2024-08-04 14:22:51,412 DEBUG SenderThread:10451 [sender.py:send():382] send: telemetry
37
+ 2024-08-04 14:22:51,834 INFO wandb-upload_0:10451 [upload_job.py:push():131] Uploaded file /tmp/tmpvai5nc9ewandb/lc3l5ghh-wandb-metadata.json
38
+ 2024-08-04 14:22:51,943 INFO Thread-12 :10451 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_142250-6p58tz1g/files/requirements.txt
39
+ 2024-08-04 14:22:51,943 INFO Thread-12 :10451 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_142250-6p58tz1g/files/wandb-metadata.json
40
+ 2024-08-04 14:22:51,943 INFO Thread-12 :10451 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_142250-6p58tz1g/files/output.log
41
+ 2024-08-04 14:22:53,535 DEBUG SenderThread:10451 [sender.py:send():382] send: config
42
+ 2024-08-04 14:22:53,536 DEBUG SenderThread:10451 [sender.py:send():382] send: config
43
+ 2024-08-04 14:22:53,643 DEBUG SenderThread:10451 [sender.py:send():382] send: exit
44
+ 2024-08-04 14:22:53,643 INFO SenderThread:10451 [sender.py:send_exit():589] handling exit code: 1
45
+ 2024-08-04 14:22:53,643 INFO SenderThread:10451 [sender.py:send_exit():591] handling runtime: 2
46
+ 2024-08-04 14:22:53,644 INFO SenderThread:10451 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
47
+ 2024-08-04 14:22:53,645 INFO SenderThread:10451 [sender.py:send_exit():597] send defer
48
+ 2024-08-04 14:22:53,645 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: defer
49
+ 2024-08-04 14:22:53,645 INFO HandlerThread:10451 [handler.py:handle_request_defer():172] handle defer: 0
50
+ 2024-08-04 14:22:53,645 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: defer
51
+ 2024-08-04 14:22:53,645 INFO SenderThread:10451 [sender.py:send_request_defer():613] handle sender defer: 0
52
+ 2024-08-04 14:22:53,645 INFO SenderThread:10451 [sender.py:transition_state():617] send defer: 1
53
+ 2024-08-04 14:22:53,645 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: defer
54
+ 2024-08-04 14:22:53,645 INFO HandlerThread:10451 [handler.py:handle_request_defer():172] handle defer: 1
55
+ 2024-08-04 14:22:53,645 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: defer
56
+ 2024-08-04 14:22:53,645 INFO SenderThread:10451 [sender.py:send_request_defer():613] handle sender defer: 1
57
+ 2024-08-04 14:22:53,645 INFO SenderThread:10451 [sender.py:transition_state():617] send defer: 2
58
+ 2024-08-04 14:22:53,645 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: defer
59
+ 2024-08-04 14:22:53,645 INFO HandlerThread:10451 [handler.py:handle_request_defer():172] handle defer: 2
60
+ 2024-08-04 14:22:53,645 INFO HandlerThread:10451 [system_monitor.py:finish():203] Stopping system monitor
61
+ 2024-08-04 14:22:53,646 DEBUG SystemMonitor:10451 [system_monitor.py:_start():172] Starting system metrics aggregation loop
62
+ 2024-08-04 14:22:53,646 INFO HandlerThread:10451 [interfaces.py:finish():202] Joined cpu monitor
63
+ 2024-08-04 14:22:53,646 DEBUG SystemMonitor:10451 [system_monitor.py:_start():179] Finished system metrics aggregation loop
64
+ 2024-08-04 14:22:53,646 INFO HandlerThread:10451 [interfaces.py:finish():202] Joined disk monitor
65
+ 2024-08-04 14:22:53,646 DEBUG SystemMonitor:10451 [system_monitor.py:_start():183] Publishing last batch of metrics
66
+ 2024-08-04 14:22:53,679 INFO HandlerThread:10451 [interfaces.py:finish():202] Joined gpu monitor
67
+ 2024-08-04 14:22:53,679 INFO HandlerThread:10451 [interfaces.py:finish():202] Joined memory monitor
68
+ 2024-08-04 14:22:53,679 INFO HandlerThread:10451 [interfaces.py:finish():202] Joined network monitor
69
+ 2024-08-04 14:22:53,680 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: defer
70
+ 2024-08-04 14:22:53,680 INFO SenderThread:10451 [sender.py:send_request_defer():613] handle sender defer: 2
71
+ 2024-08-04 14:22:53,680 INFO SenderThread:10451 [sender.py:transition_state():617] send defer: 3
72
+ 2024-08-04 14:22:53,680 DEBUG SenderThread:10451 [sender.py:send():382] send: stats
73
+ 2024-08-04 14:22:53,680 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: defer
74
+ 2024-08-04 14:22:53,680 INFO HandlerThread:10451 [handler.py:handle_request_defer():172] handle defer: 3
75
+ 2024-08-04 14:22:53,680 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: defer
76
+ 2024-08-04 14:22:53,681 INFO SenderThread:10451 [sender.py:send_request_defer():613] handle sender defer: 3
77
+ 2024-08-04 14:22:53,681 INFO SenderThread:10451 [sender.py:transition_state():617] send defer: 4
78
+ 2024-08-04 14:22:53,681 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: defer
79
+ 2024-08-04 14:22:53,681 INFO HandlerThread:10451 [handler.py:handle_request_defer():172] handle defer: 4
80
+ 2024-08-04 14:22:53,681 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: defer
81
+ 2024-08-04 14:22:53,681 INFO SenderThread:10451 [sender.py:send_request_defer():613] handle sender defer: 4
82
+ 2024-08-04 14:22:53,681 INFO SenderThread:10451 [sender.py:transition_state():617] send defer: 5
83
+ 2024-08-04 14:22:53,681 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: defer
84
+ 2024-08-04 14:22:53,681 INFO HandlerThread:10451 [handler.py:handle_request_defer():172] handle defer: 5
85
+ 2024-08-04 14:22:53,681 DEBUG SenderThread:10451 [sender.py:send():382] send: summary
86
+ 2024-08-04 14:22:53,682 INFO SenderThread:10451 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
87
+ 2024-08-04 14:22:53,682 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: defer
88
+ 2024-08-04 14:22:53,682 INFO SenderThread:10451 [sender.py:send_request_defer():613] handle sender defer: 5
89
+ 2024-08-04 14:22:53,682 INFO SenderThread:10451 [sender.py:transition_state():617] send defer: 6
90
+ 2024-08-04 14:22:53,683 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: defer
91
+ 2024-08-04 14:22:53,683 INFO HandlerThread:10451 [handler.py:handle_request_defer():172] handle defer: 6
92
+ 2024-08-04 14:22:53,683 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: defer
93
+ 2024-08-04 14:22:53,683 INFO SenderThread:10451 [sender.py:send_request_defer():613] handle sender defer: 6
94
+ 2024-08-04 14:22:53,685 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: status_report
95
+ 2024-08-04 14:22:53,891 INFO SenderThread:10451 [sender.py:transition_state():617] send defer: 7
96
+ 2024-08-04 14:22:53,891 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: defer
97
+ 2024-08-04 14:22:53,891 INFO HandlerThread:10451 [handler.py:handle_request_defer():172] handle defer: 7
98
+ 2024-08-04 14:22:53,892 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: defer
99
+ 2024-08-04 14:22:53,892 INFO SenderThread:10451 [sender.py:send_request_defer():613] handle sender defer: 7
100
+ 2024-08-04 14:22:53,944 INFO Thread-12 :10451 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_142250-6p58tz1g/files/config.yaml
101
+ 2024-08-04 14:22:53,944 INFO Thread-12 :10451 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_142250-6p58tz1g/files/output.log
102
+ 2024-08-04 14:22:53,944 INFO Thread-12 :10451 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_142250-6p58tz1g/files/wandb-summary.json
103
+ 2024-08-04 14:22:54,643 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: poll_exit
104
+ 2024-08-04 14:22:55,782 INFO SenderThread:10451 [sender.py:transition_state():617] send defer: 8
105
+ 2024-08-04 14:22:55,783 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: poll_exit
106
+ 2024-08-04 14:22:55,783 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: defer
107
+ 2024-08-04 14:22:55,783 INFO HandlerThread:10451 [handler.py:handle_request_defer():172] handle defer: 8
108
+ 2024-08-04 14:22:55,783 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: defer
109
+ 2024-08-04 14:22:55,783 INFO SenderThread:10451 [sender.py:send_request_defer():613] handle sender defer: 8
110
+ 2024-08-04 14:22:55,783 INFO SenderThread:10451 [job_builder.py:build():296] Attempting to build job artifact
111
+ 2024-08-04 14:22:55,784 INFO SenderThread:10451 [job_builder.py:_get_source_type():426] is repo sourced job
112
+ 2024-08-04 14:22:55,883 INFO SenderThread:10451 [job_builder.py:build():402] adding wandb-job metadata file
113
+ 2024-08-04 14:22:55,891 INFO SenderThread:10451 [sender.py:transition_state():617] send defer: 9
114
+ 2024-08-04 14:22:55,892 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: defer
115
+ 2024-08-04 14:22:55,892 DEBUG SenderThread:10451 [sender.py:send():382] send: artifact
116
+ 2024-08-04 14:22:55,892 INFO HandlerThread:10451 [handler.py:handle_request_defer():172] handle defer: 9
117
+ 2024-08-04 14:22:55,945 INFO Thread-12 :10451 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_142250-6p58tz1g/files/output.log
118
+ 2024-08-04 14:22:56,644 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: poll_exit
119
+ 2024-08-04 14:22:57,777 INFO SenderThread:10451 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTA5MTk2NTkzOA==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTA5MzUzODM4NQ==', 'versionIndex': 3}}}
120
+ 2024-08-04 14:22:57,777 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: defer
121
+ 2024-08-04 14:22:57,777 INFO SenderThread:10451 [sender.py:send_request_defer():613] handle sender defer: 9
122
+ 2024-08-04 14:22:57,777 INFO SenderThread:10451 [dir_watcher.py:finish():358] shutting down directory watcher
123
+ 2024-08-04 14:22:57,946 INFO SenderThread:10451 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240804_142250-6p58tz1g/files
124
+ 2024-08-04 14:22:57,946 INFO SenderThread:10451 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_142250-6p58tz1g/files/requirements.txt requirements.txt
125
+ 2024-08-04 14:22:57,947 INFO SenderThread:10451 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_142250-6p58tz1g/files/config.yaml config.yaml
126
+ 2024-08-04 14:22:57,948 INFO SenderThread:10451 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_142250-6p58tz1g/files/wandb-metadata.json wandb-metadata.json
127
+ 2024-08-04 14:22:57,948 INFO SenderThread:10451 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_142250-6p58tz1g/files/wandb-summary.json wandb-summary.json
128
+ 2024-08-04 14:22:57,950 INFO SenderThread:10451 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_142250-6p58tz1g/files/output.log output.log
129
+ 2024-08-04 14:22:57,952 INFO SenderThread:10451 [sender.py:transition_state():617] send defer: 10
130
+ 2024-08-04 14:22:57,952 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: poll_exit
131
+ 2024-08-04 14:22:57,952 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: defer
132
+ 2024-08-04 14:22:57,952 INFO HandlerThread:10451 [handler.py:handle_request_defer():172] handle defer: 10
133
+ 2024-08-04 14:22:57,954 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: defer
134
+ 2024-08-04 14:22:57,954 INFO SenderThread:10451 [sender.py:send_request_defer():613] handle sender defer: 10
135
+ 2024-08-04 14:22:57,954 INFO SenderThread:10451 [file_pusher.py:finish():172] shutting down file pusher
136
+ 2024-08-04 14:22:58,363 INFO wandb-upload_1:10451 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_142250-6p58tz1g/files/config.yaml
137
+ 2024-08-04 14:22:58,459 INFO wandb-upload_0:10451 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_142250-6p58tz1g/files/requirements.txt
138
+ 2024-08-04 14:22:58,506 INFO wandb-upload_2:10451 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_142250-6p58tz1g/files/wandb-summary.json
139
+ 2024-08-04 14:22:58,525 INFO wandb-upload_3:10451 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_142250-6p58tz1g/files/output.log
140
+ 2024-08-04 14:22:58,645 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: poll_exit
141
+ 2024-08-04 14:22:58,645 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: poll_exit
142
+ 2024-08-04 14:22:58,725 INFO Thread-11 (_thread_body):10451 [sender.py:transition_state():617] send defer: 11
143
+ 2024-08-04 14:22:58,725 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: defer
144
+ 2024-08-04 14:22:58,725 INFO HandlerThread:10451 [handler.py:handle_request_defer():172] handle defer: 11
145
+ 2024-08-04 14:22:58,726 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: defer
146
+ 2024-08-04 14:22:58,726 INFO SenderThread:10451 [sender.py:send_request_defer():613] handle sender defer: 11
147
+ 2024-08-04 14:22:58,726 INFO SenderThread:10451 [file_pusher.py:join():178] waiting for file pusher
148
+ 2024-08-04 14:22:58,726 INFO SenderThread:10451 [sender.py:transition_state():617] send defer: 12
149
+ 2024-08-04 14:22:58,726 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: defer
150
+ 2024-08-04 14:22:58,726 INFO HandlerThread:10451 [handler.py:handle_request_defer():172] handle defer: 12
151
+ 2024-08-04 14:22:58,726 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: defer
152
+ 2024-08-04 14:22:58,726 INFO SenderThread:10451 [sender.py:send_request_defer():613] handle sender defer: 12
153
+ 2024-08-04 14:22:58,726 INFO SenderThread:10451 [file_stream.py:finish():595] file stream finish called
154
+ 2024-08-04 14:22:58,910 INFO SenderThread:10451 [file_stream.py:finish():599] file stream finish is done
155
+ 2024-08-04 14:22:58,911 INFO SenderThread:10451 [sender.py:transition_state():617] send defer: 13
156
+ 2024-08-04 14:22:58,911 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: defer
157
+ 2024-08-04 14:22:58,911 INFO HandlerThread:10451 [handler.py:handle_request_defer():172] handle defer: 13
158
+ 2024-08-04 14:22:58,911 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: defer
159
+ 2024-08-04 14:22:58,911 INFO SenderThread:10451 [sender.py:send_request_defer():613] handle sender defer: 13
160
+ 2024-08-04 14:22:58,911 INFO SenderThread:10451 [sender.py:transition_state():617] send defer: 14
161
+ 2024-08-04 14:22:58,911 DEBUG SenderThread:10451 [sender.py:send():382] send: final
162
+ 2024-08-04 14:22:58,911 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: defer
163
+ 2024-08-04 14:22:58,912 DEBUG SenderThread:10451 [sender.py:send():382] send: footer
164
+ 2024-08-04 14:22:58,912 INFO HandlerThread:10451 [handler.py:handle_request_defer():172] handle defer: 14
165
+ 2024-08-04 14:22:58,912 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: defer
166
+ 2024-08-04 14:22:58,912 INFO SenderThread:10451 [sender.py:send_request_defer():613] handle sender defer: 14
167
+ 2024-08-04 14:22:58,912 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: poll_exit
168
+ 2024-08-04 14:22:58,912 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: poll_exit
169
+ 2024-08-04 14:22:58,913 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: poll_exit
170
+ 2024-08-04 14:22:58,913 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: poll_exit
171
+ 2024-08-04 14:22:58,913 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: server_info
172
+ 2024-08-04 14:22:58,913 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: get_summary
173
+ 2024-08-04 14:22:58,914 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: server_info
174
+ 2024-08-04 14:22:58,915 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: sampled_history
175
+ 2024-08-04 14:22:58,915 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: internal_messages
176
+ 2024-08-04 14:22:58,916 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: job_info
177
+ 2024-08-04 14:22:59,080 DEBUG SenderThread:10451 [sender.py:send_request():409] send_request: job_info
178
+ 2024-08-04 14:22:59,081 INFO MainThread:10451 [wandb_run.py:_footer_history_summary_info():3866] rendering history
179
+ 2024-08-04 14:22:59,081 INFO MainThread:10451 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
180
+ 2024-08-04 14:22:59,081 INFO MainThread:10451 [wandb_run.py:_footer_sync_info():3825] logging synced files
181
+ 2024-08-04 14:22:59,081 DEBUG HandlerThread:10451 [handler.py:handle_request():146] handle_request: shutdown
182
+ 2024-08-04 14:22:59,081 INFO HandlerThread:10451 [handler.py:finish():869] shutting down handler
183
+ 2024-08-04 14:22:59,916 INFO WriterThread:10451 [datastore.py:close():296] close: /project/wandb/run-20240804_142250-6p58tz1g/run-6p58tz1g.wandb
184
+ 2024-08-04 14:23:00,081 INFO SenderThread:10451 [sender.py:finish():1572] shutting down sender
185
+ 2024-08-04 14:23:00,081 INFO SenderThread:10451 [file_pusher.py:finish():172] shutting down file pusher
186
+ 2024-08-04 14:23:00,081 INFO SenderThread:10451 [file_pusher.py:join():178] waiting for file pusher
wandb/run-20240804_142250-6p58tz1g/logs/debug.log ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-04 14:22:50,437 INFO MainThread:10380 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
2
+ 2024-08-04 14:22:50,437 INFO MainThread:10380 [wandb_setup.py:_flush():76] Configure stats pid to 10380
3
+ 2024-08-04 14:22:50,437 INFO MainThread:10380 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
4
+ 2024-08-04 14:22:50,437 INFO MainThread:10380 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
5
+ 2024-08-04 14:22:50,437 INFO MainThread:10380 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train tiny llama sample'}
6
+ 2024-08-04 14:22:50,437 INFO MainThread:10380 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
+ 2024-08-04 14:22:50,437 INFO MainThread:10380 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
8
+ 2024-08-04 14:22:50,437 INFO MainThread:10380 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240804_142250-6p58tz1g/logs/debug.log
9
+ 2024-08-04 14:22:50,437 INFO MainThread:10380 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240804_142250-6p58tz1g/logs/debug-internal.log
10
+ 2024-08-04 14:22:50,438 INFO MainThread:10380 [wandb_init.py:init():566] calling init triggers
11
+ 2024-08-04 14:22:50,438 INFO MainThread:10380 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
12
+ config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'valid_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'test_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 512, 'num_workers': 2, 'tokenizer_type': 'Llama2Tokenizer', 'tokenizer_model': '/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'tiny-llama_train_2024-08-04-14:22:39', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/tiny-llama', 'save': '/work/llm_recipes/models/tiny-llama', 'base_model': '/share/pretrained_lm/meta-llama/TinyLlama_v1.1', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 2000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 2000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 8, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/tiny-llama', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 32000, 'gradient_accumulation_steps': 40}
13
+ 2024-08-04 14:22:50,438 INFO MainThread:10380 [wandb_init.py:init():616] starting backend
14
+ 2024-08-04 14:22:50,438 INFO MainThread:10380 [wandb_init.py:init():620] setting up manager
15
+ 2024-08-04 14:22:50,443 INFO MainThread:10380 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
16
+ 2024-08-04 14:22:50,443 INFO MainThread:10380 [wandb_init.py:init():628] backend started and connected
17
+ 2024-08-04 14:22:50,448 INFO MainThread:10380 [wandb_init.py:init():720] updated telemetry
18
+ 2024-08-04 14:22:50,459 INFO MainThread:10380 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
19
+ 2024-08-04 14:22:50,946 INFO MainThread:10380 [wandb_run.py:_on_init():2262] communicating current version
20
+ 2024-08-04 14:22:51,027 INFO MainThread:10380 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.5 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
21
+
22
+ 2024-08-04 14:22:51,027 INFO MainThread:10380 [wandb_init.py:init():804] starting run threads in backend
23
+ 2024-08-04 14:22:51,088 INFO MainThread:10380 [wandb_run.py:_console_start():2241] atexit reg
24
+ 2024-08-04 14:22:51,088 INFO MainThread:10380 [wandb_run.py:_redirect():2096] redirect: wrap_raw
25
+ 2024-08-04 14:22:51,088 INFO MainThread:10380 [wandb_run.py:_redirect():2161] Wrapping output streams.
26
+ 2024-08-04 14:22:51,088 INFO MainThread:10380 [wandb_run.py:_redirect():2186] Redirects installed.
27
+ 2024-08-04 14:22:51,090 INFO MainThread:10380 [wandb_init.py:init():847] run started, returning control to user process
28
+ 2024-08-04 14:22:53,535 INFO MainThread:10380 [wandb_run.py:_config_callback():1343] config_cb None None {'activation_function': 'silu', 'hidden_size': 2048, 'model_type': 'llama', 'max_position_embeddings': 2048, 'num_attention_heads': 32, 'num_hidden_layers': 22, 'model_architecture': 'LlamaForCausalLM'}
29
+ 2024-08-04 14:22:53,535 INFO MainThread:10380 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
30
+ 2024-08-04 14:23:00,082 WARNING MsgRouterThr:10380 [router.py:message_loop():77] message_loop has been closed
wandb/run-20240804_142250-6p58tz1g/run-6p58tz1g.wandb ADDED
Binary file (20.5 kB). View file
 
wandb/run-20240804_143607-h7fxlkpt/files/config.yaml ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ sharding_strategy:
4
+ desc: null
5
+ value: FULL_SHARD
6
+ checkpoint_type:
7
+ desc: null
8
+ value: LOCAL_STATE_DICT
9
+ fsdp_activation_checkpointing:
10
+ desc: null
11
+ value: true
12
+ fsdp_cpu_offload:
13
+ desc: null
14
+ value: false
15
+ low_cpu_fsdp:
16
+ desc: null
17
+ value: false
18
+ no_meta_device:
19
+ desc: null
20
+ value: false
21
+ data_path:
22
+ desc: null
23
+ value: null
24
+ split:
25
+ desc: null
26
+ value: 969, 30, 1
27
+ train_data_path:
28
+ desc: null
29
+ value:
30
+ - '4013541'
31
+ - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
32
+ valid_data_path:
33
+ desc: null
34
+ value:
35
+ - '4013541'
36
+ - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
37
+ test_data_path:
38
+ desc: null
39
+ value:
40
+ - '4013541'
41
+ - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
42
+ data_cache_path:
43
+ desc: null
44
+ value: null
45
+ vocab_size:
46
+ desc: null
47
+ value: null
48
+ vocab_file:
49
+ desc: null
50
+ value: null
51
+ merge_file:
52
+ desc: null
53
+ value: null
54
+ seq_length:
55
+ desc: null
56
+ value: 512
57
+ num_workers:
58
+ desc: null
59
+ value: 2
60
+ tokenizer_type:
61
+ desc: null
62
+ value: Llama2Tokenizer
63
+ tokenizer_model:
64
+ desc: null
65
+ value: /share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model
66
+ reset_position_ids:
67
+ desc: null
68
+ value: false
69
+ reset_attention_mask:
70
+ desc: null
71
+ value: false
72
+ eod_mask_loss:
73
+ desc: null
74
+ value: false
75
+ retro_return_doc_ids:
76
+ desc: null
77
+ value: false
78
+ short_seq_prob:
79
+ desc: null
80
+ value: 0.1
81
+ vocab_extra_ids:
82
+ desc: null
83
+ value: 0
84
+ seed:
85
+ desc: null
86
+ value: 1234
87
+ use_mpi:
88
+ desc: null
89
+ value: false
90
+ wandb_entity:
91
+ desc: null
92
+ value: iwakawa-koichi-q5-tohoku-nlp6723
93
+ wandb_name:
94
+ desc: null
95
+ value: tiny-llama_train_2024-08-04-14:35:56
96
+ wandb_project:
97
+ desc: null
98
+ value: llm_tutorial
99
+ quantization:
100
+ desc: null
101
+ value: false
102
+ use_freeze_layers:
103
+ desc: null
104
+ value: false
105
+ freeze_layers:
106
+ desc: null
107
+ value: null
108
+ bf16:
109
+ desc: null
110
+ value: true
111
+ fp16:
112
+ desc: null
113
+ value: false
114
+ mixed_precision:
115
+ desc: null
116
+ value: true
117
+ param_dtype:
118
+ desc: null
119
+ value: null
120
+ load:
121
+ desc: null
122
+ value: /work/llm_recipes/models/tiny-llama
123
+ save:
124
+ desc: null
125
+ value: /work/llm_recipes/models/tiny-llama
126
+ base_model:
127
+ desc: null
128
+ value: /share/pretrained_lm/meta-llama/TinyLlama_v1.1
129
+ use_better_transformer:
130
+ desc: null
131
+ value: false
132
+ grad_clip_norm:
133
+ desc: null
134
+ value: 1.0
135
+ eval_interval:
136
+ desc: null
137
+ value: 200
138
+ save_interval:
139
+ desc: null
140
+ value: 200
141
+ eval_iters:
142
+ desc: null
143
+ value: 10
144
+ optimizer:
145
+ desc: null
146
+ value: adam
147
+ lr:
148
+ desc: null
149
+ value: 2.0e-05
150
+ lr_decay_style:
151
+ desc: null
152
+ value: cosine
153
+ lr_decay_iters:
154
+ desc: null
155
+ value: 2000
156
+ lr_warmup_iters:
157
+ desc: null
158
+ value: 500
159
+ min_lr:
160
+ desc: null
161
+ value: 1.0e-06
162
+ train_iters:
163
+ desc: null
164
+ value: 2000
165
+ train_samples:
166
+ desc: null
167
+ value: null
168
+ global_batch_size:
169
+ desc: null
170
+ value: 320
171
+ micro_batch_size:
172
+ desc: null
173
+ value: 8
174
+ make_vocab_size_divisible_by:
175
+ desc: null
176
+ value: 128
177
+ sliding_window_size:
178
+ desc: null
179
+ value: 4096
180
+ skip_batch:
181
+ desc: null
182
+ value: null
183
+ no_save_optimizer_state:
184
+ desc: null
185
+ value: false
186
+ continual_pretraining:
187
+ desc: null
188
+ value: false
189
+ instruction_tuning:
190
+ desc: null
191
+ value: false
192
+ direct_preference_optimization:
193
+ desc: null
194
+ value: false
195
+ attention_dropout:
196
+ desc: null
197
+ value: 0.1
198
+ hidden_dropout:
199
+ desc: null
200
+ value: 0.1
201
+ weight_decay:
202
+ desc: null
203
+ value: 0.1
204
+ adam_beta1:
205
+ desc: null
206
+ value: 0.9
207
+ adam_beta2:
208
+ desc: null
209
+ value: 0.95
210
+ adam_eps:
211
+ desc: null
212
+ value: 1.0e-06
213
+ hf_transformer_model_dir:
214
+ desc: null
215
+ value: null
216
+ instruction_train_data_path:
217
+ desc: null
218
+ value: null
219
+ instruction_valid_data_path:
220
+ desc: null
221
+ value: null
222
+ epoch:
223
+ desc: null
224
+ value: null
225
+ instruction_dataset_size:
226
+ desc: null
227
+ value: null
228
+ save_sampler_state:
229
+ desc: null
230
+ value: false
231
+ label_smoothing:
232
+ desc: null
233
+ value: 0.0
234
+ save_n_checkpoints:
235
+ desc: null
236
+ value: 10
237
+ hf_repo_id:
238
+ desc: null
239
+ value: koichi12/tiny-llama
240
+ create_public_hf_repo:
241
+ desc: null
242
+ value: false
243
+ upload_all_checkpoints_to_hf:
244
+ desc: null
245
+ value: false
246
+ hf_upload_retry_limit:
247
+ desc: null
248
+ value: 2
249
+ exit_duration_in_mins:
250
+ desc: null
251
+ value: null
252
+ source_key:
253
+ desc: null
254
+ value: null
255
+ target_key:
256
+ desc: null
257
+ value: null
258
+ attn_implementation:
259
+ desc: null
260
+ value: flash_attention_2
261
+ efficient_instruction_tuning:
262
+ desc: null
263
+ value: false
264
+ remove_padding_masking:
265
+ desc: null
266
+ value: false
267
+ save_start_iter:
268
+ desc: null
269
+ value: null
270
+ rank:
271
+ desc: null
272
+ value: 0
273
+ world_size:
274
+ desc: null
275
+ value: 1
276
+ padded_vocab_size:
277
+ desc: null
278
+ value: 32000
279
+ gradient_accumulation_steps:
280
+ desc: null
281
+ value: 40
282
+ _wandb:
283
+ desc: null
284
+ value:
285
+ python_version: 3.10.12
286
+ cli_version: 0.16.3
287
+ framework: huggingface
288
+ huggingface_version: 4.43.3
289
+ is_jupyter_run: false
290
+ is_kaggle_kernel: false
291
+ start_time: 1722749767.220741
292
+ t:
293
+ 1:
294
+ - 1
295
+ - 11
296
+ - 49
297
+ - 55
298
+ - 71
299
+ 2:
300
+ - 1
301
+ - 11
302
+ - 49
303
+ - 55
304
+ - 71
305
+ 3:
306
+ - 13
307
+ - 16
308
+ - 23
309
+ 4: 3.10.12
310
+ 5: 0.16.3
311
+ 6: 4.43.3
312
+ 8:
313
+ - 5
314
+ 13: linux-x86_64
315
+ activation_function:
316
+ desc: null
317
+ value: silu
318
+ hidden_size:
319
+ desc: null
320
+ value: 2048
321
+ model_type:
322
+ desc: null
323
+ value: llama
324
+ max_position_embeddings:
325
+ desc: null
326
+ value: 2048
327
+ num_attention_heads:
328
+ desc: null
329
+ value: 32
330
+ num_hidden_layers:
331
+ desc: null
332
+ value: 22
333
+ model_architecture:
334
+ desc: null
335
+ value: LlamaForCausalLM
wandb/run-20240804_143607-h7fxlkpt/files/output.log ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Created Hugging Face repository with ID koichi12/tiny-llama.
2
+ Clearing GPU cache for all ranks
3
+ --> Running with torch torch_distributed debug set to detail
4
+ File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
5
+ Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
6
+ File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
7
+ Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
8
+ File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
9
+ Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
10
+ No checkpoint found in /work/llm_recipes/models/tiny-llama, skipping model loading
11
+ --> Model /share/pretrained_lm/meta-llama/TinyLlama_v1.1
12
+ --> /share/pretrained_lm/meta-llama/TinyLlama_v1.1 has 1100.048384 Million params
13
+ You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
14
+ You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
15
+ Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaForCausalLM is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
16
+ Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
17
+ /usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
18
+ warnings.warn(
19
+ BFloat16 enabled for mixed precision - using bfSixteen policy
20
+ --> applying fsdp activation checkpointing...
21
+ > datasets target sizes (minimum size):
22
+ train: 640000
23
+ validation: 35200
24
+ test: 3200
25
+ > building train, validation, and test datasets for GPT ...
26
+ > finished creating GPT datasets ...
27
+ File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
28
+ Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
29
+ No checkpoint found in /work/llm_recipes/models/tiny-llama, skipping optimizer loading
30
+ File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
31
+ Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
32
+ model info: FullyShardedDataParallel(
33
+ (_fsdp_wrapped_module): LlamaForCausalLM(
34
+ (model): LlamaModel(
35
+ (embed_tokens): Embedding(32000, 2048)
36
+ (layers): ModuleList(
37
+ (0-21): 22 x FullyShardedDataParallel(
38
+ (_fsdp_wrapped_module): CheckpointWrapper(
39
+ (_checkpoint_wrapped_module): LlamaDecoderLayer(
40
+ (self_attn): LlamaFlashAttention2(
41
+ (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
42
+ (k_proj): Linear(in_features=2048, out_features=256, bias=False)
43
+ (v_proj): Linear(in_features=2048, out_features=256, bias=False)
44
+ (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
45
+ (rotary_emb): LlamaRotaryEmbedding()
46
+ )
47
+ (mlp): LlamaMLP(
48
+ (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
49
+ (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
50
+ (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
51
+ (act_fn): SiLU()
52
+ )
53
+ (input_layernorm): LlamaRMSNorm()
54
+ (post_attention_layernorm): LlamaRMSNorm()
55
+ )
56
+ )
57
+ )
58
+ )
59
+ (norm): LlamaRMSNorm()
60
+ (rotary_emb): LlamaRotaryEmbedding()
61
+ )
62
+ (lm_head): Linear(in_features=2048, out_features=32000, bias=False)
63
+ )
64
+ )
65
+ model config: LlamaConfig {
66
+ "_name_or_path": "/share/pretrained_lm/meta-llama/TinyLlama_v1.1",
67
+ "architectures": [
68
+ "LlamaForCausalLM"
69
+ ],
70
+ "attention_bias": false,
71
+ "attention_dropout": 0.0,
72
+ "bos_token_id": 1,
73
+ "eos_token_id": 2,
74
+ "hidden_act": "silu",
75
+ "hidden_size": 2048,
76
+ "initializer_range": 0.02,
77
+ "intermediate_size": 5632,
78
+ "label_smoothing": 0.0,
79
+ "max_position_embeddings": 2048,
80
+ "mlp_bias": false,
81
+ "model_type": "llama",
82
+ "num_attention_heads": 32,
83
+ "num_hidden_layers": 22,
84
+ "num_key_value_heads": 4,
85
+ "pretraining_tp": 1,
86
+ "rms_norm_eps": 1e-05,
87
+ "rope_scaling": null,
88
+ "rope_theta": 10000.0,
89
+ "tie_word_embeddings": false,
90
+ "torch_dtype": "float32",
91
+ "transformers_version": "4.43.3",
92
+ "use_cache": false,
93
+ "vocab_size": 32000
94
+ }
95
+ Let split = None
96
+ Building a BlendedDataset for a single MegatronDataset
97
+ Unable to save the indexes because path_to_cache is None
98
+ Building a BlendedDataset for a single MegatronDataset
99
+ Unable to save the indexes because path_to_cache is None
100
+ Building a BlendedDataset for a single MegatronDataset
101
+ Unable to save the indexes because path_to_cache is None
102
+ Traceback (most recent call last):
103
+ File "/project/examples/finetuning.py", line 13, in <module>
104
+ main()
105
+ File "/project/src/llama_recipes/finetuning.py", line 281, in main
106
+ train(
107
+ File "/project/src/llama_recipes/utils/train_utils.py", line 104, in train
108
+ batch = next(train_dataloader)
109
+ File "/project/src/llama_recipes/utils/train_utils.py", line 24, in cyclic_iter
110
+ for x in iter:
111
+ File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 631, in __next__
112
+ data = self._next_data()
113
+ File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1346, in _next_data
114
+ return self._process_data(data)
115
+ File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1372, in _process_data
116
+ data.reraise()
117
+ File "/usr/local/lib/python3.10/dist-packages/torch/_utils.py", line 705, in reraise
118
+ raise exception
119
+ RuntimeError: Caught RuntimeError in DataLoader worker process 0.
120
+ Original Traceback (most recent call last):
121
+ File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
122
+ data = fetcher.fetch(index)
123
+ File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/fetch.py", line 54, in fetch
124
+ return self.collate_fn(data)
125
+ File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 277, in default_collate
126
+ return collate(batch, collate_fn_map=default_collate_fn_map)
127
+ File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 129, in collate
128
+ return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
129
+ File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 129, in <dictcomp>
130
+ return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
131
+ File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 121, in collate
132
+ return collate_fn_map[elem_type](batch, collate_fn_map=collate_fn_map)
133
+ File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 174, in collate_tensor_fn
134
+ return torch.stack(batch, 0, out=out)
135
+ RuntimeError: stack expects each tensor to be equal size, but got [513] at entry 0 and [543] at entry 1
wandb/run-20240804_143607-h7fxlkpt/files/requirements.txt ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.1.0
2
+ accelerate==0.33.0
3
+ aiohttp==3.9.1
4
+ aiosignal==1.3.1
5
+ annotated-types==0.6.0
6
+ apex==0.1
7
+ appdirs==1.4.4
8
+ argon2-cffi-bindings==21.2.0
9
+ argon2-cffi==23.1.0
10
+ asttokens==2.4.1
11
+ astunparse==1.6.3
12
+ async-timeout==4.0.3
13
+ attrs==23.2.0
14
+ audioread==3.0.1
15
+ beautifulsoup4==4.12.3
16
+ bleach==6.1.0
17
+ blis==0.7.11
18
+ cachetools==5.3.2
19
+ catalogue==2.0.10
20
+ certifi==2024.2.2
21
+ cffi==1.16.0
22
+ charset-normalizer==3.3.2
23
+ click==8.1.7
24
+ cloudpathlib==0.16.0
25
+ cloudpickle==3.0.0
26
+ cmake==3.28.1
27
+ colorama==0.4.6
28
+ comm==0.2.1
29
+ confection==0.1.4
30
+ contourpy==1.2.0
31
+ cubinlinker==0.3.0+2.g405ac64
32
+ cuda-python==12.3.0rc4+9.gdb8c48a.dirty
33
+ cudf==23.12.0
34
+ cugraph-dgl==23.12.0
35
+ cugraph-service-client==23.12.0
36
+ cugraph-service-server==23.12.0
37
+ cugraph==23.12.0
38
+ cuml==23.12.0
39
+ cupy-cuda12x==12.3.0
40
+ cycler==0.12.1
41
+ cymem==2.0.8
42
+ cython==3.0.8
43
+ dask-cuda==23.12.0
44
+ dask-cudf==23.12.0
45
+ dask==2023.11.0
46
+ debugpy==1.8.1
47
+ decorator==5.1.1
48
+ defusedxml==0.7.1
49
+ distributed==2023.11.0
50
+ dm-tree==0.1.8
51
+ docker-pycreds==0.4.0
52
+ einops==0.7.0
53
+ exceptiongroup==1.2.0
54
+ execnet==2.0.2
55
+ executing==2.0.1
56
+ expecttest==0.1.3
57
+ fastjsonschema==2.19.1
58
+ fastrlock==0.8.2
59
+ filelock==3.13.1
60
+ flash-attn==2.4.2
61
+ fonttools==4.48.1
62
+ frozenlist==1.4.1
63
+ fsspec==2023.12.2
64
+ gast==0.5.4
65
+ gitdb==4.0.11
66
+ gitpython==3.1.43
67
+ google-auth-oauthlib==0.4.6
68
+ google-auth==2.27.0
69
+ graphsurgeon==0.4.6
70
+ grpcio==1.60.1
71
+ huggingface-hub==0.24.5
72
+ hypothesis==5.35.1
73
+ idna==3.6
74
+ importlib-metadata==7.0.1
75
+ iniconfig==2.0.0
76
+ intel-openmp==2021.4.0
77
+ ipadic==1.0.0
78
+ ipykernel==6.29.2
79
+ ipython-genutils==0.2.0
80
+ ipython==8.21.0
81
+ jedi==0.19.1
82
+ jinja2==3.1.3
83
+ joblib==1.3.2
84
+ json5==0.9.14
85
+ jsonnet==0.19.1
86
+ jsonschema-specifications==2023.12.1
87
+ jsonschema==4.21.1
88
+ jupyter-client==8.6.0
89
+ jupyter-core==5.7.1
90
+ jupyter-tensorboard==0.2.0
91
+ jupyterlab-pygments==0.3.0
92
+ jupyterlab-server==1.2.0
93
+ jupyterlab==2.3.2
94
+ jupytext==1.16.1
95
+ kiwisolver==1.4.5
96
+ langcodes==3.3.0
97
+ lazy-loader==0.3
98
+ librosa==0.10.1
99
+ llvmlite==0.40.1
100
+ locket==1.0.0
101
+ logzero==1.7.0
102
+ lxml==5.2.2
103
+ markdown-it-py==3.0.0
104
+ markdown==3.5.2
105
+ markupsafe==2.1.4
106
+ matplotlib-inline==0.1.6
107
+ matplotlib==3.8.2
108
+ mdit-py-plugins==0.4.0
109
+ mdurl==0.1.2
110
+ mecab-python3==1.0.6
111
+ mistune==3.0.2
112
+ mkl-devel==2021.1.1
113
+ mkl-include==2021.1.1
114
+ mkl==2021.1.1
115
+ mock==5.1.0
116
+ more-itertools==9.1.0
117
+ mpmath==1.3.0
118
+ msgpack==1.0.7
119
+ multidict==6.0.4
120
+ murmurhash==1.0.10
121
+ nbclient==0.9.0
122
+ nbconvert==7.16.0
123
+ nbformat==5.9.2
124
+ nest-asyncio==1.6.0
125
+ networkx==2.6.3
126
+ ninja==1.11.1.1
127
+ nltk==3.8.1
128
+ notebook==6.4.10
129
+ numba==0.57.1+1.g1ff679645
130
+ numpy==1.24.4
131
+ nvfuser==0.1.4a0+d0bb811
132
+ nvidia-dali-cuda120==1.34.0
133
+ nvidia-pyindex==1.0.9
134
+ nvtx==0.2.5
135
+ oauthlib==3.2.2
136
+ onnx==1.15.0rc2
137
+ opencv==4.7.0
138
+ optree==0.10.0
139
+ packaging==23.2
140
+ pandas==1.5.3
141
+ pandocfilters==1.5.1
142
+ parso==0.8.3
143
+ partd==1.4.1
144
+ peft==0.11.1
145
+ pexpect==4.9.0
146
+ pillow==10.2.0
147
+ pip==24.0
148
+ platformdirs==4.2.0
149
+ pluggy==1.4.0
150
+ ply==3.11
151
+ polygraphy==0.49.4
152
+ pooch==1.8.0
153
+ portalocker==2.10.1
154
+ preshed==3.0.9
155
+ prettytable==3.9.0
156
+ prometheus-client==0.19.0
157
+ prompt-toolkit==3.0.43
158
+ protobuf==4.24.4
159
+ psutil==5.9.4
160
+ ptxcompiler==0.8.1+2.g0d406d6
161
+ ptyprocess==0.7.0
162
+ pure-eval==0.2.2
163
+ pyarrow==14.0.1.dev0+gba5374836.d20240125
164
+ pyasn1-modules==0.3.0
165
+ pyasn1==0.5.1
166
+ pybind11-global==2.11.1
167
+ pybind11==2.11.1
168
+ pycocotools==2.0+nv0.8.0
169
+ pycparser==2.21
170
+ pydantic-core==2.16.2
171
+ pydantic==2.6.1
172
+ pygments==2.17.2
173
+ pylibcugraph==23.12.0
174
+ pylibcugraphops==23.12.0
175
+ pylibraft==23.12.0
176
+ pynvml==11.4.1
177
+ pyparsing==3.1.1
178
+ pytest-flakefinder==1.1.0
179
+ pytest-rerunfailures==13.0
180
+ pytest-shard==0.1.2
181
+ pytest-xdist==3.5.0
182
+ pytest==8.0.0
183
+ python-dateutil==2.8.2
184
+ python-dotenv==1.0.0
185
+ python-hostlist==1.23.0
186
+ pytorch-quantization==2.1.2
187
+ pytz==2023.3.post1
188
+ pyyaml==6.0.1
189
+ pyzmq==25.1.2
190
+ raft-dask==23.12.0
191
+ rapids-dask-dependency==23.12.1
192
+ referencing==0.33.0
193
+ regex==2023.12.25
194
+ requests-oauthlib==1.3.1
195
+ requests==2.31.0
196
+ rich==13.7.0
197
+ rmm==23.12.0
198
+ rpds-py==0.17.1
199
+ rsa==4.9
200
+ sacrebleu==2.4.0
201
+ safetensors==0.4.3
202
+ scikit-learn==1.2.0
203
+ scipy==1.12.0
204
+ send2trash==1.8.2
205
+ sentencepiece==0.1.99
206
+ sentry-sdk==2.12.0
207
+ setproctitle==1.3.3
208
+ setuptools==68.2.2
209
+ six==1.16.0
210
+ smart-open==6.4.0
211
+ smmap==5.0.1
212
+ sortedcontainers==2.4.0
213
+ soundfile==0.12.1
214
+ soupsieve==2.5
215
+ soxr==0.3.7
216
+ spacy-legacy==3.0.12
217
+ spacy-loggers==1.0.5
218
+ spacy==3.7.2
219
+ sphinx-glpi-theme==0.6
220
+ srsly==2.4.8
221
+ stack-data==0.6.3
222
+ sympy==1.12
223
+ tabulate==0.9.0
224
+ tbb==2021.11.0
225
+ tblib==3.0.0
226
+ tensorboard-data-server==0.6.1
227
+ tensorboard-plugin-wit==1.8.1
228
+ tensorboard==2.9.0
229
+ tensorrt==8.6.3
230
+ terminado==0.18.0
231
+ termplotlib==0.3.9
232
+ thinc==8.2.3
233
+ threadpoolctl==3.2.0
234
+ thriftpy2==0.4.17
235
+ tinycss2==1.2.1
236
+ tokenizers==0.19.1
237
+ toml==0.10.2
238
+ tomli==2.0.1
239
+ toolz==0.12.1
240
+ torch-tensorrt==2.3.0a0
241
+ torch==2.3.0a0+ebedce2
242
+ torchdata==0.7.1a0
243
+ torchtext==0.17.0a0
244
+ torchvision==0.18.0a0
245
+ tornado==6.4
246
+ tqdm==4.66.1
247
+ traitlets==5.9.0
248
+ transformer-engine==1.3.0+5b90b7f
249
+ transformers==4.43.3
250
+ treelite-runtime==3.9.1
251
+ treelite==3.9.1
252
+ triton==2.2.0+e28a256
253
+ typer==0.9.0
254
+ types-dataclasses==0.6.6
255
+ typing-extensions==4.9.0
256
+ ucx-py==0.35.0
257
+ uff==0.6.9
258
+ ujson==5.8.0
259
+ urllib3==1.26.18
260
+ wandb==0.16.3
261
+ wasabi==1.1.2
262
+ wcwidth==0.2.13
263
+ weasel==0.3.4
264
+ webencodings==0.5.1
265
+ werkzeug==3.0.1
266
+ wheel==0.42.0
267
+ xdoctest==1.0.2
268
+ xgboost==1.7.6
269
+ yarl==1.9.4
270
+ zict==3.0.0
271
+ zipp==3.17.0
wandb/run-20240804_143607-h7fxlkpt/files/wandb-metadata.json ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
3
+ "python": "3.10.12",
4
+ "heartbeatAt": "2024-08-04T05:36:07.811618",
5
+ "startedAt": "2024-08-04T05:36:07.207201",
6
+ "docker": null,
7
+ "cuda": null,
8
+ "args": [
9
+ "--seq-length",
10
+ "512",
11
+ "--sliding-window-size",
12
+ "4096",
13
+ "--micro-batch-size",
14
+ "8",
15
+ "--global-batch-size",
16
+ "320",
17
+ "--train-iters",
18
+ "2000",
19
+ "--tokenizer-type",
20
+ "Llama2Tokenizer",
21
+ "--tokenizer-model",
22
+ "/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model",
23
+ "--train-data-path",
24
+ "4013541",
25
+ "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
26
+ "--valid-data-path",
27
+ "4013541",
28
+ "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
29
+ "--test-data-path",
30
+ "4013541",
31
+ "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
32
+ "--lr",
33
+ "2e-5",
34
+ "--min-lr",
35
+ "1e-6",
36
+ "--lr-decay-style",
37
+ "cosine",
38
+ "--lr-warmup-iters",
39
+ "500",
40
+ "--lr-decay-iters",
41
+ "2000",
42
+ "--weight-decay",
43
+ "0.1",
44
+ "--grad-clip-norm",
45
+ "1.0",
46
+ "--optimizer",
47
+ "adam",
48
+ "--adam-beta1",
49
+ "0.9",
50
+ "--adam-beta2",
51
+ "0.95",
52
+ "--adam-eps",
53
+ "1e-6",
54
+ "--save-interval",
55
+ "200",
56
+ "--eval-interval",
57
+ "200",
58
+ "--eval-iters",
59
+ "10",
60
+ "--bf16",
61
+ "--mixed-precision",
62
+ "--base-model",
63
+ "/share/pretrained_lm/meta-llama/TinyLlama_v1.1",
64
+ "--save",
65
+ "/work/llm_recipes/models/tiny-llama",
66
+ "--load",
67
+ "/work/llm_recipes/models/tiny-llama",
68
+ "--fsdp-activation-checkpointing",
69
+ "--sharding-strategy",
70
+ "FULL_SHARD",
71
+ "--checkpoint-type",
72
+ "LOCAL_STATE_DICT",
73
+ "--save-n-checkpoints",
74
+ "10",
75
+ "--hf-upload-retry-limit",
76
+ "2",
77
+ "--hf-repo-id",
78
+ "koichi12/tiny-llama",
79
+ "--wandb-entity",
80
+ "iwakawa-koichi-q5-tohoku-nlp6723",
81
+ "--wandb-project",
82
+ "llm_tutorial",
83
+ "--wandb-name",
84
+ "tiny-llama_train_2024-08-04-14:35:56"
85
+ ],
86
+ "state": "running",
87
+ "program": "/project/examples/finetuning.py",
88
+ "codePathLocal": "examples/finetuning.py",
89
+ "codePath": "examples/finetuning.py",
90
+ "git": {
91
+ "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
92
+ "commit": "3be5353210a678dc7008f237fa16b99f2bdf36ea"
93
+ },
94
+ "email": null,
95
+ "root": "/project",
96
+ "host": "gpu-koiwa-00",
97
+ "username": "koiwa",
98
+ "executable": "/usr/bin/python",
99
+ "cpu_count": 18,
100
+ "cpu_count_logical": 18,
101
+ "cpu_freq": {
102
+ "current": 2400.0389999999993,
103
+ "min": 0.0,
104
+ "max": 0.0
105
+ },
106
+ "cpu_freq_per_core": [
107
+ {
108
+ "current": 2400.039,
109
+ "min": 0.0,
110
+ "max": 0.0
111
+ },
112
+ {
113
+ "current": 2400.039,
114
+ "min": 0.0,
115
+ "max": 0.0
116
+ },
117
+ {
118
+ "current": 2400.039,
119
+ "min": 0.0,
120
+ "max": 0.0
121
+ },
122
+ {
123
+ "current": 2400.039,
124
+ "min": 0.0,
125
+ "max": 0.0
126
+ },
127
+ {
128
+ "current": 2400.039,
129
+ "min": 0.0,
130
+ "max": 0.0
131
+ },
132
+ {
133
+ "current": 2400.039,
134
+ "min": 0.0,
135
+ "max": 0.0
136
+ },
137
+ {
138
+ "current": 2400.039,
139
+ "min": 0.0,
140
+ "max": 0.0
141
+ },
142
+ {
143
+ "current": 2400.039,
144
+ "min": 0.0,
145
+ "max": 0.0
146
+ },
147
+ {
148
+ "current": 2400.039,
149
+ "min": 0.0,
150
+ "max": 0.0
151
+ },
152
+ {
153
+ "current": 2400.039,
154
+ "min": 0.0,
155
+ "max": 0.0
156
+ },
157
+ {
158
+ "current": 2400.039,
159
+ "min": 0.0,
160
+ "max": 0.0
161
+ },
162
+ {
163
+ "current": 2400.039,
164
+ "min": 0.0,
165
+ "max": 0.0
166
+ },
167
+ {
168
+ "current": 2400.039,
169
+ "min": 0.0,
170
+ "max": 0.0
171
+ },
172
+ {
173
+ "current": 2400.039,
174
+ "min": 0.0,
175
+ "max": 0.0
176
+ },
177
+ {
178
+ "current": 2400.039,
179
+ "min": 0.0,
180
+ "max": 0.0
181
+ },
182
+ {
183
+ "current": 2400.039,
184
+ "min": 0.0,
185
+ "max": 0.0
186
+ },
187
+ {
188
+ "current": 2400.039,
189
+ "min": 0.0,
190
+ "max": 0.0
191
+ },
192
+ {
193
+ "current": 2400.039,
194
+ "min": 0.0,
195
+ "max": 0.0
196
+ }
197
+ ],
198
+ "disk": {
199
+ "/": {
200
+ "total": 0.0625,
201
+ "used": 1.1444091796875e-05
202
+ }
203
+ },
204
+ "gpu": "NVIDIA A100-SXM4-40GB",
205
+ "gpu_count": 1,
206
+ "gpu_devices": [
207
+ {
208
+ "name": "NVIDIA A100-SXM4-40GB",
209
+ "memory_total": 42949672960
210
+ }
211
+ ],
212
+ "memory": {
213
+ "total": 56.48781967163086
214
+ }
215
+ }
wandb/run-20240804_143607-h7fxlkpt/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb": {"runtime": 2}}
wandb/run-20240804_143607-h7fxlkpt/logs/debug-internal.log ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-04 14:36:07,222 INFO StreamThr :11584 [internal.py:wandb_internal():86] W&B internal server running at pid: 11584, started at: 2024-08-04 14:36:07.221438
2
+ 2024-08-04 14:36:07,223 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: status
3
+ 2024-08-04 14:36:07,225 INFO WriterThread:11584 [datastore.py:open_for_write():87] open: /project/wandb/run-20240804_143607-h7fxlkpt/run-h7fxlkpt.wandb
4
+ 2024-08-04 14:36:07,226 DEBUG SenderThread:11584 [sender.py:send():382] send: header
5
+ 2024-08-04 14:36:07,240 DEBUG SenderThread:11584 [sender.py:send():382] send: run
6
+ 2024-08-04 14:36:07,696 INFO SenderThread:11584 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240804_143607-h7fxlkpt/files
7
+ 2024-08-04 14:36:07,696 INFO SenderThread:11584 [sender.py:_start_run_threads():1136] run started: h7fxlkpt with start time 1722749767.220741
8
+ 2024-08-04 14:36:07,701 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: check_version
9
+ 2024-08-04 14:36:07,701 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: check_version
10
+ 2024-08-04 14:36:07,791 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: run_start
11
+ 2024-08-04 14:36:07,798 DEBUG HandlerThread:11584 [system_info.py:__init__():27] System info init
12
+ 2024-08-04 14:36:07,798 DEBUG HandlerThread:11584 [system_info.py:__init__():42] System info init done
13
+ 2024-08-04 14:36:07,798 INFO HandlerThread:11584 [system_monitor.py:start():194] Starting system monitor
14
+ 2024-08-04 14:36:07,798 INFO SystemMonitor:11584 [system_monitor.py:_start():158] Starting system asset monitoring threads
15
+ 2024-08-04 14:36:07,799 INFO HandlerThread:11584 [system_monitor.py:probe():214] Collecting system info
16
+ 2024-08-04 14:36:07,799 INFO SystemMonitor:11584 [interfaces.py:start():190] Started cpu monitoring
17
+ 2024-08-04 14:36:07,799 INFO SystemMonitor:11584 [interfaces.py:start():190] Started disk monitoring
18
+ 2024-08-04 14:36:07,800 INFO SystemMonitor:11584 [interfaces.py:start():190] Started gpu monitoring
19
+ 2024-08-04 14:36:07,801 INFO SystemMonitor:11584 [interfaces.py:start():190] Started memory monitoring
20
+ 2024-08-04 14:36:07,802 INFO SystemMonitor:11584 [interfaces.py:start():190] Started network monitoring
21
+ 2024-08-04 14:36:07,811 DEBUG HandlerThread:11584 [system_info.py:probe():151] Probing system
22
+ 2024-08-04 14:36:07,813 DEBUG HandlerThread:11584 [system_info.py:_probe_git():136] Probing git
23
+ 2024-08-04 14:36:07,825 DEBUG HandlerThread:11584 [system_info.py:_probe_git():144] Probing git done
24
+ 2024-08-04 14:36:07,825 DEBUG HandlerThread:11584 [system_info.py:probe():199] Probing system done
25
+ 2024-08-04 14:36:07,825 DEBUG HandlerThread:11584 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-04T05:36:07.811618', 'startedAt': '2024-08-04T05:36:07.207201', 'docker': None, 'cuda': None, 'args': ('--seq-length', '512', '--sliding-window-size', '4096', '--micro-batch-size', '8', '--global-batch-size', '320', '--train-iters', '2000', '--tokenizer-type', 'Llama2Tokenizer', '--tokenizer-model', '/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model', '--train-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--valid-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--test-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '2000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '200', '--eval-interval', '200', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/meta-llama/TinyLlama_v1.1', '--save', '/work/llm_recipes/models/tiny-llama', '--load', '/work/llm_recipes/models/tiny-llama', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/tiny-llama', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'tiny-llama_train_2024-08-04-14:35:56'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '3be5353210a678dc7008f237fa16b99f2bdf36ea'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0389999999993, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.48781967163086}}
26
+ 2024-08-04 14:36:07,825 INFO HandlerThread:11584 [system_monitor.py:probe():224] Finished collecting system info
27
+ 2024-08-04 14:36:07,825 INFO HandlerThread:11584 [system_monitor.py:probe():227] Publishing system info
28
+ 2024-08-04 14:36:07,827 INFO HandlerThread:11584 [system_monitor.py:probe():229] Finished publishing system info
29
+ 2024-08-04 14:36:07,833 DEBUG SenderThread:11584 [sender.py:send():382] send: files
30
+ 2024-08-04 14:36:07,833 INFO SenderThread:11584 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
31
+ 2024-08-04 14:36:07,842 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: python_packages
32
+ 2024-08-04 14:36:07,842 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: stop_status
33
+ 2024-08-04 14:36:07,842 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: internal_messages
34
+ 2024-08-04 14:36:07,843 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: python_packages
35
+ 2024-08-04 14:36:07,845 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: stop_status
36
+ 2024-08-04 14:36:08,168 DEBUG SenderThread:11584 [sender.py:send():382] send: telemetry
37
+ 2024-08-04 14:36:08,499 INFO wandb-upload_0:11584 [upload_job.py:push():131] Uploaded file /tmp/tmp7k_0gn43wandb/ux980mno-wandb-metadata.json
38
+ 2024-08-04 14:36:08,698 INFO Thread-12 :11584 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_143607-h7fxlkpt/files/output.log
39
+ 2024-08-04 14:36:08,698 INFO Thread-12 :11584 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_143607-h7fxlkpt/files/requirements.txt
40
+ 2024-08-04 14:36:08,698 INFO Thread-12 :11584 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_143607-h7fxlkpt/files/wandb-metadata.json
41
+ 2024-08-04 14:36:10,261 DEBUG SenderThread:11584 [sender.py:send():382] send: config
42
+ 2024-08-04 14:36:10,262 DEBUG SenderThread:11584 [sender.py:send():382] send: config
43
+ 2024-08-04 14:36:10,349 DEBUG SenderThread:11584 [sender.py:send():382] send: exit
44
+ 2024-08-04 14:36:10,349 INFO SenderThread:11584 [sender.py:send_exit():589] handling exit code: 1
45
+ 2024-08-04 14:36:10,349 INFO SenderThread:11584 [sender.py:send_exit():591] handling runtime: 2
46
+ 2024-08-04 14:36:10,351 INFO SenderThread:11584 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
47
+ 2024-08-04 14:36:10,351 INFO SenderThread:11584 [sender.py:send_exit():597] send defer
48
+ 2024-08-04 14:36:10,351 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: defer
49
+ 2024-08-04 14:36:10,351 INFO HandlerThread:11584 [handler.py:handle_request_defer():172] handle defer: 0
50
+ 2024-08-04 14:36:10,351 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: defer
51
+ 2024-08-04 14:36:10,351 INFO SenderThread:11584 [sender.py:send_request_defer():613] handle sender defer: 0
52
+ 2024-08-04 14:36:10,351 INFO SenderThread:11584 [sender.py:transition_state():617] send defer: 1
53
+ 2024-08-04 14:36:10,352 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: defer
54
+ 2024-08-04 14:36:10,352 INFO HandlerThread:11584 [handler.py:handle_request_defer():172] handle defer: 1
55
+ 2024-08-04 14:36:10,352 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: defer
56
+ 2024-08-04 14:36:10,352 INFO SenderThread:11584 [sender.py:send_request_defer():613] handle sender defer: 1
57
+ 2024-08-04 14:36:10,352 INFO SenderThread:11584 [sender.py:transition_state():617] send defer: 2
58
+ 2024-08-04 14:36:10,352 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: defer
59
+ 2024-08-04 14:36:10,352 INFO HandlerThread:11584 [handler.py:handle_request_defer():172] handle defer: 2
60
+ 2024-08-04 14:36:10,352 INFO HandlerThread:11584 [system_monitor.py:finish():203] Stopping system monitor
61
+ 2024-08-04 14:36:10,352 DEBUG SystemMonitor:11584 [system_monitor.py:_start():172] Starting system metrics aggregation loop
62
+ 2024-08-04 14:36:10,352 INFO HandlerThread:11584 [interfaces.py:finish():202] Joined cpu monitor
63
+ 2024-08-04 14:36:10,352 DEBUG SystemMonitor:11584 [system_monitor.py:_start():179] Finished system metrics aggregation loop
64
+ 2024-08-04 14:36:10,353 INFO HandlerThread:11584 [interfaces.py:finish():202] Joined disk monitor
65
+ 2024-08-04 14:36:10,353 DEBUG SystemMonitor:11584 [system_monitor.py:_start():183] Publishing last batch of metrics
66
+ 2024-08-04 14:36:10,385 INFO HandlerThread:11584 [interfaces.py:finish():202] Joined gpu monitor
67
+ 2024-08-04 14:36:10,385 INFO HandlerThread:11584 [interfaces.py:finish():202] Joined memory monitor
68
+ 2024-08-04 14:36:10,386 INFO HandlerThread:11584 [interfaces.py:finish():202] Joined network monitor
69
+ 2024-08-04 14:36:10,386 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: defer
70
+ 2024-08-04 14:36:10,386 INFO SenderThread:11584 [sender.py:send_request_defer():613] handle sender defer: 2
71
+ 2024-08-04 14:36:10,386 INFO SenderThread:11584 [sender.py:transition_state():617] send defer: 3
72
+ 2024-08-04 14:36:10,386 DEBUG SenderThread:11584 [sender.py:send():382] send: stats
73
+ 2024-08-04 14:36:10,386 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: defer
74
+ 2024-08-04 14:36:10,386 INFO HandlerThread:11584 [handler.py:handle_request_defer():172] handle defer: 3
75
+ 2024-08-04 14:36:10,387 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: defer
76
+ 2024-08-04 14:36:10,387 INFO SenderThread:11584 [sender.py:send_request_defer():613] handle sender defer: 3
77
+ 2024-08-04 14:36:10,387 INFO SenderThread:11584 [sender.py:transition_state():617] send defer: 4
78
+ 2024-08-04 14:36:10,387 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: defer
79
+ 2024-08-04 14:36:10,387 INFO HandlerThread:11584 [handler.py:handle_request_defer():172] handle defer: 4
80
+ 2024-08-04 14:36:10,387 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: defer
81
+ 2024-08-04 14:36:10,387 INFO SenderThread:11584 [sender.py:send_request_defer():613] handle sender defer: 4
82
+ 2024-08-04 14:36:10,387 INFO SenderThread:11584 [sender.py:transition_state():617] send defer: 5
83
+ 2024-08-04 14:36:10,387 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: defer
84
+ 2024-08-04 14:36:10,387 INFO HandlerThread:11584 [handler.py:handle_request_defer():172] handle defer: 5
85
+ 2024-08-04 14:36:10,387 DEBUG SenderThread:11584 [sender.py:send():382] send: summary
86
+ 2024-08-04 14:36:10,388 INFO SenderThread:11584 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
87
+ 2024-08-04 14:36:10,388 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: defer
88
+ 2024-08-04 14:36:10,388 INFO SenderThread:11584 [sender.py:send_request_defer():613] handle sender defer: 5
89
+ 2024-08-04 14:36:10,388 INFO SenderThread:11584 [sender.py:transition_state():617] send defer: 6
90
+ 2024-08-04 14:36:10,389 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: defer
91
+ 2024-08-04 14:36:10,389 INFO HandlerThread:11584 [handler.py:handle_request_defer():172] handle defer: 6
92
+ 2024-08-04 14:36:10,389 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: defer
93
+ 2024-08-04 14:36:10,389 INFO SenderThread:11584 [sender.py:send_request_defer():613] handle sender defer: 6
94
+ 2024-08-04 14:36:10,391 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: status_report
95
+ 2024-08-04 14:36:10,576 INFO SenderThread:11584 [sender.py:transition_state():617] send defer: 7
96
+ 2024-08-04 14:36:10,577 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: defer
97
+ 2024-08-04 14:36:10,577 INFO HandlerThread:11584 [handler.py:handle_request_defer():172] handle defer: 7
98
+ 2024-08-04 14:36:10,577 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: defer
99
+ 2024-08-04 14:36:10,577 INFO SenderThread:11584 [sender.py:send_request_defer():613] handle sender defer: 7
100
+ 2024-08-04 14:36:10,699 INFO Thread-12 :11584 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_143607-h7fxlkpt/files/output.log
101
+ 2024-08-04 14:36:10,699 INFO Thread-12 :11584 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_143607-h7fxlkpt/files/config.yaml
102
+ 2024-08-04 14:36:10,699 INFO Thread-12 :11584 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_143607-h7fxlkpt/files/wandb-summary.json
103
+ 2024-08-04 14:36:11,349 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: poll_exit
104
+ 2024-08-04 14:36:12,530 INFO SenderThread:11584 [sender.py:transition_state():617] send defer: 8
105
+ 2024-08-04 14:36:12,530 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: poll_exit
106
+ 2024-08-04 14:36:12,530 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: defer
107
+ 2024-08-04 14:36:12,531 INFO HandlerThread:11584 [handler.py:handle_request_defer():172] handle defer: 8
108
+ 2024-08-04 14:36:12,531 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: defer
109
+ 2024-08-04 14:36:12,531 INFO SenderThread:11584 [sender.py:send_request_defer():613] handle sender defer: 8
110
+ 2024-08-04 14:36:12,531 INFO SenderThread:11584 [job_builder.py:build():296] Attempting to build job artifact
111
+ 2024-08-04 14:36:12,532 INFO SenderThread:11584 [job_builder.py:_get_source_type():426] is repo sourced job
112
+ 2024-08-04 14:36:12,546 INFO SenderThread:11584 [job_builder.py:build():402] adding wandb-job metadata file
113
+ 2024-08-04 14:36:12,554 INFO SenderThread:11584 [sender.py:transition_state():617] send defer: 9
114
+ 2024-08-04 14:36:12,555 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: defer
115
+ 2024-08-04 14:36:12,555 DEBUG SenderThread:11584 [sender.py:send():382] send: artifact
116
+ 2024-08-04 14:36:12,555 INFO HandlerThread:11584 [handler.py:handle_request_defer():172] handle defer: 9
117
+ 2024-08-04 14:36:12,700 INFO Thread-12 :11584 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_143607-h7fxlkpt/files/output.log
118
+ 2024-08-04 14:36:13,350 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: poll_exit
119
+ 2024-08-04 14:36:13,435 INFO SenderThread:11584 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTA5MTk2NTkzOA==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTA5MzUzODM4NQ==', 'versionIndex': 3}}}
120
+ 2024-08-04 14:36:13,435 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: defer
121
+ 2024-08-04 14:36:13,435 INFO SenderThread:11584 [sender.py:send_request_defer():613] handle sender defer: 9
122
+ 2024-08-04 14:36:13,435 INFO SenderThread:11584 [dir_watcher.py:finish():358] shutting down directory watcher
123
+ 2024-08-04 14:36:13,701 INFO SenderThread:11584 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240804_143607-h7fxlkpt/files
124
+ 2024-08-04 14:36:13,701 INFO SenderThread:11584 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_143607-h7fxlkpt/files/requirements.txt requirements.txt
125
+ 2024-08-04 14:36:13,702 INFO SenderThread:11584 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_143607-h7fxlkpt/files/config.yaml config.yaml
126
+ 2024-08-04 14:36:13,703 INFO SenderThread:11584 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_143607-h7fxlkpt/files/wandb-metadata.json wandb-metadata.json
127
+ 2024-08-04 14:36:13,703 INFO SenderThread:11584 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_143607-h7fxlkpt/files/wandb-summary.json wandb-summary.json
128
+ 2024-08-04 14:36:13,705 INFO SenderThread:11584 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_143607-h7fxlkpt/files/output.log output.log
129
+ 2024-08-04 14:36:13,706 INFO SenderThread:11584 [sender.py:transition_state():617] send defer: 10
130
+ 2024-08-04 14:36:13,707 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: poll_exit
131
+ 2024-08-04 14:36:13,707 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: defer
132
+ 2024-08-04 14:36:13,707 INFO HandlerThread:11584 [handler.py:handle_request_defer():172] handle defer: 10
133
+ 2024-08-04 14:36:13,708 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: defer
134
+ 2024-08-04 14:36:13,708 INFO SenderThread:11584 [sender.py:send_request_defer():613] handle sender defer: 10
135
+ 2024-08-04 14:36:13,709 INFO SenderThread:11584 [file_pusher.py:finish():172] shutting down file pusher
136
+ 2024-08-04 14:36:14,120 INFO wandb-upload_0:11584 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_143607-h7fxlkpt/files/requirements.txt
137
+ 2024-08-04 14:36:14,203 INFO wandb-upload_1:11584 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_143607-h7fxlkpt/files/config.yaml
138
+ 2024-08-04 14:36:14,309 INFO wandb-upload_3:11584 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_143607-h7fxlkpt/files/output.log
139
+ 2024-08-04 14:36:14,324 INFO wandb-upload_2:11584 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_143607-h7fxlkpt/files/wandb-summary.json
140
+ 2024-08-04 14:36:14,351 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: poll_exit
141
+ 2024-08-04 14:36:14,351 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: poll_exit
142
+ 2024-08-04 14:36:14,524 INFO Thread-11 (_thread_body):11584 [sender.py:transition_state():617] send defer: 11
143
+ 2024-08-04 14:36:14,524 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: defer
144
+ 2024-08-04 14:36:14,524 INFO HandlerThread:11584 [handler.py:handle_request_defer():172] handle defer: 11
145
+ 2024-08-04 14:36:14,524 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: defer
146
+ 2024-08-04 14:36:14,524 INFO SenderThread:11584 [sender.py:send_request_defer():613] handle sender defer: 11
147
+ 2024-08-04 14:36:14,524 INFO SenderThread:11584 [file_pusher.py:join():178] waiting for file pusher
148
+ 2024-08-04 14:36:14,525 INFO SenderThread:11584 [sender.py:transition_state():617] send defer: 12
149
+ 2024-08-04 14:36:14,525 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: defer
150
+ 2024-08-04 14:36:14,525 INFO HandlerThread:11584 [handler.py:handle_request_defer():172] handle defer: 12
151
+ 2024-08-04 14:36:14,525 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: defer
152
+ 2024-08-04 14:36:14,525 INFO SenderThread:11584 [sender.py:send_request_defer():613] handle sender defer: 12
153
+ 2024-08-04 14:36:14,525 INFO SenderThread:11584 [file_stream.py:finish():595] file stream finish called
154
+ 2024-08-04 14:36:14,732 INFO SenderThread:11584 [file_stream.py:finish():599] file stream finish is done
155
+ 2024-08-04 14:36:14,732 INFO SenderThread:11584 [sender.py:transition_state():617] send defer: 13
156
+ 2024-08-04 14:36:14,732 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: defer
157
+ 2024-08-04 14:36:14,732 INFO HandlerThread:11584 [handler.py:handle_request_defer():172] handle defer: 13
158
+ 2024-08-04 14:36:14,732 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: defer
159
+ 2024-08-04 14:36:14,732 INFO SenderThread:11584 [sender.py:send_request_defer():613] handle sender defer: 13
160
+ 2024-08-04 14:36:14,732 INFO SenderThread:11584 [sender.py:transition_state():617] send defer: 14
161
+ 2024-08-04 14:36:14,732 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: defer
162
+ 2024-08-04 14:36:14,733 DEBUG SenderThread:11584 [sender.py:send():382] send: final
163
+ 2024-08-04 14:36:14,733 INFO HandlerThread:11584 [handler.py:handle_request_defer():172] handle defer: 14
164
+ 2024-08-04 14:36:14,733 DEBUG SenderThread:11584 [sender.py:send():382] send: footer
165
+ 2024-08-04 14:36:14,733 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: defer
166
+ 2024-08-04 14:36:14,733 INFO SenderThread:11584 [sender.py:send_request_defer():613] handle sender defer: 14
167
+ 2024-08-04 14:36:14,733 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: poll_exit
168
+ 2024-08-04 14:36:14,733 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: poll_exit
169
+ 2024-08-04 14:36:14,734 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: poll_exit
170
+ 2024-08-04 14:36:14,734 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: server_info
171
+ 2024-08-04 14:36:14,734 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: poll_exit
172
+ 2024-08-04 14:36:14,734 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: server_info
173
+ 2024-08-04 14:36:14,734 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: get_summary
174
+ 2024-08-04 14:36:14,736 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: sampled_history
175
+ 2024-08-04 14:36:14,736 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: internal_messages
176
+ 2024-08-04 14:36:14,736 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: job_info
177
+ 2024-08-04 14:36:14,893 DEBUG SenderThread:11584 [sender.py:send_request():409] send_request: job_info
178
+ 2024-08-04 14:36:14,893 INFO MainThread:11584 [wandb_run.py:_footer_history_summary_info():3866] rendering history
179
+ 2024-08-04 14:36:14,894 INFO MainThread:11584 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
180
+ 2024-08-04 14:36:14,894 INFO MainThread:11584 [wandb_run.py:_footer_sync_info():3825] logging synced files
181
+ 2024-08-04 14:36:14,894 DEBUG HandlerThread:11584 [handler.py:handle_request():146] handle_request: shutdown
182
+ 2024-08-04 14:36:14,894 INFO HandlerThread:11584 [handler.py:finish():869] shutting down handler
183
+ 2024-08-04 14:36:15,737 INFO WriterThread:11584 [datastore.py:close():296] close: /project/wandb/run-20240804_143607-h7fxlkpt/run-h7fxlkpt.wandb
184
+ 2024-08-04 14:36:15,893 INFO SenderThread:11584 [sender.py:finish():1572] shutting down sender
185
+ 2024-08-04 14:36:15,894 INFO SenderThread:11584 [file_pusher.py:finish():172] shutting down file pusher
186
+ 2024-08-04 14:36:15,894 INFO SenderThread:11584 [file_pusher.py:join():178] waiting for file pusher
wandb/run-20240804_143607-h7fxlkpt/logs/debug.log ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-04 14:36:07,213 INFO MainThread:11513 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
2
+ 2024-08-04 14:36:07,214 INFO MainThread:11513 [wandb_setup.py:_flush():76] Configure stats pid to 11513
3
+ 2024-08-04 14:36:07,214 INFO MainThread:11513 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
4
+ 2024-08-04 14:36:07,214 INFO MainThread:11513 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
5
+ 2024-08-04 14:36:07,214 INFO MainThread:11513 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train tiny llama sample'}
6
+ 2024-08-04 14:36:07,214 INFO MainThread:11513 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
+ 2024-08-04 14:36:07,214 INFO MainThread:11513 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
8
+ 2024-08-04 14:36:07,214 INFO MainThread:11513 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240804_143607-h7fxlkpt/logs/debug.log
9
+ 2024-08-04 14:36:07,214 INFO MainThread:11513 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240804_143607-h7fxlkpt/logs/debug-internal.log
10
+ 2024-08-04 14:36:07,214 INFO MainThread:11513 [wandb_init.py:init():566] calling init triggers
11
+ 2024-08-04 14:36:07,215 INFO MainThread:11513 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
12
+ config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'valid_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'test_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 512, 'num_workers': 2, 'tokenizer_type': 'Llama2Tokenizer', 'tokenizer_model': '/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'tiny-llama_train_2024-08-04-14:35:56', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/tiny-llama', 'save': '/work/llm_recipes/models/tiny-llama', 'base_model': '/share/pretrained_lm/meta-llama/TinyLlama_v1.1', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 2000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 2000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 8, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/tiny-llama', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 32000, 'gradient_accumulation_steps': 40}
13
+ 2024-08-04 14:36:07,215 INFO MainThread:11513 [wandb_init.py:init():616] starting backend
14
+ 2024-08-04 14:36:07,215 INFO MainThread:11513 [wandb_init.py:init():620] setting up manager
15
+ 2024-08-04 14:36:07,219 INFO MainThread:11513 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
16
+ 2024-08-04 14:36:07,220 INFO MainThread:11513 [wandb_init.py:init():628] backend started and connected
17
+ 2024-08-04 14:36:07,225 INFO MainThread:11513 [wandb_init.py:init():720] updated telemetry
18
+ 2024-08-04 14:36:07,236 INFO MainThread:11513 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
19
+ 2024-08-04 14:36:07,701 INFO MainThread:11513 [wandb_run.py:_on_init():2262] communicating current version
20
+ 2024-08-04 14:36:07,784 INFO MainThread:11513 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.5 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
21
+
22
+ 2024-08-04 14:36:07,784 INFO MainThread:11513 [wandb_init.py:init():804] starting run threads in backend
23
+ 2024-08-04 14:36:07,841 INFO MainThread:11513 [wandb_run.py:_console_start():2241] atexit reg
24
+ 2024-08-04 14:36:07,842 INFO MainThread:11513 [wandb_run.py:_redirect():2096] redirect: wrap_raw
25
+ 2024-08-04 14:36:07,842 INFO MainThread:11513 [wandb_run.py:_redirect():2161] Wrapping output streams.
26
+ 2024-08-04 14:36:07,842 INFO MainThread:11513 [wandb_run.py:_redirect():2186] Redirects installed.
27
+ 2024-08-04 14:36:07,843 INFO MainThread:11513 [wandb_init.py:init():847] run started, returning control to user process
28
+ 2024-08-04 14:36:10,261 INFO MainThread:11513 [wandb_run.py:_config_callback():1343] config_cb None None {'activation_function': 'silu', 'hidden_size': 2048, 'model_type': 'llama', 'max_position_embeddings': 2048, 'num_attention_heads': 32, 'num_hidden_layers': 22, 'model_architecture': 'LlamaForCausalLM'}
29
+ 2024-08-04 14:36:10,261 INFO MainThread:11513 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
30
+ 2024-08-04 14:36:15,895 WARNING MsgRouterThr:11513 [router.py:message_loop():77] message_loop has been closed
wandb/run-20240804_143607-h7fxlkpt/run-h7fxlkpt.wandb ADDED
Binary file (20.4 kB). View file
 
wandb/run-20240804_221132-o8ieoj9i/files/config.yaml ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ sharding_strategy:
4
+ desc: null
5
+ value: FULL_SHARD
6
+ checkpoint_type:
7
+ desc: null
8
+ value: LOCAL_STATE_DICT
9
+ fsdp_activation_checkpointing:
10
+ desc: null
11
+ value: true
12
+ fsdp_cpu_offload:
13
+ desc: null
14
+ value: false
15
+ low_cpu_fsdp:
16
+ desc: null
17
+ value: false
18
+ no_meta_device:
19
+ desc: null
20
+ value: false
21
+ data_path:
22
+ desc: null
23
+ value: null
24
+ split:
25
+ desc: null
26
+ value: 969, 30, 1
27
+ train_data_path:
28
+ desc: null
29
+ value:
30
+ - '235289369'
31
+ - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document
32
+ valid_data_path:
33
+ desc: null
34
+ value:
35
+ - '235289369'
36
+ - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document
37
+ test_data_path:
38
+ desc: null
39
+ value:
40
+ - '235289369'
41
+ - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document
42
+ data_cache_path:
43
+ desc: null
44
+ value: null
45
+ vocab_size:
46
+ desc: null
47
+ value: null
48
+ vocab_file:
49
+ desc: null
50
+ value: null
51
+ merge_file:
52
+ desc: null
53
+ value: null
54
+ seq_length:
55
+ desc: null
56
+ value: 4096
57
+ num_workers:
58
+ desc: null
59
+ value: 2
60
+ tokenizer_type:
61
+ desc: null
62
+ value: HFPreTrainedTokenizer
63
+ tokenizer_model:
64
+ desc: null
65
+ value: /share/pretrained_lm/google/gemma-2-2b
66
+ reset_position_ids:
67
+ desc: null
68
+ value: false
69
+ reset_attention_mask:
70
+ desc: null
71
+ value: false
72
+ eod_mask_loss:
73
+ desc: null
74
+ value: false
75
+ retro_return_doc_ids:
76
+ desc: null
77
+ value: false
78
+ short_seq_prob:
79
+ desc: null
80
+ value: 0.1
81
+ vocab_extra_ids:
82
+ desc: null
83
+ value: 0
84
+ seed:
85
+ desc: null
86
+ value: 1234
87
+ use_mpi:
88
+ desc: null
89
+ value: false
90
+ wandb_entity:
91
+ desc: null
92
+ value: iwakawa-koichi-q5-tohoku-nlp6723
93
+ wandb_name:
94
+ desc: null
95
+ value: yans-sample-gemma-2-2b_train_2024-08-04-22:11:21
96
+ wandb_project:
97
+ desc: null
98
+ value: llm_tutorial
99
+ quantization:
100
+ desc: null
101
+ value: false
102
+ use_freeze_layers:
103
+ desc: null
104
+ value: false
105
+ freeze_layers:
106
+ desc: null
107
+ value: null
108
+ bf16:
109
+ desc: null
110
+ value: true
111
+ fp16:
112
+ desc: null
113
+ value: false
114
+ mixed_precision:
115
+ desc: null
116
+ value: true
117
+ param_dtype:
118
+ desc: null
119
+ value: null
120
+ load:
121
+ desc: null
122
+ value: /work/llm_recipes/models/yans-sample-gemma-2-2b
123
+ save:
124
+ desc: null
125
+ value: /work/llm_recipes/models/yans-sample-gemma-2-2b
126
+ base_model:
127
+ desc: null
128
+ value: /share/pretrained_lm/google/gemma-2-2b
129
+ use_better_transformer:
130
+ desc: null
131
+ value: false
132
+ grad_clip_norm:
133
+ desc: null
134
+ value: 1.0
135
+ eval_interval:
136
+ desc: null
137
+ value: 200
138
+ save_interval:
139
+ desc: null
140
+ value: 200
141
+ eval_iters:
142
+ desc: null
143
+ value: 10
144
+ optimizer:
145
+ desc: null
146
+ value: anyprecision
147
+ lr:
148
+ desc: null
149
+ value: 2.0e-05
150
+ lr_decay_style:
151
+ desc: null
152
+ value: cosine
153
+ lr_decay_iters:
154
+ desc: null
155
+ value: 20000
156
+ lr_warmup_iters:
157
+ desc: null
158
+ value: 500
159
+ min_lr:
160
+ desc: null
161
+ value: 1.0e-06
162
+ train_iters:
163
+ desc: null
164
+ value: 20000
165
+ train_samples:
166
+ desc: null
167
+ value: null
168
+ global_batch_size:
169
+ desc: null
170
+ value: 320
171
+ micro_batch_size:
172
+ desc: null
173
+ value: 2
174
+ make_vocab_size_divisible_by:
175
+ desc: null
176
+ value: 128
177
+ sliding_window_size:
178
+ desc: null
179
+ value: 4096
180
+ skip_batch:
181
+ desc: null
182
+ value: null
183
+ no_save_optimizer_state:
184
+ desc: null
185
+ value: false
186
+ continual_pretraining:
187
+ desc: null
188
+ value: false
189
+ instruction_tuning:
190
+ desc: null
191
+ value: false
192
+ direct_preference_optimization:
193
+ desc: null
194
+ value: false
195
+ attention_dropout:
196
+ desc: null
197
+ value: 0.1
198
+ hidden_dropout:
199
+ desc: null
200
+ value: 0.1
201
+ weight_decay:
202
+ desc: null
203
+ value: 0.1
204
+ adam_beta1:
205
+ desc: null
206
+ value: 0.9
207
+ adam_beta2:
208
+ desc: null
209
+ value: 0.95
210
+ adam_eps:
211
+ desc: null
212
+ value: 1.0e-06
213
+ hf_transformer_model_dir:
214
+ desc: null
215
+ value: null
216
+ instruction_train_data_path:
217
+ desc: null
218
+ value: null
219
+ instruction_valid_data_path:
220
+ desc: null
221
+ value: null
222
+ epoch:
223
+ desc: null
224
+ value: null
225
+ instruction_dataset_size:
226
+ desc: null
227
+ value: null
228
+ save_sampler_state:
229
+ desc: null
230
+ value: false
231
+ label_smoothing:
232
+ desc: null
233
+ value: 0.0
234
+ save_n_checkpoints:
235
+ desc: null
236
+ value: 10
237
+ hf_repo_id:
238
+ desc: null
239
+ value: koichi12/yans-sample-gemma-2-2b
240
+ create_public_hf_repo:
241
+ desc: null
242
+ value: false
243
+ upload_all_checkpoints_to_hf:
244
+ desc: null
245
+ value: false
246
+ hf_upload_retry_limit:
247
+ desc: null
248
+ value: 2
249
+ exit_duration_in_mins:
250
+ desc: null
251
+ value: null
252
+ source_key:
253
+ desc: null
254
+ value: null
255
+ target_key:
256
+ desc: null
257
+ value: null
258
+ attn_implementation:
259
+ desc: null
260
+ value: flash_attention_2
261
+ efficient_instruction_tuning:
262
+ desc: null
263
+ value: false
264
+ remove_padding_masking:
265
+ desc: null
266
+ value: false
267
+ save_start_iter:
268
+ desc: null
269
+ value: null
270
+ rank:
271
+ desc: null
272
+ value: 0
273
+ world_size:
274
+ desc: null
275
+ value: 1
276
+ padded_vocab_size:
277
+ desc: null
278
+ value: 256000
279
+ gradient_accumulation_steps:
280
+ desc: null
281
+ value: 160
282
+ _wandb:
283
+ desc: null
284
+ value:
285
+ python_version: 3.10.12
286
+ cli_version: 0.16.3
287
+ framework: huggingface
288
+ huggingface_version: 4.43.3
289
+ is_jupyter_run: false
290
+ is_kaggle_kernel: false
291
+ start_time: 1722777092.265577
292
+ t:
293
+ 1:
294
+ - 1
295
+ - 11
296
+ - 49
297
+ - 55
298
+ - 71
299
+ 2:
300
+ - 1
301
+ - 11
302
+ - 49
303
+ - 55
304
+ - 71
305
+ 3:
306
+ - 13
307
+ - 16
308
+ - 23
309
+ 4: 3.10.12
310
+ 5: 0.16.3
311
+ 6: 4.43.3
312
+ 8:
313
+ - 5
314
+ 13: linux-x86_64
315
+ activation_function:
316
+ desc: null
317
+ value: gelu_pytorch_tanh
318
+ hidden_size:
319
+ desc: null
320
+ value: 2304
321
+ model_type:
322
+ desc: null
323
+ value: gemma2
324
+ max_position_embeddings:
325
+ desc: null
326
+ value: 4096
327
+ num_attention_heads:
328
+ desc: null
329
+ value: 8
330
+ num_hidden_layers:
331
+ desc: null
332
+ value: 26
333
+ model_architecture:
334
+ desc: null
335
+ value: Gemma2ForCausalLM
wandb/run-20240804_221132-o8ieoj9i/files/output.log ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Created Hugging Face repository with ID koichi12/yans-sample-gemma-2-2b.
2
+ Clearing GPU cache for all ranks
3
+ --> Running with torch torch_distributed debug set to detail
4
+ File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
5
+ Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
6
+ File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
7
+ Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
8
+ You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
9
+
10
+
11
+ Loading checkpoint shards: 67%|██████▋ | 2/3 [02:29<01:15, 75.36s/it]
12
+ File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
13
+ Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
14
+ No checkpoint found in /work/llm_recipes/models/yans-sample-gemma-2-2b, skipping model loading
15
+ --> Model /share/pretrained_lm/google/gemma-2-2b
16
+ --> /share/pretrained_lm/google/gemma-2-2b has 2614.341888 Million params
17
+ BFloat16 enabled for mixed precision - using bfSixteen policy
18
+ --> applying fsdp activation checkpointing...
19
+ > datasets target sizes (minimum size):
20
+ train: 6400000
21
+ validation: 323200
22
+ test: 3200
23
+ Loading checkpoint shards: 100%|██████████| 3/3 [02:38<00:00, 52.69s/it]
24
+ /usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
25
+ warnings.warn(
26
+ Let split = None
27
+ Building a BlendedDataset for a single MegatronDataset
28
+ > finished creating GPT datasets ...
29
+ File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
30
+ Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
31
+ No checkpoint found in /work/llm_recipes/models/yans-sample-gemma-2-2b, skipping optimizer loading
32
+ File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
33
+ Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
34
+ model info: FullyShardedDataParallel(
35
+ (_fsdp_wrapped_module): Gemma2ForCausalLM(
36
+ (model): Gemma2Model(
37
+ (embed_tokens): Embedding(256000, 2304, padding_idx=0)
38
+ (layers): ModuleList(
39
+ (0-25): 26 x FullyShardedDataParallel(
40
+ (_fsdp_wrapped_module): CheckpointWrapper(
41
+ (_checkpoint_wrapped_module): Gemma2DecoderLayer(
42
+ (self_attn): Gemma2FlashAttention2(
43
+ (q_proj): Linear(in_features=2304, out_features=2048, bias=False)
44
+ (k_proj): Linear(in_features=2304, out_features=1024, bias=False)
45
+ (v_proj): Linear(in_features=2304, out_features=1024, bias=False)
46
+ (o_proj): Linear(in_features=2048, out_features=2304, bias=False)
47
+ (rotary_emb): Gemma2RotaryEmbedding()
48
+ )
49
+ (mlp): Gemma2MLP(
50
+ (gate_proj): Linear(in_features=2304, out_features=9216, bias=False)
51
+ (up_proj): Linear(in_features=2304, out_features=9216, bias=False)
52
+ (down_proj): Linear(in_features=9216, out_features=2304, bias=False)
53
+ (act_fn): PytorchGELUTanh()
54
+ )
55
+ (input_layernorm): Gemma2RMSNorm()
56
+ (post_attention_layernorm): Gemma2RMSNorm()
57
+ (pre_feedforward_layernorm): Gemma2RMSNorm()
58
+ (post_feedforward_layernorm): Gemma2RMSNorm()
59
+ )
60
+ )
61
+ )
62
+ )
63
+ (norm): Gemma2RMSNorm()
64
+ )
65
+ (lm_head): Linear(in_features=2304, out_features=256000, bias=False)
66
+ )
67
+ )
68
+ model config: Gemma2Config {
69
+ "_name_or_path": "/share/pretrained_lm/google/gemma-2-2b",
70
+ "architectures": [
71
+ "Gemma2ForCausalLM"
72
+ ],
73
+ "attention_bias": false,
74
+ "attention_dropout": 0.0,
75
+ "attn_logit_softcapping": 50.0,
76
+ "bos_token_id": 2,
77
+ "cache_implementation": "hybrid",
78
+ "eos_token_id": 1,
79
+ "final_logit_softcapping": 30.0,
80
+ "head_dim": 256,
81
+ "hidden_act": "gelu_pytorch_tanh",
82
+ "hidden_activation": "gelu_pytorch_tanh",
83
+ "hidden_size": 2304,
84
+ "initializer_range": 0.02,
85
+ "intermediate_size": 9216,
86
+ "label_smoothing": 0.0,
87
+ "max_position_embeddings": 4096,
88
+ "model_type": "gemma2",
89
+ "num_attention_heads": 8,
90
+ "num_hidden_layers": 26,
91
+ "num_key_value_heads": 4,
92
+ "pad_token_id": 0,
93
+ "query_pre_attn_scalar": 256,
94
+ "rms_norm_eps": 1e-06,
95
+ "rope_theta": 10000.0,
96
+ "sliding_window": 4096,
97
+ "torch_dtype": "float32",
98
+ "transformers_version": "4.43.3",
99
+ "use_cache": false,
100
+ "vocab_size": 256000
101
+ }
102
+ Unable to save the indexes because path_to_cache is None
103
+ Building a BlendedDataset for a single MegatronDataset
104
+ Unable to save the indexes because path_to_cache is None
105
+ Building a BlendedDataset for a single MegatronDataset
106
+ Unable to save the indexes because path_to_cache is None
107
+ It is strongly recommended to train Gemma2 models with the `eager` attention implementation instead of `flash_attention_2`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.
108
+ Traceback (most recent call last):
109
+ File "/project/examples/finetuning.py", line 13, in <module>
110
+ main()
111
+ File "/project/src/llama_recipes/finetuning.py", line 281, in main
112
+ train(
113
+ File "/project/src/llama_recipes/utils/train_utils.py", line 110, in train
114
+ loss: torch.Tensor = model(**batch).loss
115
+ File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
116
+ return self._call_impl(*args, **kwargs)
117
+ File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
118
+ return forward_call(*args, **kwargs)
119
+ File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 849, in forward
120
+ output = self._fsdp_wrapped_module(*args, **kwargs)
121
+ File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
122
+ return self._call_impl(*args, **kwargs)
123
+ File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
124
+ return forward_call(*args, **kwargs)
125
+ File "/project/lib/transformers/src/transformers/models/gemma2/modeling_gemma2.py", line 976, in forward
126
+ loss = loss_fct(shift_logits, shift_labels)
127
+ File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
128
+ return self._call_impl(*args, **kwargs)
129
+ File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
130
+ return forward_call(*args, **kwargs)
131
+ File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/loss.py", line 1179, in forward
132
+ return F.cross_entropy(input, target, weight=self.weight,
133
+ File "/usr/local/lib/python3.10/dist-packages/torch/nn/functional.py", line 3086, in cross_entropy
134
+ return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
135
+ torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 7.81 GiB. GPU 0 has a total capacity of 39.39 GiB of which 7.81 GiB is free. Including non-PyTorch memory, this process has 31.58 GiB memory in use. Of the allocated memory 30.38 GiB is allocated by PyTorch, and 385.31 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
wandb/run-20240804_221132-o8ieoj9i/files/requirements.txt ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.1.0
2
+ accelerate==0.33.0
3
+ aiohttp==3.9.1
4
+ aiosignal==1.3.1
5
+ annotated-types==0.6.0
6
+ apex==0.1
7
+ appdirs==1.4.4
8
+ argon2-cffi-bindings==21.2.0
9
+ argon2-cffi==23.1.0
10
+ asttokens==2.4.1
11
+ astunparse==1.6.3
12
+ async-timeout==4.0.3
13
+ attrs==23.2.0
14
+ audioread==3.0.1
15
+ beautifulsoup4==4.12.3
16
+ bleach==6.1.0
17
+ blis==0.7.11
18
+ cachetools==5.3.2
19
+ catalogue==2.0.10
20
+ certifi==2024.2.2
21
+ cffi==1.16.0
22
+ charset-normalizer==3.3.2
23
+ click==8.1.7
24
+ cloudpathlib==0.16.0
25
+ cloudpickle==3.0.0
26
+ cmake==3.28.1
27
+ colorama==0.4.6
28
+ comm==0.2.1
29
+ confection==0.1.4
30
+ contourpy==1.2.0
31
+ cubinlinker==0.3.0+2.g405ac64
32
+ cuda-python==12.3.0rc4+9.gdb8c48a.dirty
33
+ cudf==23.12.0
34
+ cugraph-dgl==23.12.0
35
+ cugraph-service-client==23.12.0
36
+ cugraph-service-server==23.12.0
37
+ cugraph==23.12.0
38
+ cuml==23.12.0
39
+ cupy-cuda12x==12.3.0
40
+ cycler==0.12.1
41
+ cymem==2.0.8
42
+ cython==3.0.8
43
+ dask-cuda==23.12.0
44
+ dask-cudf==23.12.0
45
+ dask==2023.11.0
46
+ debugpy==1.8.1
47
+ decorator==5.1.1
48
+ defusedxml==0.7.1
49
+ distributed==2023.11.0
50
+ dm-tree==0.1.8
51
+ docker-pycreds==0.4.0
52
+ einops==0.7.0
53
+ exceptiongroup==1.2.0
54
+ execnet==2.0.2
55
+ executing==2.0.1
56
+ expecttest==0.1.3
57
+ fastjsonschema==2.19.1
58
+ fastrlock==0.8.2
59
+ filelock==3.13.1
60
+ flash-attn==2.4.2
61
+ fonttools==4.48.1
62
+ frozenlist==1.4.1
63
+ fsspec==2023.12.2
64
+ gast==0.5.4
65
+ gitdb==4.0.11
66
+ gitpython==3.1.43
67
+ google-auth-oauthlib==0.4.6
68
+ google-auth==2.27.0
69
+ graphsurgeon==0.4.6
70
+ grpcio==1.60.1
71
+ huggingface-hub==0.24.5
72
+ hypothesis==5.35.1
73
+ idna==3.6
74
+ importlib-metadata==7.0.1
75
+ iniconfig==2.0.0
76
+ intel-openmp==2021.4.0
77
+ ipadic==1.0.0
78
+ ipykernel==6.29.2
79
+ ipython-genutils==0.2.0
80
+ ipython==8.21.0
81
+ jedi==0.19.1
82
+ jinja2==3.1.3
83
+ joblib==1.3.2
84
+ json5==0.9.14
85
+ jsonnet==0.19.1
86
+ jsonschema-specifications==2023.12.1
87
+ jsonschema==4.21.1
88
+ jupyter-client==8.6.0
89
+ jupyter-core==5.7.1
90
+ jupyter-tensorboard==0.2.0
91
+ jupyterlab-pygments==0.3.0
92
+ jupyterlab-server==1.2.0
93
+ jupyterlab==2.3.2
94
+ jupytext==1.16.1
95
+ kiwisolver==1.4.5
96
+ langcodes==3.3.0
97
+ lazy-loader==0.3
98
+ librosa==0.10.1
99
+ llvmlite==0.40.1
100
+ locket==1.0.0
101
+ logzero==1.7.0
102
+ lxml==5.2.2
103
+ markdown-it-py==3.0.0
104
+ markdown==3.5.2
105
+ markupsafe==2.1.4
106
+ matplotlib-inline==0.1.6
107
+ matplotlib==3.8.2
108
+ mdit-py-plugins==0.4.0
109
+ mdurl==0.1.2
110
+ mecab-python3==1.0.6
111
+ mistune==3.0.2
112
+ mkl-devel==2021.1.1
113
+ mkl-include==2021.1.1
114
+ mkl==2021.1.1
115
+ mock==5.1.0
116
+ more-itertools==9.1.0
117
+ mpmath==1.3.0
118
+ msgpack==1.0.7
119
+ multidict==6.0.4
120
+ murmurhash==1.0.10
121
+ nbclient==0.9.0
122
+ nbconvert==7.16.0
123
+ nbformat==5.9.2
124
+ nest-asyncio==1.6.0
125
+ networkx==2.6.3
126
+ ninja==1.11.1.1
127
+ nltk==3.8.1
128
+ notebook==6.4.10
129
+ numba==0.57.1+1.g1ff679645
130
+ numpy==1.24.4
131
+ nvfuser==0.1.4a0+d0bb811
132
+ nvidia-dali-cuda120==1.34.0
133
+ nvidia-pyindex==1.0.9
134
+ nvtx==0.2.5
135
+ oauthlib==3.2.2
136
+ onnx==1.15.0rc2
137
+ opencv==4.7.0
138
+ optree==0.10.0
139
+ packaging==23.2
140
+ pandas==1.5.3
141
+ pandocfilters==1.5.1
142
+ parso==0.8.3
143
+ partd==1.4.1
144
+ peft==0.11.1
145
+ pexpect==4.9.0
146
+ pillow==10.2.0
147
+ pip==24.0
148
+ platformdirs==4.2.0
149
+ pluggy==1.4.0
150
+ ply==3.11
151
+ polygraphy==0.49.4
152
+ pooch==1.8.0
153
+ portalocker==2.10.1
154
+ preshed==3.0.9
155
+ prettytable==3.9.0
156
+ prometheus-client==0.19.0
157
+ prompt-toolkit==3.0.43
158
+ protobuf==4.24.4
159
+ psutil==5.9.4
160
+ ptxcompiler==0.8.1+2.g0d406d6
161
+ ptyprocess==0.7.0
162
+ pure-eval==0.2.2
163
+ pyarrow==14.0.1.dev0+gba5374836.d20240125
164
+ pyasn1-modules==0.3.0
165
+ pyasn1==0.5.1
166
+ pybind11-global==2.11.1
167
+ pybind11==2.11.1
168
+ pycocotools==2.0+nv0.8.0
169
+ pycparser==2.21
170
+ pydantic-core==2.16.2
171
+ pydantic==2.6.1
172
+ pygments==2.17.2
173
+ pylibcugraph==23.12.0
174
+ pylibcugraphops==23.12.0
175
+ pylibraft==23.12.0
176
+ pynvml==11.4.1
177
+ pyparsing==3.1.1
178
+ pytest-flakefinder==1.1.0
179
+ pytest-rerunfailures==13.0
180
+ pytest-shard==0.1.2
181
+ pytest-xdist==3.5.0
182
+ pytest==8.0.0
183
+ python-dateutil==2.8.2
184
+ python-dotenv==1.0.0
185
+ python-hostlist==1.23.0
186
+ pytorch-quantization==2.1.2
187
+ pytz==2023.3.post1
188
+ pyyaml==6.0.1
189
+ pyzmq==25.1.2
190
+ raft-dask==23.12.0
191
+ rapids-dask-dependency==23.12.1
192
+ referencing==0.33.0
193
+ regex==2023.12.25
194
+ requests-oauthlib==1.3.1
195
+ requests==2.31.0
196
+ rich==13.7.0
197
+ rmm==23.12.0
198
+ rpds-py==0.17.1
199
+ rsa==4.9
200
+ sacrebleu==2.4.0
201
+ safetensors==0.4.3
202
+ scikit-learn==1.2.0
203
+ scipy==1.12.0
204
+ send2trash==1.8.2
205
+ sentencepiece==0.1.99
206
+ sentry-sdk==2.12.0
207
+ setproctitle==1.3.3
208
+ setuptools==68.2.2
209
+ six==1.16.0
210
+ smart-open==6.4.0
211
+ smmap==5.0.1
212
+ sortedcontainers==2.4.0
213
+ soundfile==0.12.1
214
+ soupsieve==2.5
215
+ soxr==0.3.7
216
+ spacy-legacy==3.0.12
217
+ spacy-loggers==1.0.5
218
+ spacy==3.7.2
219
+ sphinx-glpi-theme==0.6
220
+ srsly==2.4.8
221
+ stack-data==0.6.3
222
+ sympy==1.12
223
+ tabulate==0.9.0
224
+ tbb==2021.11.0
225
+ tblib==3.0.0
226
+ tensorboard-data-server==0.6.1
227
+ tensorboard-plugin-wit==1.8.1
228
+ tensorboard==2.9.0
229
+ tensorrt==8.6.3
230
+ terminado==0.18.0
231
+ termplotlib==0.3.9
232
+ thinc==8.2.3
233
+ threadpoolctl==3.2.0
234
+ thriftpy2==0.4.17
235
+ tinycss2==1.2.1
236
+ tokenizers==0.19.1
237
+ toml==0.10.2
238
+ tomli==2.0.1
239
+ toolz==0.12.1
240
+ torch-tensorrt==2.3.0a0
241
+ torch==2.3.0a0+ebedce2
242
+ torchdata==0.7.1a0
243
+ torchtext==0.17.0a0
244
+ torchvision==0.18.0a0
245
+ tornado==6.4
246
+ tqdm==4.66.1
247
+ traitlets==5.9.0
248
+ transformer-engine==1.3.0+5b90b7f
249
+ transformers==4.43.3
250
+ treelite-runtime==3.9.1
251
+ treelite==3.9.1
252
+ triton==2.2.0+e28a256
253
+ typer==0.9.0
254
+ types-dataclasses==0.6.6
255
+ typing-extensions==4.9.0
256
+ ucx-py==0.35.0
257
+ uff==0.6.9
258
+ ujson==5.8.0
259
+ urllib3==1.26.18
260
+ wandb==0.16.3
261
+ wasabi==1.1.2
262
+ wcwidth==0.2.13
263
+ weasel==0.3.4
264
+ webencodings==0.5.1
265
+ werkzeug==3.0.1
266
+ wheel==0.42.0
267
+ xdoctest==1.0.2
268
+ xgboost==1.7.6
269
+ yarl==1.9.4
270
+ zict==3.0.0
271
+ zipp==3.17.0
wandb/run-20240804_221132-o8ieoj9i/files/wandb-metadata.json ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
3
+ "python": "3.10.12",
4
+ "heartbeatAt": "2024-08-04T13:11:32.902217",
5
+ "startedAt": "2024-08-04T13:11:32.253120",
6
+ "docker": null,
7
+ "cuda": null,
8
+ "args": [
9
+ "--seq-length",
10
+ "4096",
11
+ "--sliding-window-size",
12
+ "4096",
13
+ "--micro-batch-size",
14
+ "2",
15
+ "--global-batch-size",
16
+ "320",
17
+ "--train-iters",
18
+ "20000",
19
+ "--tokenizer-type",
20
+ "HFPreTrainedTokenizer",
21
+ "--tokenizer-model",
22
+ "/share/pretrained_lm/google/gemma-2-2b",
23
+ "--train-data-path",
24
+ "235289369",
25
+ "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document",
26
+ "--valid-data-path",
27
+ "235289369",
28
+ "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document",
29
+ "--test-data-path",
30
+ "235289369",
31
+ "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document",
32
+ "--lr",
33
+ "2e-5",
34
+ "--min-lr",
35
+ "1e-6",
36
+ "--lr-decay-style",
37
+ "cosine",
38
+ "--lr-warmup-iters",
39
+ "500",
40
+ "--lr-decay-iters",
41
+ "20000",
42
+ "--weight-decay",
43
+ "0.1",
44
+ "--grad-clip-norm",
45
+ "1.0",
46
+ "--optimizer",
47
+ "anyprecision",
48
+ "--adam-beta1",
49
+ "0.9",
50
+ "--adam-beta2",
51
+ "0.95",
52
+ "--adam-eps",
53
+ "1e-6",
54
+ "--save-interval",
55
+ "200",
56
+ "--eval-interval",
57
+ "200",
58
+ "--eval-iters",
59
+ "10",
60
+ "--bf16",
61
+ "--mixed-precision",
62
+ "--base-model",
63
+ "/share/pretrained_lm/google/gemma-2-2b",
64
+ "--save",
65
+ "/work/llm_recipes/models/yans-sample-gemma-2-2b",
66
+ "--load",
67
+ "/work/llm_recipes/models/yans-sample-gemma-2-2b",
68
+ "--fsdp-activation-checkpointing",
69
+ "--sharding-strategy",
70
+ "FULL_SHARD",
71
+ "--checkpoint-type",
72
+ "LOCAL_STATE_DICT",
73
+ "--save-n-checkpoints",
74
+ "10",
75
+ "--hf-upload-retry-limit",
76
+ "2",
77
+ "--hf-repo-id",
78
+ "koichi12/yans-sample-gemma-2-2b",
79
+ "--wandb-entity",
80
+ "iwakawa-koichi-q5-tohoku-nlp6723",
81
+ "--wandb-project",
82
+ "llm_tutorial",
83
+ "--wandb-name",
84
+ "yans-sample-gemma-2-2b_train_2024-08-04-22:11:21"
85
+ ],
86
+ "state": "running",
87
+ "program": "/project/examples/finetuning.py",
88
+ "codePathLocal": "examples/finetuning.py",
89
+ "codePath": "examples/finetuning.py",
90
+ "git": {
91
+ "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
92
+ "commit": "0336bd6c20fe25d78eda1d14afa66c1ae2e6d687"
93
+ },
94
+ "email": null,
95
+ "root": "/project",
96
+ "host": "gpu-koiwa-00",
97
+ "username": "koiwa",
98
+ "executable": "/usr/bin/python",
99
+ "cpu_count": 18,
100
+ "cpu_count_logical": 18,
101
+ "cpu_freq": {
102
+ "current": 2400.044999999999,
103
+ "min": 0.0,
104
+ "max": 0.0
105
+ },
106
+ "cpu_freq_per_core": [
107
+ {
108
+ "current": 2400.045,
109
+ "min": 0.0,
110
+ "max": 0.0
111
+ },
112
+ {
113
+ "current": 2400.045,
114
+ "min": 0.0,
115
+ "max": 0.0
116
+ },
117
+ {
118
+ "current": 2400.045,
119
+ "min": 0.0,
120
+ "max": 0.0
121
+ },
122
+ {
123
+ "current": 2400.045,
124
+ "min": 0.0,
125
+ "max": 0.0
126
+ },
127
+ {
128
+ "current": 2400.045,
129
+ "min": 0.0,
130
+ "max": 0.0
131
+ },
132
+ {
133
+ "current": 2400.045,
134
+ "min": 0.0,
135
+ "max": 0.0
136
+ },
137
+ {
138
+ "current": 2400.045,
139
+ "min": 0.0,
140
+ "max": 0.0
141
+ },
142
+ {
143
+ "current": 2400.045,
144
+ "min": 0.0,
145
+ "max": 0.0
146
+ },
147
+ {
148
+ "current": 2400.045,
149
+ "min": 0.0,
150
+ "max": 0.0
151
+ },
152
+ {
153
+ "current": 2400.045,
154
+ "min": 0.0,
155
+ "max": 0.0
156
+ },
157
+ {
158
+ "current": 2400.045,
159
+ "min": 0.0,
160
+ "max": 0.0
161
+ },
162
+ {
163
+ "current": 2400.045,
164
+ "min": 0.0,
165
+ "max": 0.0
166
+ },
167
+ {
168
+ "current": 2400.045,
169
+ "min": 0.0,
170
+ "max": 0.0
171
+ },
172
+ {
173
+ "current": 2400.045,
174
+ "min": 0.0,
175
+ "max": 0.0
176
+ },
177
+ {
178
+ "current": 2400.045,
179
+ "min": 0.0,
180
+ "max": 0.0
181
+ },
182
+ {
183
+ "current": 2400.045,
184
+ "min": 0.0,
185
+ "max": 0.0
186
+ },
187
+ {
188
+ "current": 2400.045,
189
+ "min": 0.0,
190
+ "max": 0.0
191
+ },
192
+ {
193
+ "current": 2400.045,
194
+ "min": 0.0,
195
+ "max": 0.0
196
+ }
197
+ ],
198
+ "disk": {
199
+ "/": {
200
+ "total": 0.0625,
201
+ "used": 1.1444091796875e-05
202
+ }
203
+ },
204
+ "gpu": "NVIDIA A100-SXM4-40GB",
205
+ "gpu_count": 1,
206
+ "gpu_devices": [
207
+ {
208
+ "name": "NVIDIA A100-SXM4-40GB",
209
+ "memory_total": 42949672960
210
+ }
211
+ ],
212
+ "memory": {
213
+ "total": 56.48782730102539
214
+ }
215
+ }
wandb/run-20240804_221132-o8ieoj9i/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb": {"runtime": 166}}
wandb/run-20240804_221132-o8ieoj9i/logs/debug-internal.log ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-04 22:11:32,267 INFO StreamThr :12237 [internal.py:wandb_internal():86] W&B internal server running at pid: 12237, started at: 2024-08-04 22:11:32.266168
2
+ 2024-08-04 22:11:32,268 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status
3
+ 2024-08-04 22:11:32,270 INFO WriterThread:12237 [datastore.py:open_for_write():87] open: /project/wandb/run-20240804_221132-o8ieoj9i/run-o8ieoj9i.wandb
4
+ 2024-08-04 22:11:32,271 DEBUG SenderThread:12237 [sender.py:send():382] send: header
5
+ 2024-08-04 22:11:32,285 DEBUG SenderThread:12237 [sender.py:send():382] send: run
6
+ 2024-08-04 22:11:32,779 INFO SenderThread:12237 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240804_221132-o8ieoj9i/files
7
+ 2024-08-04 22:11:32,779 INFO SenderThread:12237 [sender.py:_start_run_threads():1136] run started: o8ieoj9i with start time 1722777092.265577
8
+ 2024-08-04 22:11:32,784 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: check_version
9
+ 2024-08-04 22:11:32,784 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: check_version
10
+ 2024-08-04 22:11:32,884 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: run_start
11
+ 2024-08-04 22:11:32,890 DEBUG HandlerThread:12237 [system_info.py:__init__():27] System info init
12
+ 2024-08-04 22:11:32,890 DEBUG HandlerThread:12237 [system_info.py:__init__():42] System info init done
13
+ 2024-08-04 22:11:32,890 INFO HandlerThread:12237 [system_monitor.py:start():194] Starting system monitor
14
+ 2024-08-04 22:11:32,890 INFO SystemMonitor:12237 [system_monitor.py:_start():158] Starting system asset monitoring threads
15
+ 2024-08-04 22:11:32,890 INFO HandlerThread:12237 [system_monitor.py:probe():214] Collecting system info
16
+ 2024-08-04 22:11:32,891 INFO SystemMonitor:12237 [interfaces.py:start():190] Started cpu monitoring
17
+ 2024-08-04 22:11:32,891 INFO SystemMonitor:12237 [interfaces.py:start():190] Started disk monitoring
18
+ 2024-08-04 22:11:32,892 INFO SystemMonitor:12237 [interfaces.py:start():190] Started gpu monitoring
19
+ 2024-08-04 22:11:32,893 INFO SystemMonitor:12237 [interfaces.py:start():190] Started memory monitoring
20
+ 2024-08-04 22:11:32,893 INFO SystemMonitor:12237 [interfaces.py:start():190] Started network monitoring
21
+ 2024-08-04 22:11:32,902 DEBUG HandlerThread:12237 [system_info.py:probe():151] Probing system
22
+ 2024-08-04 22:11:32,904 DEBUG HandlerThread:12237 [system_info.py:_probe_git():136] Probing git
23
+ 2024-08-04 22:11:32,916 DEBUG HandlerThread:12237 [system_info.py:_probe_git():144] Probing git done
24
+ 2024-08-04 22:11:32,916 DEBUG HandlerThread:12237 [system_info.py:probe():199] Probing system done
25
+ 2024-08-04 22:11:32,916 DEBUG HandlerThread:12237 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-04T13:11:32.902217', 'startedAt': '2024-08-04T13:11:32.253120', 'docker': None, 'cuda': None, 'args': ('--seq-length', '4096', '--sliding-window-size', '4096', '--micro-batch-size', '2', '--global-batch-size', '320', '--train-iters', '20000', '--tokenizer-type', 'HFPreTrainedTokenizer', '--tokenizer-model', '/share/pretrained_lm/google/gemma-2-2b', '--train-data-path', '235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document', '--valid-data-path', '235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document', '--test-data-path', '235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '20000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'anyprecision', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '200', '--eval-interval', '200', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/google/gemma-2-2b', '--save', '/work/llm_recipes/models/yans-sample-gemma-2-2b', '--load', '/work/llm_recipes/models/yans-sample-gemma-2-2b', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/yans-sample-gemma-2-2b', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'yans-sample-gemma-2-2b_train_2024-08-04-22:11:21'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '0336bd6c20fe25d78eda1d14afa66c1ae2e6d687'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.044999999999, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.48782730102539}}
26
+ 2024-08-04 22:11:32,916 INFO HandlerThread:12237 [system_monitor.py:probe():224] Finished collecting system info
27
+ 2024-08-04 22:11:32,916 INFO HandlerThread:12237 [system_monitor.py:probe():227] Publishing system info
28
+ 2024-08-04 22:11:32,917 INFO HandlerThread:12237 [system_monitor.py:probe():229] Finished publishing system info
29
+ 2024-08-04 22:11:32,923 DEBUG SenderThread:12237 [sender.py:send():382] send: files
30
+ 2024-08-04 22:11:32,923 INFO SenderThread:12237 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
31
+ 2024-08-04 22:11:32,932 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: python_packages
32
+ 2024-08-04 22:11:32,933 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: stop_status
33
+ 2024-08-04 22:11:32,933 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: internal_messages
34
+ 2024-08-04 22:11:32,933 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: python_packages
35
+ 2024-08-04 22:11:32,935 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: stop_status
36
+ 2024-08-04 22:11:33,202 DEBUG SenderThread:12237 [sender.py:send():382] send: telemetry
37
+ 2024-08-04 22:11:33,617 INFO wandb-upload_0:12237 [upload_job.py:push():131] Uploaded file /tmp/tmpntsoky67wandb/ybme98wl-wandb-metadata.json
38
+ 2024-08-04 22:11:33,780 INFO Thread-12 :12237 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_221132-o8ieoj9i/files/requirements.txt
39
+ 2024-08-04 22:11:33,781 INFO Thread-12 :12237 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_221132-o8ieoj9i/files/wandb-metadata.json
40
+ 2024-08-04 22:11:33,781 INFO Thread-12 :12237 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_221132-o8ieoj9i/files/output.log
41
+ 2024-08-04 22:11:35,781 INFO Thread-12 :12237 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_221132-o8ieoj9i/files/output.log
42
+ 2024-08-04 22:11:37,800 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
43
+ 2024-08-04 22:11:42,801 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
44
+ 2024-08-04 22:11:47,802 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
45
+ 2024-08-04 22:11:47,932 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: stop_status
46
+ 2024-08-04 22:11:47,932 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: internal_messages
47
+ 2024-08-04 22:11:47,933 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: stop_status
48
+ 2024-08-04 22:11:53,184 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
49
+ 2024-08-04 22:11:58,184 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
50
+ 2024-08-04 22:12:02,932 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: stop_status
51
+ 2024-08-04 22:12:02,932 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: stop_status
52
+ 2024-08-04 22:12:02,972 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: internal_messages
53
+ 2024-08-04 22:12:04,128 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
54
+ 2024-08-04 22:12:04,797 INFO Thread-12 :12237 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_221132-o8ieoj9i/files/config.yaml
55
+ 2024-08-04 22:12:09,335 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
56
+ 2024-08-04 22:12:14,336 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
57
+ 2024-08-04 22:12:17,932 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: stop_status
58
+ 2024-08-04 22:12:17,932 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: stop_status
59
+ 2024-08-04 22:12:17,972 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: internal_messages
60
+ 2024-08-04 22:12:20,198 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
61
+ 2024-08-04 22:12:25,199 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
62
+ 2024-08-04 22:12:30,199 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
63
+ 2024-08-04 22:12:32,894 DEBUG SystemMonitor:12237 [system_monitor.py:_start():172] Starting system metrics aggregation loop
64
+ 2024-08-04 22:12:32,895 DEBUG SenderThread:12237 [sender.py:send():382] send: stats
65
+ 2024-08-04 22:12:32,932 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: stop_status
66
+ 2024-08-04 22:12:32,932 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: stop_status
67
+ 2024-08-04 22:12:32,972 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: internal_messages
68
+ 2024-08-04 22:12:36,110 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
69
+ 2024-08-04 22:12:41,111 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
70
+ 2024-08-04 22:12:45,820 INFO Thread-12 :12237 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_221132-o8ieoj9i/files/output.log
71
+ 2024-08-04 22:12:46,558 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
72
+ 2024-08-04 22:12:47,932 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: stop_status
73
+ 2024-08-04 22:12:47,933 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: internal_messages
74
+ 2024-08-04 22:12:47,933 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: stop_status
75
+ 2024-08-04 22:12:52,156 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
76
+ 2024-08-04 22:12:57,157 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
77
+ 2024-08-04 22:13:02,157 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
78
+ 2024-08-04 22:13:02,897 DEBUG SenderThread:12237 [sender.py:send():382] send: stats
79
+ 2024-08-04 22:13:02,932 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: stop_status
80
+ 2024-08-04 22:13:02,932 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: stop_status
81
+ 2024-08-04 22:13:02,972 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: internal_messages
82
+ 2024-08-04 22:13:08,124 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
83
+ 2024-08-04 22:13:13,125 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
84
+ 2024-08-04 22:13:17,932 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: stop_status
85
+ 2024-08-04 22:13:17,933 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: stop_status
86
+ 2024-08-04 22:13:17,976 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: internal_messages
87
+ 2024-08-04 22:13:18,132 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
88
+ 2024-08-04 22:13:23,133 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
89
+ 2024-08-04 22:13:28,134 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
90
+ 2024-08-04 22:13:32,898 DEBUG SenderThread:12237 [sender.py:send():382] send: stats
91
+ 2024-08-04 22:13:32,932 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: stop_status
92
+ 2024-08-04 22:13:32,933 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: stop_status
93
+ 2024-08-04 22:13:32,976 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: internal_messages
94
+ 2024-08-04 22:13:33,205 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
95
+ 2024-08-04 22:13:38,206 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
96
+ 2024-08-04 22:13:43,207 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
97
+ 2024-08-04 22:13:47,932 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: stop_status
98
+ 2024-08-04 22:13:47,933 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: stop_status
99
+ 2024-08-04 22:13:47,976 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: internal_messages
100
+ 2024-08-04 22:13:49,120 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
101
+ 2024-08-04 22:13:54,121 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
102
+ 2024-08-04 22:13:59,122 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
103
+ 2024-08-04 22:14:02,898 DEBUG SenderThread:12237 [sender.py:send():382] send: stats
104
+ 2024-08-04 22:14:02,932 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: stop_status
105
+ 2024-08-04 22:14:02,933 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: stop_status
106
+ 2024-08-04 22:14:02,976 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: internal_messages
107
+ 2024-08-04 22:14:04,197 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
108
+ 2024-08-04 22:14:04,864 INFO Thread-12 :12237 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_221132-o8ieoj9i/files/output.log
109
+ 2024-08-04 22:14:09,198 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
110
+ 2024-08-04 22:14:13,453 DEBUG SenderThread:12237 [sender.py:send():382] send: config
111
+ 2024-08-04 22:14:13,453 DEBUG SenderThread:12237 [sender.py:send():382] send: config
112
+ 2024-08-04 22:14:13,869 INFO Thread-12 :12237 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_221132-o8ieoj9i/files/output.log
113
+ 2024-08-04 22:14:14,550 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
114
+ 2024-08-04 22:14:15,870 INFO Thread-12 :12237 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_221132-o8ieoj9i/files/output.log
115
+ 2024-08-04 22:14:17,933 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: stop_status
116
+ 2024-08-04 22:14:17,934 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: internal_messages
117
+ 2024-08-04 22:14:17,934 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: stop_status
118
+ 2024-08-04 22:14:19,437 DEBUG SenderThread:12237 [sender.py:send():382] send: exit
119
+ 2024-08-04 22:14:19,437 INFO SenderThread:12237 [sender.py:send_exit():589] handling exit code: 1
120
+ 2024-08-04 22:14:19,437 INFO SenderThread:12237 [sender.py:send_exit():591] handling runtime: 166
121
+ 2024-08-04 22:14:19,438 INFO SenderThread:12237 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
122
+ 2024-08-04 22:14:19,439 INFO SenderThread:12237 [sender.py:send_exit():597] send defer
123
+ 2024-08-04 22:14:19,439 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: defer
124
+ 2024-08-04 22:14:19,439 INFO HandlerThread:12237 [handler.py:handle_request_defer():172] handle defer: 0
125
+ 2024-08-04 22:14:19,439 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: defer
126
+ 2024-08-04 22:14:19,439 INFO SenderThread:12237 [sender.py:send_request_defer():613] handle sender defer: 0
127
+ 2024-08-04 22:14:19,439 INFO SenderThread:12237 [sender.py:transition_state():617] send defer: 1
128
+ 2024-08-04 22:14:19,439 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: defer
129
+ 2024-08-04 22:14:19,439 INFO HandlerThread:12237 [handler.py:handle_request_defer():172] handle defer: 1
130
+ 2024-08-04 22:14:19,439 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: defer
131
+ 2024-08-04 22:14:19,439 INFO SenderThread:12237 [sender.py:send_request_defer():613] handle sender defer: 1
132
+ 2024-08-04 22:14:19,439 INFO SenderThread:12237 [sender.py:transition_state():617] send defer: 2
133
+ 2024-08-04 22:14:19,439 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: defer
134
+ 2024-08-04 22:14:19,440 INFO HandlerThread:12237 [handler.py:handle_request_defer():172] handle defer: 2
135
+ 2024-08-04 22:14:19,440 INFO HandlerThread:12237 [system_monitor.py:finish():203] Stopping system monitor
136
+ 2024-08-04 22:14:19,440 DEBUG SystemMonitor:12237 [system_monitor.py:_start():179] Finished system metrics aggregation loop
137
+ 2024-08-04 22:14:19,440 DEBUG SystemMonitor:12237 [system_monitor.py:_start():183] Publishing last batch of metrics
138
+ 2024-08-04 22:14:19,440 INFO HandlerThread:12237 [interfaces.py:finish():202] Joined cpu monitor
139
+ 2024-08-04 22:14:19,441 INFO HandlerThread:12237 [interfaces.py:finish():202] Joined disk monitor
140
+ 2024-08-04 22:14:19,474 INFO HandlerThread:12237 [interfaces.py:finish():202] Joined gpu monitor
141
+ 2024-08-04 22:14:19,474 INFO HandlerThread:12237 [interfaces.py:finish():202] Joined memory monitor
142
+ 2024-08-04 22:14:19,474 INFO HandlerThread:12237 [interfaces.py:finish():202] Joined network monitor
143
+ 2024-08-04 22:14:19,475 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: defer
144
+ 2024-08-04 22:14:19,475 INFO SenderThread:12237 [sender.py:send_request_defer():613] handle sender defer: 2
145
+ 2024-08-04 22:14:19,475 INFO SenderThread:12237 [sender.py:transition_state():617] send defer: 3
146
+ 2024-08-04 22:14:19,475 DEBUG SenderThread:12237 [sender.py:send():382] send: stats
147
+ 2024-08-04 22:14:19,475 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: defer
148
+ 2024-08-04 22:14:19,475 INFO HandlerThread:12237 [handler.py:handle_request_defer():172] handle defer: 3
149
+ 2024-08-04 22:14:19,475 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: defer
150
+ 2024-08-04 22:14:19,475 INFO SenderThread:12237 [sender.py:send_request_defer():613] handle sender defer: 3
151
+ 2024-08-04 22:14:19,475 INFO SenderThread:12237 [sender.py:transition_state():617] send defer: 4
152
+ 2024-08-04 22:14:19,475 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: defer
153
+ 2024-08-04 22:14:19,475 INFO HandlerThread:12237 [handler.py:handle_request_defer():172] handle defer: 4
154
+ 2024-08-04 22:14:19,476 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: defer
155
+ 2024-08-04 22:14:19,476 INFO SenderThread:12237 [sender.py:send_request_defer():613] handle sender defer: 4
156
+ 2024-08-04 22:14:19,476 INFO SenderThread:12237 [sender.py:transition_state():617] send defer: 5
157
+ 2024-08-04 22:14:19,476 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: defer
158
+ 2024-08-04 22:14:19,476 INFO HandlerThread:12237 [handler.py:handle_request_defer():172] handle defer: 5
159
+ 2024-08-04 22:14:19,476 DEBUG SenderThread:12237 [sender.py:send():382] send: summary
160
+ 2024-08-04 22:14:19,477 INFO SenderThread:12237 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
161
+ 2024-08-04 22:14:19,477 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: defer
162
+ 2024-08-04 22:14:19,477 INFO SenderThread:12237 [sender.py:send_request_defer():613] handle sender defer: 5
163
+ 2024-08-04 22:14:19,477 INFO SenderThread:12237 [sender.py:transition_state():617] send defer: 6
164
+ 2024-08-04 22:14:19,477 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: defer
165
+ 2024-08-04 22:14:19,477 INFO HandlerThread:12237 [handler.py:handle_request_defer():172] handle defer: 6
166
+ 2024-08-04 22:14:19,477 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: defer
167
+ 2024-08-04 22:14:19,477 INFO SenderThread:12237 [sender.py:send_request_defer():613] handle sender defer: 6
168
+ 2024-08-04 22:14:19,480 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
169
+ 2024-08-04 22:14:19,712 INFO SenderThread:12237 [sender.py:transition_state():617] send defer: 7
170
+ 2024-08-04 22:14:19,712 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: defer
171
+ 2024-08-04 22:14:19,712 INFO HandlerThread:12237 [handler.py:handle_request_defer():172] handle defer: 7
172
+ 2024-08-04 22:14:19,712 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: defer
173
+ 2024-08-04 22:14:19,712 INFO SenderThread:12237 [sender.py:send_request_defer():613] handle sender defer: 7
174
+ 2024-08-04 22:14:19,873 INFO Thread-12 :12237 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_221132-o8ieoj9i/files/config.yaml
175
+ 2024-08-04 22:14:19,874 INFO Thread-12 :12237 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_221132-o8ieoj9i/files/wandb-summary.json
176
+ 2024-08-04 22:14:20,437 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: poll_exit
177
+ 2024-08-04 22:14:20,874 INFO Thread-12 :12237 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_221132-o8ieoj9i/files/output.log
178
+ 2024-08-04 22:14:21,905 INFO SenderThread:12237 [sender.py:transition_state():617] send defer: 8
179
+ 2024-08-04 22:14:21,905 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: poll_exit
180
+ 2024-08-04 22:14:21,905 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: defer
181
+ 2024-08-04 22:14:21,906 INFO HandlerThread:12237 [handler.py:handle_request_defer():172] handle defer: 8
182
+ 2024-08-04 22:14:21,906 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: defer
183
+ 2024-08-04 22:14:21,906 INFO SenderThread:12237 [sender.py:send_request_defer():613] handle sender defer: 8
184
+ 2024-08-04 22:14:21,906 INFO SenderThread:12237 [job_builder.py:build():296] Attempting to build job artifact
185
+ 2024-08-04 22:14:21,907 INFO SenderThread:12237 [job_builder.py:_get_source_type():426] is repo sourced job
186
+ 2024-08-04 22:14:21,921 INFO SenderThread:12237 [job_builder.py:build():402] adding wandb-job metadata file
187
+ 2024-08-04 22:14:21,929 INFO SenderThread:12237 [sender.py:transition_state():617] send defer: 9
188
+ 2024-08-04 22:14:21,929 DEBUG SenderThread:12237 [sender.py:send():382] send: artifact
189
+ 2024-08-04 22:14:21,929 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: defer
190
+ 2024-08-04 22:14:21,931 INFO HandlerThread:12237 [handler.py:handle_request_defer():172] handle defer: 9
191
+ 2024-08-04 22:14:22,437 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: poll_exit
192
+ 2024-08-04 22:14:22,875 INFO Thread-12 :12237 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_221132-o8ieoj9i/files/output.log
193
+ 2024-08-04 22:14:23,127 INFO wandb-upload_0:12237 [upload_job.py:push():86] Skipped uploading /singularity_home/.local/share/wandb/artifacts/staging/tmpaydno9il
194
+ 2024-08-04 22:14:23,543 INFO wandb-upload_1:12237 [upload_job.py:push():89] Uploaded file /singularity_home/.local/share/wandb/artifacts/staging/tmpaetcwljm
195
+ 2024-08-04 22:14:24,702 INFO SenderThread:12237 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTA5ODUzNDkwNw==', 'state': 'PENDING', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTA5MzUzODM4NQ==', 'versionIndex': 3}}}
196
+ 2024-08-04 22:14:24,702 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: defer
197
+ 2024-08-04 22:14:24,702 INFO SenderThread:12237 [sender.py:send_request_defer():613] handle sender defer: 9
198
+ 2024-08-04 22:14:24,702 INFO SenderThread:12237 [dir_watcher.py:finish():358] shutting down directory watcher
199
+ 2024-08-04 22:14:24,876 INFO SenderThread:12237 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240804_221132-o8ieoj9i/files
200
+ 2024-08-04 22:14:24,876 INFO SenderThread:12237 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_221132-o8ieoj9i/files/requirements.txt requirements.txt
201
+ 2024-08-04 22:14:24,876 INFO SenderThread:12237 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_221132-o8ieoj9i/files/config.yaml config.yaml
202
+ 2024-08-04 22:14:24,878 INFO SenderThread:12237 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_221132-o8ieoj9i/files/wandb-metadata.json wandb-metadata.json
203
+ 2024-08-04 22:14:24,878 INFO SenderThread:12237 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_221132-o8ieoj9i/files/wandb-summary.json wandb-summary.json
204
+ 2024-08-04 22:14:24,879 INFO SenderThread:12237 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_221132-o8ieoj9i/files/output.log output.log
205
+ 2024-08-04 22:14:24,881 INFO SenderThread:12237 [sender.py:transition_state():617] send defer: 10
206
+ 2024-08-04 22:14:24,881 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: poll_exit
207
+ 2024-08-04 22:14:24,881 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: defer
208
+ 2024-08-04 22:14:24,882 INFO HandlerThread:12237 [handler.py:handle_request_defer():172] handle defer: 10
209
+ 2024-08-04 22:14:24,882 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
210
+ 2024-08-04 22:14:24,883 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: defer
211
+ 2024-08-04 22:14:24,883 INFO SenderThread:12237 [sender.py:send_request_defer():613] handle sender defer: 10
212
+ 2024-08-04 22:14:24,883 INFO SenderThread:12237 [file_pusher.py:finish():172] shutting down file pusher
213
+ 2024-08-04 22:14:25,282 INFO wandb-upload_0:12237 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_221132-o8ieoj9i/files/requirements.txt
214
+ 2024-08-04 22:14:25,375 INFO wandb-upload_1:12237 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_221132-o8ieoj9i/files/config.yaml
215
+ 2024-08-04 22:14:25,438 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: poll_exit
216
+ 2024-08-04 22:14:25,438 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: poll_exit
217
+ 2024-08-04 22:14:25,461 INFO wandb-upload_2:12237 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_221132-o8ieoj9i/files/wandb-summary.json
218
+ 2024-08-04 22:14:25,480 INFO wandb-upload_3:12237 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_221132-o8ieoj9i/files/output.log
219
+ 2024-08-04 22:14:25,680 INFO Thread-11 (_thread_body):12237 [sender.py:transition_state():617] send defer: 11
220
+ 2024-08-04 22:14:25,681 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: defer
221
+ 2024-08-04 22:14:25,681 INFO HandlerThread:12237 [handler.py:handle_request_defer():172] handle defer: 11
222
+ 2024-08-04 22:14:25,681 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: defer
223
+ 2024-08-04 22:14:25,681 INFO SenderThread:12237 [sender.py:send_request_defer():613] handle sender defer: 11
224
+ 2024-08-04 22:14:25,681 INFO SenderThread:12237 [file_pusher.py:join():178] waiting for file pusher
225
+ 2024-08-04 22:14:25,681 INFO SenderThread:12237 [sender.py:transition_state():617] send defer: 12
226
+ 2024-08-04 22:14:25,681 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: defer
227
+ 2024-08-04 22:14:25,681 INFO HandlerThread:12237 [handler.py:handle_request_defer():172] handle defer: 12
228
+ 2024-08-04 22:14:25,681 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: defer
229
+ 2024-08-04 22:14:25,681 INFO SenderThread:12237 [sender.py:send_request_defer():613] handle sender defer: 12
230
+ 2024-08-04 22:14:25,681 INFO SenderThread:12237 [file_stream.py:finish():595] file stream finish called
231
+ 2024-08-04 22:14:25,848 INFO SenderThread:12237 [file_stream.py:finish():599] file stream finish is done
232
+ 2024-08-04 22:14:25,848 INFO SenderThread:12237 [sender.py:transition_state():617] send defer: 13
233
+ 2024-08-04 22:14:25,849 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: defer
234
+ 2024-08-04 22:14:25,849 INFO HandlerThread:12237 [handler.py:handle_request_defer():172] handle defer: 13
235
+ 2024-08-04 22:14:25,849 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: defer
236
+ 2024-08-04 22:14:25,849 INFO SenderThread:12237 [sender.py:send_request_defer():613] handle sender defer: 13
237
+ 2024-08-04 22:14:25,849 INFO SenderThread:12237 [sender.py:transition_state():617] send defer: 14
238
+ 2024-08-04 22:14:25,849 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: defer
239
+ 2024-08-04 22:14:25,849 DEBUG SenderThread:12237 [sender.py:send():382] send: final
240
+ 2024-08-04 22:14:25,849 INFO HandlerThread:12237 [handler.py:handle_request_defer():172] handle defer: 14
241
+ 2024-08-04 22:14:25,849 DEBUG SenderThread:12237 [sender.py:send():382] send: footer
242
+ 2024-08-04 22:14:25,850 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: defer
243
+ 2024-08-04 22:14:25,850 INFO SenderThread:12237 [sender.py:send_request_defer():613] handle sender defer: 14
244
+ 2024-08-04 22:14:25,850 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: poll_exit
245
+ 2024-08-04 22:14:25,850 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: poll_exit
246
+ 2024-08-04 22:14:25,850 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: poll_exit
247
+ 2024-08-04 22:14:25,851 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: poll_exit
248
+ 2024-08-04 22:14:25,851 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: server_info
249
+ 2024-08-04 22:14:25,851 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: get_summary
250
+ 2024-08-04 22:14:25,851 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: sampled_history
251
+ 2024-08-04 22:14:25,852 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: server_info
252
+ 2024-08-04 22:14:25,852 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: internal_messages
253
+ 2024-08-04 22:14:25,853 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: job_info
254
+ 2024-08-04 22:14:26,030 DEBUG SenderThread:12237 [sender.py:send_request():409] send_request: job_info
255
+ 2024-08-04 22:14:26,030 INFO MainThread:12237 [wandb_run.py:_footer_history_summary_info():3866] rendering history
256
+ 2024-08-04 22:14:26,030 INFO MainThread:12237 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
257
+ 2024-08-04 22:14:26,030 INFO MainThread:12237 [wandb_run.py:_footer_sync_info():3825] logging synced files
258
+ 2024-08-04 22:14:26,031 DEBUG HandlerThread:12237 [handler.py:handle_request():146] handle_request: shutdown
259
+ 2024-08-04 22:14:26,031 INFO HandlerThread:12237 [handler.py:finish():869] shutting down handler
260
+ 2024-08-04 22:14:26,853 INFO WriterThread:12237 [datastore.py:close():296] close: /project/wandb/run-20240804_221132-o8ieoj9i/run-o8ieoj9i.wandb
261
+ 2024-08-04 22:14:27,030 INFO SenderThread:12237 [sender.py:finish():1572] shutting down sender
262
+ 2024-08-04 22:14:27,030 INFO SenderThread:12237 [file_pusher.py:finish():172] shutting down file pusher
263
+ 2024-08-04 22:14:27,030 INFO SenderThread:12237 [file_pusher.py:join():178] waiting for file pusher
wandb/run-20240804_221132-o8ieoj9i/logs/debug.log ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-04 22:11:32,259 INFO MainThread:12166 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
2
+ 2024-08-04 22:11:32,259 INFO MainThread:12166 [wandb_setup.py:_flush():76] Configure stats pid to 12166
3
+ 2024-08-04 22:11:32,259 INFO MainThread:12166 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
4
+ 2024-08-04 22:11:32,259 INFO MainThread:12166 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
5
+ 2024-08-04 22:11:32,259 INFO MainThread:12166 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train sample'}
6
+ 2024-08-04 22:11:32,259 INFO MainThread:12166 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
+ 2024-08-04 22:11:32,259 INFO MainThread:12166 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
8
+ 2024-08-04 22:11:32,259 INFO MainThread:12166 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240804_221132-o8ieoj9i/logs/debug.log
9
+ 2024-08-04 22:11:32,259 INFO MainThread:12166 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240804_221132-o8ieoj9i/logs/debug-internal.log
10
+ 2024-08-04 22:11:32,259 INFO MainThread:12166 [wandb_init.py:init():566] calling init triggers
11
+ 2024-08-04 22:11:32,259 INFO MainThread:12166 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
12
+ config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document'], 'valid_data_path': ['235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document'], 'test_data_path': ['235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 4096, 'num_workers': 2, 'tokenizer_type': 'HFPreTrainedTokenizer', 'tokenizer_model': '/share/pretrained_lm/google/gemma-2-2b', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'yans-sample-gemma-2-2b_train_2024-08-04-22:11:21', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/yans-sample-gemma-2-2b', 'save': '/work/llm_recipes/models/yans-sample-gemma-2-2b', 'base_model': '/share/pretrained_lm/google/gemma-2-2b', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'anyprecision', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 2, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/yans-sample-gemma-2-2b', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 256000, 'gradient_accumulation_steps': 160}
13
+ 2024-08-04 22:11:32,260 INFO MainThread:12166 [wandb_init.py:init():616] starting backend
14
+ 2024-08-04 22:11:32,260 INFO MainThread:12166 [wandb_init.py:init():620] setting up manager
15
+ 2024-08-04 22:11:32,264 INFO MainThread:12166 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
16
+ 2024-08-04 22:11:32,265 INFO MainThread:12166 [wandb_init.py:init():628] backend started and connected
17
+ 2024-08-04 22:11:32,270 INFO MainThread:12166 [wandb_init.py:init():720] updated telemetry
18
+ 2024-08-04 22:11:32,281 INFO MainThread:12166 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
19
+ 2024-08-04 22:11:32,783 INFO MainThread:12166 [wandb_run.py:_on_init():2262] communicating current version
20
+ 2024-08-04 22:11:32,877 INFO MainThread:12166 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.5 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
21
+
22
+ 2024-08-04 22:11:32,877 INFO MainThread:12166 [wandb_init.py:init():804] starting run threads in backend
23
+ 2024-08-04 22:11:32,932 INFO MainThread:12166 [wandb_run.py:_console_start():2241] atexit reg
24
+ 2024-08-04 22:11:32,932 INFO MainThread:12166 [wandb_run.py:_redirect():2096] redirect: wrap_raw
25
+ 2024-08-04 22:11:32,932 INFO MainThread:12166 [wandb_run.py:_redirect():2161] Wrapping output streams.
26
+ 2024-08-04 22:11:32,932 INFO MainThread:12166 [wandb_run.py:_redirect():2186] Redirects installed.
27
+ 2024-08-04 22:11:32,933 INFO MainThread:12166 [wandb_init.py:init():847] run started, returning control to user process
28
+ 2024-08-04 22:14:13,452 INFO MainThread:12166 [wandb_run.py:_config_callback():1343] config_cb None None {'activation_function': 'gelu_pytorch_tanh', 'hidden_size': 2304, 'model_type': 'gemma2', 'max_position_embeddings': 4096, 'num_attention_heads': 8, 'num_hidden_layers': 26, 'model_architecture': 'Gemma2ForCausalLM'}
29
+ 2024-08-04 22:14:13,453 INFO MainThread:12166 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
30
+ 2024-08-04 22:14:27,031 WARNING MsgRouterThr:12166 [router.py:message_loop():77] message_loop has been closed
wandb/run-20240804_221132-o8ieoj9i/run-o8ieoj9i.wandb ADDED
Binary file (27.3 kB). View file
 
wandb/run-20240812_052853-n84i0o06/files/config.yaml ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ sharding_strategy:
4
+ desc: null
5
+ value: FULL_SHARD
6
+ checkpoint_type:
7
+ desc: null
8
+ value: LOCAL_STATE_DICT
9
+ fsdp_activation_checkpointing:
10
+ desc: null
11
+ value: true
12
+ fsdp_cpu_offload:
13
+ desc: null
14
+ value: false
15
+ low_cpu_fsdp:
16
+ desc: null
17
+ value: false
18
+ no_meta_device:
19
+ desc: null
20
+ value: false
21
+ data_path:
22
+ desc: null
23
+ value: null
24
+ split:
25
+ desc: null
26
+ value: 969, 30, 1
27
+ train_data_path:
28
+ desc: null
29
+ value:
30
+ - '304771887'
31
+ - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
32
+ valid_data_path:
33
+ desc: null
34
+ value:
35
+ - '304771887'
36
+ - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
37
+ test_data_path:
38
+ desc: null
39
+ value:
40
+ - '304771887'
41
+ - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
42
+ data_cache_path:
43
+ desc: null
44
+ value: null
45
+ vocab_size:
46
+ desc: null
47
+ value: null
48
+ vocab_file:
49
+ desc: null
50
+ value: null
51
+ merge_file:
52
+ desc: null
53
+ value: null
54
+ seq_length:
55
+ desc: null
56
+ value: 4096
57
+ num_workers:
58
+ desc: null
59
+ value: 2
60
+ tokenizer_type:
61
+ desc: null
62
+ value: HFPreTrainedTokenizer
63
+ tokenizer_model:
64
+ desc: null
65
+ value: /share/pretrained_lm/Qwen/Qwen2-0.5B
66
+ reset_position_ids:
67
+ desc: null
68
+ value: false
69
+ reset_attention_mask:
70
+ desc: null
71
+ value: false
72
+ eod_mask_loss:
73
+ desc: null
74
+ value: false
75
+ retro_return_doc_ids:
76
+ desc: null
77
+ value: false
78
+ short_seq_prob:
79
+ desc: null
80
+ value: 0.1
81
+ vocab_extra_ids:
82
+ desc: null
83
+ value: 0
84
+ seed:
85
+ desc: null
86
+ value: 1234
87
+ use_mpi:
88
+ desc: null
89
+ value: false
90
+ wandb_entity:
91
+ desc: null
92
+ value: iwakawa-koichi-q5-tohoku-nlp6723
93
+ wandb_name:
94
+ desc: null
95
+ value: yans-qwen2-0.5B_train_2024-08-12-05:28:42
96
+ wandb_project:
97
+ desc: null
98
+ value: llm_tutorial
99
+ quantization:
100
+ desc: null
101
+ value: false
102
+ use_freeze_layers:
103
+ desc: null
104
+ value: false
105
+ freeze_layers:
106
+ desc: null
107
+ value: null
108
+ bf16:
109
+ desc: null
110
+ value: true
111
+ fp16:
112
+ desc: null
113
+ value: false
114
+ mixed_precision:
115
+ desc: null
116
+ value: true
117
+ param_dtype:
118
+ desc: null
119
+ value: null
120
+ load:
121
+ desc: null
122
+ value: /work/llm_recipes/models/yans-qwen2-0.5B
123
+ save:
124
+ desc: null
125
+ value: /work/llm_recipes/models/yans-qwen2-0.5B
126
+ base_model:
127
+ desc: null
128
+ value: /share/pretrained_lm/Qwen/Qwen2-0.5B
129
+ use_better_transformer:
130
+ desc: null
131
+ value: false
132
+ grad_clip_norm:
133
+ desc: null
134
+ value: 1.0
135
+ eval_interval:
136
+ desc: null
137
+ value: 200
138
+ save_interval:
139
+ desc: null
140
+ value: 5
141
+ eval_iters:
142
+ desc: null
143
+ value: 10
144
+ optimizer:
145
+ desc: null
146
+ value: adam
147
+ lr:
148
+ desc: null
149
+ value: 2.0e-05
150
+ lr_decay_style:
151
+ desc: null
152
+ value: cosine
153
+ lr_decay_iters:
154
+ desc: null
155
+ value: 20000
156
+ lr_warmup_iters:
157
+ desc: null
158
+ value: 500
159
+ min_lr:
160
+ desc: null
161
+ value: 1.0e-06
162
+ train_iters:
163
+ desc: null
164
+ value: 20000
165
+ train_samples:
166
+ desc: null
167
+ value: null
168
+ global_batch_size:
169
+ desc: null
170
+ value: 320
171
+ micro_batch_size:
172
+ desc: null
173
+ value: 1
174
+ make_vocab_size_divisible_by:
175
+ desc: null
176
+ value: 128
177
+ sliding_window_size:
178
+ desc: null
179
+ value: 4096
180
+ skip_batch:
181
+ desc: null
182
+ value: null
183
+ no_save_optimizer_state:
184
+ desc: null
185
+ value: false
186
+ continual_pretraining:
187
+ desc: null
188
+ value: false
189
+ instruction_tuning:
190
+ desc: null
191
+ value: false
192
+ direct_preference_optimization:
193
+ desc: null
194
+ value: false
195
+ attention_dropout:
196
+ desc: null
197
+ value: 0.1
198
+ hidden_dropout:
199
+ desc: null
200
+ value: 0.1
201
+ weight_decay:
202
+ desc: null
203
+ value: 0.1
204
+ adam_beta1:
205
+ desc: null
206
+ value: 0.9
207
+ adam_beta2:
208
+ desc: null
209
+ value: 0.95
210
+ adam_eps:
211
+ desc: null
212
+ value: 1.0e-06
213
+ hf_transformer_model_dir:
214
+ desc: null
215
+ value: null
216
+ instruction_train_data_path:
217
+ desc: null
218
+ value: null
219
+ instruction_valid_data_path:
220
+ desc: null
221
+ value: null
222
+ epoch:
223
+ desc: null
224
+ value: null
225
+ instruction_dataset_size:
226
+ desc: null
227
+ value: null
228
+ save_sampler_state:
229
+ desc: null
230
+ value: false
231
+ label_smoothing:
232
+ desc: null
233
+ value: 0.0
234
+ save_n_checkpoints:
235
+ desc: null
236
+ value: 10
237
+ hf_repo_id:
238
+ desc: null
239
+ value: koichi12/yans-qwen2-0.5B
240
+ create_public_hf_repo:
241
+ desc: null
242
+ value: false
243
+ upload_all_checkpoints_to_hf:
244
+ desc: null
245
+ value: false
246
+ hf_upload_retry_limit:
247
+ desc: null
248
+ value: 2
249
+ exit_duration_in_mins:
250
+ desc: null
251
+ value: null
252
+ source_key:
253
+ desc: null
254
+ value: null
255
+ target_key:
256
+ desc: null
257
+ value: null
258
+ attn_implementation:
259
+ desc: null
260
+ value: flash_attention_2
261
+ efficient_instruction_tuning:
262
+ desc: null
263
+ value: false
264
+ remove_padding_masking:
265
+ desc: null
266
+ value: false
267
+ save_start_iter:
268
+ desc: null
269
+ value: null
270
+ rank:
271
+ desc: null
272
+ value: 0
273
+ world_size:
274
+ desc: null
275
+ value: 1
276
+ padded_vocab_size:
277
+ desc: null
278
+ value: 151680
279
+ gradient_accumulation_steps:
280
+ desc: null
281
+ value: 320
282
+ _wandb:
283
+ desc: null
284
+ value:
285
+ python_version: 3.10.12
286
+ cli_version: 0.16.3
287
+ framework: huggingface
288
+ huggingface_version: 4.43.3
289
+ is_jupyter_run: false
290
+ is_kaggle_kernel: false
291
+ start_time: 1723408133.524123
292
+ t:
293
+ 1:
294
+ - 1
295
+ - 11
296
+ - 49
297
+ - 55
298
+ - 71
299
+ 2:
300
+ - 1
301
+ - 11
302
+ - 49
303
+ - 55
304
+ - 71
305
+ 3:
306
+ - 13
307
+ - 16
308
+ - 23
309
+ 4: 3.10.12
310
+ 5: 0.16.3
311
+ 6: 4.43.3
312
+ 8:
313
+ - 5
314
+ 13: linux-x86_64
315
+ model_architecture:
316
+ desc: null
317
+ value: Qwen2ForCausalLM
318
+ activation_function:
319
+ desc: null
320
+ value: silu
321
+ hidden_size:
322
+ desc: null
323
+ value: 896
324
+ model_type:
325
+ desc: null
326
+ value: qwen2
327
+ max_position_embeddings:
328
+ desc: null
329
+ value: 4096
330
+ num_attention_heads:
331
+ desc: null
332
+ value: 14
333
+ num_hidden_layers:
334
+ desc: null
335
+ value: 24
wandb/run-20240812_052853-n84i0o06/files/output.log ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Created Hugging Face repository with ID koichi12/yans-qwen2-0.5B.
2
+ Clearing GPU cache for all ranks
3
+ --> Running with torch torch_distributed debug set to detail
4
+ File not found: /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
5
+ Unable to read latest iteration from /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
6
+ File not found: /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
7
+ Unable to read latest iteration from /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
8
+ File not found: /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
9
+ Unable to read latest iteration from /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
10
+ No checkpoint found in /work/llm_recipes/models/yans-qwen2-0.5B, skipping model loading
11
+ --> Model /share/pretrained_lm/Qwen/Qwen2-0.5B
12
+ --> /share/pretrained_lm/Qwen/Qwen2-0.5B has 494.032768 Million params
13
+ BFloat16 enabled for mixed precision - using bfSixteen policy
14
+ You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
15
+ /usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
16
+ warnings.warn(
17
+ --> applying fsdp activation checkpointing...
18
+ > datasets target sizes (minimum size):
19
+ train: 6400000
20
+ validation: 323200
21
+ test: 3200
22
+ > building train, validation, and test datasets for GPT ...
23
+ Let split = None
24
+ > finished creating GPT datasets ...
25
+ File not found: /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
26
+ Unable to read latest iteration from /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
27
+ No checkpoint found in /work/llm_recipes/models/yans-qwen2-0.5B, skipping optimizer loading
28
+ File not found: /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
29
+ Unable to read latest iteration from /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
30
+ model info: FullyShardedDataParallel(
31
+ (_fsdp_wrapped_module): Qwen2ForCausalLM(
32
+ (model): Qwen2Model(
33
+ (embed_tokens): Embedding(151936, 896)
34
+ (layers): ModuleList(
35
+ (0-23): 24 x FullyShardedDataParallel(
36
+ (_fsdp_wrapped_module): CheckpointWrapper(
37
+ (_checkpoint_wrapped_module): Qwen2DecoderLayer(
38
+ (self_attn): Qwen2FlashAttention2(
39
+ (q_proj): Linear(in_features=896, out_features=896, bias=True)
40
+ (k_proj): Linear(in_features=896, out_features=128, bias=True)
41
+ (v_proj): Linear(in_features=896, out_features=128, bias=True)
42
+ (o_proj): Linear(in_features=896, out_features=896, bias=False)
43
+ (rotary_emb): Qwen2RotaryEmbedding()
44
+ )
45
+ (mlp): Qwen2MLP(
46
+ (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
47
+ (up_proj): Linear(in_features=896, out_features=4864, bias=False)
48
+ (down_proj): Linear(in_features=4864, out_features=896, bias=False)
49
+ (act_fn): SiLU()
50
+ )
51
+ (input_layernorm): Qwen2RMSNorm()
52
+ (post_attention_layernorm): Qwen2RMSNorm()
53
+ )
54
+ )
55
+ )
56
+ )
57
+ (norm): Qwen2RMSNorm()
58
+ )
59
+ (lm_head): Linear(in_features=896, out_features=151936, bias=False)
60
+ )
61
+ )
62
+ model config: Qwen2Config {
63
+ "_name_or_path": "/share/pretrained_lm/Qwen/Qwen2-0.5B",
64
+ "architectures": [
65
+ "Qwen2ForCausalLM"
66
+ ],
67
+ "attention_dropout": 0.0,
68
+ "bos_token_id": 151643,
69
+ "eos_token_id": 151643,
70
+ "hidden_act": "silu",
71
+ "hidden_size": 896,
72
+ "initializer_range": 0.02,
73
+ "intermediate_size": 4864,
74
+ "label_smoothing": 0.0,
75
+ "max_position_embeddings": 4096,
76
+ "max_window_layers": 24,
77
+ "model_type": "qwen2",
78
+ "num_attention_heads": 14,
79
+ "num_hidden_layers": 24,
80
+ "num_key_value_heads": 2,
81
+ "rms_norm_eps": 1e-06,
82
+ "rope_theta": 1000000.0,
83
+ "sliding_window": null,
84
+ "tie_word_embeddings": true,
85
+ "torch_dtype": "bfloat16",
86
+ "transformers_version": "4.43.3",
87
+ "use_cache": false,
88
+ "use_sliding_window": false,
89
+ "vocab_size": 151936
90
+ }
91
+ Building a BlendedDataset for a single MegatronDataset
92
+ Unable to save the indexes because path_to_cache is None
93
+ Building a BlendedDataset for a single MegatronDataset
94
+ Unable to save the indexes because path_to_cache is None
95
+ Building a BlendedDataset for a single MegatronDataset
96
+ Unable to save the indexes because path_to_cache is None
97
+ ------------------------------------------------------------------
98
+ iteration: 1 , TFLOPS: 67.05501421617748, Tokens per sec: 16676.24515769431, Loss: 4.1814446449279785
99
+ ------------------------------------------------------------------
100
+ ------------------------------------------------------------------
101
+ iteration: 2 , TFLOPS: 70.71126656778048, Tokens per sec: 17585.5367488818, Loss: 4.19144344329834
102
+ ------------------------------------------------------------------
103
+ ------------------------------------------------------------------
104
+ iteration: 3 , TFLOPS: 70.545913767934, Tokens per sec: 17544.41433827636, Loss: 4.197675704956055
105
+ ------------------------------------------------------------------
106
+ ------------------------------------------------------------------
107
+ iteration: 4 , TFLOPS: 70.68479486678217, Tokens per sec: 17578.953369834773, Loss: 4.183629989624023
108
+ ------------------------------------------------------------------
109
+ ------------------------------------------------------------------
110
+ iteration: 5 , TFLOPS: 70.61673302016509, Tokens per sec: 17562.0267305172, Loss: 4.198177337646484
111
+ ------------------------------------------------------------------
112
+ Saving checkpoint to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005
113
+ Saving model state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005/model.pt
114
+ /usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_state_dict_utils.py:773: UserWarning: When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict willbe returned.
115
+ warnings.warn(
116
+ /usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_state_dict_utils.py:716: UserWarning: When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict willbe returned.
117
+ warnings.warn(
118
+ Saved model state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005/model.pt
119
+ Saving optimizer state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005/optimizer.pt
120
+ [rank0]:[2024-08-12 05:35:23,399] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _optim_state_dict() profiling: defaultdict(<class 'float'>, {'preprocessing': 0.00647389400000975, 'preprocessing_with_comm': 0.0007460029999037943, 'state_converting': 0.9694889820000299, <Type.ALL: 'all'>: 0.9780955020000874})
121
+ Saved optimizer state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005/optimizer.pt
122
+ Saving scheduler state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005/scheduler.pt
123
+ Saved scheduler state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005/scheduler.pt
124
+ Saving RNG states to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005/rng.pt
125
+ Saved RNG states to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005/rng.pt
126
+ Traceback (most recent call last):
127
+ File "/project/examples/finetuning.py", line 13, in <module>
128
+ main()
129
+ File "/project/src/llama_recipes/finetuning.py", line 281, in main
130
+ train(
131
+ File "/project/src/llama_recipes/utils/train_utils.py", line 175, in train
132
+ save_checkpoint(
133
+ File "/project/src/llama_recipes/utils/checkpoint.py", line 168, in save_checkpoint
134
+ tokenizer.tokenizer.save_pretrained(tokenizer_path)
135
+ File "/project/lib/transformers/src/transformers/tokenization_utils_base.py", line 2622, in save_pretrained
136
+ if os.path.isfile(save_directory):
137
+ File "/usr/lib/python3.10/genericpath.py", line 30, in isfile
138
+ st = os.stat(path)
139
+ TypeError: stat: path should be string, bytes, os.PathLike or integer, not NoneType
wandb/run-20240812_052853-n84i0o06/files/requirements.txt ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.1.0
2
+ accelerate==0.33.0
3
+ aiohttp==3.9.1
4
+ aiosignal==1.3.1
5
+ annotated-types==0.6.0
6
+ apex==0.1
7
+ appdirs==1.4.4
8
+ argon2-cffi-bindings==21.2.0
9
+ argon2-cffi==23.1.0
10
+ asttokens==2.4.1
11
+ astunparse==1.6.3
12
+ async-timeout==4.0.3
13
+ attrs==23.2.0
14
+ audioread==3.0.1
15
+ beautifulsoup4==4.12.3
16
+ bleach==6.1.0
17
+ blis==0.7.11
18
+ cachetools==5.3.2
19
+ catalogue==2.0.10
20
+ certifi==2024.2.2
21
+ cffi==1.16.0
22
+ charset-normalizer==3.3.2
23
+ click==8.1.7
24
+ cloudpathlib==0.16.0
25
+ cloudpickle==3.0.0
26
+ cmake==3.28.1
27
+ colorama==0.4.6
28
+ comm==0.2.1
29
+ confection==0.1.4
30
+ contourpy==1.2.0
31
+ cubinlinker==0.3.0+2.g405ac64
32
+ cuda-python==12.3.0rc4+9.gdb8c48a.dirty
33
+ cudf==23.12.0
34
+ cugraph-dgl==23.12.0
35
+ cugraph-service-client==23.12.0
36
+ cugraph-service-server==23.12.0
37
+ cugraph==23.12.0
38
+ cuml==23.12.0
39
+ cupy-cuda12x==12.3.0
40
+ cycler==0.12.1
41
+ cymem==2.0.8
42
+ cython==3.0.8
43
+ dask-cuda==23.12.0
44
+ dask-cudf==23.12.0
45
+ dask==2023.11.0
46
+ debugpy==1.8.1
47
+ decorator==5.1.1
48
+ defusedxml==0.7.1
49
+ distributed==2023.11.0
50
+ dm-tree==0.1.8
51
+ docker-pycreds==0.4.0
52
+ einops==0.7.0
53
+ exceptiongroup==1.2.0
54
+ execnet==2.0.2
55
+ executing==2.0.1
56
+ expecttest==0.1.3
57
+ fastjsonschema==2.19.1
58
+ fastrlock==0.8.2
59
+ filelock==3.13.1
60
+ flash-attn==2.4.2
61
+ fonttools==4.48.1
62
+ frozenlist==1.4.1
63
+ fsspec==2023.12.2
64
+ gast==0.5.4
65
+ gitdb==4.0.11
66
+ gitpython==3.1.43
67
+ google-auth-oauthlib==0.4.6
68
+ google-auth==2.27.0
69
+ graphsurgeon==0.4.6
70
+ grpcio==1.60.1
71
+ huggingface-hub==0.24.5
72
+ hypothesis==5.35.1
73
+ idna==3.6
74
+ importlib-metadata==7.0.1
75
+ iniconfig==2.0.0
76
+ intel-openmp==2021.4.0
77
+ ipadic==1.0.0
78
+ ipykernel==6.29.2
79
+ ipython-genutils==0.2.0
80
+ ipython==8.21.0
81
+ jedi==0.19.1
82
+ jinja2==3.1.3
83
+ joblib==1.3.2
84
+ json5==0.9.14
85
+ jsonnet==0.19.1
86
+ jsonschema-specifications==2023.12.1
87
+ jsonschema==4.21.1
88
+ jupyter-client==8.6.0
89
+ jupyter-core==5.7.1
90
+ jupyter-tensorboard==0.2.0
91
+ jupyterlab-pygments==0.3.0
92
+ jupyterlab-server==1.2.0
93
+ jupyterlab==2.3.2
94
+ jupytext==1.16.1
95
+ kiwisolver==1.4.5
96
+ langcodes==3.3.0
97
+ lazy-loader==0.3
98
+ librosa==0.10.1
99
+ llvmlite==0.40.1
100
+ locket==1.0.0
101
+ logzero==1.7.0
102
+ lxml==5.2.2
103
+ markdown-it-py==3.0.0
104
+ markdown==3.5.2
105
+ markupsafe==2.1.4
106
+ matplotlib-inline==0.1.6
107
+ matplotlib==3.8.2
108
+ mdit-py-plugins==0.4.0
109
+ mdurl==0.1.2
110
+ mecab-python3==1.0.6
111
+ mistune==3.0.2
112
+ mkl-devel==2021.1.1
113
+ mkl-include==2021.1.1
114
+ mkl==2021.1.1
115
+ mock==5.1.0
116
+ more-itertools==9.1.0
117
+ mpmath==1.3.0
118
+ msgpack==1.0.7
119
+ multidict==6.0.4
120
+ murmurhash==1.0.10
121
+ nbclient==0.9.0
122
+ nbconvert==7.16.0
123
+ nbformat==5.9.2
124
+ nest-asyncio==1.6.0
125
+ networkx==2.6.3
126
+ ninja==1.11.1.1
127
+ nltk==3.8.1
128
+ notebook==6.4.10
129
+ numba==0.57.1+1.g1ff679645
130
+ numpy==1.24.4
131
+ nvfuser==0.1.4a0+d0bb811
132
+ nvidia-dali-cuda120==1.34.0
133
+ nvidia-pyindex==1.0.9
134
+ nvtx==0.2.5
135
+ oauthlib==3.2.2
136
+ onnx==1.15.0rc2
137
+ opencv==4.7.0
138
+ optree==0.10.0
139
+ packaging==23.2
140
+ pandas==1.5.3
141
+ pandocfilters==1.5.1
142
+ parso==0.8.3
143
+ partd==1.4.1
144
+ peft==0.11.1
145
+ pexpect==4.9.0
146
+ pillow==10.2.0
147
+ pip==24.0
148
+ platformdirs==4.2.0
149
+ pluggy==1.4.0
150
+ ply==3.11
151
+ polygraphy==0.49.4
152
+ pooch==1.8.0
153
+ portalocker==2.10.1
154
+ preshed==3.0.9
155
+ prettytable==3.9.0
156
+ prometheus-client==0.19.0
157
+ prompt-toolkit==3.0.43
158
+ protobuf==4.24.4
159
+ psutil==5.9.4
160
+ ptxcompiler==0.8.1+2.g0d406d6
161
+ ptyprocess==0.7.0
162
+ pure-eval==0.2.2
163
+ pyarrow==14.0.1.dev0+gba5374836.d20240125
164
+ pyasn1-modules==0.3.0
165
+ pyasn1==0.5.1
166
+ pybind11-global==2.11.1
167
+ pybind11==2.11.1
168
+ pycocotools==2.0+nv0.8.0
169
+ pycparser==2.21
170
+ pydantic-core==2.16.2
171
+ pydantic==2.6.1
172
+ pygments==2.17.2
173
+ pylibcugraph==23.12.0
174
+ pylibcugraphops==23.12.0
175
+ pylibraft==23.12.0
176
+ pynvml==11.4.1
177
+ pyparsing==3.1.1
178
+ pytest-flakefinder==1.1.0
179
+ pytest-rerunfailures==13.0
180
+ pytest-shard==0.1.2
181
+ pytest-xdist==3.5.0
182
+ pytest==8.0.0
183
+ python-dateutil==2.8.2
184
+ python-dotenv==1.0.0
185
+ python-hostlist==1.23.0
186
+ pytorch-quantization==2.1.2
187
+ pytz==2023.3.post1
188
+ pyyaml==6.0.1
189
+ pyzmq==25.1.2
190
+ raft-dask==23.12.0
191
+ rapids-dask-dependency==23.12.1
192
+ referencing==0.33.0
193
+ regex==2023.12.25
194
+ requests-oauthlib==1.3.1
195
+ requests==2.31.0
196
+ rich==13.7.0
197
+ rmm==23.12.0
198
+ rpds-py==0.17.1
199
+ rsa==4.9
200
+ sacrebleu==2.4.0
201
+ safetensors==0.4.3
202
+ scikit-learn==1.2.0
203
+ scipy==1.12.0
204
+ send2trash==1.8.2
205
+ sentencepiece==0.1.99
206
+ sentry-sdk==2.12.0
207
+ setproctitle==1.3.3
208
+ setuptools==68.2.2
209
+ six==1.16.0
210
+ smart-open==6.4.0
211
+ smmap==5.0.1
212
+ sortedcontainers==2.4.0
213
+ soundfile==0.12.1
214
+ soupsieve==2.5
215
+ soxr==0.3.7
216
+ spacy-legacy==3.0.12
217
+ spacy-loggers==1.0.5
218
+ spacy==3.7.2
219
+ sphinx-glpi-theme==0.6
220
+ srsly==2.4.8
221
+ stack-data==0.6.3
222
+ sympy==1.12
223
+ tabulate==0.9.0
224
+ tbb==2021.11.0
225
+ tblib==3.0.0
226
+ tensorboard-data-server==0.6.1
227
+ tensorboard-plugin-wit==1.8.1
228
+ tensorboard==2.9.0
229
+ tensorrt==8.6.3
230
+ terminado==0.18.0
231
+ termplotlib==0.3.9
232
+ thinc==8.2.3
233
+ threadpoolctl==3.2.0
234
+ thriftpy2==0.4.17
235
+ tinycss2==1.2.1
236
+ tokenizers==0.19.1
237
+ toml==0.10.2
238
+ tomli==2.0.1
239
+ toolz==0.12.1
240
+ torch-tensorrt==2.3.0a0
241
+ torch==2.3.0a0+ebedce2
242
+ torchdata==0.7.1a0
243
+ torchtext==0.17.0a0
244
+ torchvision==0.18.0a0
245
+ tornado==6.4
246
+ tqdm==4.66.1
247
+ traitlets==5.9.0
248
+ transformer-engine==1.3.0+5b90b7f
249
+ transformers==4.43.3
250
+ treelite-runtime==3.9.1
251
+ treelite==3.9.1
252
+ triton==2.2.0+e28a256
253
+ typer==0.9.0
254
+ types-dataclasses==0.6.6
255
+ typing-extensions==4.9.0
256
+ ucx-py==0.35.0
257
+ uff==0.6.9
258
+ ujson==5.8.0
259
+ urllib3==1.26.18
260
+ wandb==0.16.3
261
+ wasabi==1.1.2
262
+ wcwidth==0.2.13
263
+ weasel==0.3.4
264
+ webencodings==0.5.1
265
+ werkzeug==3.0.1
266
+ wheel==0.42.0
267
+ xdoctest==1.0.2
268
+ xgboost==1.7.6
269
+ yarl==1.9.4
270
+ zict==3.0.0
271
+ zipp==3.17.0
wandb/run-20240812_052853-n84i0o06/files/wandb-metadata.json ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
3
+ "python": "3.10.12",
4
+ "heartbeatAt": "2024-08-11T20:28:54.148690",
5
+ "startedAt": "2024-08-11T20:28:53.511276",
6
+ "docker": null,
7
+ "cuda": null,
8
+ "args": [
9
+ "--seq-length",
10
+ "4096",
11
+ "--sliding-window-size",
12
+ "4096",
13
+ "--micro-batch-size",
14
+ "1",
15
+ "--global-batch-size",
16
+ "320",
17
+ "--train-iters",
18
+ "20000",
19
+ "--tokenizer-type",
20
+ "HFPreTrainedTokenizer",
21
+ "--tokenizer-model",
22
+ "/share/pretrained_lm/Qwen/Qwen2-0.5B",
23
+ "--train-data-path",
24
+ "304771887",
25
+ "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
26
+ "--valid-data-path",
27
+ "304771887",
28
+ "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
29
+ "--test-data-path",
30
+ "304771887",
31
+ "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
32
+ "--lr",
33
+ "2e-5",
34
+ "--min-lr",
35
+ "1e-6",
36
+ "--lr-decay-style",
37
+ "cosine",
38
+ "--lr-warmup-iters",
39
+ "500",
40
+ "--lr-decay-iters",
41
+ "20000",
42
+ "--weight-decay",
43
+ "0.1",
44
+ "--grad-clip-norm",
45
+ "1.0",
46
+ "--optimizer",
47
+ "adam",
48
+ "--adam-beta1",
49
+ "0.9",
50
+ "--adam-beta2",
51
+ "0.95",
52
+ "--adam-eps",
53
+ "1e-6",
54
+ "--save-interval",
55
+ "5",
56
+ "--eval-interval",
57
+ "200",
58
+ "--eval-iters",
59
+ "10",
60
+ "--bf16",
61
+ "--mixed-precision",
62
+ "--base-model",
63
+ "/share/pretrained_lm/Qwen/Qwen2-0.5B",
64
+ "--save",
65
+ "/work/llm_recipes/models/yans-qwen2-0.5B",
66
+ "--load",
67
+ "/work/llm_recipes/models/yans-qwen2-0.5B",
68
+ "--fsdp-activation-checkpointing",
69
+ "--sharding-strategy",
70
+ "FULL_SHARD",
71
+ "--checkpoint-type",
72
+ "LOCAL_STATE_DICT",
73
+ "--save-n-checkpoints",
74
+ "10",
75
+ "--hf-upload-retry-limit",
76
+ "2",
77
+ "--hf-repo-id",
78
+ "koichi12/yans-qwen2-0.5B",
79
+ "--wandb-entity",
80
+ "iwakawa-koichi-q5-tohoku-nlp6723",
81
+ "--wandb-project",
82
+ "llm_tutorial",
83
+ "--wandb-name",
84
+ "yans-qwen2-0.5B_train_2024-08-12-05:28:42"
85
+ ],
86
+ "state": "running",
87
+ "program": "/project/examples/finetuning.py",
88
+ "codePathLocal": "examples/finetuning.py",
89
+ "codePath": "examples/finetuning.py",
90
+ "git": {
91
+ "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
92
+ "commit": "6da01327e78c302bc0cfdb335f3ca297e2a19c8c"
93
+ },
94
+ "email": null,
95
+ "root": "/project",
96
+ "host": "gpu-koiwa-00",
97
+ "username": "koiwa",
98
+ "executable": "/usr/bin/python",
99
+ "cpu_count": 18,
100
+ "cpu_count_logical": 18,
101
+ "cpu_freq": {
102
+ "current": 2400.0429999999997,
103
+ "min": 0.0,
104
+ "max": 0.0
105
+ },
106
+ "cpu_freq_per_core": [
107
+ {
108
+ "current": 2400.043,
109
+ "min": 0.0,
110
+ "max": 0.0
111
+ },
112
+ {
113
+ "current": 2400.043,
114
+ "min": 0.0,
115
+ "max": 0.0
116
+ },
117
+ {
118
+ "current": 2400.043,
119
+ "min": 0.0,
120
+ "max": 0.0
121
+ },
122
+ {
123
+ "current": 2400.043,
124
+ "min": 0.0,
125
+ "max": 0.0
126
+ },
127
+ {
128
+ "current": 2400.043,
129
+ "min": 0.0,
130
+ "max": 0.0
131
+ },
132
+ {
133
+ "current": 2400.043,
134
+ "min": 0.0,
135
+ "max": 0.0
136
+ },
137
+ {
138
+ "current": 2400.043,
139
+ "min": 0.0,
140
+ "max": 0.0
141
+ },
142
+ {
143
+ "current": 2400.043,
144
+ "min": 0.0,
145
+ "max": 0.0
146
+ },
147
+ {
148
+ "current": 2400.043,
149
+ "min": 0.0,
150
+ "max": 0.0
151
+ },
152
+ {
153
+ "current": 2400.043,
154
+ "min": 0.0,
155
+ "max": 0.0
156
+ },
157
+ {
158
+ "current": 2400.043,
159
+ "min": 0.0,
160
+ "max": 0.0
161
+ },
162
+ {
163
+ "current": 2400.043,
164
+ "min": 0.0,
165
+ "max": 0.0
166
+ },
167
+ {
168
+ "current": 2400.043,
169
+ "min": 0.0,
170
+ "max": 0.0
171
+ },
172
+ {
173
+ "current": 2400.043,
174
+ "min": 0.0,
175
+ "max": 0.0
176
+ },
177
+ {
178
+ "current": 2400.043,
179
+ "min": 0.0,
180
+ "max": 0.0
181
+ },
182
+ {
183
+ "current": 2400.043,
184
+ "min": 0.0,
185
+ "max": 0.0
186
+ },
187
+ {
188
+ "current": 2400.043,
189
+ "min": 0.0,
190
+ "max": 0.0
191
+ },
192
+ {
193
+ "current": 2400.043,
194
+ "min": 0.0,
195
+ "max": 0.0
196
+ }
197
+ ],
198
+ "disk": {
199
+ "/": {
200
+ "total": 0.0625,
201
+ "used": 1.1444091796875e-05
202
+ }
203
+ },
204
+ "gpu": "NVIDIA A100-SXM4-40GB",
205
+ "gpu_count": 1,
206
+ "gpu_devices": [
207
+ {
208
+ "name": "NVIDIA A100-SXM4-40GB",
209
+ "memory_total": 42949672960
210
+ }
211
+ ],
212
+ "memory": {
213
+ "total": 56.487823486328125
214
+ }
215
+ }
wandb/run-20240812_052853-n84i0o06/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"training/loss": 4.198177337646484, "training/perplexity": 66.56489507784042, "utils/batch_size": 1, "utils/global_batch_size": 320, "utils/seq_len": 4097, "utils/gradient_accumulation_steps": 320, "utils/iteration": 5, "optimizer/lr": 1.19e-06, "optimizer/variance_l2": 0.00650817005037245, "optimizer/variance_sqrt_l2": 0.4753125323283669, "optimizer/momentum_l2": 0.4059003829432183, "optimizer/weight_l2": 825.0639369164065, "optimizer/variance_l1": 0.22650909423828125, "optimizer/variance_sqrt_l1": 1979.75, "optimizer/momentum_l1": 1591.375, "optimizer/weight_l1": 6886400.0, "optimizer/variance_abs_max": 0.004669189453125, "optimizer/variance_sqrt_abs_max": 0.068359375, "optimizer/momentum_abs_max": 0.058837890625, "optimizer/weight_abs_max": 175.0, "stats/1_iteration_time": 74.65197611400004, "stats/tokens_per_sec": 17562.0267305172, "stats/tokens_per_sec_per_gpu": 17562.0267305172, "stats/tflops": 70.61673302016509, "_timestamp": 1723408520.9273944, "_runtime": 387.4032714366913, "_step": 5, "_wandb": {"runtime": 391}}
wandb/run-20240812_052853-n84i0o06/logs/debug-internal.log ADDED
@@ -0,0 +1,384 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-12 05:28:53,525 INFO StreamThr :10531 [internal.py:wandb_internal():86] W&B internal server running at pid: 10531, started at: 2024-08-12 05:28:53.524894
2
+ 2024-08-12 05:28:53,527 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status
3
+ 2024-08-12 05:28:53,529 INFO WriterThread:10531 [datastore.py:open_for_write():87] open: /project/wandb/run-20240812_052853-n84i0o06/run-n84i0o06.wandb
4
+ 2024-08-12 05:28:53,530 DEBUG SenderThread:10531 [sender.py:send():382] send: header
5
+ 2024-08-12 05:28:53,544 DEBUG SenderThread:10531 [sender.py:send():382] send: run
6
+ 2024-08-12 05:28:54,033 INFO SenderThread:10531 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240812_052853-n84i0o06/files
7
+ 2024-08-12 05:28:54,033 INFO SenderThread:10531 [sender.py:_start_run_threads():1136] run started: n84i0o06 with start time 1723408133.524123
8
+ 2024-08-12 05:28:54,038 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: check_version
9
+ 2024-08-12 05:28:54,038 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: check_version
10
+ 2024-08-12 05:28:54,128 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: run_start
11
+ 2024-08-12 05:28:54,135 DEBUG HandlerThread:10531 [system_info.py:__init__():27] System info init
12
+ 2024-08-12 05:28:54,135 DEBUG HandlerThread:10531 [system_info.py:__init__():42] System info init done
13
+ 2024-08-12 05:28:54,135 INFO HandlerThread:10531 [system_monitor.py:start():194] Starting system monitor
14
+ 2024-08-12 05:28:54,135 INFO SystemMonitor:10531 [system_monitor.py:_start():158] Starting system asset monitoring threads
15
+ 2024-08-12 05:28:54,135 INFO HandlerThread:10531 [system_monitor.py:probe():214] Collecting system info
16
+ 2024-08-12 05:28:54,136 INFO SystemMonitor:10531 [interfaces.py:start():190] Started cpu monitoring
17
+ 2024-08-12 05:28:54,136 INFO SystemMonitor:10531 [interfaces.py:start():190] Started disk monitoring
18
+ 2024-08-12 05:28:54,137 INFO SystemMonitor:10531 [interfaces.py:start():190] Started gpu monitoring
19
+ 2024-08-12 05:28:54,138 INFO SystemMonitor:10531 [interfaces.py:start():190] Started memory monitoring
20
+ 2024-08-12 05:28:54,139 INFO SystemMonitor:10531 [interfaces.py:start():190] Started network monitoring
21
+ 2024-08-12 05:28:54,148 DEBUG HandlerThread:10531 [system_info.py:probe():151] Probing system
22
+ 2024-08-12 05:28:54,150 DEBUG HandlerThread:10531 [system_info.py:_probe_git():136] Probing git
23
+ 2024-08-12 05:28:54,163 DEBUG HandlerThread:10531 [system_info.py:_probe_git():144] Probing git done
24
+ 2024-08-12 05:28:54,163 DEBUG HandlerThread:10531 [system_info.py:probe():199] Probing system done
25
+ 2024-08-12 05:28:54,163 DEBUG HandlerThread:10531 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-11T20:28:54.148690', 'startedAt': '2024-08-11T20:28:53.511276', 'docker': None, 'cuda': None, 'args': ('--seq-length', '4096', '--sliding-window-size', '4096', '--micro-batch-size', '1', '--global-batch-size', '320', '--train-iters', '20000', '--tokenizer-type', 'HFPreTrainedTokenizer', '--tokenizer-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--train-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--valid-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--test-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '20000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '5', '--eval-interval', '200', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--save', '/work/llm_recipes/models/yans-qwen2-0.5B', '--load', '/work/llm_recipes/models/yans-qwen2-0.5B', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/yans-qwen2-0.5B', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'yans-qwen2-0.5B_train_2024-08-12-05:28:42'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '6da01327e78c302bc0cfdb335f3ca297e2a19c8c'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0429999999997, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.487823486328125}}
26
+ 2024-08-12 05:28:54,163 INFO HandlerThread:10531 [system_monitor.py:probe():224] Finished collecting system info
27
+ 2024-08-12 05:28:54,163 INFO HandlerThread:10531 [system_monitor.py:probe():227] Publishing system info
28
+ 2024-08-12 05:28:54,164 INFO HandlerThread:10531 [system_monitor.py:probe():229] Finished publishing system info
29
+ 2024-08-12 05:28:54,170 DEBUG SenderThread:10531 [sender.py:send():382] send: files
30
+ 2024-08-12 05:28:54,170 INFO SenderThread:10531 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
31
+ 2024-08-12 05:28:54,180 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: python_packages
32
+ 2024-08-12 05:28:54,180 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
33
+ 2024-08-12 05:28:54,181 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: python_packages
34
+ 2024-08-12 05:28:54,181 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
35
+ 2024-08-12 05:28:54,182 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
36
+ 2024-08-12 05:28:54,475 DEBUG SenderThread:10531 [sender.py:send():382] send: telemetry
37
+ 2024-08-12 05:28:54,885 INFO wandb-upload_0:10531 [upload_job.py:push():131] Uploaded file /tmp/tmp0u7r0fs3wandb/exuilam8-wandb-metadata.json
38
+ 2024-08-12 05:28:55,035 INFO Thread-12 :10531 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_052853-n84i0o06/files/wandb-metadata.json
39
+ 2024-08-12 05:28:55,035 INFO Thread-12 :10531 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_052853-n84i0o06/files/requirements.txt
40
+ 2024-08-12 05:28:56,035 INFO Thread-12 :10531 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_052853-n84i0o06/files/output.log
41
+ 2024-08-12 05:28:58,036 INFO Thread-12 :10531 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052853-n84i0o06/files/output.log
42
+ 2024-08-12 05:28:59,328 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
43
+ 2024-08-12 05:29:00,038 INFO Thread-12 :10531 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052853-n84i0o06/files/output.log
44
+ 2024-08-12 05:29:01,878 DEBUG SenderThread:10531 [sender.py:send():382] send: config
45
+ 2024-08-12 05:29:01,879 DEBUG SenderThread:10531 [sender.py:send():382] send: config
46
+ 2024-08-12 05:29:02,039 INFO Thread-12 :10531 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052853-n84i0o06/files/output.log
47
+ 2024-08-12 05:29:04,040 INFO Thread-12 :10531 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052853-n84i0o06/files/output.log
48
+ 2024-08-12 05:29:04,879 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
49
+ 2024-08-12 05:29:09,180 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
50
+ 2024-08-12 05:29:09,181 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
51
+ 2024-08-12 05:29:09,181 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
52
+ 2024-08-12 05:29:10,368 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
53
+ 2024-08-12 05:29:15,369 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
54
+ 2024-08-12 05:29:20,370 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
55
+ 2024-08-12 05:29:24,180 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
56
+ 2024-08-12 05:29:24,180 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
57
+ 2024-08-12 05:29:24,220 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
58
+ 2024-08-12 05:29:26,367 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
59
+ 2024-08-12 05:29:27,058 INFO Thread-12 :10531 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052853-n84i0o06/files/config.yaml
60
+ 2024-08-12 05:29:31,577 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
61
+ 2024-08-12 05:29:36,578 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
62
+ 2024-08-12 05:29:39,180 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
63
+ 2024-08-12 05:29:39,180 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
64
+ 2024-08-12 05:29:39,220 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
65
+ 2024-08-12 05:29:42,448 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
66
+ 2024-08-12 05:29:47,449 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
67
+ 2024-08-12 05:29:52,450 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
68
+ 2024-08-12 05:29:54,139 DEBUG SystemMonitor:10531 [system_monitor.py:_start():172] Starting system metrics aggregation loop
69
+ 2024-08-12 05:29:54,141 DEBUG SenderThread:10531 [sender.py:send():382] send: stats
70
+ 2024-08-12 05:29:54,180 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
71
+ 2024-08-12 05:29:54,180 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
72
+ 2024-08-12 05:29:54,220 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
73
+ 2024-08-12 05:29:58,446 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
74
+ 2024-08-12 05:30:03,447 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
75
+ 2024-08-12 05:30:08,448 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
76
+ 2024-08-12 05:30:09,180 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
77
+ 2024-08-12 05:30:09,181 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
78
+ 2024-08-12 05:30:09,224 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
79
+ 2024-08-12 05:30:13,456 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
80
+ 2024-08-12 05:30:18,457 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
81
+ 2024-08-12 05:30:22,408 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: partial_history
82
+ 2024-08-12 05:30:24,097 INFO Thread-12 :10531 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052853-n84i0o06/files/output.log
83
+ 2024-08-12 05:30:24,142 DEBUG SenderThread:10531 [sender.py:send():382] send: stats
84
+ 2024-08-12 05:30:24,142 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
85
+ 2024-08-12 05:30:24,180 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
86
+ 2024-08-12 05:30:24,180 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
87
+ 2024-08-12 05:30:24,182 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
88
+ 2024-08-12 05:30:29,451 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
89
+ 2024-08-12 05:30:34,451 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
90
+ 2024-08-12 05:30:39,180 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
91
+ 2024-08-12 05:30:39,181 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
92
+ 2024-08-12 05:30:39,224 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
93
+ 2024-08-12 05:30:40,419 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
94
+ 2024-08-12 05:30:45,420 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
95
+ 2024-08-12 05:30:50,421 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
96
+ 2024-08-12 05:30:54,143 DEBUG SenderThread:10531 [sender.py:send():382] send: stats
97
+ 2024-08-12 05:30:54,180 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
98
+ 2024-08-12 05:30:54,181 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
99
+ 2024-08-12 05:30:54,224 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
100
+ 2024-08-12 05:30:56,414 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
101
+ 2024-08-12 05:31:01,416 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
102
+ 2024-08-12 05:31:06,417 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
103
+ 2024-08-12 05:31:09,181 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
104
+ 2024-08-12 05:31:09,181 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
105
+ 2024-08-12 05:31:09,224 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
106
+ 2024-08-12 05:31:12,373 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
107
+ 2024-08-12 05:31:17,375 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
108
+ 2024-08-12 05:31:22,376 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
109
+ 2024-08-12 05:31:24,144 DEBUG SenderThread:10531 [sender.py:send():382] send: stats
110
+ 2024-08-12 05:31:24,181 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
111
+ 2024-08-12 05:31:24,181 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
112
+ 2024-08-12 05:31:24,224 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
113
+ 2024-08-12 05:31:28,366 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
114
+ 2024-08-12 05:31:33,367 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
115
+ 2024-08-12 05:31:36,963 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: partial_history
116
+ 2024-08-12 05:31:36,966 DEBUG SenderThread:10531 [sender.py:send():382] send: history
117
+ 2024-08-12 05:31:36,966 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: summary_record
118
+ 2024-08-12 05:31:36,968 INFO SenderThread:10531 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
119
+ 2024-08-12 05:31:37,152 INFO Thread-12 :10531 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_052853-n84i0o06/files/wandb-summary.json
120
+ 2024-08-12 05:31:39,006 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
121
+ 2024-08-12 05:31:39,181 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
122
+ 2024-08-12 05:31:39,181 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
123
+ 2024-08-12 05:31:39,183 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
124
+ 2024-08-12 05:31:40,154 INFO Thread-12 :10531 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052853-n84i0o06/files/output.log
125
+ 2024-08-12 05:31:44,409 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
126
+ 2024-08-12 05:31:49,410 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
127
+ 2024-08-12 05:31:54,145 DEBUG SenderThread:10531 [sender.py:send():382] send: stats
128
+ 2024-08-12 05:31:54,181 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
129
+ 2024-08-12 05:31:54,181 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
130
+ 2024-08-12 05:31:54,228 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
131
+ 2024-08-12 05:31:55,354 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
132
+ 2024-08-12 05:32:00,355 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
133
+ 2024-08-12 05:32:05,356 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
134
+ 2024-08-12 05:32:09,181 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
135
+ 2024-08-12 05:32:09,181 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
136
+ 2024-08-12 05:32:09,224 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
137
+ 2024-08-12 05:32:10,376 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
138
+ 2024-08-12 05:32:15,377 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
139
+ 2024-08-12 05:32:20,378 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
140
+ 2024-08-12 05:32:24,146 DEBUG SenderThread:10531 [sender.py:send():382] send: stats
141
+ 2024-08-12 05:32:24,181 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
142
+ 2024-08-12 05:32:24,181 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
143
+ 2024-08-12 05:32:24,224 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
144
+ 2024-08-12 05:32:25,450 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
145
+ 2024-08-12 05:32:30,451 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
146
+ 2024-08-12 05:32:35,451 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
147
+ 2024-08-12 05:32:39,181 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
148
+ 2024-08-12 05:32:39,182 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
149
+ 2024-08-12 05:32:39,224 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
150
+ 2024-08-12 05:32:41,437 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
151
+ 2024-08-12 05:32:46,438 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
152
+ 2024-08-12 05:32:51,438 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
153
+ 2024-08-12 05:32:51,692 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: partial_history
154
+ 2024-08-12 05:32:51,694 DEBUG SenderThread:10531 [sender.py:send():382] send: history
155
+ 2024-08-12 05:32:51,694 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: summary_record
156
+ 2024-08-12 05:32:51,696 INFO SenderThread:10531 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
157
+ 2024-08-12 05:32:52,204 INFO Thread-12 :10531 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052853-n84i0o06/files/wandb-summary.json
158
+ 2024-08-12 05:32:54,147 DEBUG SenderThread:10531 [sender.py:send():382] send: stats
159
+ 2024-08-12 05:32:54,181 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
160
+ 2024-08-12 05:32:54,182 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
161
+ 2024-08-12 05:32:54,183 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
162
+ 2024-08-12 05:32:54,205 INFO Thread-12 :10531 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052853-n84i0o06/files/output.log
163
+ 2024-08-12 05:32:56,453 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
164
+ 2024-08-12 05:33:01,453 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
165
+ 2024-08-12 05:33:06,454 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
166
+ 2024-08-12 05:33:09,181 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
167
+ 2024-08-12 05:33:09,182 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
168
+ 2024-08-12 05:33:09,224 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
169
+ 2024-08-12 05:33:12,386 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
170
+ 2024-08-12 05:33:17,386 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
171
+ 2024-08-12 05:33:22,387 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
172
+ 2024-08-12 05:33:24,148 DEBUG SenderThread:10531 [sender.py:send():382] send: stats
173
+ 2024-08-12 05:33:24,181 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
174
+ 2024-08-12 05:33:24,182 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
175
+ 2024-08-12 05:33:24,224 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
176
+ 2024-08-12 05:33:28,379 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
177
+ 2024-08-12 05:33:33,380 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
178
+ 2024-08-12 05:33:38,380 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
179
+ 2024-08-12 05:33:39,182 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
180
+ 2024-08-12 05:33:39,182 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
181
+ 2024-08-12 05:33:39,224 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
182
+ 2024-08-12 05:33:43,420 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
183
+ 2024-08-12 05:33:48,421 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
184
+ 2024-08-12 05:33:53,421 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
185
+ 2024-08-12 05:33:54,149 DEBUG SenderThread:10531 [sender.py:send():382] send: stats
186
+ 2024-08-12 05:33:54,182 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
187
+ 2024-08-12 05:33:54,182 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
188
+ 2024-08-12 05:33:54,224 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
189
+ 2024-08-12 05:33:59,378 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
190
+ 2024-08-12 05:34:04,379 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
191
+ 2024-08-12 05:34:06,274 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: partial_history
192
+ 2024-08-12 05:34:06,276 DEBUG SenderThread:10531 [sender.py:send():382] send: history
193
+ 2024-08-12 05:34:06,277 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: summary_record
194
+ 2024-08-12 05:34:06,278 INFO SenderThread:10531 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
195
+ 2024-08-12 05:34:07,249 INFO Thread-12 :10531 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052853-n84i0o06/files/wandb-summary.json
196
+ 2024-08-12 05:34:08,250 INFO Thread-12 :10531 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052853-n84i0o06/files/output.log
197
+ 2024-08-12 05:34:09,182 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
198
+ 2024-08-12 05:34:09,182 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
199
+ 2024-08-12 05:34:09,184 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
200
+ 2024-08-12 05:34:09,395 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
201
+ 2024-08-12 05:34:14,395 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
202
+ 2024-08-12 05:34:19,396 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
203
+ 2024-08-12 05:34:24,150 DEBUG SenderThread:10531 [sender.py:send():382] send: stats
204
+ 2024-08-12 05:34:24,182 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
205
+ 2024-08-12 05:34:24,182 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
206
+ 2024-08-12 05:34:24,224 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
207
+ 2024-08-12 05:34:25,394 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
208
+ 2024-08-12 05:34:30,395 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
209
+ 2024-08-12 05:34:35,396 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
210
+ 2024-08-12 05:34:39,182 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
211
+ 2024-08-12 05:34:39,182 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
212
+ 2024-08-12 05:34:39,224 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
213
+ 2024-08-12 05:34:40,439 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
214
+ 2024-08-12 05:34:45,439 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
215
+ 2024-08-12 05:34:50,440 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
216
+ 2024-08-12 05:34:54,152 DEBUG SenderThread:10531 [sender.py:send():382] send: stats
217
+ 2024-08-12 05:34:54,182 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
218
+ 2024-08-12 05:34:54,182 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
219
+ 2024-08-12 05:34:54,224 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
220
+ 2024-08-12 05:34:55,454 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
221
+ 2024-08-12 05:35:00,455 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
222
+ 2024-08-12 05:35:05,455 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
223
+ 2024-08-12 05:35:09,182 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
224
+ 2024-08-12 05:35:09,182 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
225
+ 2024-08-12 05:35:09,224 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
226
+ 2024-08-12 05:35:11,407 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
227
+ 2024-08-12 05:35:16,407 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
228
+ 2024-08-12 05:35:20,928 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: partial_history
229
+ 2024-08-12 05:35:20,930 DEBUG SenderThread:10531 [sender.py:send():382] send: history
230
+ 2024-08-12 05:35:20,931 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: summary_record
231
+ 2024-08-12 05:35:20,932 INFO SenderThread:10531 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
232
+ 2024-08-12 05:35:21,295 INFO Thread-12 :10531 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052853-n84i0o06/files/wandb-summary.json
233
+ 2024-08-12 05:35:21,970 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
234
+ 2024-08-12 05:35:22,296 INFO Thread-12 :10531 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052853-n84i0o06/files/output.log
235
+ 2024-08-12 05:35:24,152 DEBUG SenderThread:10531 [sender.py:send():382] send: stats
236
+ 2024-08-12 05:35:24,232 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
237
+ 2024-08-12 05:35:24,255 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
238
+ 2024-08-12 05:35:24,256 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
239
+ 2024-08-12 05:35:24,297 INFO Thread-12 :10531 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052853-n84i0o06/files/output.log
240
+ 2024-08-12 05:35:25,212 DEBUG SenderThread:10531 [sender.py:send():382] send: exit
241
+ 2024-08-12 05:35:25,213 INFO SenderThread:10531 [sender.py:send_exit():589] handling exit code: 1
242
+ 2024-08-12 05:35:25,213 INFO SenderThread:10531 [sender.py:send_exit():591] handling runtime: 391
243
+ 2024-08-12 05:35:25,214 INFO SenderThread:10531 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
244
+ 2024-08-12 05:35:25,214 INFO SenderThread:10531 [sender.py:send_exit():597] send defer
245
+ 2024-08-12 05:35:25,214 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: defer
246
+ 2024-08-12 05:35:25,214 INFO HandlerThread:10531 [handler.py:handle_request_defer():172] handle defer: 0
247
+ 2024-08-12 05:35:25,215 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: defer
248
+ 2024-08-12 05:35:25,215 INFO SenderThread:10531 [sender.py:send_request_defer():613] handle sender defer: 0
249
+ 2024-08-12 05:35:25,215 INFO SenderThread:10531 [sender.py:transition_state():617] send defer: 1
250
+ 2024-08-12 05:35:25,215 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: defer
251
+ 2024-08-12 05:35:25,215 INFO HandlerThread:10531 [handler.py:handle_request_defer():172] handle defer: 1
252
+ 2024-08-12 05:35:25,215 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: defer
253
+ 2024-08-12 05:35:25,215 INFO SenderThread:10531 [sender.py:send_request_defer():613] handle sender defer: 1
254
+ 2024-08-12 05:35:25,215 INFO SenderThread:10531 [sender.py:transition_state():617] send defer: 2
255
+ 2024-08-12 05:35:25,215 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: defer
256
+ 2024-08-12 05:35:25,215 INFO HandlerThread:10531 [handler.py:handle_request_defer():172] handle defer: 2
257
+ 2024-08-12 05:35:25,215 INFO HandlerThread:10531 [system_monitor.py:finish():203] Stopping system monitor
258
+ 2024-08-12 05:35:25,215 DEBUG SystemMonitor:10531 [system_monitor.py:_start():179] Finished system metrics aggregation loop
259
+ 2024-08-12 05:35:25,215 INFO HandlerThread:10531 [interfaces.py:finish():202] Joined cpu monitor
260
+ 2024-08-12 05:35:25,216 DEBUG SystemMonitor:10531 [system_monitor.py:_start():183] Publishing last batch of metrics
261
+ 2024-08-12 05:35:25,216 INFO HandlerThread:10531 [interfaces.py:finish():202] Joined disk monitor
262
+ 2024-08-12 05:35:25,249 INFO HandlerThread:10531 [interfaces.py:finish():202] Joined gpu monitor
263
+ 2024-08-12 05:35:25,249 INFO HandlerThread:10531 [interfaces.py:finish():202] Joined memory monitor
264
+ 2024-08-12 05:35:25,249 INFO HandlerThread:10531 [interfaces.py:finish():202] Joined network monitor
265
+ 2024-08-12 05:35:25,249 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: defer
266
+ 2024-08-12 05:35:25,249 INFO SenderThread:10531 [sender.py:send_request_defer():613] handle sender defer: 2
267
+ 2024-08-12 05:35:25,249 INFO SenderThread:10531 [sender.py:transition_state():617] send defer: 3
268
+ 2024-08-12 05:35:25,249 DEBUG SenderThread:10531 [sender.py:send():382] send: stats
269
+ 2024-08-12 05:35:25,250 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: defer
270
+ 2024-08-12 05:35:25,250 INFO HandlerThread:10531 [handler.py:handle_request_defer():172] handle defer: 3
271
+ 2024-08-12 05:35:25,251 DEBUG SenderThread:10531 [sender.py:send():382] send: history
272
+ 2024-08-12 05:35:25,252 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: summary_record
273
+ 2024-08-12 05:35:25,253 INFO SenderThread:10531 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
274
+ 2024-08-12 05:35:25,253 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: defer
275
+ 2024-08-12 05:35:25,253 INFO SenderThread:10531 [sender.py:send_request_defer():613] handle sender defer: 3
276
+ 2024-08-12 05:35:25,253 INFO SenderThread:10531 [sender.py:transition_state():617] send defer: 4
277
+ 2024-08-12 05:35:25,253 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: defer
278
+ 2024-08-12 05:35:25,253 INFO HandlerThread:10531 [handler.py:handle_request_defer():172] handle defer: 4
279
+ 2024-08-12 05:35:25,253 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: defer
280
+ 2024-08-12 05:35:25,253 INFO SenderThread:10531 [sender.py:send_request_defer():613] handle sender defer: 4
281
+ 2024-08-12 05:35:25,253 INFO SenderThread:10531 [sender.py:transition_state():617] send defer: 5
282
+ 2024-08-12 05:35:25,253 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: defer
283
+ 2024-08-12 05:35:25,253 INFO HandlerThread:10531 [handler.py:handle_request_defer():172] handle defer: 5
284
+ 2024-08-12 05:35:25,254 DEBUG SenderThread:10531 [sender.py:send():382] send: summary
285
+ 2024-08-12 05:35:25,255 INFO SenderThread:10531 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
286
+ 2024-08-12 05:35:25,255 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: defer
287
+ 2024-08-12 05:35:25,255 INFO SenderThread:10531 [sender.py:send_request_defer():613] handle sender defer: 5
288
+ 2024-08-12 05:35:25,255 INFO SenderThread:10531 [sender.py:transition_state():617] send defer: 6
289
+ 2024-08-12 05:35:25,255 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: defer
290
+ 2024-08-12 05:35:25,255 INFO HandlerThread:10531 [handler.py:handle_request_defer():172] handle defer: 6
291
+ 2024-08-12 05:35:25,255 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: defer
292
+ 2024-08-12 05:35:25,255 INFO SenderThread:10531 [sender.py:send_request_defer():613] handle sender defer: 6
293
+ 2024-08-12 05:35:25,256 INFO SenderThread:10531 [sender.py:transition_state():617] send defer: 7
294
+ 2024-08-12 05:35:25,256 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
295
+ 2024-08-12 05:35:25,256 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: defer
296
+ 2024-08-12 05:35:25,256 INFO HandlerThread:10531 [handler.py:handle_request_defer():172] handle defer: 7
297
+ 2024-08-12 05:35:25,256 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: defer
298
+ 2024-08-12 05:35:25,256 INFO SenderThread:10531 [sender.py:send_request_defer():613] handle sender defer: 7
299
+ 2024-08-12 05:35:25,298 INFO Thread-12 :10531 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052853-n84i0o06/files/wandb-summary.json
300
+ 2024-08-12 05:35:26,141 INFO SenderThread:10531 [sender.py:transition_state():617] send defer: 8
301
+ 2024-08-12 05:35:26,142 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: defer
302
+ 2024-08-12 05:35:26,142 INFO HandlerThread:10531 [handler.py:handle_request_defer():172] handle defer: 8
303
+ 2024-08-12 05:35:26,142 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: defer
304
+ 2024-08-12 05:35:26,142 INFO SenderThread:10531 [sender.py:send_request_defer():613] handle sender defer: 8
305
+ 2024-08-12 05:35:26,142 INFO SenderThread:10531 [job_builder.py:build():296] Attempting to build job artifact
306
+ 2024-08-12 05:35:26,143 INFO SenderThread:10531 [job_builder.py:_get_source_type():426] is repo sourced job
307
+ 2024-08-12 05:35:26,157 INFO SenderThread:10531 [job_builder.py:build():402] adding wandb-job metadata file
308
+ 2024-08-12 05:35:26,166 INFO SenderThread:10531 [sender.py:transition_state():617] send defer: 9
309
+ 2024-08-12 05:35:26,166 DEBUG SenderThread:10531 [sender.py:send():382] send: artifact
310
+ 2024-08-12 05:35:26,166 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: defer
311
+ 2024-08-12 05:35:26,167 INFO HandlerThread:10531 [handler.py:handle_request_defer():172] handle defer: 9
312
+ 2024-08-12 05:35:26,213 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: poll_exit
313
+ 2024-08-12 05:35:26,299 INFO Thread-12 :10531 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052853-n84i0o06/files/output.log
314
+ 2024-08-12 05:35:27,302 INFO wandb-upload_1:10531 [upload_job.py:push():86] Skipped uploading /singularity_home/.local/share/wandb/artifacts/staging/tmpyfws5ko3
315
+ 2024-08-12 05:35:27,738 INFO wandb-upload_0:10531 [upload_job.py:push():89] Uploaded file /singularity_home/.local/share/wandb/artifacts/staging/tmpypuucsag
316
+ 2024-08-12 05:35:29,357 INFO SenderThread:10531 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTEzOTg5OTc5MQ==', 'state': 'PENDING', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTEzOTgzMzc4Mw==', 'versionIndex': 6}}}
317
+ 2024-08-12 05:35:29,357 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: defer
318
+ 2024-08-12 05:35:29,357 INFO SenderThread:10531 [sender.py:send_request_defer():613] handle sender defer: 9
319
+ 2024-08-12 05:35:29,358 INFO SenderThread:10531 [dir_watcher.py:finish():358] shutting down directory watcher
320
+ 2024-08-12 05:35:30,300 INFO SenderThread:10531 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240812_052853-n84i0o06/files
321
+ 2024-08-12 05:35:30,301 INFO SenderThread:10531 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_052853-n84i0o06/files/requirements.txt requirements.txt
322
+ 2024-08-12 05:35:30,301 INFO SenderThread:10531 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_052853-n84i0o06/files/config.yaml config.yaml
323
+ 2024-08-12 05:35:30,301 INFO SenderThread:10531 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_052853-n84i0o06/files/wandb-metadata.json wandb-metadata.json
324
+ 2024-08-12 05:35:30,302 INFO SenderThread:10531 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_052853-n84i0o06/files/wandb-summary.json wandb-summary.json
325
+ 2024-08-12 05:35:30,304 INFO SenderThread:10531 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_052853-n84i0o06/files/output.log output.log
326
+ 2024-08-12 05:35:30,306 INFO SenderThread:10531 [sender.py:transition_state():617] send defer: 10
327
+ 2024-08-12 05:35:30,306 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: poll_exit
328
+ 2024-08-12 05:35:30,306 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: defer
329
+ 2024-08-12 05:35:30,307 INFO HandlerThread:10531 [handler.py:handle_request_defer():172] handle defer: 10
330
+ 2024-08-12 05:35:30,308 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: defer
331
+ 2024-08-12 05:35:30,308 INFO SenderThread:10531 [sender.py:send_request_defer():613] handle sender defer: 10
332
+ 2024-08-12 05:35:30,308 INFO SenderThread:10531 [file_pusher.py:finish():172] shutting down file pusher
333
+ 2024-08-12 05:35:30,718 INFO wandb-upload_0:10531 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_052853-n84i0o06/files/config.yaml
334
+ 2024-08-12 05:35:30,895 INFO wandb-upload_3:10531 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_052853-n84i0o06/files/output.log
335
+ 2024-08-12 05:35:31,214 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: keepalive
336
+ 2024-08-12 05:35:31,214 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: poll_exit
337
+ 2024-08-12 05:35:31,214 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: poll_exit
338
+ 2024-08-12 05:35:31,248 INFO wandb-upload_1:10531 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_052853-n84i0o06/files/requirements.txt
339
+ 2024-08-12 05:35:31,299 INFO wandb-upload_2:10531 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_052853-n84i0o06/files/wandb-summary.json
340
+ 2024-08-12 05:35:31,499 INFO Thread-11 (_thread_body):10531 [sender.py:transition_state():617] send defer: 11
341
+ 2024-08-12 05:35:31,499 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: defer
342
+ 2024-08-12 05:35:31,500 INFO HandlerThread:10531 [handler.py:handle_request_defer():172] handle defer: 11
343
+ 2024-08-12 05:35:31,500 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: defer
344
+ 2024-08-12 05:35:31,500 INFO SenderThread:10531 [sender.py:send_request_defer():613] handle sender defer: 11
345
+ 2024-08-12 05:35:31,500 INFO SenderThread:10531 [file_pusher.py:join():178] waiting for file pusher
346
+ 2024-08-12 05:35:31,500 INFO SenderThread:10531 [sender.py:transition_state():617] send defer: 12
347
+ 2024-08-12 05:35:31,500 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: defer
348
+ 2024-08-12 05:35:31,500 INFO HandlerThread:10531 [handler.py:handle_request_defer():172] handle defer: 12
349
+ 2024-08-12 05:35:31,500 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: defer
350
+ 2024-08-12 05:35:31,500 INFO SenderThread:10531 [sender.py:send_request_defer():613] handle sender defer: 12
351
+ 2024-08-12 05:35:31,500 INFO SenderThread:10531 [file_stream.py:finish():595] file stream finish called
352
+ 2024-08-12 05:35:32,061 INFO SenderThread:10531 [file_stream.py:finish():599] file stream finish is done
353
+ 2024-08-12 05:35:32,061 INFO SenderThread:10531 [sender.py:transition_state():617] send defer: 13
354
+ 2024-08-12 05:35:32,061 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: defer
355
+ 2024-08-12 05:35:32,061 INFO HandlerThread:10531 [handler.py:handle_request_defer():172] handle defer: 13
356
+ 2024-08-12 05:35:32,062 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: defer
357
+ 2024-08-12 05:35:32,062 INFO SenderThread:10531 [sender.py:send_request_defer():613] handle sender defer: 13
358
+ 2024-08-12 05:35:32,062 INFO SenderThread:10531 [sender.py:transition_state():617] send defer: 14
359
+ 2024-08-12 05:35:32,062 DEBUG SenderThread:10531 [sender.py:send():382] send: final
360
+ 2024-08-12 05:35:32,062 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: defer
361
+ 2024-08-12 05:35:32,062 INFO HandlerThread:10531 [handler.py:handle_request_defer():172] handle defer: 14
362
+ 2024-08-12 05:35:32,062 DEBUG SenderThread:10531 [sender.py:send():382] send: footer
363
+ 2024-08-12 05:35:32,062 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: defer
364
+ 2024-08-12 05:35:32,062 INFO SenderThread:10531 [sender.py:send_request_defer():613] handle sender defer: 14
365
+ 2024-08-12 05:35:32,063 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: poll_exit
366
+ 2024-08-12 05:35:32,063 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: poll_exit
367
+ 2024-08-12 05:35:32,063 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: poll_exit
368
+ 2024-08-12 05:35:32,064 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: poll_exit
369
+ 2024-08-12 05:35:32,064 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: server_info
370
+ 2024-08-12 05:35:32,064 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: server_info
371
+ 2024-08-12 05:35:32,065 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: get_summary
372
+ 2024-08-12 05:35:32,066 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: sampled_history
373
+ 2024-08-12 05:35:32,067 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
374
+ 2024-08-12 05:35:32,067 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: job_info
375
+ 2024-08-12 05:35:32,238 DEBUG SenderThread:10531 [sender.py:send_request():409] send_request: job_info
376
+ 2024-08-12 05:35:32,238 INFO MainThread:10531 [wandb_run.py:_footer_history_summary_info():3866] rendering history
377
+ 2024-08-12 05:35:32,239 INFO MainThread:10531 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
378
+ 2024-08-12 05:35:32,239 INFO MainThread:10531 [wandb_run.py:_footer_sync_info():3825] logging synced files
379
+ 2024-08-12 05:35:32,240 DEBUG HandlerThread:10531 [handler.py:handle_request():146] handle_request: shutdown
380
+ 2024-08-12 05:35:32,240 INFO HandlerThread:10531 [handler.py:finish():869] shutting down handler
381
+ 2024-08-12 05:35:33,068 INFO WriterThread:10531 [datastore.py:close():296] close: /project/wandb/run-20240812_052853-n84i0o06/run-n84i0o06.wandb
382
+ 2024-08-12 05:35:33,239 INFO SenderThread:10531 [sender.py:finish():1572] shutting down sender
383
+ 2024-08-12 05:35:33,239 INFO SenderThread:10531 [file_pusher.py:finish():172] shutting down file pusher
384
+ 2024-08-12 05:35:33,239 INFO SenderThread:10531 [file_pusher.py:join():178] waiting for file pusher
wandb/run-20240812_052853-n84i0o06/logs/debug.log ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-12 05:28:53,517 INFO MainThread:10460 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
2
+ 2024-08-12 05:28:53,517 INFO MainThread:10460 [wandb_setup.py:_flush():76] Configure stats pid to 10460
3
+ 2024-08-12 05:28:53,517 INFO MainThread:10460 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
4
+ 2024-08-12 05:28:53,517 INFO MainThread:10460 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
5
+ 2024-08-12 05:28:53,517 INFO MainThread:10460 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train Qwen2'}
6
+ 2024-08-12 05:28:53,518 INFO MainThread:10460 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
+ 2024-08-12 05:28:53,518 INFO MainThread:10460 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
8
+ 2024-08-12 05:28:53,518 INFO MainThread:10460 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240812_052853-n84i0o06/logs/debug.log
9
+ 2024-08-12 05:28:53,518 INFO MainThread:10460 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240812_052853-n84i0o06/logs/debug-internal.log
10
+ 2024-08-12 05:28:53,518 INFO MainThread:10460 [wandb_init.py:init():566] calling init triggers
11
+ 2024-08-12 05:28:53,518 INFO MainThread:10460 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
12
+ config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'valid_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'test_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 4096, 'num_workers': 2, 'tokenizer_type': 'HFPreTrainedTokenizer', 'tokenizer_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'yans-qwen2-0.5B_train_2024-08-12-05:28:42', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/yans-qwen2-0.5B', 'save': '/work/llm_recipes/models/yans-qwen2-0.5B', 'base_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 5, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 1, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/yans-qwen2-0.5B', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 151680, 'gradient_accumulation_steps': 320}
13
+ 2024-08-12 05:28:53,518 INFO MainThread:10460 [wandb_init.py:init():616] starting backend
14
+ 2024-08-12 05:28:53,518 INFO MainThread:10460 [wandb_init.py:init():620] setting up manager
15
+ 2024-08-12 05:28:53,523 INFO MainThread:10460 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
16
+ 2024-08-12 05:28:53,523 INFO MainThread:10460 [wandb_init.py:init():628] backend started and connected
17
+ 2024-08-12 05:28:53,528 INFO MainThread:10460 [wandb_init.py:init():720] updated telemetry
18
+ 2024-08-12 05:28:53,540 INFO MainThread:10460 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
19
+ 2024-08-12 05:28:54,037 INFO MainThread:10460 [wandb_run.py:_on_init():2262] communicating current version
20
+ 2024-08-12 05:28:54,121 INFO MainThread:10460 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.6 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
21
+
22
+ 2024-08-12 05:28:54,121 INFO MainThread:10460 [wandb_init.py:init():804] starting run threads in backend
23
+ 2024-08-12 05:28:54,179 INFO MainThread:10460 [wandb_run.py:_console_start():2241] atexit reg
24
+ 2024-08-12 05:28:54,180 INFO MainThread:10460 [wandb_run.py:_redirect():2096] redirect: wrap_raw
25
+ 2024-08-12 05:28:54,180 INFO MainThread:10460 [wandb_run.py:_redirect():2161] Wrapping output streams.
26
+ 2024-08-12 05:28:54,180 INFO MainThread:10460 [wandb_run.py:_redirect():2186] Redirects installed.
27
+ 2024-08-12 05:28:54,181 INFO MainThread:10460 [wandb_init.py:init():847] run started, returning control to user process
28
+ 2024-08-12 05:29:01,877 INFO MainThread:10460 [wandb_run.py:_config_callback():1343] config_cb None None {'model_architecture': 'Qwen2ForCausalLM', 'activation_function': 'silu', 'hidden_size': 896, 'model_type': 'qwen2', 'max_position_embeddings': 4096, 'num_attention_heads': 14, 'num_hidden_layers': 24}
29
+ 2024-08-12 05:29:01,878 INFO MainThread:10460 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
30
+ 2024-08-12 05:35:33,240 WARNING MsgRouterThr:10460 [router.py:message_loop():77] message_loop has been closed
wandb/run-20240812_052853-n84i0o06/run-n84i0o06.wandb ADDED
Binary file (45.7 kB). View file
 
wandb/run-20240812_063027-j1htzx7q/files/output.log ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Created Hugging Face repository with ID koichi12/yans-sample-gemma-2-2b.
2
+ Clearing GPU cache for all ranks
3
+ --> Running with torch torch_distributed debug set to detail
4
+ File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
5
+ Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
6
+ File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
7
+ Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
8
+ You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
9
+
10
+
11
+ Loading checkpoint shards: 67%|██████▋ | 2/3 [02:31<01:16, 76.44s/it]
12
+ File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
13
+ Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
14
+ No checkpoint found in /work/llm_recipes/models/yans-sample-gemma-2-2b, skipping model loading
15
+ --> Model /share/pretrained_lm/google/gemma-2-2b
16
+ --> /share/pretrained_lm/google/gemma-2-2b has 2614.341888 Million params
17
+ BFloat16 enabled for mixed precision - using bfSixteen policy
18
+ --> applying fsdp activation checkpointing...
19
+ > datasets target sizes (minimum size):
20
+ train: 6400000
21
+ validation: 21334400
22
+ test: 3200
23
+ Loading checkpoint shards: 100%|██████████| 3/3 [02:40<00:00, 53.37s/it]
24
+ /usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
25
+ warnings.warn(
26
+ Let split = None
27
+ Building a BlendedDataset for a single MegatronDataset
28
+ Unable to save the indexes because path_to_cache is None
29
+ Building a BlendedDataset for a single MegatronDataset
30
+ Unable to save the indexes because path_to_cache is None
31
+ Building a BlendedDataset for a single MegatronDataset
32
+ Unable to save the indexes because path_to_cache is None
33
+ > finished creating GPT datasets ...
34
+ File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
35
+ Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
36
+ No checkpoint found in /work/llm_recipes/models/yans-sample-gemma-2-2b, skipping optimizer loading
37
+ File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
38
+ Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
39
+ model info: FullyShardedDataParallel(
40
+ (_fsdp_wrapped_module): Gemma2ForCausalLM(
41
+ (model): Gemma2Model(
42
+ (embed_tokens): Embedding(256000, 2304, padding_idx=0)
43
+ (layers): ModuleList(
44
+ (0-25): 26 x FullyShardedDataParallel(
45
+ (_fsdp_wrapped_module): CheckpointWrapper(
46
+ (_checkpoint_wrapped_module): Gemma2DecoderLayer(
47
+ (self_attn): Gemma2FlashAttention2(
48
+ (q_proj): Linear(in_features=2304, out_features=2048, bias=False)
49
+ (k_proj): Linear(in_features=2304, out_features=1024, bias=False)
50
+ (v_proj): Linear(in_features=2304, out_features=1024, bias=False)
51
+ (o_proj): Linear(in_features=2048, out_features=2304, bias=False)
52
+ (rotary_emb): Gemma2RotaryEmbedding()
53
+ )
54
+ (mlp): Gemma2MLP(
55
+ (gate_proj): Linear(in_features=2304, out_features=9216, bias=False)
56
+ (up_proj): Linear(in_features=2304, out_features=9216, bias=False)
57
+ (down_proj): Linear(in_features=9216, out_features=2304, bias=False)
58
+ (act_fn): PytorchGELUTanh()
59
+ )
60
+ (input_layernorm): Gemma2RMSNorm()
61
+ (post_attention_layernorm): Gemma2RMSNorm()
62
+ (pre_feedforward_layernorm): Gemma2RMSNorm()
63
+ (post_feedforward_layernorm): Gemma2RMSNorm()
64
+ )
65
+ )
66
+ )
67
+ )
68
+ (norm): Gemma2RMSNorm()
69
+ )
70
+ (lm_head): Linear(in_features=2304, out_features=256000, bias=False)
71
+ )
72
+ )
73
+ model config: Gemma2Config {
74
+ "_name_or_path": "/share/pretrained_lm/google/gemma-2-2b",
75
+ "architectures": [
76
+ "Gemma2ForCausalLM"
77
+ ],
78
+ "attention_bias": false,
79
+ "attention_dropout": 0.0,
80
+ "attn_logit_softcapping": 50.0,
81
+ "bos_token_id": 2,
82
+ "cache_implementation": "hybrid",
83
+ "eos_token_id": 1,
84
+ "final_logit_softcapping": 30.0,
85
+ "head_dim": 256,
86
+ "hidden_act": "gelu_pytorch_tanh",
87
+ "hidden_activation": "gelu_pytorch_tanh",
88
+ "hidden_size": 2304,
89
+ "initializer_range": 0.02,
90
+ "intermediate_size": 9216,
91
+ "label_smoothing": 0.0,
92
+ "max_position_embeddings": 4096,
93
+ "model_type": "gemma2",
94
+ "num_attention_heads": 8,
95
+ "num_hidden_layers": 26,
96
+ "num_key_value_heads": 4,
97
+ "pad_token_id": 0,
98
+ "query_pre_attn_scalar": 256,
99
+ "rms_norm_eps": 1e-06,
100
+ "rope_theta": 10000.0,
101
+ "sliding_window": 4096,
102
+ "torch_dtype": "float32",
103
+ "transformers_version": "4.43.3",
104
+ "use_cache": false,
105
+ "vocab_size": 256000
106
+ }
107
+ It is strongly recommended to train Gemma2 models with the `eager` attention implementation instead of `flash_attention_2`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.
108
+ Traceback (most recent call last):
109
+ File "/project/examples/finetuning.py", line 13, in <module>
110
+ main()
111
+ File "/project/src/llama_recipes/finetuning.py", line 281, in main
112
+ train(
113
+ File "/project/src/llama_recipes/utils/train_utils.py", line 118, in train
114
+ loss.backward()
115
+ File "/usr/local/lib/python3.10/dist-packages/torch/_tensor.py", line 522, in backward
116
+ torch.autograd.backward(
117
+ File "/usr/local/lib/python3.10/dist-packages/torch/autograd/__init__.py", line 267, in backward
118
+ _engine_run_backward(
119
+ File "/usr/local/lib/python3.10/dist-packages/torch/autograd/graph.py", line 681, in _engine_run_backward
120
+ return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
121
+ torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.70 GiB. GPU 0 has a total capacity of 39.39 GiB of which 3.86 GiB is free. Including non-PyTorch memory, this process has 35.52 GiB memory in use. Of the allocated memory 32.71 GiB is allocated by PyTorch, and 1.99 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
wandb/run-20240812_063027-j1htzx7q/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb": {"runtime": 167}}
wandb/run-20240823_154448-v9m85jnt/files/config.yaml ADDED
@@ -0,0 +1,321 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ sharding_strategy:
4
+ desc: null
5
+ value: FULL_SHARD
6
+ checkpoint_type:
7
+ desc: null
8
+ value: LOCAL_STATE_DICT
9
+ fsdp_activation_checkpointing:
10
+ desc: null
11
+ value: true
12
+ fsdp_cpu_offload:
13
+ desc: null
14
+ value: false
15
+ low_cpu_fsdp:
16
+ desc: null
17
+ value: false
18
+ no_meta_device:
19
+ desc: null
20
+ value: false
21
+ data_path:
22
+ desc: null
23
+ value: null
24
+ split:
25
+ desc: null
26
+ value: 969, 30, 1
27
+ train_data_path:
28
+ desc: null
29
+ value:
30
+ - '1754785366'
31
+ - /project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document
32
+ - '28623823675'
33
+ - /project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document
34
+ valid_data_path:
35
+ desc: null
36
+ value:
37
+ - '1754785366'
38
+ - /project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document
39
+ test_data_path:
40
+ desc: null
41
+ value:
42
+ - '1754785366'
43
+ - /project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document
44
+ data_cache_path:
45
+ desc: null
46
+ value: null
47
+ vocab_size:
48
+ desc: null
49
+ value: null
50
+ vocab_file:
51
+ desc: null
52
+ value: null
53
+ merge_file:
54
+ desc: null
55
+ value: null
56
+ seq_length:
57
+ desc: null
58
+ value: 2048
59
+ num_workers:
60
+ desc: null
61
+ value: 2
62
+ tokenizer_type:
63
+ desc: null
64
+ value: HFPreTrainedTokenizer
65
+ tokenizer_model:
66
+ desc: null
67
+ value: /share/pretrained_lm/Qwen/Qwen2-0.5B
68
+ reset_position_ids:
69
+ desc: null
70
+ value: false
71
+ reset_attention_mask:
72
+ desc: null
73
+ value: false
74
+ eod_mask_loss:
75
+ desc: null
76
+ value: false
77
+ retro_return_doc_ids:
78
+ desc: null
79
+ value: false
80
+ short_seq_prob:
81
+ desc: null
82
+ value: 0.1
83
+ vocab_extra_ids:
84
+ desc: null
85
+ value: 0
86
+ seed:
87
+ desc: null
88
+ value: 1234
89
+ use_mpi:
90
+ desc: null
91
+ value: false
92
+ wandb_entity:
93
+ desc: null
94
+ value: iwakawa-koichi-q5-tohoku-nlp6723
95
+ wandb_name:
96
+ desc: null
97
+ value: Qwen2-0.5b-0.2_train_2024-08-23-15:44:18
98
+ wandb_project:
99
+ desc: null
100
+ value: llm_tutorial-0.2
101
+ quantization:
102
+ desc: null
103
+ value: false
104
+ use_freeze_layers:
105
+ desc: null
106
+ value: false
107
+ freeze_layers:
108
+ desc: null
109
+ value: null
110
+ bf16:
111
+ desc: null
112
+ value: true
113
+ fp16:
114
+ desc: null
115
+ value: false
116
+ mixed_precision:
117
+ desc: null
118
+ value: true
119
+ param_dtype:
120
+ desc: null
121
+ value: null
122
+ load:
123
+ desc: null
124
+ value: /work/llm_recipes/models/Qwen2-0.5b-0.2
125
+ save:
126
+ desc: null
127
+ value: /work/llm_recipes/models/Qwen2-0.5b-0.2
128
+ base_model:
129
+ desc: null
130
+ value: /share/pretrained_lm/Qwen/Qwen2-0.5B
131
+ use_better_transformer:
132
+ desc: null
133
+ value: false
134
+ grad_clip_norm:
135
+ desc: null
136
+ value: 1.0
137
+ eval_interval:
138
+ desc: null
139
+ value: 10
140
+ save_interval:
141
+ desc: null
142
+ value: 10
143
+ eval_iters:
144
+ desc: null
145
+ value: 10
146
+ optimizer:
147
+ desc: null
148
+ value: anyprecision
149
+ lr:
150
+ desc: null
151
+ value: 2.0e-05
152
+ lr_decay_style:
153
+ desc: null
154
+ value: cosine
155
+ lr_decay_iters:
156
+ desc: null
157
+ value: 7500
158
+ lr_warmup_iters:
159
+ desc: null
160
+ value: 500
161
+ min_lr:
162
+ desc: null
163
+ value: 1.0e-06
164
+ train_iters:
165
+ desc: null
166
+ value: 7500
167
+ train_samples:
168
+ desc: null
169
+ value: null
170
+ global_batch_size:
171
+ desc: null
172
+ value: 320
173
+ micro_batch_size:
174
+ desc: null
175
+ value: 1
176
+ make_vocab_size_divisible_by:
177
+ desc: null
178
+ value: 128
179
+ sliding_window_size:
180
+ desc: null
181
+ value: 4096
182
+ skip_batch:
183
+ desc: null
184
+ value: null
185
+ no_save_optimizer_state:
186
+ desc: null
187
+ value: false
188
+ continual_pretraining:
189
+ desc: null
190
+ value: false
191
+ instruction_tuning:
192
+ desc: null
193
+ value: false
194
+ direct_preference_optimization:
195
+ desc: null
196
+ value: false
197
+ attention_dropout:
198
+ desc: null
199
+ value: 0.1
200
+ hidden_dropout:
201
+ desc: null
202
+ value: 0.1
203
+ weight_decay:
204
+ desc: null
205
+ value: 0.1
206
+ adam_beta1:
207
+ desc: null
208
+ value: 0.9
209
+ adam_beta2:
210
+ desc: null
211
+ value: 0.95
212
+ adam_eps:
213
+ desc: null
214
+ value: 1.0e-06
215
+ hf_transformer_model_dir:
216
+ desc: null
217
+ value: null
218
+ instruction_train_data_path:
219
+ desc: null
220
+ value: null
221
+ instruction_valid_data_path:
222
+ desc: null
223
+ value: null
224
+ epoch:
225
+ desc: null
226
+ value: null
227
+ instruction_dataset_size:
228
+ desc: null
229
+ value: null
230
+ save_sampler_state:
231
+ desc: null
232
+ value: false
233
+ label_smoothing:
234
+ desc: null
235
+ value: 0.0
236
+ save_n_checkpoints:
237
+ desc: null
238
+ value: 10
239
+ hf_repo_id:
240
+ desc: null
241
+ value: koichi12/Qwen2-0.5b-0.2
242
+ create_public_hf_repo:
243
+ desc: null
244
+ value: false
245
+ upload_all_checkpoints_to_hf:
246
+ desc: null
247
+ value: true
248
+ hf_upload_retry_limit:
249
+ desc: null
250
+ value: 2
251
+ exit_duration_in_mins:
252
+ desc: null
253
+ value: null
254
+ source_key:
255
+ desc: null
256
+ value: null
257
+ target_key:
258
+ desc: null
259
+ value: null
260
+ attn_implementation:
261
+ desc: null
262
+ value: flash_attention_2
263
+ efficient_instruction_tuning:
264
+ desc: null
265
+ value: false
266
+ remove_padding_masking:
267
+ desc: null
268
+ value: false
269
+ save_start_iter:
270
+ desc: null
271
+ value: null
272
+ valid_micro_batch_size:
273
+ desc: null
274
+ value: 1
275
+ rank:
276
+ desc: null
277
+ value: 0
278
+ world_size:
279
+ desc: null
280
+ value: 1
281
+ padded_vocab_size:
282
+ desc: null
283
+ value: 151680
284
+ gradient_accumulation_steps:
285
+ desc: null
286
+ value: 320
287
+ _wandb:
288
+ desc: null
289
+ value:
290
+ python_version: 3.10.12
291
+ cli_version: 0.16.3
292
+ framework: huggingface
293
+ huggingface_version: 4.43.3
294
+ is_jupyter_run: false
295
+ is_kaggle_kernel: false
296
+ start_time: 1724395488.891619
297
+ t:
298
+ 1:
299
+ - 1
300
+ - 11
301
+ - 49
302
+ - 55
303
+ - 71
304
+ - 105
305
+ 2:
306
+ - 1
307
+ - 11
308
+ - 49
309
+ - 55
310
+ - 71
311
+ - 105
312
+ 3:
313
+ - 13
314
+ - 16
315
+ - 23
316
+ 4: 3.10.12
317
+ 5: 0.16.3
318
+ 6: 4.43.3
319
+ 8:
320
+ - 5
321
+ 13: linux-x86_64
wandb/run-20240823_154448-v9m85jnt/files/output.log ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Created Hugging Face repository with ID koichi12/Qwen2-0.5b-0.2.
2
+ Clearing GPU cache for all ranks
3
+ --> Running with torch torch_distributed debug set to detail
4
+ File not found: /work/llm_recipes/models/Qwen2-0.5b-0.2/latest_iteration.txt
5
+ Unable to read latest iteration from /work/llm_recipes/models/Qwen2-0.5b-0.2/latest_iteration.txt
6
+ File not found: /work/llm_recipes/models/Qwen2-0.5b-0.2/latest_iteration.txt
7
+ Unable to read latest iteration from /work/llm_recipes/models/Qwen2-0.5b-0.2/latest_iteration.txt
8
+ Traceback (most recent call last):
9
+ File "/project/examples/finetuning.py", line 13, in <module>
10
+ main()
11
+ File "/project/src/llama_recipes/finetuning.py", line 103, in main
12
+ model = get_model(
13
+ File "/project/src/llama_recipes/get_models.py", line 106, in get_model
14
+ assert sliding_window == 131072
15
+ AssertionError
wandb/run-20240823_154448-v9m85jnt/files/requirements.txt ADDED
@@ -0,0 +1,375 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.1.0
2
+ accelerate==0.23.0
3
+ aiohttp==3.9.1
4
+ aiosignal==1.3.1
5
+ annotated-types==0.6.0
6
+ antlr4-python3-runtime==4.9.3
7
+ anyio==4.4.0
8
+ apex==0.1
9
+ appdirs==1.4.4
10
+ argon2-cffi-bindings==21.2.0
11
+ argon2-cffi==23.1.0
12
+ astroid==3.2.4
13
+ asttokens==2.4.1
14
+ astunparse==1.6.3
15
+ async-timeout==4.0.3
16
+ attrs==23.2.0
17
+ audioread==3.0.1
18
+ beautifulsoup4==4.12.3
19
+ bert-score==0.3.13
20
+ bleach==6.1.0
21
+ blis==0.7.11
22
+ build==1.2.1
23
+ cachecontrol==0.14.0
24
+ cachetools==5.3.2
25
+ catalogue==2.0.10
26
+ certifi==2024.2.2
27
+ cffi==1.16.0
28
+ chardet==5.2.0
29
+ charset-normalizer==3.3.2
30
+ cleo==2.1.0
31
+ click==8.1.7
32
+ cloudpathlib==0.16.0
33
+ cloudpickle==3.0.0
34
+ cmake==3.28.1
35
+ colorama==0.4.6
36
+ comm==0.2.1
37
+ confection==0.1.4
38
+ contourpy==1.2.0
39
+ cramjam==2.8.3
40
+ crashtest==0.4.1
41
+ cryptography==43.0.0
42
+ cubinlinker==0.3.0+2.g405ac64
43
+ cuda-python==12.3.0rc4+9.gdb8c48a.dirty
44
+ cudf==23.12.0
45
+ cugraph-dgl==23.12.0
46
+ cugraph-service-client==23.12.0
47
+ cugraph-service-server==23.12.0
48
+ cugraph==23.12.0
49
+ cuml==23.12.0
50
+ cupy-cuda12x==12.3.0
51
+ cycler==0.12.1
52
+ cymem==2.0.8
53
+ cython==3.0.8
54
+ dask-cuda==23.12.0
55
+ dask-cudf==23.12.0
56
+ dask==2023.11.0
57
+ dataclasses-json==0.6.7
58
+ dataproperty==1.0.1
59
+ datasets==2.20.0
60
+ debugpy==1.8.1
61
+ decorator==5.1.1
62
+ defusedxml==0.7.1
63
+ dill==0.3.8
64
+ distlib==0.3.8
65
+ distributed==2023.11.0
66
+ distro==1.9.0
67
+ dm-tree==0.1.8
68
+ docker-pycreds==0.4.0
69
+ dulwich==0.21.7
70
+ einops==0.7.0
71
+ emoji==2.12.1
72
+ entmax==1.3
73
+ evaluate==0.4.2
74
+ exceptiongroup==1.2.0
75
+ execnet==2.0.2
76
+ executing==2.0.1
77
+ expecttest==0.1.3
78
+ fastjsonschema==2.19.1
79
+ fastparquet==2023.10.1
80
+ fastrlock==0.8.2
81
+ filelock==3.13.1
82
+ flash-attn==2.4.2
83
+ fonttools==4.48.1
84
+ frozenlist==1.4.1
85
+ fsspec==2023.12.2
86
+ fugashi==1.3.2
87
+ fuzzywuzzy==0.18.0
88
+ gast==0.5.4
89
+ gitdb==4.0.11
90
+ gitpython==3.1.43
91
+ google-auth-oauthlib==0.4.6
92
+ google-auth==2.27.0
93
+ graphsurgeon==0.4.6
94
+ greenlet==3.0.3
95
+ grpcio==1.60.1
96
+ h11==0.14.0
97
+ httpcore==1.0.5
98
+ httpx==0.27.0
99
+ huggingface-hub==0.24.5
100
+ hydra-core==1.3.2
101
+ hypothesis==5.35.1
102
+ idna==3.6
103
+ importlib-metadata==7.0.1
104
+ iniconfig==2.0.0
105
+ installer==0.7.0
106
+ intel-openmp==2021.4.0
107
+ ipadic==1.0.0
108
+ ipykernel==6.29.2
109
+ ipython-genutils==0.2.0
110
+ ipython==8.21.0
111
+ isort==5.13.2
112
+ jaraco.classes==3.4.0
113
+ jedi==0.19.1
114
+ jeepney==0.8.0
115
+ jinja2==3.1.3
116
+ jiter==0.5.0
117
+ joblib==1.3.2
118
+ json5==0.9.14
119
+ jsonargparse==3.13.1
120
+ jsonlines==4.0.0
121
+ jsonnet==0.19.1
122
+ jsonpatch==1.33
123
+ jsonpointer==3.0.0
124
+ jsonschema-specifications==2023.12.1
125
+ jsonschema==4.21.1
126
+ jupyter-client==8.6.0
127
+ jupyter-core==5.7.1
128
+ jupyter-tensorboard==0.2.0
129
+ jupyterlab-pygments==0.3.0
130
+ jupyterlab-server==1.2.0
131
+ jupyterlab==2.3.2
132
+ jupytext==1.16.1
133
+ keyring==24.3.1
134
+ kiwisolver==1.4.5
135
+ langchain-community==0.2.12
136
+ langchain-core==0.2.31
137
+ langchain-huggingface==0.0.2
138
+ langchain-openai==0.1.21
139
+ langchain-text-splitters==0.2.2
140
+ langchain==0.2.13
141
+ langcodes==3.3.0
142
+ langsmith==0.1.99
143
+ lazy-loader==0.3
144
+ levenshtein==0.25.1
145
+ librosa==0.10.1
146
+ lightning-utilities==0.11.6
147
+ llm-jp-eval==1.4.0
148
+ llvmlite==0.40.1
149
+ lm-eval==0.3.0
150
+ locket==1.0.0
151
+ logzero==1.7.0
152
+ lxml==5.2.2
153
+ markdown-it-py==3.0.0
154
+ markdown==3.5.2
155
+ markupsafe==2.1.4
156
+ marshmallow==3.21.3
157
+ matplotlib-inline==0.1.6
158
+ matplotlib==3.8.2
159
+ mbstrdecoder==1.1.3
160
+ mccabe==0.7.0
161
+ mdit-py-plugins==0.4.0
162
+ mdurl==0.1.2
163
+ mecab-python3==1.0.6
164
+ mistune==3.0.2
165
+ mkl-devel==2021.1.1
166
+ mkl-include==2021.1.1
167
+ mkl==2021.1.1
168
+ mock==5.1.0
169
+ mojimoji==0.0.13
170
+ more-itertools==9.1.0
171
+ mpmath==1.3.0
172
+ msgpack==1.0.7
173
+ multidict==6.0.4
174
+ multiprocess==0.70.16
175
+ murmurhash==1.0.10
176
+ mypy-extensions==1.0.0
177
+ nbclient==0.9.0
178
+ nbconvert==7.16.0
179
+ nbformat==5.9.2
180
+ neologdn==0.5.3
181
+ nest-asyncio==1.6.0
182
+ networkx==2.6.3
183
+ ninja==1.11.1.1
184
+ nltk==3.8.1
185
+ notebook==6.4.10
186
+ numba==0.57.1+1.g1ff679645
187
+ numexpr==2.10.1
188
+ numpy==1.24.4
189
+ nvfuser==0.1.4a0+d0bb811
190
+ nvidia-dali-cuda120==1.34.0
191
+ nvidia-pyindex==1.0.9
192
+ nvtx==0.2.5
193
+ oauthlib==3.2.2
194
+ omegaconf==2.3.0
195
+ onnx==1.15.0rc2
196
+ openai==1.40.6
197
+ opencv==4.7.0
198
+ optree==0.10.0
199
+ orjson==3.10.7
200
+ packaging==23.2
201
+ pandas==2.2.2
202
+ pandocfilters==1.5.1
203
+ parso==0.8.3
204
+ partd==1.4.1
205
+ pathvalidate==3.2.0
206
+ peft==0.5.0
207
+ pexpect==4.9.0
208
+ pillow==10.2.0
209
+ pip==24.0
210
+ pkginfo==1.11.1
211
+ plac==1.4.3
212
+ platformdirs==4.2.0
213
+ pluggy==1.4.0
214
+ ply==3.11
215
+ poetry-core==1.9.0
216
+ poetry-plugin-export==1.8.0
217
+ poetry==1.8.3
218
+ polygraphy==0.49.4
219
+ pooch==1.8.0
220
+ portalocker==2.10.1
221
+ preshed==3.0.9
222
+ prettytable==3.9.0
223
+ prometheus-client==0.19.0
224
+ prompt-toolkit==3.0.43
225
+ protobuf==4.24.4
226
+ psutil==5.9.4
227
+ ptxcompiler==0.8.1+2.g0d406d6
228
+ ptyprocess==0.7.0
229
+ pure-eval==0.2.2
230
+ pyarrow-hotfix==0.6
231
+ pyarrow==15.0.2
232
+ pyasn1-modules==0.3.0
233
+ pyasn1==0.5.1
234
+ pybind11-global==2.11.1
235
+ pybind11==2.11.1
236
+ pycocotools==2.0+nv0.8.0
237
+ pycountry==24.6.1
238
+ pycparser==2.21
239
+ pydantic-core==2.16.2
240
+ pydantic==2.6.1
241
+ pygments==2.17.2
242
+ pylibcugraph==23.12.0
243
+ pylibcugraphops==23.12.0
244
+ pylibraft==23.12.0
245
+ pylint==3.2.6
246
+ pynvml==11.4.1
247
+ pyparsing==3.1.1
248
+ pyproject-hooks==1.1.0
249
+ pytablewriter==1.2.0
250
+ pytest-flakefinder==1.1.0
251
+ pytest-rerunfailures==13.0
252
+ pytest-shard==0.1.2
253
+ pytest-xdist==3.5.0
254
+ pytest==8.0.0
255
+ python-dateutil==2.8.2
256
+ python-dotenv==1.0.0
257
+ python-hostlist==1.23.0
258
+ python-levenshtein==0.25.1
259
+ pytorch-lightning==2.4.0
260
+ pytorch-quantization==2.1.2
261
+ pytz==2023.3.post1
262
+ pyyaml==6.0.1
263
+ pyzmq==25.1.2
264
+ raft-dask==23.12.0
265
+ rapidfuzz==3.9.6
266
+ rapids-dask-dependency==23.12.1
267
+ referencing==0.33.0
268
+ regex==2023.12.25
269
+ requests-oauthlib==1.3.1
270
+ requests-toolbelt==1.0.0
271
+ requests==2.32.3
272
+ rhoknp==1.7.0
273
+ rich==13.7.0
274
+ rmm==23.12.0
275
+ rouge-score==0.1.2
276
+ rpds-py==0.17.1
277
+ rsa==4.9
278
+ sacrebleu==2.4.2
279
+ safetensors==0.4.3
280
+ scikit-learn==1.5.1
281
+ scipy==1.12.0
282
+ secretstorage==3.3.3
283
+ send2trash==1.8.2
284
+ sentence-transformers==3.0.1
285
+ sentencepiece==0.1.99
286
+ sentry-sdk==2.12.0
287
+ setproctitle==1.3.3
288
+ setuptools==68.2.2
289
+ shellingham==1.5.4
290
+ six==1.16.0
291
+ smart-open==6.4.0
292
+ smmap==5.0.1
293
+ sniffio==1.3.1
294
+ sortedcontainers==2.4.0
295
+ soundfile==0.12.1
296
+ soupsieve==2.5
297
+ soxr==0.3.7
298
+ spacy-legacy==3.0.12
299
+ spacy-loggers==1.0.5
300
+ spacy==3.7.2
301
+ sphinx-glpi-theme==0.6
302
+ sqlalchemy==2.0.32
303
+ sqlitedict==2.1.0
304
+ srsly==2.4.8
305
+ stack-data==0.6.3
306
+ sumeval==0.2.2
307
+ sympy==1.12
308
+ tabledata==1.3.3
309
+ tabulate==0.9.0
310
+ tbb==2021.11.0
311
+ tblib==3.0.0
312
+ tcolorpy==0.1.6
313
+ tenacity==8.5.0
314
+ tensorboard-data-server==0.6.1
315
+ tensorboard-plugin-wit==1.8.1
316
+ tensorboard==2.9.0
317
+ tensorrt==8.6.3
318
+ terminado==0.18.0
319
+ termplotlib==0.3.9
320
+ text-generation==0.7.0
321
+ thinc==8.2.3
322
+ threadpoolctl==3.2.0
323
+ thriftpy2==0.4.17
324
+ tiktoken==0.7.0
325
+ tinycss2==1.2.1
326
+ tokenizers==0.19.1
327
+ toml==0.10.2
328
+ tomli==2.0.1
329
+ tomlkit==0.13.2
330
+ toolz==0.12.1
331
+ torch-tensorrt==2.3.0a0
332
+ torch==2.3.0a0+ebedce2
333
+ torchdata==0.7.1a0
334
+ torchmetrics==0.10.3
335
+ torchtext==0.17.0a0
336
+ torchvision==0.18.0a0
337
+ tornado==6.4
338
+ tqdm-multiprocess==0.0.11
339
+ tqdm==4.66.5
340
+ traitlets==5.9.0
341
+ transformer-engine==1.3.0+5b90b7f
342
+ transformers==4.43.3
343
+ treelite-runtime==3.9.1
344
+ treelite==3.9.1
345
+ triton==2.2.0+e28a256
346
+ trove-classifiers==2024.7.2
347
+ typepy==1.3.2
348
+ typer==0.9.0
349
+ types-dataclasses==0.6.6
350
+ typing-extensions==4.12.2
351
+ typing-inspect==0.9.0
352
+ tzdata==2024.1
353
+ ucx-py==0.35.0
354
+ uff==0.6.9
355
+ ujson==5.8.0
356
+ unbabel-comet==2.2.2
357
+ unidic-lite==1.0.8
358
+ urllib3==1.26.18
359
+ virtualenv==20.26.3
360
+ wandb==0.16.3
361
+ wasabi==1.1.2
362
+ wcwidth==0.2.13
363
+ weasel==0.3.4
364
+ webencodings==0.5.1
365
+ werkzeug==3.0.1
366
+ wheel==0.42.0
367
+ word2number==1.1
368
+ xdoctest==1.0.2
369
+ xgboost==1.7.6
370
+ xmltodict==0.13.0
371
+ xxhash==3.4.1
372
+ yarl==1.9.4
373
+ zict==3.0.0
374
+ zipp==3.17.0
375
+ zstandard==0.23.0
wandb/run-20240823_154448-v9m85jnt/files/wandb-metadata.json ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
3
+ "python": "3.10.12",
4
+ "heartbeatAt": "2024-08-23T06:44:49.486428",
5
+ "startedAt": "2024-08-23T06:44:48.878270",
6
+ "docker": null,
7
+ "cuda": null,
8
+ "args": [
9
+ "--seq-length",
10
+ "2048",
11
+ "--sliding-window-size",
12
+ "4096",
13
+ "--micro-batch-size",
14
+ "1",
15
+ "--valid_micro_batch_size",
16
+ "1",
17
+ "--global-batch-size",
18
+ "320",
19
+ "--train-iters",
20
+ "7500",
21
+ "--tokenizer-type",
22
+ "HFPreTrainedTokenizer",
23
+ "--tokenizer-model",
24
+ "/share/pretrained_lm/Qwen/Qwen2-0.5B",
25
+ "--train-data-path",
26
+ "1754785366",
27
+ "/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document",
28
+ "28623823675",
29
+ "/project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document",
30
+ "--valid-data-path",
31
+ "1754785366",
32
+ "/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document",
33
+ "--test-data-path",
34
+ "1754785366",
35
+ "/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document",
36
+ "--lr",
37
+ "2e-5",
38
+ "--min-lr",
39
+ "1e-6",
40
+ "--lr-decay-style",
41
+ "cosine",
42
+ "--lr-warmup-iters",
43
+ "500",
44
+ "--lr-decay-iters",
45
+ "7500",
46
+ "--weight-decay",
47
+ "0.1",
48
+ "--grad-clip-norm",
49
+ "1.0",
50
+ "--optimizer",
51
+ "anyprecision",
52
+ "--adam-beta1",
53
+ "0.9",
54
+ "--adam-beta2",
55
+ "0.95",
56
+ "--adam-eps",
57
+ "1e-6",
58
+ "--save-interval",
59
+ "10",
60
+ "--eval-interval",
61
+ "10",
62
+ "--eval-iters",
63
+ "10",
64
+ "--bf16",
65
+ "--mixed-precision",
66
+ "--base-model",
67
+ "/share/pretrained_lm/Qwen/Qwen2-0.5B",
68
+ "--save",
69
+ "/work/llm_recipes/models/Qwen2-0.5b-0.2",
70
+ "--load",
71
+ "/work/llm_recipes/models/Qwen2-0.5b-0.2",
72
+ "--fsdp-activation-checkpointing",
73
+ "--sharding-strategy",
74
+ "FULL_SHARD",
75
+ "--checkpoint-type",
76
+ "LOCAL_STATE_DICT",
77
+ "--save-n-checkpoints",
78
+ "10",
79
+ "--upload-all-checkpoints-to-hf",
80
+ "--hf-upload-retry-limit",
81
+ "2",
82
+ "--hf-repo-id",
83
+ "koichi12/Qwen2-0.5b-0.2",
84
+ "--wandb-entity",
85
+ "iwakawa-koichi-q5-tohoku-nlp6723",
86
+ "--wandb-project",
87
+ "llm_tutorial-0.2",
88
+ "--wandb-name",
89
+ "Qwen2-0.5b-0.2_train_2024-08-23-15:44:18"
90
+ ],
91
+ "state": "running",
92
+ "program": "/project/examples/finetuning.py",
93
+ "codePathLocal": "examples/finetuning.py",
94
+ "codePath": "examples/finetuning.py",
95
+ "git": {
96
+ "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
97
+ "commit": "887a2cc5d104c10264701f95cbbb0a6a116768d6"
98
+ },
99
+ "email": null,
100
+ "root": "/project",
101
+ "host": "gpu-koiwa-00",
102
+ "username": "koiwa",
103
+ "executable": "/usr/bin/python",
104
+ "cpu_count": 18,
105
+ "cpu_count_logical": 18,
106
+ "cpu_freq": {
107
+ "current": 2400.0389999999993,
108
+ "min": 0.0,
109
+ "max": 0.0
110
+ },
111
+ "cpu_freq_per_core": [
112
+ {
113
+ "current": 2400.039,
114
+ "min": 0.0,
115
+ "max": 0.0
116
+ },
117
+ {
118
+ "current": 2400.039,
119
+ "min": 0.0,
120
+ "max": 0.0
121
+ },
122
+ {
123
+ "current": 2400.039,
124
+ "min": 0.0,
125
+ "max": 0.0
126
+ },
127
+ {
128
+ "current": 2400.039,
129
+ "min": 0.0,
130
+ "max": 0.0
131
+ },
132
+ {
133
+ "current": 2400.039,
134
+ "min": 0.0,
135
+ "max": 0.0
136
+ },
137
+ {
138
+ "current": 2400.039,
139
+ "min": 0.0,
140
+ "max": 0.0
141
+ },
142
+ {
143
+ "current": 2400.039,
144
+ "min": 0.0,
145
+ "max": 0.0
146
+ },
147
+ {
148
+ "current": 2400.039,
149
+ "min": 0.0,
150
+ "max": 0.0
151
+ },
152
+ {
153
+ "current": 2400.039,
154
+ "min": 0.0,
155
+ "max": 0.0
156
+ },
157
+ {
158
+ "current": 2400.039,
159
+ "min": 0.0,
160
+ "max": 0.0
161
+ },
162
+ {
163
+ "current": 2400.039,
164
+ "min": 0.0,
165
+ "max": 0.0
166
+ },
167
+ {
168
+ "current": 2400.039,
169
+ "min": 0.0,
170
+ "max": 0.0
171
+ },
172
+ {
173
+ "current": 2400.039,
174
+ "min": 0.0,
175
+ "max": 0.0
176
+ },
177
+ {
178
+ "current": 2400.039,
179
+ "min": 0.0,
180
+ "max": 0.0
181
+ },
182
+ {
183
+ "current": 2400.039,
184
+ "min": 0.0,
185
+ "max": 0.0
186
+ },
187
+ {
188
+ "current": 2400.039,
189
+ "min": 0.0,
190
+ "max": 0.0
191
+ },
192
+ {
193
+ "current": 2400.039,
194
+ "min": 0.0,
195
+ "max": 0.0
196
+ },
197
+ {
198
+ "current": 2400.039,
199
+ "min": 0.0,
200
+ "max": 0.0
201
+ }
202
+ ],
203
+ "disk": {
204
+ "/": {
205
+ "total": 0.0625,
206
+ "used": 1.1444091796875e-05
207
+ }
208
+ },
209
+ "gpu": "NVIDIA A100-SXM4-40GB",
210
+ "gpu_count": 1,
211
+ "gpu_devices": [
212
+ {
213
+ "name": "NVIDIA A100-SXM4-40GB",
214
+ "memory_total": 42949672960
215
+ }
216
+ ],
217
+ "memory": {
218
+ "total": 56.487831115722656
219
+ }
220
+ }
wandb/run-20240823_154448-v9m85jnt/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb": {"runtime": 1}}
wandb/run-20240823_154448-v9m85jnt/logs/debug-internal.log ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-23 15:44:48,892 INFO StreamThr :10032 [internal.py:wandb_internal():86] W&B internal server running at pid: 10032, started at: 2024-08-23 15:44:48.891774
2
+ 2024-08-23 15:44:48,893 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: status
3
+ 2024-08-23 15:44:48,896 INFO WriterThread:10032 [datastore.py:open_for_write():87] open: /project/wandb/run-20240823_154448-v9m85jnt/run-v9m85jnt.wandb
4
+ 2024-08-23 15:44:48,897 DEBUG SenderThread:10032 [sender.py:send():382] send: header
5
+ 2024-08-23 15:44:48,913 DEBUG SenderThread:10032 [sender.py:send():382] send: run
6
+ 2024-08-23 15:44:49,390 INFO SenderThread:10032 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240823_154448-v9m85jnt/files
7
+ 2024-08-23 15:44:49,390 INFO SenderThread:10032 [sender.py:_start_run_threads():1136] run started: v9m85jnt with start time 1724395488.891619
8
+ 2024-08-23 15:44:49,395 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: check_version
9
+ 2024-08-23 15:44:49,396 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: check_version
10
+ 2024-08-23 15:44:49,467 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: run_start
11
+ 2024-08-23 15:44:49,473 DEBUG HandlerThread:10032 [system_info.py:__init__():27] System info init
12
+ 2024-08-23 15:44:49,474 DEBUG HandlerThread:10032 [system_info.py:__init__():42] System info init done
13
+ 2024-08-23 15:44:49,474 INFO HandlerThread:10032 [system_monitor.py:start():194] Starting system monitor
14
+ 2024-08-23 15:44:49,474 INFO SystemMonitor:10032 [system_monitor.py:_start():158] Starting system asset monitoring threads
15
+ 2024-08-23 15:44:49,474 INFO HandlerThread:10032 [system_monitor.py:probe():214] Collecting system info
16
+ 2024-08-23 15:44:49,474 INFO SystemMonitor:10032 [interfaces.py:start():190] Started cpu monitoring
17
+ 2024-08-23 15:44:49,475 INFO SystemMonitor:10032 [interfaces.py:start():190] Started disk monitoring
18
+ 2024-08-23 15:44:49,475 INFO SystemMonitor:10032 [interfaces.py:start():190] Started gpu monitoring
19
+ 2024-08-23 15:44:49,475 INFO SystemMonitor:10032 [interfaces.py:start():190] Started memory monitoring
20
+ 2024-08-23 15:44:49,476 INFO SystemMonitor:10032 [interfaces.py:start():190] Started network monitoring
21
+ 2024-08-23 15:44:49,486 DEBUG HandlerThread:10032 [system_info.py:probe():151] Probing system
22
+ 2024-08-23 15:44:49,488 DEBUG HandlerThread:10032 [system_info.py:_probe_git():136] Probing git
23
+ 2024-08-23 15:44:49,500 DEBUG HandlerThread:10032 [system_info.py:_probe_git():144] Probing git done
24
+ 2024-08-23 15:44:49,500 DEBUG HandlerThread:10032 [system_info.py:probe():199] Probing system done
25
+ 2024-08-23 15:44:49,500 DEBUG HandlerThread:10032 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-23T06:44:49.486428', 'startedAt': '2024-08-23T06:44:48.878270', 'docker': None, 'cuda': None, 'args': ('--seq-length', '2048', '--sliding-window-size', '4096', '--micro-batch-size', '1', '--valid_micro_batch_size', '1', '--global-batch-size', '320', '--train-iters', '7500', '--tokenizer-type', 'HFPreTrainedTokenizer', '--tokenizer-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--train-data-path', '1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '28623823675', '/project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document', '--valid-data-path', '1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '--test-data-path', '1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '7500', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'anyprecision', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '10', '--eval-interval', '10', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--save', '/work/llm_recipes/models/Qwen2-0.5b-0.2', '--load', '/work/llm_recipes/models/Qwen2-0.5b-0.2', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--upload-all-checkpoints-to-hf', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/Qwen2-0.5b-0.2', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial-0.2', '--wandb-name', 'Qwen2-0.5b-0.2_train_2024-08-23-15:44:18'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '887a2cc5d104c10264701f95cbbb0a6a116768d6'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0389999999993, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.487831115722656}}
26
+ 2024-08-23 15:44:49,500 INFO HandlerThread:10032 [system_monitor.py:probe():224] Finished collecting system info
27
+ 2024-08-23 15:44:49,500 INFO HandlerThread:10032 [system_monitor.py:probe():227] Publishing system info
28
+ 2024-08-23 15:44:49,502 INFO HandlerThread:10032 [system_monitor.py:probe():229] Finished publishing system info
29
+ 2024-08-23 15:44:49,528 DEBUG SenderThread:10032 [sender.py:send():382] send: files
30
+ 2024-08-23 15:44:49,529 INFO SenderThread:10032 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
31
+ 2024-08-23 15:44:49,540 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: python_packages
32
+ 2024-08-23 15:44:49,540 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: stop_status
33
+ 2024-08-23 15:44:49,540 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: internal_messages
34
+ 2024-08-23 15:44:49,541 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: python_packages
35
+ 2024-08-23 15:44:49,543 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: stop_status
36
+ 2024-08-23 15:44:49,740 DEBUG SenderThread:10032 [sender.py:send():382] send: telemetry
37
+ 2024-08-23 15:44:50,157 INFO wandb-upload_0:10032 [upload_job.py:push():131] Uploaded file /tmp/tmp_akktvpmwandb/xbudf9th-wandb-metadata.json
38
+ 2024-08-23 15:44:50,392 INFO Thread-12 :10032 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240823_154448-v9m85jnt/files/wandb-metadata.json
39
+ 2024-08-23 15:44:50,392 INFO Thread-12 :10032 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240823_154448-v9m85jnt/files/requirements.txt
40
+ 2024-08-23 15:44:50,392 INFO Thread-12 :10032 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240823_154448-v9m85jnt/files/output.log
41
+ 2024-08-23 15:44:50,729 DEBUG SenderThread:10032 [sender.py:send():382] send: exit
42
+ 2024-08-23 15:44:50,729 INFO SenderThread:10032 [sender.py:send_exit():589] handling exit code: 1
43
+ 2024-08-23 15:44:50,730 INFO SenderThread:10032 [sender.py:send_exit():591] handling runtime: 1
44
+ 2024-08-23 15:44:50,731 INFO SenderThread:10032 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
45
+ 2024-08-23 15:44:50,731 INFO SenderThread:10032 [sender.py:send_exit():597] send defer
46
+ 2024-08-23 15:44:50,731 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: defer
47
+ 2024-08-23 15:44:50,731 INFO HandlerThread:10032 [handler.py:handle_request_defer():172] handle defer: 0
48
+ 2024-08-23 15:44:50,731 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: defer
49
+ 2024-08-23 15:44:50,732 INFO SenderThread:10032 [sender.py:send_request_defer():613] handle sender defer: 0
50
+ 2024-08-23 15:44:50,732 INFO SenderThread:10032 [sender.py:transition_state():617] send defer: 1
51
+ 2024-08-23 15:44:50,732 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: defer
52
+ 2024-08-23 15:44:50,732 INFO HandlerThread:10032 [handler.py:handle_request_defer():172] handle defer: 1
53
+ 2024-08-23 15:44:50,732 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: defer
54
+ 2024-08-23 15:44:50,732 INFO SenderThread:10032 [sender.py:send_request_defer():613] handle sender defer: 1
55
+ 2024-08-23 15:44:50,732 INFO SenderThread:10032 [sender.py:transition_state():617] send defer: 2
56
+ 2024-08-23 15:44:50,732 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: defer
57
+ 2024-08-23 15:44:50,732 INFO HandlerThread:10032 [handler.py:handle_request_defer():172] handle defer: 2
58
+ 2024-08-23 15:44:50,732 INFO HandlerThread:10032 [system_monitor.py:finish():203] Stopping system monitor
59
+ 2024-08-23 15:44:50,732 INFO HandlerThread:10032 [interfaces.py:finish():202] Joined cpu monitor
60
+ 2024-08-23 15:44:50,733 DEBUG SystemMonitor:10032 [system_monitor.py:_start():172] Starting system metrics aggregation loop
61
+ 2024-08-23 15:44:50,733 INFO HandlerThread:10032 [interfaces.py:finish():202] Joined disk monitor
62
+ 2024-08-23 15:44:50,733 DEBUG SystemMonitor:10032 [system_monitor.py:_start():179] Finished system metrics aggregation loop
63
+ 2024-08-23 15:44:50,733 DEBUG SystemMonitor:10032 [system_monitor.py:_start():183] Publishing last batch of metrics
64
+ 2024-08-23 15:44:50,765 INFO HandlerThread:10032 [interfaces.py:finish():202] Joined gpu monitor
65
+ 2024-08-23 15:44:50,765 INFO HandlerThread:10032 [interfaces.py:finish():202] Joined memory monitor
66
+ 2024-08-23 15:44:50,765 INFO HandlerThread:10032 [interfaces.py:finish():202] Joined network monitor
67
+ 2024-08-23 15:44:50,766 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: defer
68
+ 2024-08-23 15:44:50,766 INFO SenderThread:10032 [sender.py:send_request_defer():613] handle sender defer: 2
69
+ 2024-08-23 15:44:50,766 INFO SenderThread:10032 [sender.py:transition_state():617] send defer: 3
70
+ 2024-08-23 15:44:50,766 DEBUG SenderThread:10032 [sender.py:send():382] send: stats
71
+ 2024-08-23 15:44:50,766 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: defer
72
+ 2024-08-23 15:44:50,766 INFO HandlerThread:10032 [handler.py:handle_request_defer():172] handle defer: 3
73
+ 2024-08-23 15:44:50,766 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: defer
74
+ 2024-08-23 15:44:50,766 INFO SenderThread:10032 [sender.py:send_request_defer():613] handle sender defer: 3
75
+ 2024-08-23 15:44:50,766 INFO SenderThread:10032 [sender.py:transition_state():617] send defer: 4
76
+ 2024-08-23 15:44:50,767 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: defer
77
+ 2024-08-23 15:44:50,767 INFO HandlerThread:10032 [handler.py:handle_request_defer():172] handle defer: 4
78
+ 2024-08-23 15:44:50,767 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: defer
79
+ 2024-08-23 15:44:50,767 INFO SenderThread:10032 [sender.py:send_request_defer():613] handle sender defer: 4
80
+ 2024-08-23 15:44:50,767 INFO SenderThread:10032 [sender.py:transition_state():617] send defer: 5
81
+ 2024-08-23 15:44:50,767 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: defer
82
+ 2024-08-23 15:44:50,767 INFO HandlerThread:10032 [handler.py:handle_request_defer():172] handle defer: 5
83
+ 2024-08-23 15:44:50,767 DEBUG SenderThread:10032 [sender.py:send():382] send: summary
84
+ 2024-08-23 15:44:50,768 INFO SenderThread:10032 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
85
+ 2024-08-23 15:44:50,768 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: defer
86
+ 2024-08-23 15:44:50,768 INFO SenderThread:10032 [sender.py:send_request_defer():613] handle sender defer: 5
87
+ 2024-08-23 15:44:50,768 INFO SenderThread:10032 [sender.py:transition_state():617] send defer: 6
88
+ 2024-08-23 15:44:50,768 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: defer
89
+ 2024-08-23 15:44:50,768 INFO HandlerThread:10032 [handler.py:handle_request_defer():172] handle defer: 6
90
+ 2024-08-23 15:44:50,768 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: defer
91
+ 2024-08-23 15:44:50,769 INFO SenderThread:10032 [sender.py:send_request_defer():613] handle sender defer: 6
92
+ 2024-08-23 15:44:50,771 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: status_report
93
+ 2024-08-23 15:44:50,957 INFO SenderThread:10032 [sender.py:transition_state():617] send defer: 7
94
+ 2024-08-23 15:44:50,957 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: defer
95
+ 2024-08-23 15:44:50,957 INFO HandlerThread:10032 [handler.py:handle_request_defer():172] handle defer: 7
96
+ 2024-08-23 15:44:50,958 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: defer
97
+ 2024-08-23 15:44:50,958 INFO SenderThread:10032 [sender.py:send_request_defer():613] handle sender defer: 7
98
+ 2024-08-23 15:44:51,392 INFO Thread-12 :10032 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_154448-v9m85jnt/files/config.yaml
99
+ 2024-08-23 15:44:51,392 INFO Thread-12 :10032 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240823_154448-v9m85jnt/files/wandb-summary.json
100
+ 2024-08-23 15:44:51,729 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: poll_exit
101
+ 2024-08-23 15:44:52,393 INFO Thread-12 :10032 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_154448-v9m85jnt/files/output.log
102
+ 2024-08-23 15:44:52,721 INFO SenderThread:10032 [sender.py:transition_state():617] send defer: 8
103
+ 2024-08-23 15:44:52,721 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: poll_exit
104
+ 2024-08-23 15:44:52,721 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: defer
105
+ 2024-08-23 15:44:52,721 INFO HandlerThread:10032 [handler.py:handle_request_defer():172] handle defer: 8
106
+ 2024-08-23 15:44:52,721 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: defer
107
+ 2024-08-23 15:44:52,721 INFO SenderThread:10032 [sender.py:send_request_defer():613] handle sender defer: 8
108
+ 2024-08-23 15:44:52,721 INFO SenderThread:10032 [job_builder.py:build():296] Attempting to build job artifact
109
+ 2024-08-23 15:44:52,722 INFO SenderThread:10032 [job_builder.py:_get_source_type():426] is repo sourced job
110
+ 2024-08-23 15:44:52,730 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: poll_exit
111
+ 2024-08-23 15:44:52,737 INFO SenderThread:10032 [job_builder.py:build():402] adding wandb-job metadata file
112
+ 2024-08-23 15:44:52,746 INFO SenderThread:10032 [sender.py:transition_state():617] send defer: 9
113
+ 2024-08-23 15:44:52,747 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: poll_exit
114
+ 2024-08-23 15:44:52,747 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: defer
115
+ 2024-08-23 15:44:52,747 DEBUG SenderThread:10032 [sender.py:send():382] send: artifact
116
+ 2024-08-23 15:44:52,747 INFO HandlerThread:10032 [handler.py:handle_request_defer():172] handle defer: 9
117
+ 2024-08-23 15:44:53,393 INFO Thread-12 :10032 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_154448-v9m85jnt/files/output.log
118
+ 2024-08-23 15:44:53,730 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: poll_exit
119
+ 2024-08-23 15:44:54,153 INFO wandb-upload_1:10032 [upload_job.py:push():89] Uploaded file /singularity_home/.local/share/wandb/artifacts/staging/tmp_o6jbw71
120
+ 2024-08-23 15:44:54,878 INFO wandb-upload_0:10032 [upload_job.py:push():89] Uploaded file /singularity_home/.local/share/wandb/artifacts/staging/tmpdgbh2byi
121
+ 2024-08-23 15:44:55,934 INFO SenderThread:10032 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTE2MTk3MTc1OA==', 'state': 'PENDING', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjQxNjQ1ODQ1MA==', 'latestArtifact': None}}
122
+ 2024-08-23 15:44:55,934 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: defer
123
+ 2024-08-23 15:44:55,934 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: status_report
124
+ 2024-08-23 15:44:55,934 INFO SenderThread:10032 [sender.py:send_request_defer():613] handle sender defer: 9
125
+ 2024-08-23 15:44:55,934 INFO SenderThread:10032 [dir_watcher.py:finish():358] shutting down directory watcher
126
+ 2024-08-23 15:44:56,394 INFO SenderThread:10032 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240823_154448-v9m85jnt/files
127
+ 2024-08-23 15:44:56,395 INFO SenderThread:10032 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_154448-v9m85jnt/files/requirements.txt requirements.txt
128
+ 2024-08-23 15:44:56,395 INFO SenderThread:10032 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_154448-v9m85jnt/files/config.yaml config.yaml
129
+ 2024-08-23 15:44:56,396 INFO SenderThread:10032 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_154448-v9m85jnt/files/wandb-metadata.json wandb-metadata.json
130
+ 2024-08-23 15:44:56,396 INFO SenderThread:10032 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_154448-v9m85jnt/files/wandb-summary.json wandb-summary.json
131
+ 2024-08-23 15:44:56,398 INFO SenderThread:10032 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_154448-v9m85jnt/files/output.log output.log
132
+ 2024-08-23 15:44:56,399 INFO SenderThread:10032 [sender.py:transition_state():617] send defer: 10
133
+ 2024-08-23 15:44:56,399 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: poll_exit
134
+ 2024-08-23 15:44:56,399 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: defer
135
+ 2024-08-23 15:44:56,401 INFO HandlerThread:10032 [handler.py:handle_request_defer():172] handle defer: 10
136
+ 2024-08-23 15:44:56,401 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: defer
137
+ 2024-08-23 15:44:56,401 INFO SenderThread:10032 [sender.py:send_request_defer():613] handle sender defer: 10
138
+ 2024-08-23 15:44:56,401 INFO SenderThread:10032 [file_pusher.py:finish():172] shutting down file pusher
139
+ 2024-08-23 15:44:56,731 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: poll_exit
140
+ 2024-08-23 15:44:56,731 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: poll_exit
141
+ 2024-08-23 15:44:56,790 INFO wandb-upload_1:10032 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240823_154448-v9m85jnt/files/requirements.txt
142
+ 2024-08-23 15:44:56,818 INFO wandb-upload_0:10032 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240823_154448-v9m85jnt/files/config.yaml
143
+ 2024-08-23 15:44:56,848 INFO wandb-upload_2:10032 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240823_154448-v9m85jnt/files/wandb-summary.json
144
+ 2024-08-23 15:44:56,865 INFO wandb-upload_3:10032 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240823_154448-v9m85jnt/files/output.log
145
+ 2024-08-23 15:44:57,065 INFO Thread-11 (_thread_body):10032 [sender.py:transition_state():617] send defer: 11
146
+ 2024-08-23 15:44:57,065 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: defer
147
+ 2024-08-23 15:44:57,065 INFO HandlerThread:10032 [handler.py:handle_request_defer():172] handle defer: 11
148
+ 2024-08-23 15:44:57,065 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: defer
149
+ 2024-08-23 15:44:57,065 INFO SenderThread:10032 [sender.py:send_request_defer():613] handle sender defer: 11
150
+ 2024-08-23 15:44:57,065 INFO SenderThread:10032 [file_pusher.py:join():178] waiting for file pusher
151
+ 2024-08-23 15:44:57,066 INFO SenderThread:10032 [sender.py:transition_state():617] send defer: 12
152
+ 2024-08-23 15:44:57,066 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: defer
153
+ 2024-08-23 15:44:57,066 INFO HandlerThread:10032 [handler.py:handle_request_defer():172] handle defer: 12
154
+ 2024-08-23 15:44:57,066 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: defer
155
+ 2024-08-23 15:44:57,066 INFO SenderThread:10032 [sender.py:send_request_defer():613] handle sender defer: 12
156
+ 2024-08-23 15:44:57,066 INFO SenderThread:10032 [file_stream.py:finish():595] file stream finish called
157
+ 2024-08-23 15:44:57,271 INFO SenderThread:10032 [file_stream.py:finish():599] file stream finish is done
158
+ 2024-08-23 15:44:57,271 INFO SenderThread:10032 [sender.py:transition_state():617] send defer: 13
159
+ 2024-08-23 15:44:57,271 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: defer
160
+ 2024-08-23 15:44:57,271 INFO HandlerThread:10032 [handler.py:handle_request_defer():172] handle defer: 13
161
+ 2024-08-23 15:44:57,271 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: defer
162
+ 2024-08-23 15:44:57,271 INFO SenderThread:10032 [sender.py:send_request_defer():613] handle sender defer: 13
163
+ 2024-08-23 15:44:57,271 INFO SenderThread:10032 [sender.py:transition_state():617] send defer: 14
164
+ 2024-08-23 15:44:57,271 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: defer
165
+ 2024-08-23 15:44:57,271 DEBUG SenderThread:10032 [sender.py:send():382] send: final
166
+ 2024-08-23 15:44:57,271 INFO HandlerThread:10032 [handler.py:handle_request_defer():172] handle defer: 14
167
+ 2024-08-23 15:44:57,271 DEBUG SenderThread:10032 [sender.py:send():382] send: footer
168
+ 2024-08-23 15:44:57,272 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: defer
169
+ 2024-08-23 15:44:57,272 INFO SenderThread:10032 [sender.py:send_request_defer():613] handle sender defer: 14
170
+ 2024-08-23 15:44:57,272 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: poll_exit
171
+ 2024-08-23 15:44:57,272 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: poll_exit
172
+ 2024-08-23 15:44:57,272 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: server_info
173
+ 2024-08-23 15:44:57,273 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: get_summary
174
+ 2024-08-23 15:44:57,273 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: sampled_history
175
+ 2024-08-23 15:44:57,273 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: poll_exit
176
+ 2024-08-23 15:44:57,273 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: internal_messages
177
+ 2024-08-23 15:44:57,273 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: poll_exit
178
+ 2024-08-23 15:44:57,274 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: server_info
179
+ 2024-08-23 15:44:57,275 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: job_info
180
+ 2024-08-23 15:44:57,441 DEBUG SenderThread:10032 [sender.py:send_request():409] send_request: job_info
181
+ 2024-08-23 15:44:57,441 INFO MainThread:10032 [wandb_run.py:_footer_history_summary_info():3866] rendering history
182
+ 2024-08-23 15:44:57,441 INFO MainThread:10032 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
183
+ 2024-08-23 15:44:57,441 INFO MainThread:10032 [wandb_run.py:_footer_sync_info():3825] logging synced files
184
+ 2024-08-23 15:44:57,441 DEBUG HandlerThread:10032 [handler.py:handle_request():146] handle_request: shutdown
185
+ 2024-08-23 15:44:57,441 INFO HandlerThread:10032 [handler.py:finish():869] shutting down handler
186
+ 2024-08-23 15:44:58,275 INFO WriterThread:10032 [datastore.py:close():296] close: /project/wandb/run-20240823_154448-v9m85jnt/run-v9m85jnt.wandb
187
+ 2024-08-23 15:44:58,441 INFO SenderThread:10032 [sender.py:finish():1572] shutting down sender
188
+ 2024-08-23 15:44:58,441 INFO SenderThread:10032 [file_pusher.py:finish():172] shutting down file pusher
189
+ 2024-08-23 15:44:58,441 INFO SenderThread:10032 [file_pusher.py:join():178] waiting for file pusher
wandb/run-20240823_154448-v9m85jnt/logs/debug.log ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-23 15:44:48,884 INFO MainThread:9961 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
2
+ 2024-08-23 15:44:48,884 INFO MainThread:9961 [wandb_setup.py:_flush():76] Configure stats pid to 9961
3
+ 2024-08-23 15:44:48,884 INFO MainThread:9961 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
4
+ 2024-08-23 15:44:48,884 INFO MainThread:9961 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
5
+ 2024-08-23 15:44:48,884 INFO MainThread:9961 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train sample'}
6
+ 2024-08-23 15:44:48,884 INFO MainThread:9961 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
+ 2024-08-23 15:44:48,884 INFO MainThread:9961 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
8
+ 2024-08-23 15:44:48,884 INFO MainThread:9961 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240823_154448-v9m85jnt/logs/debug.log
9
+ 2024-08-23 15:44:48,884 INFO MainThread:9961 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240823_154448-v9m85jnt/logs/debug-internal.log
10
+ 2024-08-23 15:44:48,884 INFO MainThread:9961 [wandb_init.py:init():566] calling init triggers
11
+ 2024-08-23 15:44:48,884 INFO MainThread:9961 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
12
+ config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '28623823675', '/project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document'], 'valid_data_path': ['1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document'], 'test_data_path': ['1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 2048, 'num_workers': 2, 'tokenizer_type': 'HFPreTrainedTokenizer', 'tokenizer_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'Qwen2-0.5b-0.2_train_2024-08-23-15:44:18', 'wandb_project': 'llm_tutorial-0.2', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/Qwen2-0.5b-0.2', 'save': '/work/llm_recipes/models/Qwen2-0.5b-0.2', 'base_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 10, 'save_interval': 10, 'eval_iters': 10, 'optimizer': 'anyprecision', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 7500, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 7500, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 1, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/Qwen2-0.5b-0.2', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': True, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'valid_micro_batch_size': 1, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 151680, 'gradient_accumulation_steps': 320}
13
+ 2024-08-23 15:44:48,884 INFO MainThread:9961 [wandb_init.py:init():616] starting backend
14
+ 2024-08-23 15:44:48,885 INFO MainThread:9961 [wandb_init.py:init():620] setting up manager
15
+ 2024-08-23 15:44:48,889 INFO MainThread:9961 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
16
+ 2024-08-23 15:44:48,891 INFO MainThread:9961 [wandb_init.py:init():628] backend started and connected
17
+ 2024-08-23 15:44:48,896 INFO MainThread:9961 [wandb_init.py:init():720] updated telemetry
18
+ 2024-08-23 15:44:48,909 INFO MainThread:9961 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
19
+ 2024-08-23 15:44:49,395 INFO MainThread:9961 [wandb_run.py:_on_init():2262] communicating current version
20
+ 2024-08-23 15:44:49,418 INFO MainThread:9961 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.7 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
21
+
22
+ 2024-08-23 15:44:49,418 INFO MainThread:9961 [wandb_init.py:init():804] starting run threads in backend
23
+ 2024-08-23 15:44:49,539 INFO MainThread:9961 [wandb_run.py:_console_start():2241] atexit reg
24
+ 2024-08-23 15:44:49,539 INFO MainThread:9961 [wandb_run.py:_redirect():2096] redirect: wrap_raw
25
+ 2024-08-23 15:44:49,539 INFO MainThread:9961 [wandb_run.py:_redirect():2161] Wrapping output streams.
26
+ 2024-08-23 15:44:49,539 INFO MainThread:9961 [wandb_run.py:_redirect():2186] Redirects installed.
27
+ 2024-08-23 15:44:49,540 INFO MainThread:9961 [wandb_init.py:init():847] run started, returning control to user process
28
+ 2024-08-23 15:44:58,442 WARNING MsgRouterThr:9961 [router.py:message_loop():77] message_loop has been closed
wandb/run-20240823_154448-v9m85jnt/run-v9m85jnt.wandb ADDED
Binary file (8.01 kB). View file