koichi12 commited on Nov 28, 2024

Commit

b38ed3f

verified ·

1 Parent(s): 1cf1fd6

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

wandb/run-20240804_140603-q9i5g6sv/files/config.yaml +335 -0
wandb/run-20240804_140603-q9i5g6sv/files/output.log +130 -0
wandb/run-20240804_140603-q9i5g6sv/files/requirements.txt +271 -0
wandb/run-20240804_140603-q9i5g6sv/files/wandb-metadata.json +215 -0
wandb/run-20240804_140603-q9i5g6sv/files/wandb-summary.json +1 -0
wandb/run-20240804_140603-q9i5g6sv/logs/debug-internal.log +186 -0
wandb/run-20240804_140603-q9i5g6sv/logs/debug.log +30 -0
wandb/run-20240804_140603-q9i5g6sv/run-q9i5g6sv.wandb +0 -0
wandb/run-20240804_142250-6p58tz1g/files/config.yaml +335 -0
wandb/run-20240804_142250-6p58tz1g/files/output.log +135 -0
wandb/run-20240804_142250-6p58tz1g/files/requirements.txt +271 -0
wandb/run-20240804_142250-6p58tz1g/files/wandb-metadata.json +215 -0
wandb/run-20240804_142250-6p58tz1g/files/wandb-summary.json +1 -0
wandb/run-20240804_142250-6p58tz1g/logs/debug-internal.log +186 -0
wandb/run-20240804_142250-6p58tz1g/logs/debug.log +30 -0
wandb/run-20240804_142250-6p58tz1g/run-6p58tz1g.wandb +0 -0
wandb/run-20240804_143607-h7fxlkpt/files/config.yaml +335 -0
wandb/run-20240804_143607-h7fxlkpt/files/output.log +135 -0
wandb/run-20240804_143607-h7fxlkpt/files/requirements.txt +271 -0
wandb/run-20240804_143607-h7fxlkpt/files/wandb-metadata.json +215 -0
wandb/run-20240804_143607-h7fxlkpt/files/wandb-summary.json +1 -0
wandb/run-20240804_143607-h7fxlkpt/logs/debug-internal.log +186 -0
wandb/run-20240804_143607-h7fxlkpt/logs/debug.log +30 -0
wandb/run-20240804_143607-h7fxlkpt/run-h7fxlkpt.wandb +0 -0
wandb/run-20240804_221132-o8ieoj9i/files/config.yaml +335 -0
wandb/run-20240804_221132-o8ieoj9i/files/output.log +135 -0
wandb/run-20240804_221132-o8ieoj9i/files/requirements.txt +271 -0
wandb/run-20240804_221132-o8ieoj9i/files/wandb-metadata.json +215 -0
wandb/run-20240804_221132-o8ieoj9i/files/wandb-summary.json +1 -0
wandb/run-20240804_221132-o8ieoj9i/logs/debug-internal.log +263 -0
wandb/run-20240804_221132-o8ieoj9i/logs/debug.log +30 -0
wandb/run-20240804_221132-o8ieoj9i/run-o8ieoj9i.wandb +0 -0
wandb/run-20240812_052853-n84i0o06/files/config.yaml +335 -0
wandb/run-20240812_052853-n84i0o06/files/output.log +139 -0
wandb/run-20240812_052853-n84i0o06/files/requirements.txt +271 -0
wandb/run-20240812_052853-n84i0o06/files/wandb-metadata.json +215 -0
wandb/run-20240812_052853-n84i0o06/files/wandb-summary.json +1 -0
wandb/run-20240812_052853-n84i0o06/logs/debug-internal.log +384 -0
wandb/run-20240812_052853-n84i0o06/logs/debug.log +30 -0
wandb/run-20240812_052853-n84i0o06/run-n84i0o06.wandb +0 -0
wandb/run-20240812_063027-j1htzx7q/files/output.log +121 -0
wandb/run-20240812_063027-j1htzx7q/files/wandb-summary.json +1 -0
wandb/run-20240823_154448-v9m85jnt/files/config.yaml +321 -0
wandb/run-20240823_154448-v9m85jnt/files/output.log +15 -0
wandb/run-20240823_154448-v9m85jnt/files/requirements.txt +375 -0
wandb/run-20240823_154448-v9m85jnt/files/wandb-metadata.json +220 -0
wandb/run-20240823_154448-v9m85jnt/files/wandb-summary.json +1 -0
wandb/run-20240823_154448-v9m85jnt/logs/debug-internal.log +189 -0
wandb/run-20240823_154448-v9m85jnt/logs/debug.log +28 -0
wandb/run-20240823_154448-v9m85jnt/run-v9m85jnt.wandb +0 -0

wandb/run-20240804_140603-q9i5g6sv/files/config.yaml ADDED Viewed

	@@ -0,0 +1,335 @@

+wandb_version: 1
+sharding_strategy:
+  desc: null
+  value: FULL_SHARD
+checkpoint_type:
+  desc: null
+  value: LOCAL_STATE_DICT
+fsdp_activation_checkpointing:
+  desc: null
+  value: true
+fsdp_cpu_offload:
+  desc: null
+  value: false
+low_cpu_fsdp:
+  desc: null
+  value: false
+no_meta_device:
+  desc: null
+  value: false
+data_path:
+  desc: null
+  value: null
+split:
+  desc: null
+  value: 969, 30, 1
+train_data_path:
+  desc: null
+  value:
+  - '4013541'
+  - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
+valid_data_path:
+  desc: null
+  value:
+  - '4013541'
+  - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
+test_data_path:
+  desc: null
+  value:
+  - '4013541'
+  - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
+data_cache_path:
+  desc: null
+  value: null
+vocab_size:
+  desc: null
+  value: null
+vocab_file:
+  desc: null
+  value: null
+merge_file:
+  desc: null
+  value: null
+seq_length:
+  desc: null
+  value: 512
+num_workers:
+  desc: null
+  value: 2
+tokenizer_type:
+  desc: null
+  value: Llama2Tokenizer
+tokenizer_model:
+  desc: null
+  value: /share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model
+reset_position_ids:
+  desc: null
+  value: false
+reset_attention_mask:
+  desc: null
+  value: false
+eod_mask_loss:
+  desc: null
+  value: false
+retro_return_doc_ids:
+  desc: null
+  value: false
+short_seq_prob:
+  desc: null
+  value: 0.1
+vocab_extra_ids:
+  desc: null
+  value: 0
+seed:
+  desc: null
+  value: 1234
+use_mpi:
+  desc: null
+  value: false
+wandb_entity:
+  desc: null
+  value: iwakawa-koichi-q5-tohoku-nlp6723
+wandb_name:
+  desc: null
+  value: tiny-llama_train_2024-08-04-14:05:53
+wandb_project:
+  desc: null
+  value: llm_tutorial
+quantization:
+  desc: null
+  value: false
+use_freeze_layers:
+  desc: null
+  value: false
+freeze_layers:
+  desc: null
+  value: null
+bf16:
+  desc: null
+  value: true
+fp16:
+  desc: null
+  value: false
+mixed_precision:
+  desc: null
+  value: true
+param_dtype:
+  desc: null
+  value: null
+load:
+  desc: null
+  value: /work/llm_recipes/models/tiny-llama
+save:
+  desc: null
+  value: /work/llm_recipes/models/tiny-llama
+base_model:
+  desc: null
+  value: /share/pretrained_lm/meta-llama/TinyLlama_v1.1
+use_better_transformer:
+  desc: null
+  value: false
+grad_clip_norm:
+  desc: null
+  value: 1.0
+eval_interval:
+  desc: null
+  value: 200
+save_interval:
+  desc: null
+  value: 200
+eval_iters:
+  desc: null
+  value: 10
+optimizer:
+  desc: null
+  value: adam
+lr:
+  desc: null
+  value: 2.0e-05
+lr_decay_style:
+  desc: null
+  value: cosine
+lr_decay_iters:
+  desc: null
+  value: 2000
+lr_warmup_iters:
+  desc: null
+  value: 500
+min_lr:
+  desc: null
+  value: 1.0e-06
+train_iters:
+  desc: null
+  value: 2000
+train_samples:
+  desc: null
+  value: null
+global_batch_size:
+  desc: null
+  value: 320
+micro_batch_size:
+  desc: null
+  value: 8
+make_vocab_size_divisible_by:
+  desc: null
+  value: 128
+sliding_window_size:
+  desc: null
+  value: 4096
+skip_batch:
+  desc: null
+  value: null
+no_save_optimizer_state:
+  desc: null
+  value: false
+continual_pretraining:
+  desc: null
+  value: false
+instruction_tuning:
+  desc: null
+  value: false
+direct_preference_optimization:
+  desc: null
+  value: false
+attention_dropout:
+  desc: null
+  value: 0.1
+hidden_dropout:
+  desc: null
+  value: 0.1
+weight_decay:
+  desc: null
+  value: 0.1
+adam_beta1:
+  desc: null
+  value: 0.9
+adam_beta2:
+  desc: null
+  value: 0.95
+adam_eps:
+  desc: null
+  value: 1.0e-06
+hf_transformer_model_dir:
+  desc: null
+  value: null
+instruction_train_data_path:
+  desc: null
+  value: null
+instruction_valid_data_path:
+  desc: null
+  value: null
+epoch:
+  desc: null
+  value: null
+instruction_dataset_size:
+  desc: null
+  value: null
+save_sampler_state:
+  desc: null
+  value: false
+label_smoothing:
+  desc: null
+  value: 0.0
+save_n_checkpoints:
+  desc: null
+  value: 10
+hf_repo_id:
+  desc: null
+  value: koichi12/tiny-llama
+create_public_hf_repo:
+  desc: null
+  value: false
+upload_all_checkpoints_to_hf:
+  desc: null
+  value: false
+hf_upload_retry_limit:
+  desc: null
+  value: 2
+exit_duration_in_mins:
+  desc: null
+  value: null
+source_key:
+  desc: null
+  value: null
+target_key:
+  desc: null
+  value: null
+attn_implementation:
+  desc: null
+  value: flash_attention_2
+efficient_instruction_tuning:
+  desc: null
+  value: false
+remove_padding_masking:
+  desc: null
+  value: false
+save_start_iter:
+  desc: null
+  value: null
+rank:
+  desc: null
+  value: 0
+world_size:
+  desc: null
+  value: 1
+padded_vocab_size:
+  desc: null
+  value: 32000
+gradient_accumulation_steps:
+  desc: null
+  value: 40
+_wandb:
+  desc: null
+  value:
+    python_version: 3.10.12
+    cli_version: 0.16.3
+    framework: huggingface
+    huggingface_version: 4.43.3
+    is_jupyter_run: false
+    is_kaggle_kernel: false
+    start_time: 1722747963.684337
+    t:
+      1:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      2:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      3:
+      - 13
+      - 16
+      - 23
+      4: 3.10.12
+      5: 0.16.3
+      6: 4.43.3
+      8:
+      - 5
+      13: linux-x86_64
+activation_function:
+  desc: null
+  value: silu
+hidden_size:
+  desc: null
+  value: 2048
+model_type:
+  desc: null
+  value: llama
+max_position_embeddings:
+  desc: null
+  value: 2048
+num_attention_heads:
+  desc: null
+  value: 32
+num_hidden_layers:
+  desc: null
+  value: 22
+model_architecture:
+  desc: null
+  value: LlamaForCausalLM

wandb/run-20240804_140603-q9i5g6sv/files/output.log ADDED Viewed

	@@ -0,0 +1,130 @@

+Created Hugging Face repository with ID koichi12/tiny-llama.
+Clearing GPU cache for all ranks
+--> Running with torch torch_distributed debug set to detail
+File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+No checkpoint found in /work/llm_recipes/models/tiny-llama, skipping model loading
+--> Model /share/pretrained_lm/meta-llama/TinyLlama_v1.1
+--> /share/pretrained_lm/meta-llama/TinyLlama_v1.1 has 1100.048384 Million params
+You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
+You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
+Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaForCausalLM is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
+Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
+BFloat16 enabled for mixed precision - using bfSixteen policy
+--> applying fsdp activation checkpointing...
+ > datasets target sizes (minimum size):
+    train:      640000
+    validation: 35200
+    test:       3200
+> building train, validation, and test datasets for GPT ...
+> finished creating GPT datasets ...
+File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+No checkpoint found in /work/llm_recipes/models/tiny-llama, skipping optimizer loading
+File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+model info: FullyShardedDataParallel(
+  (_fsdp_wrapped_module): LlamaForCausalLM(
+    (model): LlamaModel(
+      (embed_tokens): Embedding(32000, 2048)
+      (layers): ModuleList(
+        (0-21): 22 x FullyShardedDataParallel(
+          (_fsdp_wrapped_module): CheckpointWrapper(
+            (_checkpoint_wrapped_module): LlamaDecoderLayer(
+              (self_attn): LlamaFlashAttention2(
+                (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
+                (k_proj): Linear(in_features=2048, out_features=256, bias=False)
+                (v_proj): Linear(in_features=2048, out_features=256, bias=False)
+                (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
+                (rotary_emb): LlamaRotaryEmbedding()
+              )
+              (mlp): LlamaMLP(
+                (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
+                (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
+                (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): LlamaRMSNorm()
+              (post_attention_layernorm): LlamaRMSNorm()
+            )
+          )
+        )
+      )
+      (norm): LlamaRMSNorm()
+      (rotary_emb): LlamaRotaryEmbedding()
+    )
+    (lm_head): Linear(in_features=2048, out_features=32000, bias=False)
+  )
+)
+model config: LlamaConfig {
+  "_name_or_path": "/share/pretrained_lm/meta-llama/TinyLlama_v1.1",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 5632,
+  "label_smoothing": 0.0,
+  "max_position_embeddings": 2048,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 22,
+  "num_key_value_heads": 4,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.43.3",
+  "use_cache": false,
+  "vocab_size": 32000
+}
+/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
+  warnings.warn(
+Let split = None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+Traceback (most recent call last):
+  File "/project/examples/finetuning.py", line 13, in <module>
+    main()
+  File "/project/src/llama_recipes/finetuning.py", line 281, in main
+    train(
+  File "/project/src/llama_recipes/utils/train_utils.py", line 110, in train
+    loss: torch.Tensor = model(**batch).loss
+  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 849, in forward
+    output = self._fsdp_wrapped_module(*args, **kwargs)
+  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/project/lib/transformers/src/transformers/models/llama/modeling_llama.py", line 1141, in forward
+    outputs = self.model(
+  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/project/lib/transformers/src/transformers/models/llama/modeling_llama.py", line 908, in forward
+    cache_position = torch.arange(
+RuntimeError: CUDA error: device-side assert triggered
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

wandb/run-20240804_140603-q9i5g6sv/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,271 @@

+absl-py==2.1.0
+accelerate==0.33.0
+aiohttp==3.9.1
+aiosignal==1.3.1
+annotated-types==0.6.0
+apex==0.1
+appdirs==1.4.4
+argon2-cffi-bindings==21.2.0
+argon2-cffi==23.1.0
+asttokens==2.4.1
+astunparse==1.6.3
+async-timeout==4.0.3
+attrs==23.2.0
+audioread==3.0.1
+beautifulsoup4==4.12.3
+bleach==6.1.0
+blis==0.7.11
+cachetools==5.3.2
+catalogue==2.0.10
+certifi==2024.2.2
+cffi==1.16.0
+charset-normalizer==3.3.2
+click==8.1.7
+cloudpathlib==0.16.0
+cloudpickle==3.0.0
+cmake==3.28.1
+colorama==0.4.6
+comm==0.2.1
+confection==0.1.4
+contourpy==1.2.0
+cubinlinker==0.3.0+2.g405ac64
+cuda-python==12.3.0rc4+9.gdb8c48a.dirty
+cudf==23.12.0
+cugraph-dgl==23.12.0
+cugraph-service-client==23.12.0
+cugraph-service-server==23.12.0
+cugraph==23.12.0
+cuml==23.12.0
+cupy-cuda12x==12.3.0
+cycler==0.12.1
+cymem==2.0.8
+cython==3.0.8
+dask-cuda==23.12.0
+dask-cudf==23.12.0
+dask==2023.11.0
+debugpy==1.8.1
+decorator==5.1.1
+defusedxml==0.7.1
+distributed==2023.11.0
+dm-tree==0.1.8
+docker-pycreds==0.4.0
+einops==0.7.0
+exceptiongroup==1.2.0
+execnet==2.0.2
+executing==2.0.1
+expecttest==0.1.3
+fastjsonschema==2.19.1
+fastrlock==0.8.2
+filelock==3.13.1
+flash-attn==2.4.2
+fonttools==4.48.1
+frozenlist==1.4.1
+fsspec==2023.12.2
+gast==0.5.4
+gitdb==4.0.11
+gitpython==3.1.43
+google-auth-oauthlib==0.4.6
+google-auth==2.27.0
+graphsurgeon==0.4.6
+grpcio==1.60.1
+huggingface-hub==0.24.5
+hypothesis==5.35.1
+idna==3.6
+importlib-metadata==7.0.1
+iniconfig==2.0.0
+intel-openmp==2021.4.0
+ipadic==1.0.0
+ipykernel==6.29.2
+ipython-genutils==0.2.0
+ipython==8.21.0
+jedi==0.19.1
+jinja2==3.1.3
+joblib==1.3.2
+json5==0.9.14
+jsonnet==0.19.1
+jsonschema-specifications==2023.12.1
+jsonschema==4.21.1
+jupyter-client==8.6.0
+jupyter-core==5.7.1
+jupyter-tensorboard==0.2.0
+jupyterlab-pygments==0.3.0
+jupyterlab-server==1.2.0
+jupyterlab==2.3.2
+jupytext==1.16.1
+kiwisolver==1.4.5
+langcodes==3.3.0
+lazy-loader==0.3
+librosa==0.10.1
+llvmlite==0.40.1
+locket==1.0.0
+logzero==1.7.0
+lxml==5.2.2
+markdown-it-py==3.0.0
+markdown==3.5.2
+markupsafe==2.1.4
+matplotlib-inline==0.1.6
+matplotlib==3.8.2
+mdit-py-plugins==0.4.0
+mdurl==0.1.2
+mecab-python3==1.0.6
+mistune==3.0.2
+mkl-devel==2021.1.1
+mkl-include==2021.1.1
+mkl==2021.1.1
+mock==5.1.0
+more-itertools==9.1.0
+mpmath==1.3.0
+msgpack==1.0.7
+multidict==6.0.4
+murmurhash==1.0.10
+nbclient==0.9.0
+nbconvert==7.16.0
+nbformat==5.9.2
+nest-asyncio==1.6.0
+networkx==2.6.3
+ninja==1.11.1.1
+nltk==3.8.1
+notebook==6.4.10
+numba==0.57.1+1.g1ff679645
+numpy==1.24.4
+nvfuser==0.1.4a0+d0bb811
+nvidia-dali-cuda120==1.34.0
+nvidia-pyindex==1.0.9
+nvtx==0.2.5
+oauthlib==3.2.2
+onnx==1.15.0rc2
+opencv==4.7.0
+optree==0.10.0
+packaging==23.2
+pandas==1.5.3
+pandocfilters==1.5.1
+parso==0.8.3
+partd==1.4.1
+peft==0.11.1
+pexpect==4.9.0
+pillow==10.2.0
+pip==24.0
+platformdirs==4.2.0
+pluggy==1.4.0
+ply==3.11
+polygraphy==0.49.4
+pooch==1.8.0
+portalocker==2.10.1
+preshed==3.0.9
+prettytable==3.9.0
+prometheus-client==0.19.0
+prompt-toolkit==3.0.43
+protobuf==4.24.4
+psutil==5.9.4
+ptxcompiler==0.8.1+2.g0d406d6
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyarrow==14.0.1.dev0+gba5374836.d20240125
+pyasn1-modules==0.3.0
+pyasn1==0.5.1
+pybind11-global==2.11.1
+pybind11==2.11.1
+pycocotools==2.0+nv0.8.0
+pycparser==2.21
+pydantic-core==2.16.2
+pydantic==2.6.1
+pygments==2.17.2
+pylibcugraph==23.12.0
+pylibcugraphops==23.12.0
+pylibraft==23.12.0
+pynvml==11.4.1
+pyparsing==3.1.1
+pytest-flakefinder==1.1.0
+pytest-rerunfailures==13.0
+pytest-shard==0.1.2
+pytest-xdist==3.5.0
+pytest==8.0.0
+python-dateutil==2.8.2
+python-dotenv==1.0.0
+python-hostlist==1.23.0
+pytorch-quantization==2.1.2
+pytz==2023.3.post1
+pyyaml==6.0.1
+pyzmq==25.1.2
+raft-dask==23.12.0
+rapids-dask-dependency==23.12.1
+referencing==0.33.0
+regex==2023.12.25
+requests-oauthlib==1.3.1
+requests==2.31.0
+rich==13.7.0
+rmm==23.12.0
+rpds-py==0.17.1
+rsa==4.9
+sacrebleu==2.4.0
+safetensors==0.4.3
+scikit-learn==1.2.0
+scipy==1.12.0
+send2trash==1.8.2
+sentencepiece==0.1.99
+sentry-sdk==2.12.0
+setproctitle==1.3.3
+setuptools==68.2.2
+six==1.16.0
+smart-open==6.4.0
+smmap==5.0.1
+sortedcontainers==2.4.0
+soundfile==0.12.1
+soupsieve==2.5
+soxr==0.3.7
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+spacy==3.7.2
+sphinx-glpi-theme==0.6
+srsly==2.4.8
+stack-data==0.6.3
+sympy==1.12
+tabulate==0.9.0
+tbb==2021.11.0
+tblib==3.0.0
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.1
+tensorboard==2.9.0
+tensorrt==8.6.3
+terminado==0.18.0
+termplotlib==0.3.9
+thinc==8.2.3
+threadpoolctl==3.2.0
+thriftpy2==0.4.17
+tinycss2==1.2.1
+tokenizers==0.19.1
+toml==0.10.2
+tomli==2.0.1
+toolz==0.12.1
+torch-tensorrt==2.3.0a0
+torch==2.3.0a0+ebedce2
+torchdata==0.7.1a0
+torchtext==0.17.0a0
+torchvision==0.18.0a0
+tornado==6.4
+tqdm==4.66.1
+traitlets==5.9.0
+transformer-engine==1.3.0+5b90b7f
+transformers==4.43.3
+treelite-runtime==3.9.1
+treelite==3.9.1
+triton==2.2.0+e28a256
+typer==0.9.0
+types-dataclasses==0.6.6
+typing-extensions==4.9.0
+ucx-py==0.35.0
+uff==0.6.9
+ujson==5.8.0
+urllib3==1.26.18
+wandb==0.16.3
+wasabi==1.1.2
+wcwidth==0.2.13
+weasel==0.3.4
+webencodings==0.5.1
+werkzeug==3.0.1
+wheel==0.42.0
+xdoctest==1.0.2
+xgboost==1.7.6
+yarl==1.9.4
+zict==3.0.0
+zipp==3.17.0

wandb/run-20240804_140603-q9i5g6sv/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,215 @@

+{
+    "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
+    "python": "3.10.12",
+    "heartbeatAt": "2024-08-04T05:06:04.333644",
+    "startedAt": "2024-08-04T05:06:03.671763",
+    "docker": null,
+    "cuda": null,
+    "args": [
+        "--seq-length",
+        "512",
+        "--sliding-window-size",
+        "4096",
+        "--micro-batch-size",
+        "8",
+        "--global-batch-size",
+        "320",
+        "--train-iters",
+        "2000",
+        "--tokenizer-type",
+        "Llama2Tokenizer",
+        "--tokenizer-model",
+        "/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model",
+        "--train-data-path",
+        "4013541",
+        "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
+        "--valid-data-path",
+        "4013541",
+        "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
+        "--test-data-path",
+        "4013541",
+        "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
+        "--lr",
+        "2e-5",
+        "--min-lr",
+        "1e-6",
+        "--lr-decay-style",
+        "cosine",
+        "--lr-warmup-iters",
+        "500",
+        "--lr-decay-iters",
+        "2000",
+        "--weight-decay",
+        "0.1",
+        "--grad-clip-norm",
+        "1.0",
+        "--optimizer",
+        "adam",
+        "--adam-beta1",
+        "0.9",
+        "--adam-beta2",
+        "0.95",
+        "--adam-eps",
+        "1e-6",
+        "--save-interval",
+        "200",
+        "--eval-interval",
+        "200",
+        "--eval-iters",
+        "10",
+        "--bf16",
+        "--mixed-precision",
+        "--base-model",
+        "/share/pretrained_lm/meta-llama/TinyLlama_v1.1",
+        "--save",
+        "/work/llm_recipes/models/tiny-llama",
+        "--load",
+        "/work/llm_recipes/models/tiny-llama",
+        "--fsdp-activation-checkpointing",
+        "--sharding-strategy",
+        "FULL_SHARD",
+        "--checkpoint-type",
+        "LOCAL_STATE_DICT",
+        "--save-n-checkpoints",
+        "10",
+        "--hf-upload-retry-limit",
+        "2",
+        "--hf-repo-id",
+        "koichi12/tiny-llama",
+        "--wandb-entity",
+        "iwakawa-koichi-q5-tohoku-nlp6723",
+        "--wandb-project",
+        "llm_tutorial",
+        "--wandb-name",
+        "tiny-llama_train_2024-08-04-14:05:53"
+    ],
+    "state": "running",
+    "program": "/project/examples/finetuning.py",
+    "codePathLocal": "examples/finetuning.py",
+    "codePath": "examples/finetuning.py",
+    "git": {
+        "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
+        "commit": "3be5353210a678dc7008f237fa16b99f2bdf36ea"
+    },
+    "email": null,
+    "root": "/project",
+    "host": "gpu-koiwa-00",
+    "username": "koiwa",
+    "executable": "/usr/bin/python",
+    "cpu_count": 18,
+    "cpu_count_logical": 18,
+    "cpu_freq": {
+        "current": 2400.0389999999993,
+        "min": 0.0,
+        "max": 0.0
+    },
+    "cpu_freq_per_core": [
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        }
+    ],
+    "disk": {
+        "/": {
+            "total": 0.0625,
+            "used": 1.1444091796875e-05
+        }
+    },
+    "gpu": "NVIDIA A100-SXM4-40GB",
+    "gpu_count": 1,
+    "gpu_devices": [
+        {
+            "name": "NVIDIA A100-SXM4-40GB",
+            "memory_total": 42949672960
+        }
+    ],
+    "memory": {
+        "total": 56.48781967163086
+    }
+}

wandb/run-20240804_140603-q9i5g6sv/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"_wandb": {"runtime": 4}}

wandb/run-20240804_140603-q9i5g6sv/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,186 @@

+2024-08-04 14:06:03,686 INFO    StreamThr :9469 [internal.py:wandb_internal():86] W&B internal server running at pid: 9469, started at: 2024-08-04 14:06:03.685029
+2024-08-04 14:06:03,687 DEBUG   HandlerThread:9469 [handler.py:handle_request():146] handle_request: status
+2024-08-04 14:06:03,689 INFO    WriterThread:9469 [datastore.py:open_for_write():87] open: /project/wandb/run-20240804_140603-q9i5g6sv/run-q9i5g6sv.wandb
+2024-08-04 14:06:03,690 DEBUG   SenderThread:9469 [sender.py:send():382] send: header
+2024-08-04 14:06:03,703 DEBUG   SenderThread:9469 [sender.py:send():382] send: run
+2024-08-04 14:06:04,218 INFO    SenderThread:9469 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240804_140603-q9i5g6sv/files
+2024-08-04 14:06:04,218 INFO    SenderThread:9469 [sender.py:_start_run_threads():1136] run started: q9i5g6sv with start time 1722747963.684337
+2024-08-04 14:06:04,223 DEBUG   HandlerThread:9469 [handler.py:handle_request():146] handle_request: check_version
+2024-08-04 14:06:04,223 DEBUG   SenderThread:9469 [sender.py:send_request():409] send_request: check_version
+2024-08-04 14:06:04,313 DEBUG   HandlerThread:9469 [handler.py:handle_request():146] handle_request: run_start
+2024-08-04 14:06:04,320 DEBUG   HandlerThread:9469 [system_info.py:__init__():27] System info init
+2024-08-04 14:06:04,320 DEBUG   HandlerThread:9469 [system_info.py:__init__():42] System info init done
+2024-08-04 14:06:04,320 INFO    HandlerThread:9469 [system_monitor.py:start():194] Starting system monitor
+2024-08-04 14:06:04,320 INFO    SystemMonitor:9469 [system_monitor.py:_start():158] Starting system asset monitoring threads
+2024-08-04 14:06:04,320 INFO    HandlerThread:9469 [system_monitor.py:probe():214] Collecting system info
+2024-08-04 14:06:04,321 INFO    SystemMonitor:9469 [interfaces.py:start():190] Started cpu monitoring
+2024-08-04 14:06:04,321 INFO    SystemMonitor:9469 [interfaces.py:start():190] Started disk monitoring
+2024-08-04 14:06:04,322 INFO    SystemMonitor:9469 [interfaces.py:start():190] Started gpu monitoring
+2024-08-04 14:06:04,322 INFO    SystemMonitor:9469 [interfaces.py:start():190] Started memory monitoring
+2024-08-04 14:06:04,323 INFO    SystemMonitor:9469 [interfaces.py:start():190] Started network monitoring
+2024-08-04 14:06:04,333 DEBUG   HandlerThread:9469 [system_info.py:probe():151] Probing system
+2024-08-04 14:06:04,335 DEBUG   HandlerThread:9469 [system_info.py:_probe_git():136] Probing git
+2024-08-04 14:06:04,347 DEBUG   HandlerThread:9469 [system_info.py:_probe_git():144] Probing git done
+2024-08-04 14:06:04,347 DEBUG   HandlerThread:9469 [system_info.py:probe():199] Probing system done
+2024-08-04 14:06:04,347 DEBUG   HandlerThread:9469 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-04T05:06:04.333644', 'startedAt': '2024-08-04T05:06:03.671763', 'docker': None, 'cuda': None, 'args': ('--seq-length', '512', '--sliding-window-size', '4096', '--micro-batch-size', '8', '--global-batch-size', '320', '--train-iters', '2000', '--tokenizer-type', 'Llama2Tokenizer', '--tokenizer-model', '/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model', '--train-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--valid-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--test-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '2000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '200', '--eval-interval', '200', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/meta-llama/TinyLlama_v1.1', '--save', '/work/llm_recipes/models/tiny-llama', '--load', '/work/llm_recipes/models/tiny-llama', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/tiny-llama', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'tiny-llama_train_2024-08-04-14:05:53'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '3be5353210a678dc7008f237fa16b99f2bdf36ea'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0389999999993, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.48781967163086}}
+2024-08-04 14:06:04,347 INFO    HandlerThread:9469 [system_monitor.py:probe():224] Finished collecting system info
+2024-08-04 14:06:04,347 INFO    HandlerThread:9469 [system_monitor.py:probe():227] Publishing system info
+2024-08-04 14:06:04,349 INFO    HandlerThread:9469 [system_monitor.py:probe():229] Finished publishing system info
+2024-08-04 14:06:04,354 DEBUG   SenderThread:9469 [sender.py:send():382] send: files
+2024-08-04 14:06:04,354 INFO    SenderThread:9469 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
+2024-08-04 14:06:04,364 DEBUG   HandlerThread:9469 [handler.py:handle_request():146] handle_request: python_packages
+2024-08-04 14:06:04,364 DEBUG   HandlerThread:9469 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-04 14:06:04,364 DEBUG   HandlerThread:9469 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-04 14:06:04,364 DEBUG   SenderThread:9469 [sender.py:send_request():409] send_request: python_packages
+2024-08-04 14:06:04,366 DEBUG   SenderThread:9469 [sender.py:send_request():409] send_request: stop_status
+2024-08-04 14:06:04,605 DEBUG   SenderThread:9469 [sender.py:send():382] send: telemetry
+2024-08-04 14:06:04,996 INFO    wandb-upload_0:9469 [upload_job.py:push():131] Uploaded file /tmp/tmpz1emajybwandb/prws540s-wandb-metadata.json
+2024-08-04 14:06:05,220 INFO    Thread-12 :9469 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_140603-q9i5g6sv/files/requirements.txt
+2024-08-04 14:06:05,220 INFO    Thread-12 :9469 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_140603-q9i5g6sv/files/wandb-metadata.json
+2024-08-04 14:06:05,220 INFO    Thread-12 :9469 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_140603-q9i5g6sv/files/output.log
+2024-08-04 14:06:07,221 INFO    Thread-12 :9469 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_140603-q9i5g6sv/files/output.log
+2024-08-04 14:06:07,604 DEBUG   SenderThread:9469 [sender.py:send():382] send: config
+2024-08-04 14:06:07,605 DEBUG   SenderThread:9469 [sender.py:send():382] send: config
+2024-08-04 14:06:08,222 INFO    Thread-12 :9469 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_140603-q9i5g6sv/files/output.log
+2024-08-04 14:06:08,620 DEBUG   SenderThread:9469 [sender.py:send():382] send: exit
+2024-08-04 14:06:08,620 INFO    SenderThread:9469 [sender.py:send_exit():589] handling exit code: 1
+2024-08-04 14:06:08,620 INFO    SenderThread:9469 [sender.py:send_exit():591] handling runtime: 4
+2024-08-04 14:06:08,621 INFO    SenderThread:9469 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-04 14:06:08,622 INFO    SenderThread:9469 [sender.py:send_exit():597] send defer
+2024-08-04 14:06:08,622 DEBUG   HandlerThread:9469 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:06:08,622 INFO    HandlerThread:9469 [handler.py:handle_request_defer():172] handle defer: 0
+2024-08-04 14:06:08,622 DEBUG   SenderThread:9469 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:06:08,622 INFO    SenderThread:9469 [sender.py:send_request_defer():613] handle sender defer: 0
+2024-08-04 14:06:08,622 INFO    SenderThread:9469 [sender.py:transition_state():617] send defer: 1
+2024-08-04 14:06:08,622 DEBUG   HandlerThread:9469 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:06:08,622 INFO    HandlerThread:9469 [handler.py:handle_request_defer():172] handle defer: 1
+2024-08-04 14:06:08,622 DEBUG   SenderThread:9469 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:06:08,622 INFO    SenderThread:9469 [sender.py:send_request_defer():613] handle sender defer: 1
+2024-08-04 14:06:08,623 INFO    SenderThread:9469 [sender.py:transition_state():617] send defer: 2
+2024-08-04 14:06:08,623 DEBUG   HandlerThread:9469 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:06:08,623 INFO    HandlerThread:9469 [handler.py:handle_request_defer():172] handle defer: 2
+2024-08-04 14:06:08,623 INFO    HandlerThread:9469 [system_monitor.py:finish():203] Stopping system monitor
+2024-08-04 14:06:08,623 DEBUG   SystemMonitor:9469 [system_monitor.py:_start():172] Starting system metrics aggregation loop
+2024-08-04 14:06:08,623 INFO    HandlerThread:9469 [interfaces.py:finish():202] Joined cpu monitor
+2024-08-04 14:06:08,623 DEBUG   SystemMonitor:9469 [system_monitor.py:_start():179] Finished system metrics aggregation loop
+2024-08-04 14:06:08,623 INFO    HandlerThread:9469 [interfaces.py:finish():202] Joined disk monitor
+2024-08-04 14:06:08,624 DEBUG   SystemMonitor:9469 [system_monitor.py:_start():183] Publishing last batch of metrics
+2024-08-04 14:06:08,656 INFO    HandlerThread:9469 [interfaces.py:finish():202] Joined gpu monitor
+2024-08-04 14:06:08,656 INFO    HandlerThread:9469 [interfaces.py:finish():202] Joined memory monitor
+2024-08-04 14:06:08,656 INFO    HandlerThread:9469 [interfaces.py:finish():202] Joined network monitor
+2024-08-04 14:06:08,657 DEBUG   SenderThread:9469 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:06:08,657 INFO    SenderThread:9469 [sender.py:send_request_defer():613] handle sender defer: 2
+2024-08-04 14:06:08,657 INFO    SenderThread:9469 [sender.py:transition_state():617] send defer: 3
+2024-08-04 14:06:08,657 DEBUG   SenderThread:9469 [sender.py:send():382] send: stats
+2024-08-04 14:06:08,657 DEBUG   HandlerThread:9469 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:06:08,657 INFO    HandlerThread:9469 [handler.py:handle_request_defer():172] handle defer: 3
+2024-08-04 14:06:08,657 DEBUG   SenderThread:9469 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:06:08,657 INFO    SenderThread:9469 [sender.py:send_request_defer():613] handle sender defer: 3
+2024-08-04 14:06:08,657 INFO    SenderThread:9469 [sender.py:transition_state():617] send defer: 4
+2024-08-04 14:06:08,657 DEBUG   HandlerThread:9469 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:06:08,657 INFO    HandlerThread:9469 [handler.py:handle_request_defer():172] handle defer: 4
+2024-08-04 14:06:08,658 DEBUG   SenderThread:9469 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:06:08,658 INFO    SenderThread:9469 [sender.py:send_request_defer():613] handle sender defer: 4
+2024-08-04 14:06:08,658 INFO    SenderThread:9469 [sender.py:transition_state():617] send defer: 5
+2024-08-04 14:06:08,658 DEBUG   HandlerThread:9469 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:06:08,658 INFO    HandlerThread:9469 [handler.py:handle_request_defer():172] handle defer: 5
+2024-08-04 14:06:08,658 DEBUG   SenderThread:9469 [sender.py:send():382] send: summary
+2024-08-04 14:06:08,659 INFO    SenderThread:9469 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-04 14:06:08,659 DEBUG   SenderThread:9469 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:06:08,659 INFO    SenderThread:9469 [sender.py:send_request_defer():613] handle sender defer: 5
+2024-08-04 14:06:08,659 INFO    SenderThread:9469 [sender.py:transition_state():617] send defer: 6
+2024-08-04 14:06:08,659 DEBUG   HandlerThread:9469 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:06:08,659 INFO    HandlerThread:9469 [handler.py:handle_request_defer():172] handle defer: 6
+2024-08-04 14:06:08,659 DEBUG   SenderThread:9469 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:06:08,660 INFO    SenderThread:9469 [sender.py:send_request_defer():613] handle sender defer: 6
+2024-08-04 14:06:08,662 DEBUG   HandlerThread:9469 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 14:06:08,848 INFO    SenderThread:9469 [sender.py:transition_state():617] send defer: 7
+2024-08-04 14:06:08,849 DEBUG   HandlerThread:9469 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:06:08,849 INFO    HandlerThread:9469 [handler.py:handle_request_defer():172] handle defer: 7
+2024-08-04 14:06:08,849 DEBUG   SenderThread:9469 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:06:08,849 INFO    SenderThread:9469 [sender.py:send_request_defer():613] handle sender defer: 7
+2024-08-04 14:06:09,223 INFO    Thread-12 :9469 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_140603-q9i5g6sv/files/config.yaml
+2024-08-04 14:06:09,223 INFO    Thread-12 :9469 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_140603-q9i5g6sv/files/output.log
+2024-08-04 14:06:09,223 INFO    Thread-12 :9469 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_140603-q9i5g6sv/files/wandb-summary.json
+2024-08-04 14:06:09,360 INFO    SenderThread:9469 [sender.py:transition_state():617] send defer: 8
+2024-08-04 14:06:09,361 DEBUG   HandlerThread:9469 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:06:09,361 INFO    HandlerThread:9469 [handler.py:handle_request_defer():172] handle defer: 8
+2024-08-04 14:06:09,361 DEBUG   SenderThread:9469 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:06:09,361 INFO    SenderThread:9469 [sender.py:send_request_defer():613] handle sender defer: 8
+2024-08-04 14:06:09,361 INFO    SenderThread:9469 [job_builder.py:build():296] Attempting to build job artifact
+2024-08-04 14:06:09,362 INFO    SenderThread:9469 [job_builder.py:_get_source_type():426] is repo sourced job
+2024-08-04 14:06:09,376 INFO    SenderThread:9469 [job_builder.py:build():402] adding wandb-job metadata file
+2024-08-04 14:06:09,384 INFO    SenderThread:9469 [sender.py:transition_state():617] send defer: 9
+2024-08-04 14:06:09,384 DEBUG   HandlerThread:9469 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:06:09,384 DEBUG   SenderThread:9469 [sender.py:send():382] send: artifact
+2024-08-04 14:06:09,384 INFO    HandlerThread:9469 [handler.py:handle_request_defer():172] handle defer: 9
+2024-08-04 14:06:09,620 DEBUG   HandlerThread:9469 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 14:06:10,224 INFO    Thread-12 :9469 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_140603-q9i5g6sv/files/output.log
+2024-08-04 14:06:10,240 INFO    SenderThread:9469 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTA5MTk2NTkzOA==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTA5MzUzODM4NQ==', 'versionIndex': 3}}}
+2024-08-04 14:06:10,240 DEBUG   SenderThread:9469 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:06:10,240 INFO    SenderThread:9469 [sender.py:send_request_defer():613] handle sender defer: 9
+2024-08-04 14:06:10,240 INFO    SenderThread:9469 [dir_watcher.py:finish():358] shutting down directory watcher
+2024-08-04 14:06:11,225 INFO    SenderThread:9469 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240804_140603-q9i5g6sv/files
+2024-08-04 14:06:11,225 INFO    SenderThread:9469 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_140603-q9i5g6sv/files/requirements.txt requirements.txt
+2024-08-04 14:06:11,225 INFO    SenderThread:9469 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_140603-q9i5g6sv/files/config.yaml config.yaml
+2024-08-04 14:06:11,227 INFO    SenderThread:9469 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_140603-q9i5g6sv/files/wandb-metadata.json wandb-metadata.json
+2024-08-04 14:06:11,227 INFO    SenderThread:9469 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_140603-q9i5g6sv/files/wandb-summary.json wandb-summary.json
+2024-08-04 14:06:11,228 INFO    SenderThread:9469 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_140603-q9i5g6sv/files/output.log output.log
+2024-08-04 14:06:11,230 INFO    SenderThread:9469 [sender.py:transition_state():617] send defer: 10
+2024-08-04 14:06:11,230 DEBUG   SenderThread:9469 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 14:06:11,230 DEBUG   HandlerThread:9469 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:06:11,232 INFO    HandlerThread:9469 [handler.py:handle_request_defer():172] handle defer: 10
+2024-08-04 14:06:11,232 DEBUG   SenderThread:9469 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:06:11,232 INFO    SenderThread:9469 [sender.py:send_request_defer():613] handle sender defer: 10
+2024-08-04 14:06:11,232 INFO    SenderThread:9469 [file_pusher.py:finish():172] shutting down file pusher
+2024-08-04 14:06:11,620 DEBUG   HandlerThread:9469 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 14:06:11,621 DEBUG   SenderThread:9469 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 14:06:11,713 INFO    wandb-upload_0:9469 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_140603-q9i5g6sv/files/requirements.txt
+2024-08-04 14:06:11,733 INFO    wandb-upload_1:9469 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_140603-q9i5g6sv/files/config.yaml
+2024-08-04 14:06:11,829 INFO    wandb-upload_2:9469 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_140603-q9i5g6sv/files/wandb-summary.json
+2024-08-04 14:06:11,833 INFO    wandb-upload_3:9469 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_140603-q9i5g6sv/files/output.log
+2024-08-04 14:06:12,033 INFO    Thread-11 (_thread_body):9469 [sender.py:transition_state():617] send defer: 11
+2024-08-04 14:06:12,034 DEBUG   HandlerThread:9469 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:06:12,034 INFO    HandlerThread:9469 [handler.py:handle_request_defer():172] handle defer: 11
+2024-08-04 14:06:12,034 DEBUG   SenderThread:9469 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:06:12,034 INFO    SenderThread:9469 [sender.py:send_request_defer():613] handle sender defer: 11
+2024-08-04 14:06:12,034 INFO    SenderThread:9469 [file_pusher.py:join():178] waiting for file pusher
+2024-08-04 14:06:12,034 INFO    SenderThread:9469 [sender.py:transition_state():617] send defer: 12
+2024-08-04 14:06:12,034 DEBUG   HandlerThread:9469 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:06:12,034 INFO    HandlerThread:9469 [handler.py:handle_request_defer():172] handle defer: 12
+2024-08-04 14:06:12,035 DEBUG   SenderThread:9469 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:06:12,035 INFO    SenderThread:9469 [sender.py:send_request_defer():613] handle sender defer: 12
+2024-08-04 14:06:12,035 INFO    SenderThread:9469 [file_stream.py:finish():595] file stream finish called
+2024-08-04 14:06:12,204 INFO    SenderThread:9469 [file_stream.py:finish():599] file stream finish is done
+2024-08-04 14:06:12,204 INFO    SenderThread:9469 [sender.py:transition_state():617] send defer: 13
+2024-08-04 14:06:12,205 DEBUG   HandlerThread:9469 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:06:12,205 INFO    HandlerThread:9469 [handler.py:handle_request_defer():172] handle defer: 13
+2024-08-04 14:06:12,205 DEBUG   SenderThread:9469 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:06:12,205 INFO    SenderThread:9469 [sender.py:send_request_defer():613] handle sender defer: 13
+2024-08-04 14:06:12,205 INFO    SenderThread:9469 [sender.py:transition_state():617] send defer: 14
+2024-08-04 14:06:12,205 DEBUG   HandlerThread:9469 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:06:12,205 DEBUG   SenderThread:9469 [sender.py:send():382] send: final
+2024-08-04 14:06:12,205 INFO    HandlerThread:9469 [handler.py:handle_request_defer():172] handle defer: 14
+2024-08-04 14:06:12,205 DEBUG   SenderThread:9469 [sender.py:send():382] send: footer
+2024-08-04 14:06:12,206 DEBUG   SenderThread:9469 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:06:12,206 INFO    SenderThread:9469 [sender.py:send_request_defer():613] handle sender defer: 14
+2024-08-04 14:06:12,206 DEBUG   HandlerThread:9469 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 14:06:12,206 DEBUG   SenderThread:9469 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 14:06:12,206 DEBUG   HandlerThread:9469 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 14:06:12,207 DEBUG   SenderThread:9469 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 14:06:12,207 DEBUG   HandlerThread:9469 [handler.py:handle_request():146] handle_request: server_info
+2024-08-04 14:06:12,207 DEBUG   HandlerThread:9469 [handler.py:handle_request():146] handle_request: get_summary
+2024-08-04 14:06:12,207 DEBUG   SenderThread:9469 [sender.py:send_request():409] send_request: server_info
+2024-08-04 14:06:12,208 DEBUG   HandlerThread:9469 [handler.py:handle_request():146] handle_request: sampled_history
+2024-08-04 14:06:12,209 DEBUG   HandlerThread:9469 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-04 14:06:12,209 DEBUG   HandlerThread:9469 [handler.py:handle_request():146] handle_request: job_info
+2024-08-04 14:06:12,360 DEBUG   SenderThread:9469 [sender.py:send_request():409] send_request: job_info
+2024-08-04 14:06:12,360 INFO    MainThread:9469 [wandb_run.py:_footer_history_summary_info():3866] rendering history
+2024-08-04 14:06:12,360 INFO    MainThread:9469 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
+2024-08-04 14:06:12,360 INFO    MainThread:9469 [wandb_run.py:_footer_sync_info():3825] logging synced files
+2024-08-04 14:06:12,360 DEBUG   HandlerThread:9469 [handler.py:handle_request():146] handle_request: shutdown
+2024-08-04 14:06:12,361 INFO    HandlerThread:9469 [handler.py:finish():869] shutting down handler
+2024-08-04 14:06:13,210 INFO    WriterThread:9469 [datastore.py:close():296] close: /project/wandb/run-20240804_140603-q9i5g6sv/run-q9i5g6sv.wandb
+2024-08-04 14:06:13,360 INFO    SenderThread:9469 [sender.py:finish():1572] shutting down sender
+2024-08-04 14:06:13,360 INFO    SenderThread:9469 [file_pusher.py:finish():172] shutting down file pusher
+2024-08-04 14:06:13,360 INFO    SenderThread:9469 [file_pusher.py:join():178] waiting for file pusher

wandb/run-20240804_140603-q9i5g6sv/logs/debug.log ADDED Viewed

	@@ -0,0 +1,30 @@

+2024-08-04 14:06:03,677 INFO    MainThread:9398 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
+2024-08-04 14:06:03,678 INFO    MainThread:9398 [wandb_setup.py:_flush():76] Configure stats pid to 9398
+2024-08-04 14:06:03,678 INFO    MainThread:9398 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
+2024-08-04 14:06:03,678 INFO    MainThread:9398 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
+2024-08-04 14:06:03,678 INFO    MainThread:9398 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train tiny llama sample'}
+2024-08-04 14:06:03,678 INFO    MainThread:9398 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
+2024-08-04 14:06:03,678 INFO    MainThread:9398 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
+2024-08-04 14:06:03,678 INFO    MainThread:9398 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240804_140603-q9i5g6sv/logs/debug.log
+2024-08-04 14:06:03,678 INFO    MainThread:9398 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240804_140603-q9i5g6sv/logs/debug-internal.log
+2024-08-04 14:06:03,678 INFO    MainThread:9398 [wandb_init.py:init():566] calling init triggers
+2024-08-04 14:06:03,678 INFO    MainThread:9398 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
+config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'valid_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'test_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 512, 'num_workers': 2, 'tokenizer_type': 'Llama2Tokenizer', 'tokenizer_model': '/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'tiny-llama_train_2024-08-04-14:05:53', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/tiny-llama', 'save': '/work/llm_recipes/models/tiny-llama', 'base_model': '/share/pretrained_lm/meta-llama/TinyLlama_v1.1', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 2000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 2000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 8, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/tiny-llama', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 32000, 'gradient_accumulation_steps': 40}
+2024-08-04 14:06:03,678 INFO    MainThread:9398 [wandb_init.py:init():616] starting backend
+2024-08-04 14:06:03,678 INFO    MainThread:9398 [wandb_init.py:init():620] setting up manager
+2024-08-04 14:06:03,683 INFO    MainThread:9398 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-08-04 14:06:03,684 INFO    MainThread:9398 [wandb_init.py:init():628] backend started and connected
+2024-08-04 14:06:03,689 INFO    MainThread:9398 [wandb_init.py:init():720] updated telemetry
+2024-08-04 14:06:03,699 INFO    MainThread:9398 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
+2024-08-04 14:06:04,223 INFO    MainThread:9398 [wandb_run.py:_on_init():2262] communicating current version
+2024-08-04 14:06:04,307 INFO    MainThread:9398 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.5 is available!  To upgrade, please run:\n $ pip install wandb --upgrade"
+2024-08-04 14:06:04,307 INFO    MainThread:9398 [wandb_init.py:init():804] starting run threads in backend
+2024-08-04 14:06:04,363 INFO    MainThread:9398 [wandb_run.py:_console_start():2241] atexit reg
+2024-08-04 14:06:04,363 INFO    MainThread:9398 [wandb_run.py:_redirect():2096] redirect: wrap_raw
+2024-08-04 14:06:04,363 INFO    MainThread:9398 [wandb_run.py:_redirect():2161] Wrapping output streams.
+2024-08-04 14:06:04,363 INFO    MainThread:9398 [wandb_run.py:_redirect():2186] Redirects installed.
+2024-08-04 14:06:04,364 INFO    MainThread:9398 [wandb_init.py:init():847] run started, returning control to user process
+2024-08-04 14:06:07,603 INFO    MainThread:9398 [wandb_run.py:_config_callback():1343] config_cb None None {'activation_function': 'silu', 'hidden_size': 2048, 'model_type': 'llama', 'max_position_embeddings': 2048, 'num_attention_heads': 32, 'num_hidden_layers': 22, 'model_architecture': 'LlamaForCausalLM'}
+2024-08-04 14:06:07,604 INFO    MainThread:9398 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
+2024-08-04 14:06:13,361 WARNING MsgRouterThr:9398 [router.py:message_loop():77] message_loop has been closed

wandb/run-20240804_140603-q9i5g6sv/run-q9i5g6sv.wandb ADDED Viewed

Binary file (20.7 kB). View file

wandb/run-20240804_142250-6p58tz1g/files/config.yaml ADDED Viewed

	@@ -0,0 +1,335 @@

+wandb_version: 1
+sharding_strategy:
+  desc: null
+  value: FULL_SHARD
+checkpoint_type:
+  desc: null
+  value: LOCAL_STATE_DICT
+fsdp_activation_checkpointing:
+  desc: null
+  value: true
+fsdp_cpu_offload:
+  desc: null
+  value: false
+low_cpu_fsdp:
+  desc: null
+  value: false
+no_meta_device:
+  desc: null
+  value: false
+data_path:
+  desc: null
+  value: null
+split:
+  desc: null
+  value: 969, 30, 1
+train_data_path:
+  desc: null
+  value:
+  - '4013541'
+  - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
+valid_data_path:
+  desc: null
+  value:
+  - '4013541'
+  - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
+test_data_path:
+  desc: null
+  value:
+  - '4013541'
+  - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
+data_cache_path:
+  desc: null
+  value: null
+vocab_size:
+  desc: null
+  value: null
+vocab_file:
+  desc: null
+  value: null
+merge_file:
+  desc: null
+  value: null
+seq_length:
+  desc: null
+  value: 512
+num_workers:
+  desc: null
+  value: 2
+tokenizer_type:
+  desc: null
+  value: Llama2Tokenizer
+tokenizer_model:
+  desc: null
+  value: /share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model
+reset_position_ids:
+  desc: null
+  value: false
+reset_attention_mask:
+  desc: null
+  value: false
+eod_mask_loss:
+  desc: null
+  value: false
+retro_return_doc_ids:
+  desc: null
+  value: false
+short_seq_prob:
+  desc: null
+  value: 0.1
+vocab_extra_ids:
+  desc: null
+  value: 0
+seed:
+  desc: null
+  value: 1234
+use_mpi:
+  desc: null
+  value: false
+wandb_entity:
+  desc: null
+  value: iwakawa-koichi-q5-tohoku-nlp6723
+wandb_name:
+  desc: null
+  value: tiny-llama_train_2024-08-04-14:22:39
+wandb_project:
+  desc: null
+  value: llm_tutorial
+quantization:
+  desc: null
+  value: false
+use_freeze_layers:
+  desc: null
+  value: false
+freeze_layers:
+  desc: null
+  value: null
+bf16:
+  desc: null
+  value: true
+fp16:
+  desc: null
+  value: false
+mixed_precision:
+  desc: null
+  value: true
+param_dtype:
+  desc: null
+  value: null
+load:
+  desc: null
+  value: /work/llm_recipes/models/tiny-llama
+save:
+  desc: null
+  value: /work/llm_recipes/models/tiny-llama
+base_model:
+  desc: null
+  value: /share/pretrained_lm/meta-llama/TinyLlama_v1.1
+use_better_transformer:
+  desc: null
+  value: false
+grad_clip_norm:
+  desc: null
+  value: 1.0
+eval_interval:
+  desc: null
+  value: 200
+save_interval:
+  desc: null
+  value: 200
+eval_iters:
+  desc: null
+  value: 10
+optimizer:
+  desc: null
+  value: adam
+lr:
+  desc: null
+  value: 2.0e-05
+lr_decay_style:
+  desc: null
+  value: cosine
+lr_decay_iters:
+  desc: null
+  value: 2000
+lr_warmup_iters:
+  desc: null
+  value: 500
+min_lr:
+  desc: null
+  value: 1.0e-06
+train_iters:
+  desc: null
+  value: 2000
+train_samples:
+  desc: null
+  value: null
+global_batch_size:
+  desc: null
+  value: 320
+micro_batch_size:
+  desc: null
+  value: 8
+make_vocab_size_divisible_by:
+  desc: null
+  value: 128
+sliding_window_size:
+  desc: null
+  value: 4096
+skip_batch:
+  desc: null
+  value: null
+no_save_optimizer_state:
+  desc: null
+  value: false
+continual_pretraining:
+  desc: null
+  value: false
+instruction_tuning:
+  desc: null
+  value: false
+direct_preference_optimization:
+  desc: null
+  value: false
+attention_dropout:
+  desc: null
+  value: 0.1
+hidden_dropout:
+  desc: null
+  value: 0.1
+weight_decay:
+  desc: null
+  value: 0.1
+adam_beta1:
+  desc: null
+  value: 0.9
+adam_beta2:
+  desc: null
+  value: 0.95
+adam_eps:
+  desc: null
+  value: 1.0e-06
+hf_transformer_model_dir:
+  desc: null
+  value: null
+instruction_train_data_path:
+  desc: null
+  value: null
+instruction_valid_data_path:
+  desc: null
+  value: null
+epoch:
+  desc: null
+  value: null
+instruction_dataset_size:
+  desc: null
+  value: null
+save_sampler_state:
+  desc: null
+  value: false
+label_smoothing:
+  desc: null
+  value: 0.0
+save_n_checkpoints:
+  desc: null
+  value: 10
+hf_repo_id:
+  desc: null
+  value: koichi12/tiny-llama
+create_public_hf_repo:
+  desc: null
+  value: false
+upload_all_checkpoints_to_hf:
+  desc: null
+  value: false
+hf_upload_retry_limit:
+  desc: null
+  value: 2
+exit_duration_in_mins:
+  desc: null
+  value: null
+source_key:
+  desc: null
+  value: null
+target_key:
+  desc: null
+  value: null
+attn_implementation:
+  desc: null
+  value: flash_attention_2
+efficient_instruction_tuning:
+  desc: null
+  value: false
+remove_padding_masking:
+  desc: null
+  value: false
+save_start_iter:
+  desc: null
+  value: null
+rank:
+  desc: null
+  value: 0
+world_size:
+  desc: null
+  value: 1
+padded_vocab_size:
+  desc: null
+  value: 32000
+gradient_accumulation_steps:
+  desc: null
+  value: 40
+_wandb:
+  desc: null
+  value:
+    python_version: 3.10.12
+    cli_version: 0.16.3
+    framework: huggingface
+    huggingface_version: 4.43.3
+    is_jupyter_run: false
+    is_kaggle_kernel: false
+    start_time: 1722748970.443993
+    t:
+      1:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      2:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      3:
+      - 13
+      - 16
+      - 23
+      4: 3.10.12
+      5: 0.16.3
+      6: 4.43.3
+      8:
+      - 5
+      13: linux-x86_64
+activation_function:
+  desc: null
+  value: silu
+hidden_size:
+  desc: null
+  value: 2048
+model_type:
+  desc: null
+  value: llama
+max_position_embeddings:
+  desc: null
+  value: 2048
+num_attention_heads:
+  desc: null
+  value: 32
+num_hidden_layers:
+  desc: null
+  value: 22
+model_architecture:
+  desc: null
+  value: LlamaForCausalLM

wandb/run-20240804_142250-6p58tz1g/files/output.log ADDED Viewed

	@@ -0,0 +1,135 @@

+Created Hugging Face repository with ID koichi12/tiny-llama.
+Clearing GPU cache for all ranks
+--> Running with torch torch_distributed debug set to detail
+File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+No checkpoint found in /work/llm_recipes/models/tiny-llama, skipping model loading
+--> Model /share/pretrained_lm/meta-llama/TinyLlama_v1.1
+--> /share/pretrained_lm/meta-llama/TinyLlama_v1.1 has 1100.048384 Million params
+You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
+You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
+Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaForCausalLM is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
+Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
+/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
+  warnings.warn(
+BFloat16 enabled for mixed precision - using bfSixteen policy
+--> applying fsdp activation checkpointing...
+ > datasets target sizes (minimum size):
+    train:      640000
+    validation: 35200
+    test:       3200
+> building train, validation, and test datasets for GPT ...
+> finished creating GPT datasets ...
+File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+No checkpoint found in /work/llm_recipes/models/tiny-llama, skipping optimizer loading
+File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+model info: FullyShardedDataParallel(
+  (_fsdp_wrapped_module): LlamaForCausalLM(
+    (model): LlamaModel(
+      (embed_tokens): Embedding(32000, 2048)
+      (layers): ModuleList(
+        (0-21): 22 x FullyShardedDataParallel(
+          (_fsdp_wrapped_module): CheckpointWrapper(
+            (_checkpoint_wrapped_module): LlamaDecoderLayer(
+              (self_attn): LlamaFlashAttention2(
+                (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
+                (k_proj): Linear(in_features=2048, out_features=256, bias=False)
+                (v_proj): Linear(in_features=2048, out_features=256, bias=False)
+                (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
+                (rotary_emb): LlamaRotaryEmbedding()
+              )
+              (mlp): LlamaMLP(
+                (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
+                (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
+                (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): LlamaRMSNorm()
+              (post_attention_layernorm): LlamaRMSNorm()
+            )
+          )
+        )
+      )
+      (norm): LlamaRMSNorm()
+      (rotary_emb): LlamaRotaryEmbedding()
+    )
+    (lm_head): Linear(in_features=2048, out_features=32000, bias=False)
+  )
+)
+model config: LlamaConfig {
+  "_name_or_path": "/share/pretrained_lm/meta-llama/TinyLlama_v1.1",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 5632,
+  "label_smoothing": 0.0,
+  "max_position_embeddings": 2048,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 22,
+  "num_key_value_heads": 4,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.43.3",
+  "use_cache": false,
+  "vocab_size": 32000
+}
+Let split = None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+Traceback (most recent call last):
+  File "/project/examples/finetuning.py", line 13, in <module>
+    main()
+  File "/project/src/llama_recipes/finetuning.py", line 281, in main
+    train(
+  File "/project/src/llama_recipes/utils/train_utils.py", line 104, in train
+    batch = next(train_dataloader)
+  File "/project/src/llama_recipes/utils/train_utils.py", line 24, in cyclic_iter
+    for x in iter:
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 631, in __next__
+    data = self._next_data()
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1346, in _next_data
+    return self._process_data(data)
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1372, in _process_data
+    data.reraise()
+  File "/usr/local/lib/python3.10/dist-packages/torch/_utils.py", line 705, in reraise
+    raise exception
+RuntimeError: Caught RuntimeError in DataLoader worker process 0.
+Original Traceback (most recent call last):
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
+    data = fetcher.fetch(index)
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/fetch.py", line 54, in fetch
+    return self.collate_fn(data)
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 277, in default_collate
+    return collate(batch, collate_fn_map=default_collate_fn_map)
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 129, in collate
+    return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 129, in <dictcomp>
+    return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 121, in collate
+    return collate_fn_map[elem_type](batch, collate_fn_map=collate_fn_map)
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 174, in collate_tensor_fn
+    return torch.stack(batch, 0, out=out)
+RuntimeError: stack expects each tensor to be equal size, but got [513] at entry 0 and [543] at entry 1

wandb/run-20240804_142250-6p58tz1g/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,271 @@

+absl-py==2.1.0
+accelerate==0.33.0
+aiohttp==3.9.1
+aiosignal==1.3.1
+annotated-types==0.6.0
+apex==0.1
+appdirs==1.4.4
+argon2-cffi-bindings==21.2.0
+argon2-cffi==23.1.0
+asttokens==2.4.1
+astunparse==1.6.3
+async-timeout==4.0.3
+attrs==23.2.0
+audioread==3.0.1
+beautifulsoup4==4.12.3
+bleach==6.1.0
+blis==0.7.11
+cachetools==5.3.2
+catalogue==2.0.10
+certifi==2024.2.2
+cffi==1.16.0
+charset-normalizer==3.3.2
+click==8.1.7
+cloudpathlib==0.16.0
+cloudpickle==3.0.0
+cmake==3.28.1
+colorama==0.4.6
+comm==0.2.1
+confection==0.1.4
+contourpy==1.2.0
+cubinlinker==0.3.0+2.g405ac64
+cuda-python==12.3.0rc4+9.gdb8c48a.dirty
+cudf==23.12.0
+cugraph-dgl==23.12.0
+cugraph-service-client==23.12.0
+cugraph-service-server==23.12.0
+cugraph==23.12.0
+cuml==23.12.0
+cupy-cuda12x==12.3.0
+cycler==0.12.1
+cymem==2.0.8
+cython==3.0.8
+dask-cuda==23.12.0
+dask-cudf==23.12.0
+dask==2023.11.0
+debugpy==1.8.1
+decorator==5.1.1
+defusedxml==0.7.1
+distributed==2023.11.0
+dm-tree==0.1.8
+docker-pycreds==0.4.0
+einops==0.7.0
+exceptiongroup==1.2.0
+execnet==2.0.2
+executing==2.0.1
+expecttest==0.1.3
+fastjsonschema==2.19.1
+fastrlock==0.8.2
+filelock==3.13.1
+flash-attn==2.4.2
+fonttools==4.48.1
+frozenlist==1.4.1
+fsspec==2023.12.2
+gast==0.5.4
+gitdb==4.0.11
+gitpython==3.1.43
+google-auth-oauthlib==0.4.6
+google-auth==2.27.0
+graphsurgeon==0.4.6
+grpcio==1.60.1
+huggingface-hub==0.24.5
+hypothesis==5.35.1
+idna==3.6
+importlib-metadata==7.0.1
+iniconfig==2.0.0
+intel-openmp==2021.4.0
+ipadic==1.0.0
+ipykernel==6.29.2
+ipython-genutils==0.2.0
+ipython==8.21.0
+jedi==0.19.1
+jinja2==3.1.3
+joblib==1.3.2
+json5==0.9.14
+jsonnet==0.19.1
+jsonschema-specifications==2023.12.1
+jsonschema==4.21.1
+jupyter-client==8.6.0
+jupyter-core==5.7.1
+jupyter-tensorboard==0.2.0
+jupyterlab-pygments==0.3.0
+jupyterlab-server==1.2.0
+jupyterlab==2.3.2
+jupytext==1.16.1
+kiwisolver==1.4.5
+langcodes==3.3.0
+lazy-loader==0.3
+librosa==0.10.1
+llvmlite==0.40.1
+locket==1.0.0
+logzero==1.7.0
+lxml==5.2.2
+markdown-it-py==3.0.0
+markdown==3.5.2
+markupsafe==2.1.4
+matplotlib-inline==0.1.6
+matplotlib==3.8.2
+mdit-py-plugins==0.4.0
+mdurl==0.1.2
+mecab-python3==1.0.6
+mistune==3.0.2
+mkl-devel==2021.1.1
+mkl-include==2021.1.1
+mkl==2021.1.1
+mock==5.1.0
+more-itertools==9.1.0
+mpmath==1.3.0
+msgpack==1.0.7
+multidict==6.0.4
+murmurhash==1.0.10
+nbclient==0.9.0
+nbconvert==7.16.0
+nbformat==5.9.2
+nest-asyncio==1.6.0
+networkx==2.6.3
+ninja==1.11.1.1
+nltk==3.8.1
+notebook==6.4.10
+numba==0.57.1+1.g1ff679645
+numpy==1.24.4
+nvfuser==0.1.4a0+d0bb811
+nvidia-dali-cuda120==1.34.0
+nvidia-pyindex==1.0.9
+nvtx==0.2.5
+oauthlib==3.2.2
+onnx==1.15.0rc2
+opencv==4.7.0
+optree==0.10.0
+packaging==23.2
+pandas==1.5.3
+pandocfilters==1.5.1
+parso==0.8.3
+partd==1.4.1
+peft==0.11.1
+pexpect==4.9.0
+pillow==10.2.0
+pip==24.0
+platformdirs==4.2.0
+pluggy==1.4.0
+ply==3.11
+polygraphy==0.49.4
+pooch==1.8.0
+portalocker==2.10.1
+preshed==3.0.9
+prettytable==3.9.0
+prometheus-client==0.19.0
+prompt-toolkit==3.0.43
+protobuf==4.24.4
+psutil==5.9.4
+ptxcompiler==0.8.1+2.g0d406d6
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyarrow==14.0.1.dev0+gba5374836.d20240125
+pyasn1-modules==0.3.0
+pyasn1==0.5.1
+pybind11-global==2.11.1
+pybind11==2.11.1
+pycocotools==2.0+nv0.8.0
+pycparser==2.21
+pydantic-core==2.16.2
+pydantic==2.6.1
+pygments==2.17.2
+pylibcugraph==23.12.0
+pylibcugraphops==23.12.0
+pylibraft==23.12.0
+pynvml==11.4.1
+pyparsing==3.1.1
+pytest-flakefinder==1.1.0
+pytest-rerunfailures==13.0
+pytest-shard==0.1.2
+pytest-xdist==3.5.0
+pytest==8.0.0
+python-dateutil==2.8.2
+python-dotenv==1.0.0
+python-hostlist==1.23.0
+pytorch-quantization==2.1.2
+pytz==2023.3.post1
+pyyaml==6.0.1
+pyzmq==25.1.2
+raft-dask==23.12.0
+rapids-dask-dependency==23.12.1
+referencing==0.33.0
+regex==2023.12.25
+requests-oauthlib==1.3.1
+requests==2.31.0
+rich==13.7.0
+rmm==23.12.0
+rpds-py==0.17.1
+rsa==4.9
+sacrebleu==2.4.0
+safetensors==0.4.3
+scikit-learn==1.2.0
+scipy==1.12.0
+send2trash==1.8.2
+sentencepiece==0.1.99
+sentry-sdk==2.12.0
+setproctitle==1.3.3
+setuptools==68.2.2
+six==1.16.0
+smart-open==6.4.0
+smmap==5.0.1
+sortedcontainers==2.4.0
+soundfile==0.12.1
+soupsieve==2.5
+soxr==0.3.7
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+spacy==3.7.2
+sphinx-glpi-theme==0.6
+srsly==2.4.8
+stack-data==0.6.3
+sympy==1.12
+tabulate==0.9.0
+tbb==2021.11.0
+tblib==3.0.0
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.1
+tensorboard==2.9.0
+tensorrt==8.6.3
+terminado==0.18.0
+termplotlib==0.3.9
+thinc==8.2.3
+threadpoolctl==3.2.0
+thriftpy2==0.4.17
+tinycss2==1.2.1
+tokenizers==0.19.1
+toml==0.10.2
+tomli==2.0.1
+toolz==0.12.1
+torch-tensorrt==2.3.0a0
+torch==2.3.0a0+ebedce2
+torchdata==0.7.1a0
+torchtext==0.17.0a0
+torchvision==0.18.0a0
+tornado==6.4
+tqdm==4.66.1
+traitlets==5.9.0
+transformer-engine==1.3.0+5b90b7f
+transformers==4.43.3
+treelite-runtime==3.9.1
+treelite==3.9.1
+triton==2.2.0+e28a256
+typer==0.9.0
+types-dataclasses==0.6.6
+typing-extensions==4.9.0
+ucx-py==0.35.0
+uff==0.6.9
+ujson==5.8.0
+urllib3==1.26.18
+wandb==0.16.3
+wasabi==1.1.2
+wcwidth==0.2.13
+weasel==0.3.4
+webencodings==0.5.1
+werkzeug==3.0.1
+wheel==0.42.0
+xdoctest==1.0.2
+xgboost==1.7.6
+yarl==1.9.4
+zict==3.0.0
+zipp==3.17.0

wandb/run-20240804_142250-6p58tz1g/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,215 @@

+{
+    "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
+    "python": "3.10.12",
+    "heartbeatAt": "2024-08-04T05:22:51.055103",
+    "startedAt": "2024-08-04T05:22:50.431050",
+    "docker": null,
+    "cuda": null,
+    "args": [
+        "--seq-length",
+        "512",
+        "--sliding-window-size",
+        "4096",
+        "--micro-batch-size",
+        "8",
+        "--global-batch-size",
+        "320",
+        "--train-iters",
+        "2000",
+        "--tokenizer-type",
+        "Llama2Tokenizer",
+        "--tokenizer-model",
+        "/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model",
+        "--train-data-path",
+        "4013541",
+        "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
+        "--valid-data-path",
+        "4013541",
+        "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
+        "--test-data-path",
+        "4013541",
+        "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
+        "--lr",
+        "2e-5",
+        "--min-lr",
+        "1e-6",
+        "--lr-decay-style",
+        "cosine",
+        "--lr-warmup-iters",
+        "500",
+        "--lr-decay-iters",
+        "2000",
+        "--weight-decay",
+        "0.1",
+        "--grad-clip-norm",
+        "1.0",
+        "--optimizer",
+        "adam",
+        "--adam-beta1",
+        "0.9",
+        "--adam-beta2",
+        "0.95",
+        "--adam-eps",
+        "1e-6",
+        "--save-interval",
+        "200",
+        "--eval-interval",
+        "200",
+        "--eval-iters",
+        "10",
+        "--bf16",
+        "--mixed-precision",
+        "--base-model",
+        "/share/pretrained_lm/meta-llama/TinyLlama_v1.1",
+        "--save",
+        "/work/llm_recipes/models/tiny-llama",
+        "--load",
+        "/work/llm_recipes/models/tiny-llama",
+        "--fsdp-activation-checkpointing",
+        "--sharding-strategy",
+        "FULL_SHARD",
+        "--checkpoint-type",
+        "LOCAL_STATE_DICT",
+        "--save-n-checkpoints",
+        "10",
+        "--hf-upload-retry-limit",
+        "2",
+        "--hf-repo-id",
+        "koichi12/tiny-llama",
+        "--wandb-entity",
+        "iwakawa-koichi-q5-tohoku-nlp6723",
+        "--wandb-project",
+        "llm_tutorial",
+        "--wandb-name",
+        "tiny-llama_train_2024-08-04-14:22:39"
+    ],
+    "state": "running",
+    "program": "/project/examples/finetuning.py",
+    "codePathLocal": "examples/finetuning.py",
+    "codePath": "examples/finetuning.py",
+    "git": {
+        "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
+        "commit": "3be5353210a678dc7008f237fa16b99f2bdf36ea"
+    },
+    "email": null,
+    "root": "/project",
+    "host": "gpu-koiwa-00",
+    "username": "koiwa",
+    "executable": "/usr/bin/python",
+    "cpu_count": 18,
+    "cpu_count_logical": 18,
+    "cpu_freq": {
+        "current": 2400.0389999999993,
+        "min": 0.0,
+        "max": 0.0
+    },
+    "cpu_freq_per_core": [
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        }
+    ],
+    "disk": {
+        "/": {
+            "total": 0.0625,
+            "used": 1.1444091796875e-05
+        }
+    },
+    "gpu": "NVIDIA A100-SXM4-40GB",
+    "gpu_count": 1,
+    "gpu_devices": [
+        {
+            "name": "NVIDIA A100-SXM4-40GB",
+            "memory_total": 42949672960
+        }
+    ],
+    "memory": {
+        "total": 56.48781967163086
+    }
+}

wandb/run-20240804_142250-6p58tz1g/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"_wandb": {"runtime": 2}}

wandb/run-20240804_142250-6p58tz1g/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,186 @@

+2024-08-04 14:22:50,445 INFO    StreamThr :10451 [internal.py:wandb_internal():86] W&B internal server running at pid: 10451, started at: 2024-08-04 14:22:50.444819
+2024-08-04 14:22:50,447 DEBUG   HandlerThread:10451 [handler.py:handle_request():146] handle_request: status
+2024-08-04 14:22:50,449 INFO    WriterThread:10451 [datastore.py:open_for_write():87] open: /project/wandb/run-20240804_142250-6p58tz1g/run-6p58tz1g.wandb
+2024-08-04 14:22:50,450 DEBUG   SenderThread:10451 [sender.py:send():382] send: header
+2024-08-04 14:22:50,463 DEBUG   SenderThread:10451 [sender.py:send():382] send: run
+2024-08-04 14:22:50,941 INFO    SenderThread:10451 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240804_142250-6p58tz1g/files
+2024-08-04 14:22:50,941 INFO    SenderThread:10451 [sender.py:_start_run_threads():1136] run started: 6p58tz1g with start time 1722748970.443993
+2024-08-04 14:22:50,946 DEBUG   HandlerThread:10451 [handler.py:handle_request():146] handle_request: check_version
+2024-08-04 14:22:50,946 DEBUG   SenderThread:10451 [sender.py:send_request():409] send_request: check_version
+2024-08-04 14:22:51,034 DEBUG   HandlerThread:10451 [handler.py:handle_request():146] handle_request: run_start
+2024-08-04 14:22:51,041 DEBUG   HandlerThread:10451 [system_info.py:__init__():27] System info init
+2024-08-04 14:22:51,041 DEBUG   HandlerThread:10451 [system_info.py:__init__():42] System info init done
+2024-08-04 14:22:51,041 INFO    HandlerThread:10451 [system_monitor.py:start():194] Starting system monitor
+2024-08-04 14:22:51,041 INFO    SystemMonitor:10451 [system_monitor.py:_start():158] Starting system asset monitoring threads
+2024-08-04 14:22:51,042 INFO    HandlerThread:10451 [system_monitor.py:probe():214] Collecting system info
+2024-08-04 14:22:51,042 INFO    SystemMonitor:10451 [interfaces.py:start():190] Started cpu monitoring
+2024-08-04 14:22:51,043 INFO    SystemMonitor:10451 [interfaces.py:start():190] Started disk monitoring
+2024-08-04 14:22:51,044 INFO    SystemMonitor:10451 [interfaces.py:start():190] Started gpu monitoring
+2024-08-04 14:22:51,044 INFO    SystemMonitor:10451 [interfaces.py:start():190] Started memory monitoring
+2024-08-04 14:22:51,045 INFO    SystemMonitor:10451 [interfaces.py:start():190] Started network monitoring
+2024-08-04 14:22:51,055 DEBUG   HandlerThread:10451 [system_info.py:probe():151] Probing system
+2024-08-04 14:22:51,059 DEBUG   HandlerThread:10451 [system_info.py:_probe_git():136] Probing git
+2024-08-04 14:22:51,071 DEBUG   HandlerThread:10451 [system_info.py:_probe_git():144] Probing git done
+2024-08-04 14:22:51,071 DEBUG   HandlerThread:10451 [system_info.py:probe():199] Probing system done
+2024-08-04 14:22:51,071 DEBUG   HandlerThread:10451 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-04T05:22:51.055103', 'startedAt': '2024-08-04T05:22:50.431050', 'docker': None, 'cuda': None, 'args': ('--seq-length', '512', '--sliding-window-size', '4096', '--micro-batch-size', '8', '--global-batch-size', '320', '--train-iters', '2000', '--tokenizer-type', 'Llama2Tokenizer', '--tokenizer-model', '/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model', '--train-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--valid-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--test-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '2000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '200', '--eval-interval', '200', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/meta-llama/TinyLlama_v1.1', '--save', '/work/llm_recipes/models/tiny-llama', '--load', '/work/llm_recipes/models/tiny-llama', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/tiny-llama', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'tiny-llama_train_2024-08-04-14:22:39'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '3be5353210a678dc7008f237fa16b99f2bdf36ea'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0389999999993, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.48781967163086}}
+2024-08-04 14:22:51,072 INFO    HandlerThread:10451 [system_monitor.py:probe():224] Finished collecting system info
+2024-08-04 14:22:51,072 INFO    HandlerThread:10451 [system_monitor.py:probe():227] Publishing system info
+2024-08-04 14:22:51,073 INFO    HandlerThread:10451 [system_monitor.py:probe():229] Finished publishing system info
+2024-08-04 14:22:51,079 DEBUG   SenderThread:10451 [sender.py:send():382] send: files
+2024-08-04 14:22:51,079 INFO    SenderThread:10451 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
+2024-08-04 14:22:51,089 DEBUG   HandlerThread:10451 [handler.py:handle_request():146] handle_request: python_packages
+2024-08-04 14:22:51,089 DEBUG   HandlerThread:10451 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-04 14:22:51,089 DEBUG   SenderThread:10451 [sender.py:send_request():409] send_request: python_packages
+2024-08-04 14:22:51,090 DEBUG   HandlerThread:10451 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-04 14:22:51,091 DEBUG   SenderThread:10451 [sender.py:send_request():409] send_request: stop_status
+2024-08-04 14:22:51,412 DEBUG   SenderThread:10451 [sender.py:send():382] send: telemetry
+2024-08-04 14:22:51,834 INFO    wandb-upload_0:10451 [upload_job.py:push():131] Uploaded file /tmp/tmpvai5nc9ewandb/lc3l5ghh-wandb-metadata.json
+2024-08-04 14:22:51,943 INFO    Thread-12 :10451 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_142250-6p58tz1g/files/requirements.txt
+2024-08-04 14:22:51,943 INFO    Thread-12 :10451 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_142250-6p58tz1g/files/wandb-metadata.json
+2024-08-04 14:22:51,943 INFO    Thread-12 :10451 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_142250-6p58tz1g/files/output.log
+2024-08-04 14:22:53,535 DEBUG   SenderThread:10451 [sender.py:send():382] send: config
+2024-08-04 14:22:53,536 DEBUG   SenderThread:10451 [sender.py:send():382] send: config
+2024-08-04 14:22:53,643 DEBUG   SenderThread:10451 [sender.py:send():382] send: exit
+2024-08-04 14:22:53,643 INFO    SenderThread:10451 [sender.py:send_exit():589] handling exit code: 1
+2024-08-04 14:22:53,643 INFO    SenderThread:10451 [sender.py:send_exit():591] handling runtime: 2
+2024-08-04 14:22:53,644 INFO    SenderThread:10451 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-04 14:22:53,645 INFO    SenderThread:10451 [sender.py:send_exit():597] send defer
+2024-08-04 14:22:53,645 DEBUG   HandlerThread:10451 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:22:53,645 INFO    HandlerThread:10451 [handler.py:handle_request_defer():172] handle defer: 0
+2024-08-04 14:22:53,645 DEBUG   SenderThread:10451 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:22:53,645 INFO    SenderThread:10451 [sender.py:send_request_defer():613] handle sender defer: 0
+2024-08-04 14:22:53,645 INFO    SenderThread:10451 [sender.py:transition_state():617] send defer: 1
+2024-08-04 14:22:53,645 DEBUG   HandlerThread:10451 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:22:53,645 INFO    HandlerThread:10451 [handler.py:handle_request_defer():172] handle defer: 1
+2024-08-04 14:22:53,645 DEBUG   SenderThread:10451 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:22:53,645 INFO    SenderThread:10451 [sender.py:send_request_defer():613] handle sender defer: 1
+2024-08-04 14:22:53,645 INFO    SenderThread:10451 [sender.py:transition_state():617] send defer: 2
+2024-08-04 14:22:53,645 DEBUG   HandlerThread:10451 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:22:53,645 INFO    HandlerThread:10451 [handler.py:handle_request_defer():172] handle defer: 2
+2024-08-04 14:22:53,645 INFO    HandlerThread:10451 [system_monitor.py:finish():203] Stopping system monitor
+2024-08-04 14:22:53,646 DEBUG   SystemMonitor:10451 [system_monitor.py:_start():172] Starting system metrics aggregation loop
+2024-08-04 14:22:53,646 INFO    HandlerThread:10451 [interfaces.py:finish():202] Joined cpu monitor
+2024-08-04 14:22:53,646 DEBUG   SystemMonitor:10451 [system_monitor.py:_start():179] Finished system metrics aggregation loop
+2024-08-04 14:22:53,646 INFO    HandlerThread:10451 [interfaces.py:finish():202] Joined disk monitor
+2024-08-04 14:22:53,646 DEBUG   SystemMonitor:10451 [system_monitor.py:_start():183] Publishing last batch of metrics
+2024-08-04 14:22:53,679 INFO    HandlerThread:10451 [interfaces.py:finish():202] Joined gpu monitor
+2024-08-04 14:22:53,679 INFO    HandlerThread:10451 [interfaces.py:finish():202] Joined memory monitor
+2024-08-04 14:22:53,679 INFO    HandlerThread:10451 [interfaces.py:finish():202] Joined network monitor
+2024-08-04 14:22:53,680 DEBUG   SenderThread:10451 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:22:53,680 INFO    SenderThread:10451 [sender.py:send_request_defer():613] handle sender defer: 2
+2024-08-04 14:22:53,680 INFO    SenderThread:10451 [sender.py:transition_state():617] send defer: 3
+2024-08-04 14:22:53,680 DEBUG   SenderThread:10451 [sender.py:send():382] send: stats
+2024-08-04 14:22:53,680 DEBUG   HandlerThread:10451 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:22:53,680 INFO    HandlerThread:10451 [handler.py:handle_request_defer():172] handle defer: 3
+2024-08-04 14:22:53,680 DEBUG   SenderThread:10451 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:22:53,681 INFO    SenderThread:10451 [sender.py:send_request_defer():613] handle sender defer: 3
+2024-08-04 14:22:53,681 INFO    SenderThread:10451 [sender.py:transition_state():617] send defer: 4
+2024-08-04 14:22:53,681 DEBUG   HandlerThread:10451 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:22:53,681 INFO    HandlerThread:10451 [handler.py:handle_request_defer():172] handle defer: 4
+2024-08-04 14:22:53,681 DEBUG   SenderThread:10451 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:22:53,681 INFO    SenderThread:10451 [sender.py:send_request_defer():613] handle sender defer: 4
+2024-08-04 14:22:53,681 INFO    SenderThread:10451 [sender.py:transition_state():617] send defer: 5
+2024-08-04 14:22:53,681 DEBUG   HandlerThread:10451 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:22:53,681 INFO    HandlerThread:10451 [handler.py:handle_request_defer():172] handle defer: 5
+2024-08-04 14:22:53,681 DEBUG   SenderThread:10451 [sender.py:send():382] send: summary
+2024-08-04 14:22:53,682 INFO    SenderThread:10451 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-04 14:22:53,682 DEBUG   SenderThread:10451 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:22:53,682 INFO    SenderThread:10451 [sender.py:send_request_defer():613] handle sender defer: 5
+2024-08-04 14:22:53,682 INFO    SenderThread:10451 [sender.py:transition_state():617] send defer: 6
+2024-08-04 14:22:53,683 DEBUG   HandlerThread:10451 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:22:53,683 INFO    HandlerThread:10451 [handler.py:handle_request_defer():172] handle defer: 6
+2024-08-04 14:22:53,683 DEBUG   SenderThread:10451 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:22:53,683 INFO    SenderThread:10451 [sender.py:send_request_defer():613] handle sender defer: 6
+2024-08-04 14:22:53,685 DEBUG   HandlerThread:10451 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 14:22:53,891 INFO    SenderThread:10451 [sender.py:transition_state():617] send defer: 7
+2024-08-04 14:22:53,891 DEBUG   HandlerThread:10451 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:22:53,891 INFO    HandlerThread:10451 [handler.py:handle_request_defer():172] handle defer: 7
+2024-08-04 14:22:53,892 DEBUG   SenderThread:10451 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:22:53,892 INFO    SenderThread:10451 [sender.py:send_request_defer():613] handle sender defer: 7
+2024-08-04 14:22:53,944 INFO    Thread-12 :10451 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_142250-6p58tz1g/files/config.yaml
+2024-08-04 14:22:53,944 INFO    Thread-12 :10451 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_142250-6p58tz1g/files/output.log
+2024-08-04 14:22:53,944 INFO    Thread-12 :10451 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_142250-6p58tz1g/files/wandb-summary.json
+2024-08-04 14:22:54,643 DEBUG   HandlerThread:10451 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 14:22:55,782 INFO    SenderThread:10451 [sender.py:transition_state():617] send defer: 8
+2024-08-04 14:22:55,783 DEBUG   SenderThread:10451 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 14:22:55,783 DEBUG   HandlerThread:10451 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:22:55,783 INFO    HandlerThread:10451 [handler.py:handle_request_defer():172] handle defer: 8
+2024-08-04 14:22:55,783 DEBUG   SenderThread:10451 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:22:55,783 INFO    SenderThread:10451 [sender.py:send_request_defer():613] handle sender defer: 8
+2024-08-04 14:22:55,783 INFO    SenderThread:10451 [job_builder.py:build():296] Attempting to build job artifact
+2024-08-04 14:22:55,784 INFO    SenderThread:10451 [job_builder.py:_get_source_type():426] is repo sourced job
+2024-08-04 14:22:55,883 INFO    SenderThread:10451 [job_builder.py:build():402] adding wandb-job metadata file
+2024-08-04 14:22:55,891 INFO    SenderThread:10451 [sender.py:transition_state():617] send defer: 9
+2024-08-04 14:22:55,892 DEBUG   HandlerThread:10451 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:22:55,892 DEBUG   SenderThread:10451 [sender.py:send():382] send: artifact
+2024-08-04 14:22:55,892 INFO    HandlerThread:10451 [handler.py:handle_request_defer():172] handle defer: 9
+2024-08-04 14:22:55,945 INFO    Thread-12 :10451 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_142250-6p58tz1g/files/output.log
+2024-08-04 14:22:56,644 DEBUG   HandlerThread:10451 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 14:22:57,777 INFO    SenderThread:10451 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTA5MTk2NTkzOA==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTA5MzUzODM4NQ==', 'versionIndex': 3}}}
+2024-08-04 14:22:57,777 DEBUG   SenderThread:10451 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:22:57,777 INFO    SenderThread:10451 [sender.py:send_request_defer():613] handle sender defer: 9
+2024-08-04 14:22:57,777 INFO    SenderThread:10451 [dir_watcher.py:finish():358] shutting down directory watcher
+2024-08-04 14:22:57,946 INFO    SenderThread:10451 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240804_142250-6p58tz1g/files
+2024-08-04 14:22:57,946 INFO    SenderThread:10451 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_142250-6p58tz1g/files/requirements.txt requirements.txt
+2024-08-04 14:22:57,947 INFO    SenderThread:10451 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_142250-6p58tz1g/files/config.yaml config.yaml
+2024-08-04 14:22:57,948 INFO    SenderThread:10451 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_142250-6p58tz1g/files/wandb-metadata.json wandb-metadata.json
+2024-08-04 14:22:57,948 INFO    SenderThread:10451 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_142250-6p58tz1g/files/wandb-summary.json wandb-summary.json
+2024-08-04 14:22:57,950 INFO    SenderThread:10451 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_142250-6p58tz1g/files/output.log output.log
+2024-08-04 14:22:57,952 INFO    SenderThread:10451 [sender.py:transition_state():617] send defer: 10
+2024-08-04 14:22:57,952 DEBUG   SenderThread:10451 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 14:22:57,952 DEBUG   HandlerThread:10451 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:22:57,952 INFO    HandlerThread:10451 [handler.py:handle_request_defer():172] handle defer: 10
+2024-08-04 14:22:57,954 DEBUG   SenderThread:10451 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:22:57,954 INFO    SenderThread:10451 [sender.py:send_request_defer():613] handle sender defer: 10
+2024-08-04 14:22:57,954 INFO    SenderThread:10451 [file_pusher.py:finish():172] shutting down file pusher
+2024-08-04 14:22:58,363 INFO    wandb-upload_1:10451 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_142250-6p58tz1g/files/config.yaml
+2024-08-04 14:22:58,459 INFO    wandb-upload_0:10451 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_142250-6p58tz1g/files/requirements.txt
+2024-08-04 14:22:58,506 INFO    wandb-upload_2:10451 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_142250-6p58tz1g/files/wandb-summary.json
+2024-08-04 14:22:58,525 INFO    wandb-upload_3:10451 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_142250-6p58tz1g/files/output.log
+2024-08-04 14:22:58,645 DEBUG   HandlerThread:10451 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 14:22:58,645 DEBUG   SenderThread:10451 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 14:22:58,725 INFO    Thread-11 (_thread_body):10451 [sender.py:transition_state():617] send defer: 11
+2024-08-04 14:22:58,725 DEBUG   HandlerThread:10451 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:22:58,725 INFO    HandlerThread:10451 [handler.py:handle_request_defer():172] handle defer: 11
+2024-08-04 14:22:58,726 DEBUG   SenderThread:10451 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:22:58,726 INFO    SenderThread:10451 [sender.py:send_request_defer():613] handle sender defer: 11
+2024-08-04 14:22:58,726 INFO    SenderThread:10451 [file_pusher.py:join():178] waiting for file pusher
+2024-08-04 14:22:58,726 INFO    SenderThread:10451 [sender.py:transition_state():617] send defer: 12
+2024-08-04 14:22:58,726 DEBUG   HandlerThread:10451 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:22:58,726 INFO    HandlerThread:10451 [handler.py:handle_request_defer():172] handle defer: 12
+2024-08-04 14:22:58,726 DEBUG   SenderThread:10451 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:22:58,726 INFO    SenderThread:10451 [sender.py:send_request_defer():613] handle sender defer: 12
+2024-08-04 14:22:58,726 INFO    SenderThread:10451 [file_stream.py:finish():595] file stream finish called
+2024-08-04 14:22:58,910 INFO    SenderThread:10451 [file_stream.py:finish():599] file stream finish is done
+2024-08-04 14:22:58,911 INFO    SenderThread:10451 [sender.py:transition_state():617] send defer: 13
+2024-08-04 14:22:58,911 DEBUG   HandlerThread:10451 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:22:58,911 INFO    HandlerThread:10451 [handler.py:handle_request_defer():172] handle defer: 13
+2024-08-04 14:22:58,911 DEBUG   SenderThread:10451 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:22:58,911 INFO    SenderThread:10451 [sender.py:send_request_defer():613] handle sender defer: 13
+2024-08-04 14:22:58,911 INFO    SenderThread:10451 [sender.py:transition_state():617] send defer: 14
+2024-08-04 14:22:58,911 DEBUG   SenderThread:10451 [sender.py:send():382] send: final
+2024-08-04 14:22:58,911 DEBUG   HandlerThread:10451 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:22:58,912 DEBUG   SenderThread:10451 [sender.py:send():382] send: footer
+2024-08-04 14:22:58,912 INFO    HandlerThread:10451 [handler.py:handle_request_defer():172] handle defer: 14
+2024-08-04 14:22:58,912 DEBUG   SenderThread:10451 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:22:58,912 INFO    SenderThread:10451 [sender.py:send_request_defer():613] handle sender defer: 14
+2024-08-04 14:22:58,912 DEBUG   HandlerThread:10451 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 14:22:58,912 DEBUG   SenderThread:10451 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 14:22:58,913 DEBUG   HandlerThread:10451 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 14:22:58,913 DEBUG   SenderThread:10451 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 14:22:58,913 DEBUG   HandlerThread:10451 [handler.py:handle_request():146] handle_request: server_info
+2024-08-04 14:22:58,913 DEBUG   HandlerThread:10451 [handler.py:handle_request():146] handle_request: get_summary
+2024-08-04 14:22:58,914 DEBUG   SenderThread:10451 [sender.py:send_request():409] send_request: server_info
+2024-08-04 14:22:58,915 DEBUG   HandlerThread:10451 [handler.py:handle_request():146] handle_request: sampled_history
+2024-08-04 14:22:58,915 DEBUG   HandlerThread:10451 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-04 14:22:58,916 DEBUG   HandlerThread:10451 [handler.py:handle_request():146] handle_request: job_info
+2024-08-04 14:22:59,080 DEBUG   SenderThread:10451 [sender.py:send_request():409] send_request: job_info
+2024-08-04 14:22:59,081 INFO    MainThread:10451 [wandb_run.py:_footer_history_summary_info():3866] rendering history
+2024-08-04 14:22:59,081 INFO    MainThread:10451 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
+2024-08-04 14:22:59,081 INFO    MainThread:10451 [wandb_run.py:_footer_sync_info():3825] logging synced files
+2024-08-04 14:22:59,081 DEBUG   HandlerThread:10451 [handler.py:handle_request():146] handle_request: shutdown
+2024-08-04 14:22:59,081 INFO    HandlerThread:10451 [handler.py:finish():869] shutting down handler
+2024-08-04 14:22:59,916 INFO    WriterThread:10451 [datastore.py:close():296] close: /project/wandb/run-20240804_142250-6p58tz1g/run-6p58tz1g.wandb
+2024-08-04 14:23:00,081 INFO    SenderThread:10451 [sender.py:finish():1572] shutting down sender
+2024-08-04 14:23:00,081 INFO    SenderThread:10451 [file_pusher.py:finish():172] shutting down file pusher
+2024-08-04 14:23:00,081 INFO    SenderThread:10451 [file_pusher.py:join():178] waiting for file pusher

wandb/run-20240804_142250-6p58tz1g/logs/debug.log ADDED Viewed

	@@ -0,0 +1,30 @@

+2024-08-04 14:22:50,437 INFO    MainThread:10380 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
+2024-08-04 14:22:50,437 INFO    MainThread:10380 [wandb_setup.py:_flush():76] Configure stats pid to 10380
+2024-08-04 14:22:50,437 INFO    MainThread:10380 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
+2024-08-04 14:22:50,437 INFO    MainThread:10380 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
+2024-08-04 14:22:50,437 INFO    MainThread:10380 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train tiny llama sample'}
+2024-08-04 14:22:50,437 INFO    MainThread:10380 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
+2024-08-04 14:22:50,437 INFO    MainThread:10380 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
+2024-08-04 14:22:50,437 INFO    MainThread:10380 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240804_142250-6p58tz1g/logs/debug.log
+2024-08-04 14:22:50,437 INFO    MainThread:10380 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240804_142250-6p58tz1g/logs/debug-internal.log
+2024-08-04 14:22:50,438 INFO    MainThread:10380 [wandb_init.py:init():566] calling init triggers
+2024-08-04 14:22:50,438 INFO    MainThread:10380 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
+config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'valid_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'test_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 512, 'num_workers': 2, 'tokenizer_type': 'Llama2Tokenizer', 'tokenizer_model': '/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'tiny-llama_train_2024-08-04-14:22:39', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/tiny-llama', 'save': '/work/llm_recipes/models/tiny-llama', 'base_model': '/share/pretrained_lm/meta-llama/TinyLlama_v1.1', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 2000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 2000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 8, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/tiny-llama', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 32000, 'gradient_accumulation_steps': 40}
+2024-08-04 14:22:50,438 INFO    MainThread:10380 [wandb_init.py:init():616] starting backend
+2024-08-04 14:22:50,438 INFO    MainThread:10380 [wandb_init.py:init():620] setting up manager
+2024-08-04 14:22:50,443 INFO    MainThread:10380 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-08-04 14:22:50,443 INFO    MainThread:10380 [wandb_init.py:init():628] backend started and connected
+2024-08-04 14:22:50,448 INFO    MainThread:10380 [wandb_init.py:init():720] updated telemetry
+2024-08-04 14:22:50,459 INFO    MainThread:10380 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
+2024-08-04 14:22:50,946 INFO    MainThread:10380 [wandb_run.py:_on_init():2262] communicating current version
+2024-08-04 14:22:51,027 INFO    MainThread:10380 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.5 is available!  To upgrade, please run:\n $ pip install wandb --upgrade"
+2024-08-04 14:22:51,027 INFO    MainThread:10380 [wandb_init.py:init():804] starting run threads in backend
+2024-08-04 14:22:51,088 INFO    MainThread:10380 [wandb_run.py:_console_start():2241] atexit reg
+2024-08-04 14:22:51,088 INFO    MainThread:10380 [wandb_run.py:_redirect():2096] redirect: wrap_raw
+2024-08-04 14:22:51,088 INFO    MainThread:10380 [wandb_run.py:_redirect():2161] Wrapping output streams.
+2024-08-04 14:22:51,088 INFO    MainThread:10380 [wandb_run.py:_redirect():2186] Redirects installed.
+2024-08-04 14:22:51,090 INFO    MainThread:10380 [wandb_init.py:init():847] run started, returning control to user process
+2024-08-04 14:22:53,535 INFO    MainThread:10380 [wandb_run.py:_config_callback():1343] config_cb None None {'activation_function': 'silu', 'hidden_size': 2048, 'model_type': 'llama', 'max_position_embeddings': 2048, 'num_attention_heads': 32, 'num_hidden_layers': 22, 'model_architecture': 'LlamaForCausalLM'}
+2024-08-04 14:22:53,535 INFO    MainThread:10380 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
+2024-08-04 14:23:00,082 WARNING MsgRouterThr:10380 [router.py:message_loop():77] message_loop has been closed

wandb/run-20240804_142250-6p58tz1g/run-6p58tz1g.wandb ADDED Viewed

Binary file (20.5 kB). View file

wandb/run-20240804_143607-h7fxlkpt/files/config.yaml ADDED Viewed

	@@ -0,0 +1,335 @@

+wandb_version: 1
+sharding_strategy:
+  desc: null
+  value: FULL_SHARD
+checkpoint_type:
+  desc: null
+  value: LOCAL_STATE_DICT
+fsdp_activation_checkpointing:
+  desc: null
+  value: true
+fsdp_cpu_offload:
+  desc: null
+  value: false
+low_cpu_fsdp:
+  desc: null
+  value: false
+no_meta_device:
+  desc: null
+  value: false
+data_path:
+  desc: null
+  value: null
+split:
+  desc: null
+  value: 969, 30, 1
+train_data_path:
+  desc: null
+  value:
+  - '4013541'
+  - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
+valid_data_path:
+  desc: null
+  value:
+  - '4013541'
+  - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
+test_data_path:
+  desc: null
+  value:
+  - '4013541'
+  - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
+data_cache_path:
+  desc: null
+  value: null
+vocab_size:
+  desc: null
+  value: null
+vocab_file:
+  desc: null
+  value: null
+merge_file:
+  desc: null
+  value: null
+seq_length:
+  desc: null
+  value: 512
+num_workers:
+  desc: null
+  value: 2
+tokenizer_type:
+  desc: null
+  value: Llama2Tokenizer
+tokenizer_model:
+  desc: null
+  value: /share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model
+reset_position_ids:
+  desc: null
+  value: false
+reset_attention_mask:
+  desc: null
+  value: false
+eod_mask_loss:
+  desc: null
+  value: false
+retro_return_doc_ids:
+  desc: null
+  value: false
+short_seq_prob:
+  desc: null
+  value: 0.1
+vocab_extra_ids:
+  desc: null
+  value: 0
+seed:
+  desc: null
+  value: 1234
+use_mpi:
+  desc: null
+  value: false
+wandb_entity:
+  desc: null
+  value: iwakawa-koichi-q5-tohoku-nlp6723
+wandb_name:
+  desc: null
+  value: tiny-llama_train_2024-08-04-14:35:56
+wandb_project:
+  desc: null
+  value: llm_tutorial
+quantization:
+  desc: null
+  value: false
+use_freeze_layers:
+  desc: null
+  value: false
+freeze_layers:
+  desc: null
+  value: null
+bf16:
+  desc: null
+  value: true
+fp16:
+  desc: null
+  value: false
+mixed_precision:
+  desc: null
+  value: true
+param_dtype:
+  desc: null
+  value: null
+load:
+  desc: null
+  value: /work/llm_recipes/models/tiny-llama
+save:
+  desc: null
+  value: /work/llm_recipes/models/tiny-llama
+base_model:
+  desc: null
+  value: /share/pretrained_lm/meta-llama/TinyLlama_v1.1
+use_better_transformer:
+  desc: null
+  value: false
+grad_clip_norm:
+  desc: null
+  value: 1.0
+eval_interval:
+  desc: null
+  value: 200
+save_interval:
+  desc: null
+  value: 200
+eval_iters:
+  desc: null
+  value: 10
+optimizer:
+  desc: null
+  value: adam
+lr:
+  desc: null
+  value: 2.0e-05
+lr_decay_style:
+  desc: null
+  value: cosine
+lr_decay_iters:
+  desc: null
+  value: 2000
+lr_warmup_iters:
+  desc: null
+  value: 500
+min_lr:
+  desc: null
+  value: 1.0e-06
+train_iters:
+  desc: null
+  value: 2000
+train_samples:
+  desc: null
+  value: null
+global_batch_size:
+  desc: null
+  value: 320
+micro_batch_size:
+  desc: null
+  value: 8
+make_vocab_size_divisible_by:
+  desc: null
+  value: 128
+sliding_window_size:
+  desc: null
+  value: 4096
+skip_batch:
+  desc: null
+  value: null
+no_save_optimizer_state:
+  desc: null
+  value: false
+continual_pretraining:
+  desc: null
+  value: false
+instruction_tuning:
+  desc: null
+  value: false
+direct_preference_optimization:
+  desc: null
+  value: false
+attention_dropout:
+  desc: null
+  value: 0.1
+hidden_dropout:
+  desc: null
+  value: 0.1
+weight_decay:
+  desc: null
+  value: 0.1
+adam_beta1:
+  desc: null
+  value: 0.9
+adam_beta2:
+  desc: null
+  value: 0.95
+adam_eps:
+  desc: null
+  value: 1.0e-06
+hf_transformer_model_dir:
+  desc: null
+  value: null
+instruction_train_data_path:
+  desc: null
+  value: null
+instruction_valid_data_path:
+  desc: null
+  value: null
+epoch:
+  desc: null
+  value: null
+instruction_dataset_size:
+  desc: null
+  value: null
+save_sampler_state:
+  desc: null
+  value: false
+label_smoothing:
+  desc: null
+  value: 0.0
+save_n_checkpoints:
+  desc: null
+  value: 10
+hf_repo_id:
+  desc: null
+  value: koichi12/tiny-llama
+create_public_hf_repo:
+  desc: null
+  value: false
+upload_all_checkpoints_to_hf:
+  desc: null
+  value: false
+hf_upload_retry_limit:
+  desc: null
+  value: 2
+exit_duration_in_mins:
+  desc: null
+  value: null
+source_key:
+  desc: null
+  value: null
+target_key:
+  desc: null
+  value: null
+attn_implementation:
+  desc: null
+  value: flash_attention_2
+efficient_instruction_tuning:
+  desc: null
+  value: false
+remove_padding_masking:
+  desc: null
+  value: false
+save_start_iter:
+  desc: null
+  value: null
+rank:
+  desc: null
+  value: 0
+world_size:
+  desc: null
+  value: 1
+padded_vocab_size:
+  desc: null
+  value: 32000
+gradient_accumulation_steps:
+  desc: null
+  value: 40
+_wandb:
+  desc: null
+  value:
+    python_version: 3.10.12
+    cli_version: 0.16.3
+    framework: huggingface
+    huggingface_version: 4.43.3
+    is_jupyter_run: false
+    is_kaggle_kernel: false
+    start_time: 1722749767.220741
+    t:
+      1:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      2:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      3:
+      - 13
+      - 16
+      - 23
+      4: 3.10.12
+      5: 0.16.3
+      6: 4.43.3
+      8:
+      - 5
+      13: linux-x86_64
+activation_function:
+  desc: null
+  value: silu
+hidden_size:
+  desc: null
+  value: 2048
+model_type:
+  desc: null
+  value: llama
+max_position_embeddings:
+  desc: null
+  value: 2048
+num_attention_heads:
+  desc: null
+  value: 32
+num_hidden_layers:
+  desc: null
+  value: 22
+model_architecture:
+  desc: null
+  value: LlamaForCausalLM

wandb/run-20240804_143607-h7fxlkpt/files/output.log ADDED Viewed

	@@ -0,0 +1,135 @@

+Created Hugging Face repository with ID koichi12/tiny-llama.
+Clearing GPU cache for all ranks
+--> Running with torch torch_distributed debug set to detail
+File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+No checkpoint found in /work/llm_recipes/models/tiny-llama, skipping model loading
+--> Model /share/pretrained_lm/meta-llama/TinyLlama_v1.1
+--> /share/pretrained_lm/meta-llama/TinyLlama_v1.1 has 1100.048384 Million params
+You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
+You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
+Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaForCausalLM is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
+Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
+/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
+  warnings.warn(
+BFloat16 enabled for mixed precision - using bfSixteen policy
+--> applying fsdp activation checkpointing...
+ > datasets target sizes (minimum size):
+    train:      640000
+    validation: 35200
+    test:       3200
+> building train, validation, and test datasets for GPT ...
+> finished creating GPT datasets ...
+File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+No checkpoint found in /work/llm_recipes/models/tiny-llama, skipping optimizer loading
+File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+model info: FullyShardedDataParallel(
+  (_fsdp_wrapped_module): LlamaForCausalLM(
+    (model): LlamaModel(
+      (embed_tokens): Embedding(32000, 2048)
+      (layers): ModuleList(
+        (0-21): 22 x FullyShardedDataParallel(
+          (_fsdp_wrapped_module): CheckpointWrapper(
+            (_checkpoint_wrapped_module): LlamaDecoderLayer(
+              (self_attn): LlamaFlashAttention2(
+                (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
+                (k_proj): Linear(in_features=2048, out_features=256, bias=False)
+                (v_proj): Linear(in_features=2048, out_features=256, bias=False)
+                (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
+                (rotary_emb): LlamaRotaryEmbedding()
+              )
+              (mlp): LlamaMLP(
+                (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
+                (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
+                (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): LlamaRMSNorm()
+              (post_attention_layernorm): LlamaRMSNorm()
+            )
+          )
+        )
+      )
+      (norm): LlamaRMSNorm()
+      (rotary_emb): LlamaRotaryEmbedding()
+    )
+    (lm_head): Linear(in_features=2048, out_features=32000, bias=False)
+  )
+)
+model config: LlamaConfig {
+  "_name_or_path": "/share/pretrained_lm/meta-llama/TinyLlama_v1.1",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 5632,
+  "label_smoothing": 0.0,
+  "max_position_embeddings": 2048,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 22,
+  "num_key_value_heads": 4,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.43.3",
+  "use_cache": false,
+  "vocab_size": 32000
+}
+Let split = None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+Traceback (most recent call last):
+  File "/project/examples/finetuning.py", line 13, in <module>
+    main()
+  File "/project/src/llama_recipes/finetuning.py", line 281, in main
+    train(
+  File "/project/src/llama_recipes/utils/train_utils.py", line 104, in train
+    batch = next(train_dataloader)
+  File "/project/src/llama_recipes/utils/train_utils.py", line 24, in cyclic_iter
+    for x in iter:
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 631, in __next__
+    data = self._next_data()
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1346, in _next_data
+    return self._process_data(data)
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1372, in _process_data
+    data.reraise()
+  File "/usr/local/lib/python3.10/dist-packages/torch/_utils.py", line 705, in reraise
+    raise exception
+RuntimeError: Caught RuntimeError in DataLoader worker process 0.
+Original Traceback (most recent call last):
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
+    data = fetcher.fetch(index)
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/fetch.py", line 54, in fetch
+    return self.collate_fn(data)
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 277, in default_collate
+    return collate(batch, collate_fn_map=default_collate_fn_map)
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 129, in collate
+    return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 129, in <dictcomp>
+    return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 121, in collate
+    return collate_fn_map[elem_type](batch, collate_fn_map=collate_fn_map)
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 174, in collate_tensor_fn
+    return torch.stack(batch, 0, out=out)
+RuntimeError: stack expects each tensor to be equal size, but got [513] at entry 0 and [543] at entry 1

wandb/run-20240804_143607-h7fxlkpt/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,271 @@

+absl-py==2.1.0
+accelerate==0.33.0
+aiohttp==3.9.1
+aiosignal==1.3.1
+annotated-types==0.6.0
+apex==0.1
+appdirs==1.4.4
+argon2-cffi-bindings==21.2.0
+argon2-cffi==23.1.0
+asttokens==2.4.1
+astunparse==1.6.3
+async-timeout==4.0.3
+attrs==23.2.0
+audioread==3.0.1
+beautifulsoup4==4.12.3
+bleach==6.1.0
+blis==0.7.11
+cachetools==5.3.2
+catalogue==2.0.10
+certifi==2024.2.2
+cffi==1.16.0
+charset-normalizer==3.3.2
+click==8.1.7
+cloudpathlib==0.16.0
+cloudpickle==3.0.0
+cmake==3.28.1
+colorama==0.4.6
+comm==0.2.1
+confection==0.1.4
+contourpy==1.2.0
+cubinlinker==0.3.0+2.g405ac64
+cuda-python==12.3.0rc4+9.gdb8c48a.dirty
+cudf==23.12.0
+cugraph-dgl==23.12.0
+cugraph-service-client==23.12.0
+cugraph-service-server==23.12.0
+cugraph==23.12.0
+cuml==23.12.0
+cupy-cuda12x==12.3.0
+cycler==0.12.1
+cymem==2.0.8
+cython==3.0.8
+dask-cuda==23.12.0
+dask-cudf==23.12.0
+dask==2023.11.0
+debugpy==1.8.1
+decorator==5.1.1
+defusedxml==0.7.1
+distributed==2023.11.0
+dm-tree==0.1.8
+docker-pycreds==0.4.0
+einops==0.7.0
+exceptiongroup==1.2.0
+execnet==2.0.2
+executing==2.0.1
+expecttest==0.1.3
+fastjsonschema==2.19.1
+fastrlock==0.8.2
+filelock==3.13.1
+flash-attn==2.4.2
+fonttools==4.48.1
+frozenlist==1.4.1
+fsspec==2023.12.2
+gast==0.5.4
+gitdb==4.0.11
+gitpython==3.1.43
+google-auth-oauthlib==0.4.6
+google-auth==2.27.0
+graphsurgeon==0.4.6
+grpcio==1.60.1
+huggingface-hub==0.24.5
+hypothesis==5.35.1
+idna==3.6
+importlib-metadata==7.0.1
+iniconfig==2.0.0
+intel-openmp==2021.4.0
+ipadic==1.0.0
+ipykernel==6.29.2
+ipython-genutils==0.2.0
+ipython==8.21.0
+jedi==0.19.1
+jinja2==3.1.3
+joblib==1.3.2
+json5==0.9.14
+jsonnet==0.19.1
+jsonschema-specifications==2023.12.1
+jsonschema==4.21.1
+jupyter-client==8.6.0
+jupyter-core==5.7.1
+jupyter-tensorboard==0.2.0
+jupyterlab-pygments==0.3.0
+jupyterlab-server==1.2.0
+jupyterlab==2.3.2
+jupytext==1.16.1
+kiwisolver==1.4.5
+langcodes==3.3.0
+lazy-loader==0.3
+librosa==0.10.1
+llvmlite==0.40.1
+locket==1.0.0
+logzero==1.7.0
+lxml==5.2.2
+markdown-it-py==3.0.0
+markdown==3.5.2
+markupsafe==2.1.4
+matplotlib-inline==0.1.6
+matplotlib==3.8.2
+mdit-py-plugins==0.4.0
+mdurl==0.1.2
+mecab-python3==1.0.6
+mistune==3.0.2
+mkl-devel==2021.1.1
+mkl-include==2021.1.1
+mkl==2021.1.1
+mock==5.1.0
+more-itertools==9.1.0
+mpmath==1.3.0
+msgpack==1.0.7
+multidict==6.0.4
+murmurhash==1.0.10
+nbclient==0.9.0
+nbconvert==7.16.0
+nbformat==5.9.2
+nest-asyncio==1.6.0
+networkx==2.6.3
+ninja==1.11.1.1
+nltk==3.8.1
+notebook==6.4.10
+numba==0.57.1+1.g1ff679645
+numpy==1.24.4
+nvfuser==0.1.4a0+d0bb811
+nvidia-dali-cuda120==1.34.0
+nvidia-pyindex==1.0.9
+nvtx==0.2.5
+oauthlib==3.2.2
+onnx==1.15.0rc2
+opencv==4.7.0
+optree==0.10.0
+packaging==23.2
+pandas==1.5.3
+pandocfilters==1.5.1
+parso==0.8.3
+partd==1.4.1
+peft==0.11.1
+pexpect==4.9.0
+pillow==10.2.0
+pip==24.0
+platformdirs==4.2.0
+pluggy==1.4.0
+ply==3.11
+polygraphy==0.49.4
+pooch==1.8.0
+portalocker==2.10.1
+preshed==3.0.9
+prettytable==3.9.0
+prometheus-client==0.19.0
+prompt-toolkit==3.0.43
+protobuf==4.24.4
+psutil==5.9.4
+ptxcompiler==0.8.1+2.g0d406d6
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyarrow==14.0.1.dev0+gba5374836.d20240125
+pyasn1-modules==0.3.0
+pyasn1==0.5.1
+pybind11-global==2.11.1
+pybind11==2.11.1
+pycocotools==2.0+nv0.8.0
+pycparser==2.21
+pydantic-core==2.16.2
+pydantic==2.6.1
+pygments==2.17.2
+pylibcugraph==23.12.0
+pylibcugraphops==23.12.0
+pylibraft==23.12.0
+pynvml==11.4.1
+pyparsing==3.1.1
+pytest-flakefinder==1.1.0
+pytest-rerunfailures==13.0
+pytest-shard==0.1.2
+pytest-xdist==3.5.0
+pytest==8.0.0
+python-dateutil==2.8.2
+python-dotenv==1.0.0
+python-hostlist==1.23.0
+pytorch-quantization==2.1.2
+pytz==2023.3.post1
+pyyaml==6.0.1
+pyzmq==25.1.2
+raft-dask==23.12.0
+rapids-dask-dependency==23.12.1
+referencing==0.33.0
+regex==2023.12.25
+requests-oauthlib==1.3.1
+requests==2.31.0
+rich==13.7.0
+rmm==23.12.0
+rpds-py==0.17.1
+rsa==4.9
+sacrebleu==2.4.0
+safetensors==0.4.3
+scikit-learn==1.2.0
+scipy==1.12.0
+send2trash==1.8.2
+sentencepiece==0.1.99
+sentry-sdk==2.12.0
+setproctitle==1.3.3
+setuptools==68.2.2
+six==1.16.0
+smart-open==6.4.0
+smmap==5.0.1
+sortedcontainers==2.4.0
+soundfile==0.12.1
+soupsieve==2.5
+soxr==0.3.7
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+spacy==3.7.2
+sphinx-glpi-theme==0.6
+srsly==2.4.8
+stack-data==0.6.3
+sympy==1.12
+tabulate==0.9.0
+tbb==2021.11.0
+tblib==3.0.0
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.1
+tensorboard==2.9.0
+tensorrt==8.6.3
+terminado==0.18.0
+termplotlib==0.3.9
+thinc==8.2.3
+threadpoolctl==3.2.0
+thriftpy2==0.4.17
+tinycss2==1.2.1
+tokenizers==0.19.1
+toml==0.10.2
+tomli==2.0.1
+toolz==0.12.1
+torch-tensorrt==2.3.0a0
+torch==2.3.0a0+ebedce2
+torchdata==0.7.1a0
+torchtext==0.17.0a0
+torchvision==0.18.0a0
+tornado==6.4
+tqdm==4.66.1
+traitlets==5.9.0
+transformer-engine==1.3.0+5b90b7f
+transformers==4.43.3
+treelite-runtime==3.9.1
+treelite==3.9.1
+triton==2.2.0+e28a256
+typer==0.9.0
+types-dataclasses==0.6.6
+typing-extensions==4.9.0
+ucx-py==0.35.0
+uff==0.6.9
+ujson==5.8.0
+urllib3==1.26.18
+wandb==0.16.3
+wasabi==1.1.2
+wcwidth==0.2.13
+weasel==0.3.4
+webencodings==0.5.1
+werkzeug==3.0.1
+wheel==0.42.0
+xdoctest==1.0.2
+xgboost==1.7.6
+yarl==1.9.4
+zict==3.0.0
+zipp==3.17.0

wandb/run-20240804_143607-h7fxlkpt/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,215 @@

+{
+    "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
+    "python": "3.10.12",
+    "heartbeatAt": "2024-08-04T05:36:07.811618",
+    "startedAt": "2024-08-04T05:36:07.207201",
+    "docker": null,
+    "cuda": null,
+    "args": [
+        "--seq-length",
+        "512",
+        "--sliding-window-size",
+        "4096",
+        "--micro-batch-size",
+        "8",
+        "--global-batch-size",
+        "320",
+        "--train-iters",
+        "2000",
+        "--tokenizer-type",
+        "Llama2Tokenizer",
+        "--tokenizer-model",
+        "/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model",
+        "--train-data-path",
+        "4013541",
+        "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
+        "--valid-data-path",
+        "4013541",
+        "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
+        "--test-data-path",
+        "4013541",
+        "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
+        "--lr",
+        "2e-5",
+        "--min-lr",
+        "1e-6",
+        "--lr-decay-style",
+        "cosine",
+        "--lr-warmup-iters",
+        "500",
+        "--lr-decay-iters",
+        "2000",
+        "--weight-decay",
+        "0.1",
+        "--grad-clip-norm",
+        "1.0",
+        "--optimizer",
+        "adam",
+        "--adam-beta1",
+        "0.9",
+        "--adam-beta2",
+        "0.95",
+        "--adam-eps",
+        "1e-6",
+        "--save-interval",
+        "200",
+        "--eval-interval",
+        "200",
+        "--eval-iters",
+        "10",
+        "--bf16",
+        "--mixed-precision",
+        "--base-model",
+        "/share/pretrained_lm/meta-llama/TinyLlama_v1.1",
+        "--save",
+        "/work/llm_recipes/models/tiny-llama",
+        "--load",
+        "/work/llm_recipes/models/tiny-llama",
+        "--fsdp-activation-checkpointing",
+        "--sharding-strategy",
+        "FULL_SHARD",
+        "--checkpoint-type",
+        "LOCAL_STATE_DICT",
+        "--save-n-checkpoints",
+        "10",
+        "--hf-upload-retry-limit",
+        "2",
+        "--hf-repo-id",
+        "koichi12/tiny-llama",
+        "--wandb-entity",
+        "iwakawa-koichi-q5-tohoku-nlp6723",
+        "--wandb-project",
+        "llm_tutorial",
+        "--wandb-name",
+        "tiny-llama_train_2024-08-04-14:35:56"
+    ],
+    "state": "running",
+    "program": "/project/examples/finetuning.py",
+    "codePathLocal": "examples/finetuning.py",
+    "codePath": "examples/finetuning.py",
+    "git": {
+        "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
+        "commit": "3be5353210a678dc7008f237fa16b99f2bdf36ea"
+    },
+    "email": null,
+    "root": "/project",
+    "host": "gpu-koiwa-00",
+    "username": "koiwa",
+    "executable": "/usr/bin/python",
+    "cpu_count": 18,
+    "cpu_count_logical": 18,
+    "cpu_freq": {
+        "current": 2400.0389999999993,
+        "min": 0.0,
+        "max": 0.0
+    },
+    "cpu_freq_per_core": [
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        }
+    ],
+    "disk": {
+        "/": {
+            "total": 0.0625,
+            "used": 1.1444091796875e-05
+        }
+    },
+    "gpu": "NVIDIA A100-SXM4-40GB",
+    "gpu_count": 1,
+    "gpu_devices": [
+        {
+            "name": "NVIDIA A100-SXM4-40GB",
+            "memory_total": 42949672960
+        }
+    ],
+    "memory": {
+        "total": 56.48781967163086
+    }
+}

wandb/run-20240804_143607-h7fxlkpt/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"_wandb": {"runtime": 2}}

wandb/run-20240804_143607-h7fxlkpt/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,186 @@

+2024-08-04 14:36:07,222 INFO    StreamThr :11584 [internal.py:wandb_internal():86] W&B internal server running at pid: 11584, started at: 2024-08-04 14:36:07.221438
+2024-08-04 14:36:07,223 DEBUG   HandlerThread:11584 [handler.py:handle_request():146] handle_request: status
+2024-08-04 14:36:07,225 INFO    WriterThread:11584 [datastore.py:open_for_write():87] open: /project/wandb/run-20240804_143607-h7fxlkpt/run-h7fxlkpt.wandb
+2024-08-04 14:36:07,226 DEBUG   SenderThread:11584 [sender.py:send():382] send: header
+2024-08-04 14:36:07,240 DEBUG   SenderThread:11584 [sender.py:send():382] send: run
+2024-08-04 14:36:07,696 INFO    SenderThread:11584 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240804_143607-h7fxlkpt/files
+2024-08-04 14:36:07,696 INFO    SenderThread:11584 [sender.py:_start_run_threads():1136] run started: h7fxlkpt with start time 1722749767.220741
+2024-08-04 14:36:07,701 DEBUG   HandlerThread:11584 [handler.py:handle_request():146] handle_request: check_version
+2024-08-04 14:36:07,701 DEBUG   SenderThread:11584 [sender.py:send_request():409] send_request: check_version
+2024-08-04 14:36:07,791 DEBUG   HandlerThread:11584 [handler.py:handle_request():146] handle_request: run_start
+2024-08-04 14:36:07,798 DEBUG   HandlerThread:11584 [system_info.py:__init__():27] System info init
+2024-08-04 14:36:07,798 DEBUG   HandlerThread:11584 [system_info.py:__init__():42] System info init done
+2024-08-04 14:36:07,798 INFO    HandlerThread:11584 [system_monitor.py:start():194] Starting system monitor
+2024-08-04 14:36:07,798 INFO    SystemMonitor:11584 [system_monitor.py:_start():158] Starting system asset monitoring threads
+2024-08-04 14:36:07,799 INFO    HandlerThread:11584 [system_monitor.py:probe():214] Collecting system info
+2024-08-04 14:36:07,799 INFO    SystemMonitor:11584 [interfaces.py:start():190] Started cpu monitoring
+2024-08-04 14:36:07,799 INFO    SystemMonitor:11584 [interfaces.py:start():190] Started disk monitoring
+2024-08-04 14:36:07,800 INFO    SystemMonitor:11584 [interfaces.py:start():190] Started gpu monitoring
+2024-08-04 14:36:07,801 INFO    SystemMonitor:11584 [interfaces.py:start():190] Started memory monitoring
+2024-08-04 14:36:07,802 INFO    SystemMonitor:11584 [interfaces.py:start():190] Started network monitoring
+2024-08-04 14:36:07,811 DEBUG   HandlerThread:11584 [system_info.py:probe():151] Probing system
+2024-08-04 14:36:07,813 DEBUG   HandlerThread:11584 [system_info.py:_probe_git():136] Probing git
+2024-08-04 14:36:07,825 DEBUG   HandlerThread:11584 [system_info.py:_probe_git():144] Probing git done
+2024-08-04 14:36:07,825 DEBUG   HandlerThread:11584 [system_info.py:probe():199] Probing system done
+2024-08-04 14:36:07,825 DEBUG   HandlerThread:11584 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-04T05:36:07.811618', 'startedAt': '2024-08-04T05:36:07.207201', 'docker': None, 'cuda': None, 'args': ('--seq-length', '512', '--sliding-window-size', '4096', '--micro-batch-size', '8', '--global-batch-size', '320', '--train-iters', '2000', '--tokenizer-type', 'Llama2Tokenizer', '--tokenizer-model', '/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model', '--train-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--valid-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--test-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '2000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '200', '--eval-interval', '200', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/meta-llama/TinyLlama_v1.1', '--save', '/work/llm_recipes/models/tiny-llama', '--load', '/work/llm_recipes/models/tiny-llama', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/tiny-llama', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'tiny-llama_train_2024-08-04-14:35:56'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '3be5353210a678dc7008f237fa16b99f2bdf36ea'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0389999999993, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.48781967163086}}
+2024-08-04 14:36:07,825 INFO    HandlerThread:11584 [system_monitor.py:probe():224] Finished collecting system info
+2024-08-04 14:36:07,825 INFO    HandlerThread:11584 [system_monitor.py:probe():227] Publishing system info
+2024-08-04 14:36:07,827 INFO    HandlerThread:11584 [system_monitor.py:probe():229] Finished publishing system info
+2024-08-04 14:36:07,833 DEBUG   SenderThread:11584 [sender.py:send():382] send: files
+2024-08-04 14:36:07,833 INFO    SenderThread:11584 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
+2024-08-04 14:36:07,842 DEBUG   HandlerThread:11584 [handler.py:handle_request():146] handle_request: python_packages
+2024-08-04 14:36:07,842 DEBUG   HandlerThread:11584 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-04 14:36:07,842 DEBUG   HandlerThread:11584 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-04 14:36:07,843 DEBUG   SenderThread:11584 [sender.py:send_request():409] send_request: python_packages
+2024-08-04 14:36:07,845 DEBUG   SenderThread:11584 [sender.py:send_request():409] send_request: stop_status
+2024-08-04 14:36:08,168 DEBUG   SenderThread:11584 [sender.py:send():382] send: telemetry
+2024-08-04 14:36:08,499 INFO    wandb-upload_0:11584 [upload_job.py:push():131] Uploaded file /tmp/tmp7k_0gn43wandb/ux980mno-wandb-metadata.json
+2024-08-04 14:36:08,698 INFO    Thread-12 :11584 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_143607-h7fxlkpt/files/output.log
+2024-08-04 14:36:08,698 INFO    Thread-12 :11584 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_143607-h7fxlkpt/files/requirements.txt
+2024-08-04 14:36:08,698 INFO    Thread-12 :11584 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_143607-h7fxlkpt/files/wandb-metadata.json
+2024-08-04 14:36:10,261 DEBUG   SenderThread:11584 [sender.py:send():382] send: config
+2024-08-04 14:36:10,262 DEBUG   SenderThread:11584 [sender.py:send():382] send: config
+2024-08-04 14:36:10,349 DEBUG   SenderThread:11584 [sender.py:send():382] send: exit
+2024-08-04 14:36:10,349 INFO    SenderThread:11584 [sender.py:send_exit():589] handling exit code: 1
+2024-08-04 14:36:10,349 INFO    SenderThread:11584 [sender.py:send_exit():591] handling runtime: 2
+2024-08-04 14:36:10,351 INFO    SenderThread:11584 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-04 14:36:10,351 INFO    SenderThread:11584 [sender.py:send_exit():597] send defer
+2024-08-04 14:36:10,351 DEBUG   HandlerThread:11584 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:36:10,351 INFO    HandlerThread:11584 [handler.py:handle_request_defer():172] handle defer: 0
+2024-08-04 14:36:10,351 DEBUG   SenderThread:11584 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:36:10,351 INFO    SenderThread:11584 [sender.py:send_request_defer():613] handle sender defer: 0
+2024-08-04 14:36:10,351 INFO    SenderThread:11584 [sender.py:transition_state():617] send defer: 1
+2024-08-04 14:36:10,352 DEBUG   HandlerThread:11584 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:36:10,352 INFO    HandlerThread:11584 [handler.py:handle_request_defer():172] handle defer: 1
+2024-08-04 14:36:10,352 DEBUG   SenderThread:11584 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:36:10,352 INFO    SenderThread:11584 [sender.py:send_request_defer():613] handle sender defer: 1
+2024-08-04 14:36:10,352 INFO    SenderThread:11584 [sender.py:transition_state():617] send defer: 2
+2024-08-04 14:36:10,352 DEBUG   HandlerThread:11584 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:36:10,352 INFO    HandlerThread:11584 [handler.py:handle_request_defer():172] handle defer: 2
+2024-08-04 14:36:10,352 INFO    HandlerThread:11584 [system_monitor.py:finish():203] Stopping system monitor
+2024-08-04 14:36:10,352 DEBUG   SystemMonitor:11584 [system_monitor.py:_start():172] Starting system metrics aggregation loop
+2024-08-04 14:36:10,352 INFO    HandlerThread:11584 [interfaces.py:finish():202] Joined cpu monitor
+2024-08-04 14:36:10,352 DEBUG   SystemMonitor:11584 [system_monitor.py:_start():179] Finished system metrics aggregation loop
+2024-08-04 14:36:10,353 INFO    HandlerThread:11584 [interfaces.py:finish():202] Joined disk monitor
+2024-08-04 14:36:10,353 DEBUG   SystemMonitor:11584 [system_monitor.py:_start():183] Publishing last batch of metrics
+2024-08-04 14:36:10,385 INFO    HandlerThread:11584 [interfaces.py:finish():202] Joined gpu monitor
+2024-08-04 14:36:10,385 INFO    HandlerThread:11584 [interfaces.py:finish():202] Joined memory monitor
+2024-08-04 14:36:10,386 INFO    HandlerThread:11584 [interfaces.py:finish():202] Joined network monitor
+2024-08-04 14:36:10,386 DEBUG   SenderThread:11584 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:36:10,386 INFO    SenderThread:11584 [sender.py:send_request_defer():613] handle sender defer: 2
+2024-08-04 14:36:10,386 INFO    SenderThread:11584 [sender.py:transition_state():617] send defer: 3
+2024-08-04 14:36:10,386 DEBUG   SenderThread:11584 [sender.py:send():382] send: stats
+2024-08-04 14:36:10,386 DEBUG   HandlerThread:11584 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:36:10,386 INFO    HandlerThread:11584 [handler.py:handle_request_defer():172] handle defer: 3
+2024-08-04 14:36:10,387 DEBUG   SenderThread:11584 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:36:10,387 INFO    SenderThread:11584 [sender.py:send_request_defer():613] handle sender defer: 3
+2024-08-04 14:36:10,387 INFO    SenderThread:11584 [sender.py:transition_state():617] send defer: 4
+2024-08-04 14:36:10,387 DEBUG   HandlerThread:11584 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:36:10,387 INFO    HandlerThread:11584 [handler.py:handle_request_defer():172] handle defer: 4
+2024-08-04 14:36:10,387 DEBUG   SenderThread:11584 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:36:10,387 INFO    SenderThread:11584 [sender.py:send_request_defer():613] handle sender defer: 4
+2024-08-04 14:36:10,387 INFO    SenderThread:11584 [sender.py:transition_state():617] send defer: 5
+2024-08-04 14:36:10,387 DEBUG   HandlerThread:11584 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:36:10,387 INFO    HandlerThread:11584 [handler.py:handle_request_defer():172] handle defer: 5
+2024-08-04 14:36:10,387 DEBUG   SenderThread:11584 [sender.py:send():382] send: summary
+2024-08-04 14:36:10,388 INFO    SenderThread:11584 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-04 14:36:10,388 DEBUG   SenderThread:11584 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:36:10,388 INFO    SenderThread:11584 [sender.py:send_request_defer():613] handle sender defer: 5
+2024-08-04 14:36:10,388 INFO    SenderThread:11584 [sender.py:transition_state():617] send defer: 6
+2024-08-04 14:36:10,389 DEBUG   HandlerThread:11584 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:36:10,389 INFO    HandlerThread:11584 [handler.py:handle_request_defer():172] handle defer: 6
+2024-08-04 14:36:10,389 DEBUG   SenderThread:11584 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:36:10,389 INFO    SenderThread:11584 [sender.py:send_request_defer():613] handle sender defer: 6
+2024-08-04 14:36:10,391 DEBUG   HandlerThread:11584 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 14:36:10,576 INFO    SenderThread:11584 [sender.py:transition_state():617] send defer: 7
+2024-08-04 14:36:10,577 DEBUG   HandlerThread:11584 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:36:10,577 INFO    HandlerThread:11584 [handler.py:handle_request_defer():172] handle defer: 7
+2024-08-04 14:36:10,577 DEBUG   SenderThread:11584 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:36:10,577 INFO    SenderThread:11584 [sender.py:send_request_defer():613] handle sender defer: 7
+2024-08-04 14:36:10,699 INFO    Thread-12 :11584 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_143607-h7fxlkpt/files/output.log
+2024-08-04 14:36:10,699 INFO    Thread-12 :11584 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_143607-h7fxlkpt/files/config.yaml
+2024-08-04 14:36:10,699 INFO    Thread-12 :11584 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_143607-h7fxlkpt/files/wandb-summary.json
+2024-08-04 14:36:11,349 DEBUG   HandlerThread:11584 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 14:36:12,530 INFO    SenderThread:11584 [sender.py:transition_state():617] send defer: 8
+2024-08-04 14:36:12,530 DEBUG   SenderThread:11584 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 14:36:12,530 DEBUG   HandlerThread:11584 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:36:12,531 INFO    HandlerThread:11584 [handler.py:handle_request_defer():172] handle defer: 8
+2024-08-04 14:36:12,531 DEBUG   SenderThread:11584 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:36:12,531 INFO    SenderThread:11584 [sender.py:send_request_defer():613] handle sender defer: 8
+2024-08-04 14:36:12,531 INFO    SenderThread:11584 [job_builder.py:build():296] Attempting to build job artifact
+2024-08-04 14:36:12,532 INFO    SenderThread:11584 [job_builder.py:_get_source_type():426] is repo sourced job
+2024-08-04 14:36:12,546 INFO    SenderThread:11584 [job_builder.py:build():402] adding wandb-job metadata file
+2024-08-04 14:36:12,554 INFO    SenderThread:11584 [sender.py:transition_state():617] send defer: 9
+2024-08-04 14:36:12,555 DEBUG   HandlerThread:11584 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:36:12,555 DEBUG   SenderThread:11584 [sender.py:send():382] send: artifact
+2024-08-04 14:36:12,555 INFO    HandlerThread:11584 [handler.py:handle_request_defer():172] handle defer: 9
+2024-08-04 14:36:12,700 INFO    Thread-12 :11584 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_143607-h7fxlkpt/files/output.log
+2024-08-04 14:36:13,350 DEBUG   HandlerThread:11584 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 14:36:13,435 INFO    SenderThread:11584 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTA5MTk2NTkzOA==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTA5MzUzODM4NQ==', 'versionIndex': 3}}}
+2024-08-04 14:36:13,435 DEBUG   SenderThread:11584 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:36:13,435 INFO    SenderThread:11584 [sender.py:send_request_defer():613] handle sender defer: 9
+2024-08-04 14:36:13,435 INFO    SenderThread:11584 [dir_watcher.py:finish():358] shutting down directory watcher
+2024-08-04 14:36:13,701 INFO    SenderThread:11584 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240804_143607-h7fxlkpt/files
+2024-08-04 14:36:13,701 INFO    SenderThread:11584 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_143607-h7fxlkpt/files/requirements.txt requirements.txt
+2024-08-04 14:36:13,702 INFO    SenderThread:11584 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_143607-h7fxlkpt/files/config.yaml config.yaml
+2024-08-04 14:36:13,703 INFO    SenderThread:11584 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_143607-h7fxlkpt/files/wandb-metadata.json wandb-metadata.json
+2024-08-04 14:36:13,703 INFO    SenderThread:11584 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_143607-h7fxlkpt/files/wandb-summary.json wandb-summary.json
+2024-08-04 14:36:13,705 INFO    SenderThread:11584 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_143607-h7fxlkpt/files/output.log output.log
+2024-08-04 14:36:13,706 INFO    SenderThread:11584 [sender.py:transition_state():617] send defer: 10
+2024-08-04 14:36:13,707 DEBUG   SenderThread:11584 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 14:36:13,707 DEBUG   HandlerThread:11584 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:36:13,707 INFO    HandlerThread:11584 [handler.py:handle_request_defer():172] handle defer: 10
+2024-08-04 14:36:13,708 DEBUG   SenderThread:11584 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:36:13,708 INFO    SenderThread:11584 [sender.py:send_request_defer():613] handle sender defer: 10
+2024-08-04 14:36:13,709 INFO    SenderThread:11584 [file_pusher.py:finish():172] shutting down file pusher
+2024-08-04 14:36:14,120 INFO    wandb-upload_0:11584 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_143607-h7fxlkpt/files/requirements.txt
+2024-08-04 14:36:14,203 INFO    wandb-upload_1:11584 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_143607-h7fxlkpt/files/config.yaml
+2024-08-04 14:36:14,309 INFO    wandb-upload_3:11584 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_143607-h7fxlkpt/files/output.log
+2024-08-04 14:36:14,324 INFO    wandb-upload_2:11584 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_143607-h7fxlkpt/files/wandb-summary.json
+2024-08-04 14:36:14,351 DEBUG   HandlerThread:11584 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 14:36:14,351 DEBUG   SenderThread:11584 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 14:36:14,524 INFO    Thread-11 (_thread_body):11584 [sender.py:transition_state():617] send defer: 11
+2024-08-04 14:36:14,524 DEBUG   HandlerThread:11584 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:36:14,524 INFO    HandlerThread:11584 [handler.py:handle_request_defer():172] handle defer: 11
+2024-08-04 14:36:14,524 DEBUG   SenderThread:11584 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:36:14,524 INFO    SenderThread:11584 [sender.py:send_request_defer():613] handle sender defer: 11
+2024-08-04 14:36:14,524 INFO    SenderThread:11584 [file_pusher.py:join():178] waiting for file pusher
+2024-08-04 14:36:14,525 INFO    SenderThread:11584 [sender.py:transition_state():617] send defer: 12
+2024-08-04 14:36:14,525 DEBUG   HandlerThread:11584 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:36:14,525 INFO    HandlerThread:11584 [handler.py:handle_request_defer():172] handle defer: 12
+2024-08-04 14:36:14,525 DEBUG   SenderThread:11584 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:36:14,525 INFO    SenderThread:11584 [sender.py:send_request_defer():613] handle sender defer: 12
+2024-08-04 14:36:14,525 INFO    SenderThread:11584 [file_stream.py:finish():595] file stream finish called
+2024-08-04 14:36:14,732 INFO    SenderThread:11584 [file_stream.py:finish():599] file stream finish is done
+2024-08-04 14:36:14,732 INFO    SenderThread:11584 [sender.py:transition_state():617] send defer: 13
+2024-08-04 14:36:14,732 DEBUG   HandlerThread:11584 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:36:14,732 INFO    HandlerThread:11584 [handler.py:handle_request_defer():172] handle defer: 13
+2024-08-04 14:36:14,732 DEBUG   SenderThread:11584 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:36:14,732 INFO    SenderThread:11584 [sender.py:send_request_defer():613] handle sender defer: 13
+2024-08-04 14:36:14,732 INFO    SenderThread:11584 [sender.py:transition_state():617] send defer: 14
+2024-08-04 14:36:14,732 DEBUG   HandlerThread:11584 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:36:14,733 DEBUG   SenderThread:11584 [sender.py:send():382] send: final
+2024-08-04 14:36:14,733 INFO    HandlerThread:11584 [handler.py:handle_request_defer():172] handle defer: 14
+2024-08-04 14:36:14,733 DEBUG   SenderThread:11584 [sender.py:send():382] send: footer
+2024-08-04 14:36:14,733 DEBUG   SenderThread:11584 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:36:14,733 INFO    SenderThread:11584 [sender.py:send_request_defer():613] handle sender defer: 14
+2024-08-04 14:36:14,733 DEBUG   HandlerThread:11584 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 14:36:14,733 DEBUG   SenderThread:11584 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 14:36:14,734 DEBUG   HandlerThread:11584 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 14:36:14,734 DEBUG   HandlerThread:11584 [handler.py:handle_request():146] handle_request: server_info
+2024-08-04 14:36:14,734 DEBUG   SenderThread:11584 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 14:36:14,734 DEBUG   SenderThread:11584 [sender.py:send_request():409] send_request: server_info
+2024-08-04 14:36:14,734 DEBUG   HandlerThread:11584 [handler.py:handle_request():146] handle_request: get_summary
+2024-08-04 14:36:14,736 DEBUG   HandlerThread:11584 [handler.py:handle_request():146] handle_request: sampled_history
+2024-08-04 14:36:14,736 DEBUG   HandlerThread:11584 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-04 14:36:14,736 DEBUG   HandlerThread:11584 [handler.py:handle_request():146] handle_request: job_info
+2024-08-04 14:36:14,893 DEBUG   SenderThread:11584 [sender.py:send_request():409] send_request: job_info
+2024-08-04 14:36:14,893 INFO    MainThread:11584 [wandb_run.py:_footer_history_summary_info():3866] rendering history
+2024-08-04 14:36:14,894 INFO    MainThread:11584 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
+2024-08-04 14:36:14,894 INFO    MainThread:11584 [wandb_run.py:_footer_sync_info():3825] logging synced files
+2024-08-04 14:36:14,894 DEBUG   HandlerThread:11584 [handler.py:handle_request():146] handle_request: shutdown
+2024-08-04 14:36:14,894 INFO    HandlerThread:11584 [handler.py:finish():869] shutting down handler
+2024-08-04 14:36:15,737 INFO    WriterThread:11584 [datastore.py:close():296] close: /project/wandb/run-20240804_143607-h7fxlkpt/run-h7fxlkpt.wandb
+2024-08-04 14:36:15,893 INFO    SenderThread:11584 [sender.py:finish():1572] shutting down sender
+2024-08-04 14:36:15,894 INFO    SenderThread:11584 [file_pusher.py:finish():172] shutting down file pusher
+2024-08-04 14:36:15,894 INFO    SenderThread:11584 [file_pusher.py:join():178] waiting for file pusher

wandb/run-20240804_143607-h7fxlkpt/logs/debug.log ADDED Viewed

	@@ -0,0 +1,30 @@

+2024-08-04 14:36:07,213 INFO    MainThread:11513 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
+2024-08-04 14:36:07,214 INFO    MainThread:11513 [wandb_setup.py:_flush():76] Configure stats pid to 11513
+2024-08-04 14:36:07,214 INFO    MainThread:11513 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
+2024-08-04 14:36:07,214 INFO    MainThread:11513 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
+2024-08-04 14:36:07,214 INFO    MainThread:11513 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train tiny llama sample'}
+2024-08-04 14:36:07,214 INFO    MainThread:11513 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
+2024-08-04 14:36:07,214 INFO    MainThread:11513 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
+2024-08-04 14:36:07,214 INFO    MainThread:11513 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240804_143607-h7fxlkpt/logs/debug.log
+2024-08-04 14:36:07,214 INFO    MainThread:11513 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240804_143607-h7fxlkpt/logs/debug-internal.log
+2024-08-04 14:36:07,214 INFO    MainThread:11513 [wandb_init.py:init():566] calling init triggers
+2024-08-04 14:36:07,215 INFO    MainThread:11513 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
+config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'valid_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'test_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 512, 'num_workers': 2, 'tokenizer_type': 'Llama2Tokenizer', 'tokenizer_model': '/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'tiny-llama_train_2024-08-04-14:35:56', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/tiny-llama', 'save': '/work/llm_recipes/models/tiny-llama', 'base_model': '/share/pretrained_lm/meta-llama/TinyLlama_v1.1', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 2000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 2000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 8, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/tiny-llama', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 32000, 'gradient_accumulation_steps': 40}
+2024-08-04 14:36:07,215 INFO    MainThread:11513 [wandb_init.py:init():616] starting backend
+2024-08-04 14:36:07,215 INFO    MainThread:11513 [wandb_init.py:init():620] setting up manager
+2024-08-04 14:36:07,219 INFO    MainThread:11513 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-08-04 14:36:07,220 INFO    MainThread:11513 [wandb_init.py:init():628] backend started and connected
+2024-08-04 14:36:07,225 INFO    MainThread:11513 [wandb_init.py:init():720] updated telemetry
+2024-08-04 14:36:07,236 INFO    MainThread:11513 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
+2024-08-04 14:36:07,701 INFO    MainThread:11513 [wandb_run.py:_on_init():2262] communicating current version
+2024-08-04 14:36:07,784 INFO    MainThread:11513 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.5 is available!  To upgrade, please run:\n $ pip install wandb --upgrade"
+2024-08-04 14:36:07,784 INFO    MainThread:11513 [wandb_init.py:init():804] starting run threads in backend
+2024-08-04 14:36:07,841 INFO    MainThread:11513 [wandb_run.py:_console_start():2241] atexit reg
+2024-08-04 14:36:07,842 INFO    MainThread:11513 [wandb_run.py:_redirect():2096] redirect: wrap_raw
+2024-08-04 14:36:07,842 INFO    MainThread:11513 [wandb_run.py:_redirect():2161] Wrapping output streams.
+2024-08-04 14:36:07,842 INFO    MainThread:11513 [wandb_run.py:_redirect():2186] Redirects installed.
+2024-08-04 14:36:07,843 INFO    MainThread:11513 [wandb_init.py:init():847] run started, returning control to user process
+2024-08-04 14:36:10,261 INFO    MainThread:11513 [wandb_run.py:_config_callback():1343] config_cb None None {'activation_function': 'silu', 'hidden_size': 2048, 'model_type': 'llama', 'max_position_embeddings': 2048, 'num_attention_heads': 32, 'num_hidden_layers': 22, 'model_architecture': 'LlamaForCausalLM'}
+2024-08-04 14:36:10,261 INFO    MainThread:11513 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
+2024-08-04 14:36:15,895 WARNING MsgRouterThr:11513 [router.py:message_loop():77] message_loop has been closed

wandb/run-20240804_143607-h7fxlkpt/run-h7fxlkpt.wandb ADDED Viewed

Binary file (20.4 kB). View file

wandb/run-20240804_221132-o8ieoj9i/files/config.yaml ADDED Viewed

	@@ -0,0 +1,335 @@

+wandb_version: 1
+sharding_strategy:
+  desc: null
+  value: FULL_SHARD
+checkpoint_type:
+  desc: null
+  value: LOCAL_STATE_DICT
+fsdp_activation_checkpointing:
+  desc: null
+  value: true
+fsdp_cpu_offload:
+  desc: null
+  value: false
+low_cpu_fsdp:
+  desc: null
+  value: false
+no_meta_device:
+  desc: null
+  value: false
+data_path:
+  desc: null
+  value: null
+split:
+  desc: null
+  value: 969, 30, 1
+train_data_path:
+  desc: null
+  value:
+  - '235289369'
+  - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document
+valid_data_path:
+  desc: null
+  value:
+  - '235289369'
+  - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document
+test_data_path:
+  desc: null
+  value:
+  - '235289369'
+  - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document
+data_cache_path:
+  desc: null
+  value: null
+vocab_size:
+  desc: null
+  value: null
+vocab_file:
+  desc: null
+  value: null
+merge_file:
+  desc: null
+  value: null
+seq_length:
+  desc: null
+  value: 4096
+num_workers:
+  desc: null
+  value: 2
+tokenizer_type:
+  desc: null
+  value: HFPreTrainedTokenizer
+tokenizer_model:
+  desc: null
+  value: /share/pretrained_lm/google/gemma-2-2b
+reset_position_ids:
+  desc: null
+  value: false
+reset_attention_mask:
+  desc: null
+  value: false
+eod_mask_loss:
+  desc: null
+  value: false
+retro_return_doc_ids:
+  desc: null
+  value: false
+short_seq_prob:
+  desc: null
+  value: 0.1
+vocab_extra_ids:
+  desc: null
+  value: 0
+seed:
+  desc: null
+  value: 1234
+use_mpi:
+  desc: null
+  value: false
+wandb_entity:
+  desc: null
+  value: iwakawa-koichi-q5-tohoku-nlp6723
+wandb_name:
+  desc: null
+  value: yans-sample-gemma-2-2b_train_2024-08-04-22:11:21
+wandb_project:
+  desc: null
+  value: llm_tutorial
+quantization:
+  desc: null
+  value: false
+use_freeze_layers:
+  desc: null
+  value: false
+freeze_layers:
+  desc: null
+  value: null
+bf16:
+  desc: null
+  value: true
+fp16:
+  desc: null
+  value: false
+mixed_precision:
+  desc: null
+  value: true
+param_dtype:
+  desc: null
+  value: null
+load:
+  desc: null
+  value: /work/llm_recipes/models/yans-sample-gemma-2-2b
+save:
+  desc: null
+  value: /work/llm_recipes/models/yans-sample-gemma-2-2b
+base_model:
+  desc: null
+  value: /share/pretrained_lm/google/gemma-2-2b
+use_better_transformer:
+  desc: null
+  value: false
+grad_clip_norm:
+  desc: null
+  value: 1.0
+eval_interval:
+  desc: null
+  value: 200
+save_interval:
+  desc: null
+  value: 200
+eval_iters:
+  desc: null
+  value: 10
+optimizer:
+  desc: null
+  value: anyprecision
+lr:
+  desc: null
+  value: 2.0e-05
+lr_decay_style:
+  desc: null
+  value: cosine
+lr_decay_iters:
+  desc: null
+  value: 20000
+lr_warmup_iters:
+  desc: null
+  value: 500
+min_lr:
+  desc: null
+  value: 1.0e-06
+train_iters:
+  desc: null
+  value: 20000
+train_samples:
+  desc: null
+  value: null
+global_batch_size:
+  desc: null
+  value: 320
+micro_batch_size:
+  desc: null
+  value: 2
+make_vocab_size_divisible_by:
+  desc: null
+  value: 128
+sliding_window_size:
+  desc: null
+  value: 4096
+skip_batch:
+  desc: null
+  value: null
+no_save_optimizer_state:
+  desc: null
+  value: false
+continual_pretraining:
+  desc: null
+  value: false
+instruction_tuning:
+  desc: null
+  value: false
+direct_preference_optimization:
+  desc: null
+  value: false
+attention_dropout:
+  desc: null
+  value: 0.1
+hidden_dropout:
+  desc: null
+  value: 0.1
+weight_decay:
+  desc: null
+  value: 0.1
+adam_beta1:
+  desc: null
+  value: 0.9
+adam_beta2:
+  desc: null
+  value: 0.95
+adam_eps:
+  desc: null
+  value: 1.0e-06
+hf_transformer_model_dir:
+  desc: null
+  value: null
+instruction_train_data_path:
+  desc: null
+  value: null
+instruction_valid_data_path:
+  desc: null
+  value: null
+epoch:
+  desc: null
+  value: null
+instruction_dataset_size:
+  desc: null
+  value: null
+save_sampler_state:
+  desc: null
+  value: false
+label_smoothing:
+  desc: null
+  value: 0.0
+save_n_checkpoints:
+  desc: null
+  value: 10
+hf_repo_id:
+  desc: null
+  value: koichi12/yans-sample-gemma-2-2b
+create_public_hf_repo:
+  desc: null
+  value: false
+upload_all_checkpoints_to_hf:
+  desc: null
+  value: false
+hf_upload_retry_limit:
+  desc: null
+  value: 2
+exit_duration_in_mins:
+  desc: null
+  value: null
+source_key:
+  desc: null
+  value: null
+target_key:
+  desc: null
+  value: null
+attn_implementation:
+  desc: null
+  value: flash_attention_2
+efficient_instruction_tuning:
+  desc: null
+  value: false
+remove_padding_masking:
+  desc: null
+  value: false
+save_start_iter:
+  desc: null
+  value: null
+rank:
+  desc: null
+  value: 0
+world_size:
+  desc: null
+  value: 1
+padded_vocab_size:
+  desc: null
+  value: 256000
+gradient_accumulation_steps:
+  desc: null
+  value: 160
+_wandb:
+  desc: null
+  value:
+    python_version: 3.10.12
+    cli_version: 0.16.3
+    framework: huggingface
+    huggingface_version: 4.43.3
+    is_jupyter_run: false
+    is_kaggle_kernel: false
+    start_time: 1722777092.265577
+    t:
+      1:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      2:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      3:
+      - 13
+      - 16
+      - 23
+      4: 3.10.12
+      5: 0.16.3
+      6: 4.43.3
+      8:
+      - 5
+      13: linux-x86_64
+activation_function:
+  desc: null
+  value: gelu_pytorch_tanh
+hidden_size:
+  desc: null
+  value: 2304
+model_type:
+  desc: null
+  value: gemma2
+max_position_embeddings:
+  desc: null
+  value: 4096
+num_attention_heads:
+  desc: null
+  value: 8
+num_hidden_layers:
+  desc: null
+  value: 26
+model_architecture:
+  desc: null
+  value: Gemma2ForCausalLM

wandb/run-20240804_221132-o8ieoj9i/files/output.log ADDED Viewed

	@@ -0,0 +1,135 @@

+Created Hugging Face repository with ID koichi12/yans-sample-gemma-2-2b.
+Clearing GPU cache for all ranks
+--> Running with torch torch_distributed debug set to detail
+File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
+File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
+You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
+Loading checkpoint shards:  67%|██████▋   | 2/3 [02:29<01:15, 75.36s/it]
+File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
+No checkpoint found in /work/llm_recipes/models/yans-sample-gemma-2-2b, skipping model loading
+--> Model /share/pretrained_lm/google/gemma-2-2b
+--> /share/pretrained_lm/google/gemma-2-2b has 2614.341888 Million params
+BFloat16 enabled for mixed precision - using bfSixteen policy
+--> applying fsdp activation checkpointing...
+ > datasets target sizes (minimum size):
+    train:      6400000
+    validation: 323200
+    test:       3200
+Loading checkpoint shards: 100%|██████████| 3/3 [02:38<00:00, 52.69s/it]
+/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
+  warnings.warn(
+Let split = None
+Building a BlendedDataset for a single MegatronDataset
+> finished creating GPT datasets ...
+File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
+No checkpoint found in /work/llm_recipes/models/yans-sample-gemma-2-2b, skipping optimizer loading
+File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
+model info: FullyShardedDataParallel(
+  (_fsdp_wrapped_module): Gemma2ForCausalLM(
+    (model): Gemma2Model(
+      (embed_tokens): Embedding(256000, 2304, padding_idx=0)
+      (layers): ModuleList(
+        (0-25): 26 x FullyShardedDataParallel(
+          (_fsdp_wrapped_module): CheckpointWrapper(
+            (_checkpoint_wrapped_module): Gemma2DecoderLayer(
+              (self_attn): Gemma2FlashAttention2(
+                (q_proj): Linear(in_features=2304, out_features=2048, bias=False)
+                (k_proj): Linear(in_features=2304, out_features=1024, bias=False)
+                (v_proj): Linear(in_features=2304, out_features=1024, bias=False)
+                (o_proj): Linear(in_features=2048, out_features=2304, bias=False)
+                (rotary_emb): Gemma2RotaryEmbedding()
+              )
+              (mlp): Gemma2MLP(
+                (gate_proj): Linear(in_features=2304, out_features=9216, bias=False)
+                (up_proj): Linear(in_features=2304, out_features=9216, bias=False)
+                (down_proj): Linear(in_features=9216, out_features=2304, bias=False)
+                (act_fn): PytorchGELUTanh()
+              )
+              (input_layernorm): Gemma2RMSNorm()
+              (post_attention_layernorm): Gemma2RMSNorm()
+              (pre_feedforward_layernorm): Gemma2RMSNorm()
+              (post_feedforward_layernorm): Gemma2RMSNorm()
+            )
+          )
+        )
+      )
+      (norm): Gemma2RMSNorm()
+    )
+    (lm_head): Linear(in_features=2304, out_features=256000, bias=False)
+  )
+)
+model config: Gemma2Config {
+  "_name_or_path": "/share/pretrained_lm/google/gemma-2-2b",
+  "architectures": [
+    "Gemma2ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "attn_logit_softcapping": 50.0,
+  "bos_token_id": 2,
+  "cache_implementation": "hybrid",
+  "eos_token_id": 1,
+  "final_logit_softcapping": 30.0,
+  "head_dim": 256,
+  "hidden_act": "gelu_pytorch_tanh",
+  "hidden_activation": "gelu_pytorch_tanh",
+  "hidden_size": 2304,
+  "initializer_range": 0.02,
+  "intermediate_size": 9216,
+  "label_smoothing": 0.0,
+  "max_position_embeddings": 4096,
+  "model_type": "gemma2",
+  "num_attention_heads": 8,
+  "num_hidden_layers": 26,
+  "num_key_value_heads": 4,
+  "pad_token_id": 0,
+  "query_pre_attn_scalar": 256,
+  "rms_norm_eps": 1e-06,
+  "rope_theta": 10000.0,
+  "sliding_window": 4096,
+  "torch_dtype": "float32",
+  "transformers_version": "4.43.3",
+  "use_cache": false,
+  "vocab_size": 256000
+}
+Unable to save the indexes because path_to_cache is None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+It is strongly recommended to train Gemma2 models with the `eager` attention implementation instead of `flash_attention_2`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.
+Traceback (most recent call last):
+  File "/project/examples/finetuning.py", line 13, in <module>
+    main()
+  File "/project/src/llama_recipes/finetuning.py", line 281, in main
+    train(
+  File "/project/src/llama_recipes/utils/train_utils.py", line 110, in train
+    loss: torch.Tensor = model(**batch).loss
+  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 849, in forward
+    output = self._fsdp_wrapped_module(*args, **kwargs)
+  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/project/lib/transformers/src/transformers/models/gemma2/modeling_gemma2.py", line 976, in forward
+    loss = loss_fct(shift_logits, shift_labels)
+  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/loss.py", line 1179, in forward
+    return F.cross_entropy(input, target, weight=self.weight,
+  File "/usr/local/lib/python3.10/dist-packages/torch/nn/functional.py", line 3086, in cross_entropy
+    return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
+torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 7.81 GiB. GPU 0 has a total capacity of 39.39 GiB of which 7.81 GiB is free. Including non-PyTorch memory, this process has 31.58 GiB memory in use. Of the allocated memory 30.38 GiB is allocated by PyTorch, and 385.31 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

wandb/run-20240804_221132-o8ieoj9i/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,271 @@

+absl-py==2.1.0
+accelerate==0.33.0
+aiohttp==3.9.1
+aiosignal==1.3.1
+annotated-types==0.6.0
+apex==0.1
+appdirs==1.4.4
+argon2-cffi-bindings==21.2.0
+argon2-cffi==23.1.0
+asttokens==2.4.1
+astunparse==1.6.3
+async-timeout==4.0.3
+attrs==23.2.0
+audioread==3.0.1
+beautifulsoup4==4.12.3
+bleach==6.1.0
+blis==0.7.11
+cachetools==5.3.2
+catalogue==2.0.10
+certifi==2024.2.2
+cffi==1.16.0
+charset-normalizer==3.3.2
+click==8.1.7
+cloudpathlib==0.16.0
+cloudpickle==3.0.0
+cmake==3.28.1
+colorama==0.4.6
+comm==0.2.1
+confection==0.1.4
+contourpy==1.2.0
+cubinlinker==0.3.0+2.g405ac64
+cuda-python==12.3.0rc4+9.gdb8c48a.dirty
+cudf==23.12.0
+cugraph-dgl==23.12.0
+cugraph-service-client==23.12.0
+cugraph-service-server==23.12.0
+cugraph==23.12.0
+cuml==23.12.0
+cupy-cuda12x==12.3.0
+cycler==0.12.1
+cymem==2.0.8
+cython==3.0.8
+dask-cuda==23.12.0
+dask-cudf==23.12.0
+dask==2023.11.0
+debugpy==1.8.1
+decorator==5.1.1
+defusedxml==0.7.1
+distributed==2023.11.0
+dm-tree==0.1.8
+docker-pycreds==0.4.0
+einops==0.7.0
+exceptiongroup==1.2.0
+execnet==2.0.2
+executing==2.0.1
+expecttest==0.1.3
+fastjsonschema==2.19.1
+fastrlock==0.8.2
+filelock==3.13.1
+flash-attn==2.4.2
+fonttools==4.48.1
+frozenlist==1.4.1
+fsspec==2023.12.2
+gast==0.5.4
+gitdb==4.0.11
+gitpython==3.1.43
+google-auth-oauthlib==0.4.6
+google-auth==2.27.0
+graphsurgeon==0.4.6
+grpcio==1.60.1
+huggingface-hub==0.24.5
+hypothesis==5.35.1
+idna==3.6
+importlib-metadata==7.0.1
+iniconfig==2.0.0
+intel-openmp==2021.4.0
+ipadic==1.0.0
+ipykernel==6.29.2
+ipython-genutils==0.2.0
+ipython==8.21.0
+jedi==0.19.1
+jinja2==3.1.3
+joblib==1.3.2
+json5==0.9.14
+jsonnet==0.19.1
+jsonschema-specifications==2023.12.1
+jsonschema==4.21.1
+jupyter-client==8.6.0
+jupyter-core==5.7.1
+jupyter-tensorboard==0.2.0
+jupyterlab-pygments==0.3.0
+jupyterlab-server==1.2.0
+jupyterlab==2.3.2
+jupytext==1.16.1
+kiwisolver==1.4.5
+langcodes==3.3.0
+lazy-loader==0.3
+librosa==0.10.1
+llvmlite==0.40.1
+locket==1.0.0
+logzero==1.7.0
+lxml==5.2.2
+markdown-it-py==3.0.0
+markdown==3.5.2
+markupsafe==2.1.4
+matplotlib-inline==0.1.6
+matplotlib==3.8.2
+mdit-py-plugins==0.4.0
+mdurl==0.1.2
+mecab-python3==1.0.6
+mistune==3.0.2
+mkl-devel==2021.1.1
+mkl-include==2021.1.1
+mkl==2021.1.1
+mock==5.1.0
+more-itertools==9.1.0
+mpmath==1.3.0
+msgpack==1.0.7
+multidict==6.0.4
+murmurhash==1.0.10
+nbclient==0.9.0
+nbconvert==7.16.0
+nbformat==5.9.2
+nest-asyncio==1.6.0
+networkx==2.6.3
+ninja==1.11.1.1
+nltk==3.8.1
+notebook==6.4.10
+numba==0.57.1+1.g1ff679645
+numpy==1.24.4
+nvfuser==0.1.4a0+d0bb811
+nvidia-dali-cuda120==1.34.0
+nvidia-pyindex==1.0.9
+nvtx==0.2.5
+oauthlib==3.2.2
+onnx==1.15.0rc2
+opencv==4.7.0
+optree==0.10.0
+packaging==23.2
+pandas==1.5.3
+pandocfilters==1.5.1
+parso==0.8.3
+partd==1.4.1
+peft==0.11.1
+pexpect==4.9.0
+pillow==10.2.0
+pip==24.0
+platformdirs==4.2.0
+pluggy==1.4.0
+ply==3.11
+polygraphy==0.49.4
+pooch==1.8.0
+portalocker==2.10.1
+preshed==3.0.9
+prettytable==3.9.0
+prometheus-client==0.19.0
+prompt-toolkit==3.0.43
+protobuf==4.24.4
+psutil==5.9.4
+ptxcompiler==0.8.1+2.g0d406d6
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyarrow==14.0.1.dev0+gba5374836.d20240125
+pyasn1-modules==0.3.0
+pyasn1==0.5.1
+pybind11-global==2.11.1
+pybind11==2.11.1
+pycocotools==2.0+nv0.8.0
+pycparser==2.21
+pydantic-core==2.16.2
+pydantic==2.6.1
+pygments==2.17.2
+pylibcugraph==23.12.0
+pylibcugraphops==23.12.0
+pylibraft==23.12.0
+pynvml==11.4.1
+pyparsing==3.1.1
+pytest-flakefinder==1.1.0
+pytest-rerunfailures==13.0
+pytest-shard==0.1.2
+pytest-xdist==3.5.0
+pytest==8.0.0
+python-dateutil==2.8.2
+python-dotenv==1.0.0
+python-hostlist==1.23.0
+pytorch-quantization==2.1.2
+pytz==2023.3.post1
+pyyaml==6.0.1
+pyzmq==25.1.2
+raft-dask==23.12.0
+rapids-dask-dependency==23.12.1
+referencing==0.33.0
+regex==2023.12.25
+requests-oauthlib==1.3.1
+requests==2.31.0
+rich==13.7.0
+rmm==23.12.0
+rpds-py==0.17.1
+rsa==4.9
+sacrebleu==2.4.0
+safetensors==0.4.3
+scikit-learn==1.2.0
+scipy==1.12.0
+send2trash==1.8.2
+sentencepiece==0.1.99
+sentry-sdk==2.12.0
+setproctitle==1.3.3
+setuptools==68.2.2
+six==1.16.0
+smart-open==6.4.0
+smmap==5.0.1
+sortedcontainers==2.4.0
+soundfile==0.12.1
+soupsieve==2.5
+soxr==0.3.7
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+spacy==3.7.2
+sphinx-glpi-theme==0.6
+srsly==2.4.8
+stack-data==0.6.3
+sympy==1.12
+tabulate==0.9.0
+tbb==2021.11.0
+tblib==3.0.0
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.1
+tensorboard==2.9.0
+tensorrt==8.6.3
+terminado==0.18.0
+termplotlib==0.3.9
+thinc==8.2.3
+threadpoolctl==3.2.0
+thriftpy2==0.4.17
+tinycss2==1.2.1
+tokenizers==0.19.1
+toml==0.10.2
+tomli==2.0.1
+toolz==0.12.1
+torch-tensorrt==2.3.0a0
+torch==2.3.0a0+ebedce2
+torchdata==0.7.1a0
+torchtext==0.17.0a0
+torchvision==0.18.0a0
+tornado==6.4
+tqdm==4.66.1
+traitlets==5.9.0
+transformer-engine==1.3.0+5b90b7f
+transformers==4.43.3
+treelite-runtime==3.9.1
+treelite==3.9.1
+triton==2.2.0+e28a256
+typer==0.9.0
+types-dataclasses==0.6.6
+typing-extensions==4.9.0
+ucx-py==0.35.0
+uff==0.6.9
+ujson==5.8.0
+urllib3==1.26.18
+wandb==0.16.3
+wasabi==1.1.2
+wcwidth==0.2.13
+weasel==0.3.4
+webencodings==0.5.1
+werkzeug==3.0.1
+wheel==0.42.0
+xdoctest==1.0.2
+xgboost==1.7.6
+yarl==1.9.4
+zict==3.0.0
+zipp==3.17.0

wandb/run-20240804_221132-o8ieoj9i/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,215 @@

+{
+    "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
+    "python": "3.10.12",
+    "heartbeatAt": "2024-08-04T13:11:32.902217",
+    "startedAt": "2024-08-04T13:11:32.253120",
+    "docker": null,
+    "cuda": null,
+    "args": [
+        "--seq-length",
+        "4096",
+        "--sliding-window-size",
+        "4096",
+        "--micro-batch-size",
+        "2",
+        "--global-batch-size",
+        "320",
+        "--train-iters",
+        "20000",
+        "--tokenizer-type",
+        "HFPreTrainedTokenizer",
+        "--tokenizer-model",
+        "/share/pretrained_lm/google/gemma-2-2b",
+        "--train-data-path",
+        "235289369",
+        "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document",
+        "--valid-data-path",
+        "235289369",
+        "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document",
+        "--test-data-path",
+        "235289369",
+        "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document",
+        "--lr",
+        "2e-5",
+        "--min-lr",
+        "1e-6",
+        "--lr-decay-style",
+        "cosine",
+        "--lr-warmup-iters",
+        "500",
+        "--lr-decay-iters",
+        "20000",
+        "--weight-decay",
+        "0.1",
+        "--grad-clip-norm",
+        "1.0",
+        "--optimizer",
+        "anyprecision",
+        "--adam-beta1",
+        "0.9",
+        "--adam-beta2",
+        "0.95",
+        "--adam-eps",
+        "1e-6",
+        "--save-interval",
+        "200",
+        "--eval-interval",
+        "200",
+        "--eval-iters",
+        "10",
+        "--bf16",
+        "--mixed-precision",
+        "--base-model",
+        "/share/pretrained_lm/google/gemma-2-2b",
+        "--save",
+        "/work/llm_recipes/models/yans-sample-gemma-2-2b",
+        "--load",
+        "/work/llm_recipes/models/yans-sample-gemma-2-2b",
+        "--fsdp-activation-checkpointing",
+        "--sharding-strategy",
+        "FULL_SHARD",
+        "--checkpoint-type",
+        "LOCAL_STATE_DICT",
+        "--save-n-checkpoints",
+        "10",
+        "--hf-upload-retry-limit",
+        "2",
+        "--hf-repo-id",
+        "koichi12/yans-sample-gemma-2-2b",
+        "--wandb-entity",
+        "iwakawa-koichi-q5-tohoku-nlp6723",
+        "--wandb-project",
+        "llm_tutorial",
+        "--wandb-name",
+        "yans-sample-gemma-2-2b_train_2024-08-04-22:11:21"
+    ],
+    "state": "running",
+    "program": "/project/examples/finetuning.py",
+    "codePathLocal": "examples/finetuning.py",
+    "codePath": "examples/finetuning.py",
+    "git": {
+        "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
+        "commit": "0336bd6c20fe25d78eda1d14afa66c1ae2e6d687"
+    },
+    "email": null,
+    "root": "/project",
+    "host": "gpu-koiwa-00",
+    "username": "koiwa",
+    "executable": "/usr/bin/python",
+    "cpu_count": 18,
+    "cpu_count_logical": 18,
+    "cpu_freq": {
+        "current": 2400.044999999999,
+        "min": 0.0,
+        "max": 0.0
+    },
+    "cpu_freq_per_core": [
+        {
+            "current": 2400.045,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.045,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.045,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.045,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.045,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.045,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.045,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.045,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.045,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.045,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.045,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.045,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.045,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.045,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.045,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.045,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.045,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.045,
+            "min": 0.0,
+            "max": 0.0
+        }
+    ],
+    "disk": {
+        "/": {
+            "total": 0.0625,
+            "used": 1.1444091796875e-05
+        }
+    },
+    "gpu": "NVIDIA A100-SXM4-40GB",
+    "gpu_count": 1,
+    "gpu_devices": [
+        {
+            "name": "NVIDIA A100-SXM4-40GB",
+            "memory_total": 42949672960
+        }
+    ],
+    "memory": {
+        "total": 56.48782730102539
+    }
+}

wandb/run-20240804_221132-o8ieoj9i/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"_wandb": {"runtime": 166}}

wandb/run-20240804_221132-o8ieoj9i/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,263 @@

+2024-08-04 22:11:32,267 INFO    StreamThr :12237 [internal.py:wandb_internal():86] W&B internal server running at pid: 12237, started at: 2024-08-04 22:11:32.266168
+2024-08-04 22:11:32,268 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: status
+2024-08-04 22:11:32,270 INFO    WriterThread:12237 [datastore.py:open_for_write():87] open: /project/wandb/run-20240804_221132-o8ieoj9i/run-o8ieoj9i.wandb
+2024-08-04 22:11:32,271 DEBUG   SenderThread:12237 [sender.py:send():382] send: header
+2024-08-04 22:11:32,285 DEBUG   SenderThread:12237 [sender.py:send():382] send: run
+2024-08-04 22:11:32,779 INFO    SenderThread:12237 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240804_221132-o8ieoj9i/files
+2024-08-04 22:11:32,779 INFO    SenderThread:12237 [sender.py:_start_run_threads():1136] run started: o8ieoj9i with start time 1722777092.265577
+2024-08-04 22:11:32,784 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: check_version
+2024-08-04 22:11:32,784 DEBUG   SenderThread:12237 [sender.py:send_request():409] send_request: check_version
+2024-08-04 22:11:32,884 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: run_start
+2024-08-04 22:11:32,890 DEBUG   HandlerThread:12237 [system_info.py:__init__():27] System info init
+2024-08-04 22:11:32,890 DEBUG   HandlerThread:12237 [system_info.py:__init__():42] System info init done
+2024-08-04 22:11:32,890 INFO    HandlerThread:12237 [system_monitor.py:start():194] Starting system monitor
+2024-08-04 22:11:32,890 INFO    SystemMonitor:12237 [system_monitor.py:_start():158] Starting system asset monitoring threads
+2024-08-04 22:11:32,890 INFO    HandlerThread:12237 [system_monitor.py:probe():214] Collecting system info
+2024-08-04 22:11:32,891 INFO    SystemMonitor:12237 [interfaces.py:start():190] Started cpu monitoring
+2024-08-04 22:11:32,891 INFO    SystemMonitor:12237 [interfaces.py:start():190] Started disk monitoring
+2024-08-04 22:11:32,892 INFO    SystemMonitor:12237 [interfaces.py:start():190] Started gpu monitoring
+2024-08-04 22:11:32,893 INFO    SystemMonitor:12237 [interfaces.py:start():190] Started memory monitoring
+2024-08-04 22:11:32,893 INFO    SystemMonitor:12237 [interfaces.py:start():190] Started network monitoring
+2024-08-04 22:11:32,902 DEBUG   HandlerThread:12237 [system_info.py:probe():151] Probing system
+2024-08-04 22:11:32,904 DEBUG   HandlerThread:12237 [system_info.py:_probe_git():136] Probing git
+2024-08-04 22:11:32,916 DEBUG   HandlerThread:12237 [system_info.py:_probe_git():144] Probing git done
+2024-08-04 22:11:32,916 DEBUG   HandlerThread:12237 [system_info.py:probe():199] Probing system done
+2024-08-04 22:11:32,916 DEBUG   HandlerThread:12237 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-04T13:11:32.902217', 'startedAt': '2024-08-04T13:11:32.253120', 'docker': None, 'cuda': None, 'args': ('--seq-length', '4096', '--sliding-window-size', '4096', '--micro-batch-size', '2', '--global-batch-size', '320', '--train-iters', '20000', '--tokenizer-type', 'HFPreTrainedTokenizer', '--tokenizer-model', '/share/pretrained_lm/google/gemma-2-2b', '--train-data-path', '235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document', '--valid-data-path', '235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document', '--test-data-path', '235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '20000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'anyprecision', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '200', '--eval-interval', '200', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/google/gemma-2-2b', '--save', '/work/llm_recipes/models/yans-sample-gemma-2-2b', '--load', '/work/llm_recipes/models/yans-sample-gemma-2-2b', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/yans-sample-gemma-2-2b', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'yans-sample-gemma-2-2b_train_2024-08-04-22:11:21'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '0336bd6c20fe25d78eda1d14afa66c1ae2e6d687'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.044999999999, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}, {'current': 2400.045, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.48782730102539}}
+2024-08-04 22:11:32,916 INFO    HandlerThread:12237 [system_monitor.py:probe():224] Finished collecting system info
+2024-08-04 22:11:32,916 INFO    HandlerThread:12237 [system_monitor.py:probe():227] Publishing system info
+2024-08-04 22:11:32,917 INFO    HandlerThread:12237 [system_monitor.py:probe():229] Finished publishing system info
+2024-08-04 22:11:32,923 DEBUG   SenderThread:12237 [sender.py:send():382] send: files
+2024-08-04 22:11:32,923 INFO    SenderThread:12237 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
+2024-08-04 22:11:32,932 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: python_packages
+2024-08-04 22:11:32,933 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-04 22:11:32,933 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-04 22:11:32,933 DEBUG   SenderThread:12237 [sender.py:send_request():409] send_request: python_packages
+2024-08-04 22:11:32,935 DEBUG   SenderThread:12237 [sender.py:send_request():409] send_request: stop_status
+2024-08-04 22:11:33,202 DEBUG   SenderThread:12237 [sender.py:send():382] send: telemetry
+2024-08-04 22:11:33,617 INFO    wandb-upload_0:12237 [upload_job.py:push():131] Uploaded file /tmp/tmpntsoky67wandb/ybme98wl-wandb-metadata.json
+2024-08-04 22:11:33,780 INFO    Thread-12 :12237 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_221132-o8ieoj9i/files/requirements.txt
+2024-08-04 22:11:33,781 INFO    Thread-12 :12237 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_221132-o8ieoj9i/files/wandb-metadata.json
+2024-08-04 22:11:33,781 INFO    Thread-12 :12237 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_221132-o8ieoj9i/files/output.log
+2024-08-04 22:11:35,781 INFO    Thread-12 :12237 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_221132-o8ieoj9i/files/output.log
+2024-08-04 22:11:37,800 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 22:11:42,801 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 22:11:47,802 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 22:11:47,932 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-04 22:11:47,932 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-04 22:11:47,933 DEBUG   SenderThread:12237 [sender.py:send_request():409] send_request: stop_status
+2024-08-04 22:11:53,184 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 22:11:58,184 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 22:12:02,932 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-04 22:12:02,932 DEBUG   SenderThread:12237 [sender.py:send_request():409] send_request: stop_status
+2024-08-04 22:12:02,972 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-04 22:12:04,128 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 22:12:04,797 INFO    Thread-12 :12237 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_221132-o8ieoj9i/files/config.yaml
+2024-08-04 22:12:09,335 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 22:12:14,336 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 22:12:17,932 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-04 22:12:17,932 DEBUG   SenderThread:12237 [sender.py:send_request():409] send_request: stop_status
+2024-08-04 22:12:17,972 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-04 22:12:20,198 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 22:12:25,199 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 22:12:30,199 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 22:12:32,894 DEBUG   SystemMonitor:12237 [system_monitor.py:_start():172] Starting system metrics aggregation loop
+2024-08-04 22:12:32,895 DEBUG   SenderThread:12237 [sender.py:send():382] send: stats
+2024-08-04 22:12:32,932 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-04 22:12:32,932 DEBUG   SenderThread:12237 [sender.py:send_request():409] send_request: stop_status
+2024-08-04 22:12:32,972 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-04 22:12:36,110 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 22:12:41,111 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 22:12:45,820 INFO    Thread-12 :12237 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_221132-o8ieoj9i/files/output.log
+2024-08-04 22:12:46,558 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 22:12:47,932 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-04 22:12:47,933 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-04 22:12:47,933 DEBUG   SenderThread:12237 [sender.py:send_request():409] send_request: stop_status
+2024-08-04 22:12:52,156 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 22:12:57,157 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 22:13:02,157 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 22:13:02,897 DEBUG   SenderThread:12237 [sender.py:send():382] send: stats
+2024-08-04 22:13:02,932 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-04 22:13:02,932 DEBUG   SenderThread:12237 [sender.py:send_request():409] send_request: stop_status
+2024-08-04 22:13:02,972 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-04 22:13:08,124 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 22:13:13,125 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 22:13:17,932 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-04 22:13:17,933 DEBUG   SenderThread:12237 [sender.py:send_request():409] send_request: stop_status
+2024-08-04 22:13:17,976 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-04 22:13:18,132 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 22:13:23,133 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 22:13:28,134 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 22:13:32,898 DEBUG   SenderThread:12237 [sender.py:send():382] send: stats
+2024-08-04 22:13:32,932 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-04 22:13:32,933 DEBUG   SenderThread:12237 [sender.py:send_request():409] send_request: stop_status
+2024-08-04 22:13:32,976 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-04 22:13:33,205 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 22:13:38,206 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 22:13:43,207 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 22:13:47,932 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-04 22:13:47,933 DEBUG   SenderThread:12237 [sender.py:send_request():409] send_request: stop_status
+2024-08-04 22:13:47,976 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-04 22:13:49,120 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 22:13:54,121 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 22:13:59,122 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 22:14:02,898 DEBUG   SenderThread:12237 [sender.py:send():382] send: stats
+2024-08-04 22:14:02,932 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-04 22:14:02,933 DEBUG   SenderThread:12237 [sender.py:send_request():409] send_request: stop_status
+2024-08-04 22:14:02,976 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-04 22:14:04,197 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 22:14:04,864 INFO    Thread-12 :12237 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_221132-o8ieoj9i/files/output.log
+2024-08-04 22:14:09,198 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 22:14:13,453 DEBUG   SenderThread:12237 [sender.py:send():382] send: config
+2024-08-04 22:14:13,453 DEBUG   SenderThread:12237 [sender.py:send():382] send: config
+2024-08-04 22:14:13,869 INFO    Thread-12 :12237 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_221132-o8ieoj9i/files/output.log
+2024-08-04 22:14:14,550 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 22:14:15,870 INFO    Thread-12 :12237 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_221132-o8ieoj9i/files/output.log
+2024-08-04 22:14:17,933 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-04 22:14:17,934 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-04 22:14:17,934 DEBUG   SenderThread:12237 [sender.py:send_request():409] send_request: stop_status
+2024-08-04 22:14:19,437 DEBUG   SenderThread:12237 [sender.py:send():382] send: exit
+2024-08-04 22:14:19,437 INFO    SenderThread:12237 [sender.py:send_exit():589] handling exit code: 1
+2024-08-04 22:14:19,437 INFO    SenderThread:12237 [sender.py:send_exit():591] handling runtime: 166
+2024-08-04 22:14:19,438 INFO    SenderThread:12237 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-04 22:14:19,439 INFO    SenderThread:12237 [sender.py:send_exit():597] send defer
+2024-08-04 22:14:19,439 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 22:14:19,439 INFO    HandlerThread:12237 [handler.py:handle_request_defer():172] handle defer: 0
+2024-08-04 22:14:19,439 DEBUG   SenderThread:12237 [sender.py:send_request():409] send_request: defer
+2024-08-04 22:14:19,439 INFO    SenderThread:12237 [sender.py:send_request_defer():613] handle sender defer: 0
+2024-08-04 22:14:19,439 INFO    SenderThread:12237 [sender.py:transition_state():617] send defer: 1
+2024-08-04 22:14:19,439 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 22:14:19,439 INFO    HandlerThread:12237 [handler.py:handle_request_defer():172] handle defer: 1
+2024-08-04 22:14:19,439 DEBUG   SenderThread:12237 [sender.py:send_request():409] send_request: defer
+2024-08-04 22:14:19,439 INFO    SenderThread:12237 [sender.py:send_request_defer():613] handle sender defer: 1
+2024-08-04 22:14:19,439 INFO    SenderThread:12237 [sender.py:transition_state():617] send defer: 2
+2024-08-04 22:14:19,439 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 22:14:19,440 INFO    HandlerThread:12237 [handler.py:handle_request_defer():172] handle defer: 2
+2024-08-04 22:14:19,440 INFO    HandlerThread:12237 [system_monitor.py:finish():203] Stopping system monitor
+2024-08-04 22:14:19,440 DEBUG   SystemMonitor:12237 [system_monitor.py:_start():179] Finished system metrics aggregation loop
+2024-08-04 22:14:19,440 DEBUG   SystemMonitor:12237 [system_monitor.py:_start():183] Publishing last batch of metrics
+2024-08-04 22:14:19,440 INFO    HandlerThread:12237 [interfaces.py:finish():202] Joined cpu monitor
+2024-08-04 22:14:19,441 INFO    HandlerThread:12237 [interfaces.py:finish():202] Joined disk monitor
+2024-08-04 22:14:19,474 INFO    HandlerThread:12237 [interfaces.py:finish():202] Joined gpu monitor
+2024-08-04 22:14:19,474 INFO    HandlerThread:12237 [interfaces.py:finish():202] Joined memory monitor
+2024-08-04 22:14:19,474 INFO    HandlerThread:12237 [interfaces.py:finish():202] Joined network monitor
+2024-08-04 22:14:19,475 DEBUG   SenderThread:12237 [sender.py:send_request():409] send_request: defer
+2024-08-04 22:14:19,475 INFO    SenderThread:12237 [sender.py:send_request_defer():613] handle sender defer: 2
+2024-08-04 22:14:19,475 INFO    SenderThread:12237 [sender.py:transition_state():617] send defer: 3
+2024-08-04 22:14:19,475 DEBUG   SenderThread:12237 [sender.py:send():382] send: stats
+2024-08-04 22:14:19,475 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 22:14:19,475 INFO    HandlerThread:12237 [handler.py:handle_request_defer():172] handle defer: 3
+2024-08-04 22:14:19,475 DEBUG   SenderThread:12237 [sender.py:send_request():409] send_request: defer
+2024-08-04 22:14:19,475 INFO    SenderThread:12237 [sender.py:send_request_defer():613] handle sender defer: 3
+2024-08-04 22:14:19,475 INFO    SenderThread:12237 [sender.py:transition_state():617] send defer: 4
+2024-08-04 22:14:19,475 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 22:14:19,475 INFO    HandlerThread:12237 [handler.py:handle_request_defer():172] handle defer: 4
+2024-08-04 22:14:19,476 DEBUG   SenderThread:12237 [sender.py:send_request():409] send_request: defer
+2024-08-04 22:14:19,476 INFO    SenderThread:12237 [sender.py:send_request_defer():613] handle sender defer: 4
+2024-08-04 22:14:19,476 INFO    SenderThread:12237 [sender.py:transition_state():617] send defer: 5
+2024-08-04 22:14:19,476 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 22:14:19,476 INFO    HandlerThread:12237 [handler.py:handle_request_defer():172] handle defer: 5
+2024-08-04 22:14:19,476 DEBUG   SenderThread:12237 [sender.py:send():382] send: summary
+2024-08-04 22:14:19,477 INFO    SenderThread:12237 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-04 22:14:19,477 DEBUG   SenderThread:12237 [sender.py:send_request():409] send_request: defer
+2024-08-04 22:14:19,477 INFO    SenderThread:12237 [sender.py:send_request_defer():613] handle sender defer: 5
+2024-08-04 22:14:19,477 INFO    SenderThread:12237 [sender.py:transition_state():617] send defer: 6
+2024-08-04 22:14:19,477 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 22:14:19,477 INFO    HandlerThread:12237 [handler.py:handle_request_defer():172] handle defer: 6
+2024-08-04 22:14:19,477 DEBUG   SenderThread:12237 [sender.py:send_request():409] send_request: defer
+2024-08-04 22:14:19,477 INFO    SenderThread:12237 [sender.py:send_request_defer():613] handle sender defer: 6
+2024-08-04 22:14:19,480 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 22:14:19,712 INFO    SenderThread:12237 [sender.py:transition_state():617] send defer: 7
+2024-08-04 22:14:19,712 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 22:14:19,712 INFO    HandlerThread:12237 [handler.py:handle_request_defer():172] handle defer: 7
+2024-08-04 22:14:19,712 DEBUG   SenderThread:12237 [sender.py:send_request():409] send_request: defer
+2024-08-04 22:14:19,712 INFO    SenderThread:12237 [sender.py:send_request_defer():613] handle sender defer: 7
+2024-08-04 22:14:19,873 INFO    Thread-12 :12237 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_221132-o8ieoj9i/files/config.yaml
+2024-08-04 22:14:19,874 INFO    Thread-12 :12237 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_221132-o8ieoj9i/files/wandb-summary.json
+2024-08-04 22:14:20,437 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 22:14:20,874 INFO    Thread-12 :12237 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_221132-o8ieoj9i/files/output.log
+2024-08-04 22:14:21,905 INFO    SenderThread:12237 [sender.py:transition_state():617] send defer: 8
+2024-08-04 22:14:21,905 DEBUG   SenderThread:12237 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 22:14:21,905 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 22:14:21,906 INFO    HandlerThread:12237 [handler.py:handle_request_defer():172] handle defer: 8
+2024-08-04 22:14:21,906 DEBUG   SenderThread:12237 [sender.py:send_request():409] send_request: defer
+2024-08-04 22:14:21,906 INFO    SenderThread:12237 [sender.py:send_request_defer():613] handle sender defer: 8
+2024-08-04 22:14:21,906 INFO    SenderThread:12237 [job_builder.py:build():296] Attempting to build job artifact
+2024-08-04 22:14:21,907 INFO    SenderThread:12237 [job_builder.py:_get_source_type():426] is repo sourced job
+2024-08-04 22:14:21,921 INFO    SenderThread:12237 [job_builder.py:build():402] adding wandb-job metadata file
+2024-08-04 22:14:21,929 INFO    SenderThread:12237 [sender.py:transition_state():617] send defer: 9
+2024-08-04 22:14:21,929 DEBUG   SenderThread:12237 [sender.py:send():382] send: artifact
+2024-08-04 22:14:21,929 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 22:14:21,931 INFO    HandlerThread:12237 [handler.py:handle_request_defer():172] handle defer: 9
+2024-08-04 22:14:22,437 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 22:14:22,875 INFO    Thread-12 :12237 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_221132-o8ieoj9i/files/output.log
+2024-08-04 22:14:23,127 INFO    wandb-upload_0:12237 [upload_job.py:push():86] Skipped uploading /singularity_home/.local/share/wandb/artifacts/staging/tmpaydno9il
+2024-08-04 22:14:23,543 INFO    wandb-upload_1:12237 [upload_job.py:push():89] Uploaded file /singularity_home/.local/share/wandb/artifacts/staging/tmpaetcwljm
+2024-08-04 22:14:24,702 INFO    SenderThread:12237 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTA5ODUzNDkwNw==', 'state': 'PENDING', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTA5MzUzODM4NQ==', 'versionIndex': 3}}}
+2024-08-04 22:14:24,702 DEBUG   SenderThread:12237 [sender.py:send_request():409] send_request: defer
+2024-08-04 22:14:24,702 INFO    SenderThread:12237 [sender.py:send_request_defer():613] handle sender defer: 9
+2024-08-04 22:14:24,702 INFO    SenderThread:12237 [dir_watcher.py:finish():358] shutting down directory watcher
+2024-08-04 22:14:24,876 INFO    SenderThread:12237 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240804_221132-o8ieoj9i/files
+2024-08-04 22:14:24,876 INFO    SenderThread:12237 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_221132-o8ieoj9i/files/requirements.txt requirements.txt
+2024-08-04 22:14:24,876 INFO    SenderThread:12237 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_221132-o8ieoj9i/files/config.yaml config.yaml
+2024-08-04 22:14:24,878 INFO    SenderThread:12237 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_221132-o8ieoj9i/files/wandb-metadata.json wandb-metadata.json
+2024-08-04 22:14:24,878 INFO    SenderThread:12237 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_221132-o8ieoj9i/files/wandb-summary.json wandb-summary.json
+2024-08-04 22:14:24,879 INFO    SenderThread:12237 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_221132-o8ieoj9i/files/output.log output.log
+2024-08-04 22:14:24,881 INFO    SenderThread:12237 [sender.py:transition_state():617] send defer: 10
+2024-08-04 22:14:24,881 DEBUG   SenderThread:12237 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 22:14:24,881 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 22:14:24,882 INFO    HandlerThread:12237 [handler.py:handle_request_defer():172] handle defer: 10
+2024-08-04 22:14:24,882 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 22:14:24,883 DEBUG   SenderThread:12237 [sender.py:send_request():409] send_request: defer
+2024-08-04 22:14:24,883 INFO    SenderThread:12237 [sender.py:send_request_defer():613] handle sender defer: 10
+2024-08-04 22:14:24,883 INFO    SenderThread:12237 [file_pusher.py:finish():172] shutting down file pusher
+2024-08-04 22:14:25,282 INFO    wandb-upload_0:12237 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_221132-o8ieoj9i/files/requirements.txt
+2024-08-04 22:14:25,375 INFO    wandb-upload_1:12237 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_221132-o8ieoj9i/files/config.yaml
+2024-08-04 22:14:25,438 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 22:14:25,438 DEBUG   SenderThread:12237 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 22:14:25,461 INFO    wandb-upload_2:12237 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_221132-o8ieoj9i/files/wandb-summary.json
+2024-08-04 22:14:25,480 INFO    wandb-upload_3:12237 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_221132-o8ieoj9i/files/output.log
+2024-08-04 22:14:25,680 INFO    Thread-11 (_thread_body):12237 [sender.py:transition_state():617] send defer: 11
+2024-08-04 22:14:25,681 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 22:14:25,681 INFO    HandlerThread:12237 [handler.py:handle_request_defer():172] handle defer: 11
+2024-08-04 22:14:25,681 DEBUG   SenderThread:12237 [sender.py:send_request():409] send_request: defer
+2024-08-04 22:14:25,681 INFO    SenderThread:12237 [sender.py:send_request_defer():613] handle sender defer: 11
+2024-08-04 22:14:25,681 INFO    SenderThread:12237 [file_pusher.py:join():178] waiting for file pusher
+2024-08-04 22:14:25,681 INFO    SenderThread:12237 [sender.py:transition_state():617] send defer: 12
+2024-08-04 22:14:25,681 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 22:14:25,681 INFO    HandlerThread:12237 [handler.py:handle_request_defer():172] handle defer: 12
+2024-08-04 22:14:25,681 DEBUG   SenderThread:12237 [sender.py:send_request():409] send_request: defer
+2024-08-04 22:14:25,681 INFO    SenderThread:12237 [sender.py:send_request_defer():613] handle sender defer: 12
+2024-08-04 22:14:25,681 INFO    SenderThread:12237 [file_stream.py:finish():595] file stream finish called
+2024-08-04 22:14:25,848 INFO    SenderThread:12237 [file_stream.py:finish():599] file stream finish is done
+2024-08-04 22:14:25,848 INFO    SenderThread:12237 [sender.py:transition_state():617] send defer: 13
+2024-08-04 22:14:25,849 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 22:14:25,849 INFO    HandlerThread:12237 [handler.py:handle_request_defer():172] handle defer: 13
+2024-08-04 22:14:25,849 DEBUG   SenderThread:12237 [sender.py:send_request():409] send_request: defer
+2024-08-04 22:14:25,849 INFO    SenderThread:12237 [sender.py:send_request_defer():613] handle sender defer: 13
+2024-08-04 22:14:25,849 INFO    SenderThread:12237 [sender.py:transition_state():617] send defer: 14
+2024-08-04 22:14:25,849 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 22:14:25,849 DEBUG   SenderThread:12237 [sender.py:send():382] send: final
+2024-08-04 22:14:25,849 INFO    HandlerThread:12237 [handler.py:handle_request_defer():172] handle defer: 14
+2024-08-04 22:14:25,849 DEBUG   SenderThread:12237 [sender.py:send():382] send: footer
+2024-08-04 22:14:25,850 DEBUG   SenderThread:12237 [sender.py:send_request():409] send_request: defer
+2024-08-04 22:14:25,850 INFO    SenderThread:12237 [sender.py:send_request_defer():613] handle sender defer: 14
+2024-08-04 22:14:25,850 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 22:14:25,850 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 22:14:25,850 DEBUG   SenderThread:12237 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 22:14:25,851 DEBUG   SenderThread:12237 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 22:14:25,851 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: server_info
+2024-08-04 22:14:25,851 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: get_summary
+2024-08-04 22:14:25,851 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: sampled_history
+2024-08-04 22:14:25,852 DEBUG   SenderThread:12237 [sender.py:send_request():409] send_request: server_info
+2024-08-04 22:14:25,852 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-04 22:14:25,853 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: job_info
+2024-08-04 22:14:26,030 DEBUG   SenderThread:12237 [sender.py:send_request():409] send_request: job_info
+2024-08-04 22:14:26,030 INFO    MainThread:12237 [wandb_run.py:_footer_history_summary_info():3866] rendering history
+2024-08-04 22:14:26,030 INFO    MainThread:12237 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
+2024-08-04 22:14:26,030 INFO    MainThread:12237 [wandb_run.py:_footer_sync_info():3825] logging synced files
+2024-08-04 22:14:26,031 DEBUG   HandlerThread:12237 [handler.py:handle_request():146] handle_request: shutdown
+2024-08-04 22:14:26,031 INFO    HandlerThread:12237 [handler.py:finish():869] shutting down handler
+2024-08-04 22:14:26,853 INFO    WriterThread:12237 [datastore.py:close():296] close: /project/wandb/run-20240804_221132-o8ieoj9i/run-o8ieoj9i.wandb
+2024-08-04 22:14:27,030 INFO    SenderThread:12237 [sender.py:finish():1572] shutting down sender
+2024-08-04 22:14:27,030 INFO    SenderThread:12237 [file_pusher.py:finish():172] shutting down file pusher
+2024-08-04 22:14:27,030 INFO    SenderThread:12237 [file_pusher.py:join():178] waiting for file pusher

wandb/run-20240804_221132-o8ieoj9i/logs/debug.log ADDED Viewed

	@@ -0,0 +1,30 @@

+2024-08-04 22:11:32,259 INFO    MainThread:12166 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
+2024-08-04 22:11:32,259 INFO    MainThread:12166 [wandb_setup.py:_flush():76] Configure stats pid to 12166
+2024-08-04 22:11:32,259 INFO    MainThread:12166 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
+2024-08-04 22:11:32,259 INFO    MainThread:12166 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
+2024-08-04 22:11:32,259 INFO    MainThread:12166 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train sample'}
+2024-08-04 22:11:32,259 INFO    MainThread:12166 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
+2024-08-04 22:11:32,259 INFO    MainThread:12166 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
+2024-08-04 22:11:32,259 INFO    MainThread:12166 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240804_221132-o8ieoj9i/logs/debug.log
+2024-08-04 22:11:32,259 INFO    MainThread:12166 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240804_221132-o8ieoj9i/logs/debug-internal.log
+2024-08-04 22:11:32,259 INFO    MainThread:12166 [wandb_init.py:init():566] calling init triggers
+2024-08-04 22:11:32,259 INFO    MainThread:12166 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
+config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document'], 'valid_data_path': ['235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document'], 'test_data_path': ['235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 4096, 'num_workers': 2, 'tokenizer_type': 'HFPreTrainedTokenizer', 'tokenizer_model': '/share/pretrained_lm/google/gemma-2-2b', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'yans-sample-gemma-2-2b_train_2024-08-04-22:11:21', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/yans-sample-gemma-2-2b', 'save': '/work/llm_recipes/models/yans-sample-gemma-2-2b', 'base_model': '/share/pretrained_lm/google/gemma-2-2b', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'anyprecision', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 2, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/yans-sample-gemma-2-2b', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 256000, 'gradient_accumulation_steps': 160}
+2024-08-04 22:11:32,260 INFO    MainThread:12166 [wandb_init.py:init():616] starting backend
+2024-08-04 22:11:32,260 INFO    MainThread:12166 [wandb_init.py:init():620] setting up manager
+2024-08-04 22:11:32,264 INFO    MainThread:12166 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-08-04 22:11:32,265 INFO    MainThread:12166 [wandb_init.py:init():628] backend started and connected
+2024-08-04 22:11:32,270 INFO    MainThread:12166 [wandb_init.py:init():720] updated telemetry
+2024-08-04 22:11:32,281 INFO    MainThread:12166 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
+2024-08-04 22:11:32,783 INFO    MainThread:12166 [wandb_run.py:_on_init():2262] communicating current version
+2024-08-04 22:11:32,877 INFO    MainThread:12166 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.5 is available!  To upgrade, please run:\n $ pip install wandb --upgrade"
+2024-08-04 22:11:32,877 INFO    MainThread:12166 [wandb_init.py:init():804] starting run threads in backend
+2024-08-04 22:11:32,932 INFO    MainThread:12166 [wandb_run.py:_console_start():2241] atexit reg
+2024-08-04 22:11:32,932 INFO    MainThread:12166 [wandb_run.py:_redirect():2096] redirect: wrap_raw
+2024-08-04 22:11:32,932 INFO    MainThread:12166 [wandb_run.py:_redirect():2161] Wrapping output streams.
+2024-08-04 22:11:32,932 INFO    MainThread:12166 [wandb_run.py:_redirect():2186] Redirects installed.
+2024-08-04 22:11:32,933 INFO    MainThread:12166 [wandb_init.py:init():847] run started, returning control to user process
+2024-08-04 22:14:13,452 INFO    MainThread:12166 [wandb_run.py:_config_callback():1343] config_cb None None {'activation_function': 'gelu_pytorch_tanh', 'hidden_size': 2304, 'model_type': 'gemma2', 'max_position_embeddings': 4096, 'num_attention_heads': 8, 'num_hidden_layers': 26, 'model_architecture': 'Gemma2ForCausalLM'}
+2024-08-04 22:14:13,453 INFO    MainThread:12166 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
+2024-08-04 22:14:27,031 WARNING MsgRouterThr:12166 [router.py:message_loop():77] message_loop has been closed

wandb/run-20240804_221132-o8ieoj9i/run-o8ieoj9i.wandb ADDED Viewed

Binary file (27.3 kB). View file

wandb/run-20240812_052853-n84i0o06/files/config.yaml ADDED Viewed

	@@ -0,0 +1,335 @@

+wandb_version: 1
+sharding_strategy:
+  desc: null
+  value: FULL_SHARD
+checkpoint_type:
+  desc: null
+  value: LOCAL_STATE_DICT
+fsdp_activation_checkpointing:
+  desc: null
+  value: true
+fsdp_cpu_offload:
+  desc: null
+  value: false
+low_cpu_fsdp:
+  desc: null
+  value: false
+no_meta_device:
+  desc: null
+  value: false
+data_path:
+  desc: null
+  value: null
+split:
+  desc: null
+  value: 969, 30, 1
+train_data_path:
+  desc: null
+  value:
+  - '304771887'
+  - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
+valid_data_path:
+  desc: null
+  value:
+  - '304771887'
+  - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
+test_data_path:
+  desc: null
+  value:
+  - '304771887'
+  - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
+data_cache_path:
+  desc: null
+  value: null
+vocab_size:
+  desc: null
+  value: null
+vocab_file:
+  desc: null
+  value: null
+merge_file:
+  desc: null
+  value: null
+seq_length:
+  desc: null
+  value: 4096
+num_workers:
+  desc: null
+  value: 2
+tokenizer_type:
+  desc: null
+  value: HFPreTrainedTokenizer
+tokenizer_model:
+  desc: null
+  value: /share/pretrained_lm/Qwen/Qwen2-0.5B
+reset_position_ids:
+  desc: null
+  value: false
+reset_attention_mask:
+  desc: null
+  value: false
+eod_mask_loss:
+  desc: null
+  value: false
+retro_return_doc_ids:
+  desc: null
+  value: false
+short_seq_prob:
+  desc: null
+  value: 0.1
+vocab_extra_ids:
+  desc: null
+  value: 0
+seed:
+  desc: null
+  value: 1234
+use_mpi:
+  desc: null
+  value: false
+wandb_entity:
+  desc: null
+  value: iwakawa-koichi-q5-tohoku-nlp6723
+wandb_name:
+  desc: null
+  value: yans-qwen2-0.5B_train_2024-08-12-05:28:42
+wandb_project:
+  desc: null
+  value: llm_tutorial
+quantization:
+  desc: null
+  value: false
+use_freeze_layers:
+  desc: null
+  value: false
+freeze_layers:
+  desc: null
+  value: null
+bf16:
+  desc: null
+  value: true
+fp16:
+  desc: null
+  value: false
+mixed_precision:
+  desc: null
+  value: true
+param_dtype:
+  desc: null
+  value: null
+load:
+  desc: null
+  value: /work/llm_recipes/models/yans-qwen2-0.5B
+save:
+  desc: null
+  value: /work/llm_recipes/models/yans-qwen2-0.5B
+base_model:
+  desc: null
+  value: /share/pretrained_lm/Qwen/Qwen2-0.5B
+use_better_transformer:
+  desc: null
+  value: false
+grad_clip_norm:
+  desc: null
+  value: 1.0
+eval_interval:
+  desc: null
+  value: 200
+save_interval:
+  desc: null
+  value: 5
+eval_iters:
+  desc: null
+  value: 10
+optimizer:
+  desc: null
+  value: adam
+lr:
+  desc: null
+  value: 2.0e-05
+lr_decay_style:
+  desc: null
+  value: cosine
+lr_decay_iters:
+  desc: null
+  value: 20000
+lr_warmup_iters:
+  desc: null
+  value: 500
+min_lr:
+  desc: null
+  value: 1.0e-06
+train_iters:
+  desc: null
+  value: 20000
+train_samples:
+  desc: null
+  value: null
+global_batch_size:
+  desc: null
+  value: 320
+micro_batch_size:
+  desc: null
+  value: 1
+make_vocab_size_divisible_by:
+  desc: null
+  value: 128
+sliding_window_size:
+  desc: null
+  value: 4096
+skip_batch:
+  desc: null
+  value: null
+no_save_optimizer_state:
+  desc: null
+  value: false
+continual_pretraining:
+  desc: null
+  value: false
+instruction_tuning:
+  desc: null
+  value: false
+direct_preference_optimization:
+  desc: null
+  value: false
+attention_dropout:
+  desc: null
+  value: 0.1
+hidden_dropout:
+  desc: null
+  value: 0.1
+weight_decay:
+  desc: null
+  value: 0.1
+adam_beta1:
+  desc: null
+  value: 0.9
+adam_beta2:
+  desc: null
+  value: 0.95
+adam_eps:
+  desc: null
+  value: 1.0e-06
+hf_transformer_model_dir:
+  desc: null
+  value: null
+instruction_train_data_path:
+  desc: null
+  value: null
+instruction_valid_data_path:
+  desc: null
+  value: null
+epoch:
+  desc: null
+  value: null
+instruction_dataset_size:
+  desc: null
+  value: null
+save_sampler_state:
+  desc: null
+  value: false
+label_smoothing:
+  desc: null
+  value: 0.0
+save_n_checkpoints:
+  desc: null
+  value: 10
+hf_repo_id:
+  desc: null
+  value: koichi12/yans-qwen2-0.5B
+create_public_hf_repo:
+  desc: null
+  value: false
+upload_all_checkpoints_to_hf:
+  desc: null
+  value: false
+hf_upload_retry_limit:
+  desc: null
+  value: 2
+exit_duration_in_mins:
+  desc: null
+  value: null
+source_key:
+  desc: null
+  value: null
+target_key:
+  desc: null
+  value: null
+attn_implementation:
+  desc: null
+  value: flash_attention_2
+efficient_instruction_tuning:
+  desc: null
+  value: false
+remove_padding_masking:
+  desc: null
+  value: false
+save_start_iter:
+  desc: null
+  value: null
+rank:
+  desc: null
+  value: 0
+world_size:
+  desc: null
+  value: 1
+padded_vocab_size:
+  desc: null
+  value: 151680
+gradient_accumulation_steps:
+  desc: null
+  value: 320
+_wandb:
+  desc: null
+  value:
+    python_version: 3.10.12
+    cli_version: 0.16.3
+    framework: huggingface
+    huggingface_version: 4.43.3
+    is_jupyter_run: false
+    is_kaggle_kernel: false
+    start_time: 1723408133.524123
+    t:
+      1:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      2:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      3:
+      - 13
+      - 16
+      - 23
+      4: 3.10.12
+      5: 0.16.3
+      6: 4.43.3
+      8:
+      - 5
+      13: linux-x86_64
+model_architecture:
+  desc: null
+  value: Qwen2ForCausalLM
+activation_function:
+  desc: null
+  value: silu
+hidden_size:
+  desc: null
+  value: 896
+model_type:
+  desc: null
+  value: qwen2
+max_position_embeddings:
+  desc: null
+  value: 4096
+num_attention_heads:
+  desc: null
+  value: 14
+num_hidden_layers:
+  desc: null
+  value: 24

wandb/run-20240812_052853-n84i0o06/files/output.log ADDED Viewed

	@@ -0,0 +1,139 @@

+Created Hugging Face repository with ID koichi12/yans-qwen2-0.5B.
+Clearing GPU cache for all ranks
+--> Running with torch torch_distributed debug set to detail
+File not found: /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
+File not found: /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
+File not found: /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
+No checkpoint found in /work/llm_recipes/models/yans-qwen2-0.5B, skipping model loading
+--> Model /share/pretrained_lm/Qwen/Qwen2-0.5B
+--> /share/pretrained_lm/Qwen/Qwen2-0.5B has 494.032768 Million params
+BFloat16 enabled for mixed precision - using bfSixteen policy
+You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
+/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
+  warnings.warn(
+--> applying fsdp activation checkpointing...
+ > datasets target sizes (minimum size):
+    train:      6400000
+    validation: 323200
+    test:       3200
+> building train, validation, and test datasets for GPT ...
+Let split = None
+> finished creating GPT datasets ...
+File not found: /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
+No checkpoint found in /work/llm_recipes/models/yans-qwen2-0.5B, skipping optimizer loading
+File not found: /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/yans-qwen2-0.5B/latest_iteration.txt
+model info: FullyShardedDataParallel(
+  (_fsdp_wrapped_module): Qwen2ForCausalLM(
+    (model): Qwen2Model(
+      (embed_tokens): Embedding(151936, 896)
+      (layers): ModuleList(
+        (0-23): 24 x FullyShardedDataParallel(
+          (_fsdp_wrapped_module): CheckpointWrapper(
+            (_checkpoint_wrapped_module): Qwen2DecoderLayer(
+              (self_attn): Qwen2FlashAttention2(
+                (q_proj): Linear(in_features=896, out_features=896, bias=True)
+                (k_proj): Linear(in_features=896, out_features=128, bias=True)
+                (v_proj): Linear(in_features=896, out_features=128, bias=True)
+                (o_proj): Linear(in_features=896, out_features=896, bias=False)
+                (rotary_emb): Qwen2RotaryEmbedding()
+              )
+              (mlp): Qwen2MLP(
+                (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
+                (up_proj): Linear(in_features=896, out_features=4864, bias=False)
+                (down_proj): Linear(in_features=4864, out_features=896, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): Qwen2RMSNorm()
+              (post_attention_layernorm): Qwen2RMSNorm()
+            )
+          )
+        )
+      )
+      (norm): Qwen2RMSNorm()
+    )
+    (lm_head): Linear(in_features=896, out_features=151936, bias=False)
+  )
+)
+model config: Qwen2Config {
+  "_name_or_path": "/share/pretrained_lm/Qwen/Qwen2-0.5B",
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "hidden_act": "silu",
+  "hidden_size": 896,
+  "initializer_range": 0.02,
+  "intermediate_size": 4864,
+  "label_smoothing": 0.0,
+  "max_position_embeddings": 4096,
+  "max_window_layers": 24,
+  "model_type": "qwen2",
+  "num_attention_heads": 14,
+  "num_hidden_layers": 24,
+  "num_key_value_heads": 2,
+  "rms_norm_eps": 1e-06,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.43.3",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+------------------------------------------------------------------
+iteration: 1 , TFLOPS: 67.05501421617748, Tokens per sec: 16676.24515769431, Loss: 4.1814446449279785
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 2 , TFLOPS: 70.71126656778048, Tokens per sec: 17585.5367488818, Loss: 4.19144344329834
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 3 , TFLOPS: 70.545913767934, Tokens per sec: 17544.41433827636, Loss: 4.197675704956055
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 4 , TFLOPS: 70.68479486678217, Tokens per sec: 17578.953369834773, Loss: 4.183629989624023
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 5 , TFLOPS: 70.61673302016509, Tokens per sec: 17562.0267305172, Loss: 4.198177337646484
+------------------------------------------------------------------
+Saving checkpoint to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005
+Saving model state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005/model.pt
+/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_state_dict_utils.py:773: UserWarning: When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict willbe returned.
+  warnings.warn(
+/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_state_dict_utils.py:716: UserWarning: When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict willbe returned.
+  warnings.warn(
+Saved model state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005/model.pt
+Saving optimizer state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005/optimizer.pt
+[rank0]:[2024-08-12 05:35:23,399] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _optim_state_dict() profiling:  defaultdict(<class 'float'>, {'preprocessing': 0.00647389400000975, 'preprocessing_with_comm': 0.0007460029999037943, 'state_converting': 0.9694889820000299, <Type.ALL: 'all'>: 0.9780955020000874})
+Saved optimizer state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005/optimizer.pt
+Saving scheduler state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005/scheduler.pt
+Saved scheduler state dict to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005/scheduler.pt
+Saving RNG states to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005/rng.pt
+Saved RNG states to /work/llm_recipes/models/yans-qwen2-0.5B/iter_0000005/rng.pt
+Traceback (most recent call last):
+  File "/project/examples/finetuning.py", line 13, in <module>
+    main()
+  File "/project/src/llama_recipes/finetuning.py", line 281, in main
+    train(
+  File "/project/src/llama_recipes/utils/train_utils.py", line 175, in train
+    save_checkpoint(
+  File "/project/src/llama_recipes/utils/checkpoint.py", line 168, in save_checkpoint
+    tokenizer.tokenizer.save_pretrained(tokenizer_path)
+  File "/project/lib/transformers/src/transformers/tokenization_utils_base.py", line 2622, in save_pretrained
+    if os.path.isfile(save_directory):
+  File "/usr/lib/python3.10/genericpath.py", line 30, in isfile
+    st = os.stat(path)
+TypeError: stat: path should be string, bytes, os.PathLike or integer, not NoneType

wandb/run-20240812_052853-n84i0o06/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,271 @@

+absl-py==2.1.0
+accelerate==0.33.0
+aiohttp==3.9.1
+aiosignal==1.3.1
+annotated-types==0.6.0
+apex==0.1
+appdirs==1.4.4
+argon2-cffi-bindings==21.2.0
+argon2-cffi==23.1.0
+asttokens==2.4.1
+astunparse==1.6.3
+async-timeout==4.0.3
+attrs==23.2.0
+audioread==3.0.1
+beautifulsoup4==4.12.3
+bleach==6.1.0
+blis==0.7.11
+cachetools==5.3.2
+catalogue==2.0.10
+certifi==2024.2.2
+cffi==1.16.0
+charset-normalizer==3.3.2
+click==8.1.7
+cloudpathlib==0.16.0
+cloudpickle==3.0.0
+cmake==3.28.1
+colorama==0.4.6
+comm==0.2.1
+confection==0.1.4
+contourpy==1.2.0
+cubinlinker==0.3.0+2.g405ac64
+cuda-python==12.3.0rc4+9.gdb8c48a.dirty
+cudf==23.12.0
+cugraph-dgl==23.12.0
+cugraph-service-client==23.12.0
+cugraph-service-server==23.12.0
+cugraph==23.12.0
+cuml==23.12.0
+cupy-cuda12x==12.3.0
+cycler==0.12.1
+cymem==2.0.8
+cython==3.0.8
+dask-cuda==23.12.0
+dask-cudf==23.12.0
+dask==2023.11.0
+debugpy==1.8.1
+decorator==5.1.1
+defusedxml==0.7.1
+distributed==2023.11.0
+dm-tree==0.1.8
+docker-pycreds==0.4.0
+einops==0.7.0
+exceptiongroup==1.2.0
+execnet==2.0.2
+executing==2.0.1
+expecttest==0.1.3
+fastjsonschema==2.19.1
+fastrlock==0.8.2
+filelock==3.13.1
+flash-attn==2.4.2
+fonttools==4.48.1
+frozenlist==1.4.1
+fsspec==2023.12.2
+gast==0.5.4
+gitdb==4.0.11
+gitpython==3.1.43
+google-auth-oauthlib==0.4.6
+google-auth==2.27.0
+graphsurgeon==0.4.6
+grpcio==1.60.1
+huggingface-hub==0.24.5
+hypothesis==5.35.1
+idna==3.6
+importlib-metadata==7.0.1
+iniconfig==2.0.0
+intel-openmp==2021.4.0
+ipadic==1.0.0
+ipykernel==6.29.2
+ipython-genutils==0.2.0
+ipython==8.21.0
+jedi==0.19.1
+jinja2==3.1.3
+joblib==1.3.2
+json5==0.9.14
+jsonnet==0.19.1
+jsonschema-specifications==2023.12.1
+jsonschema==4.21.1
+jupyter-client==8.6.0
+jupyter-core==5.7.1
+jupyter-tensorboard==0.2.0
+jupyterlab-pygments==0.3.0
+jupyterlab-server==1.2.0
+jupyterlab==2.3.2
+jupytext==1.16.1
+kiwisolver==1.4.5
+langcodes==3.3.0
+lazy-loader==0.3
+librosa==0.10.1
+llvmlite==0.40.1
+locket==1.0.0
+logzero==1.7.0
+lxml==5.2.2
+markdown-it-py==3.0.0
+markdown==3.5.2
+markupsafe==2.1.4
+matplotlib-inline==0.1.6
+matplotlib==3.8.2
+mdit-py-plugins==0.4.0
+mdurl==0.1.2
+mecab-python3==1.0.6
+mistune==3.0.2
+mkl-devel==2021.1.1
+mkl-include==2021.1.1
+mkl==2021.1.1
+mock==5.1.0
+more-itertools==9.1.0
+mpmath==1.3.0
+msgpack==1.0.7
+multidict==6.0.4
+murmurhash==1.0.10
+nbclient==0.9.0
+nbconvert==7.16.0
+nbformat==5.9.2
+nest-asyncio==1.6.0
+networkx==2.6.3
+ninja==1.11.1.1
+nltk==3.8.1
+notebook==6.4.10
+numba==0.57.1+1.g1ff679645
+numpy==1.24.4
+nvfuser==0.1.4a0+d0bb811
+nvidia-dali-cuda120==1.34.0
+nvidia-pyindex==1.0.9
+nvtx==0.2.5
+oauthlib==3.2.2
+onnx==1.15.0rc2
+opencv==4.7.0
+optree==0.10.0
+packaging==23.2
+pandas==1.5.3
+pandocfilters==1.5.1
+parso==0.8.3
+partd==1.4.1
+peft==0.11.1
+pexpect==4.9.0
+pillow==10.2.0
+pip==24.0
+platformdirs==4.2.0
+pluggy==1.4.0
+ply==3.11
+polygraphy==0.49.4
+pooch==1.8.0
+portalocker==2.10.1
+preshed==3.0.9
+prettytable==3.9.0
+prometheus-client==0.19.0
+prompt-toolkit==3.0.43
+protobuf==4.24.4
+psutil==5.9.4
+ptxcompiler==0.8.1+2.g0d406d6
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyarrow==14.0.1.dev0+gba5374836.d20240125
+pyasn1-modules==0.3.0
+pyasn1==0.5.1
+pybind11-global==2.11.1
+pybind11==2.11.1
+pycocotools==2.0+nv0.8.0
+pycparser==2.21
+pydantic-core==2.16.2
+pydantic==2.6.1
+pygments==2.17.2
+pylibcugraph==23.12.0
+pylibcugraphops==23.12.0
+pylibraft==23.12.0
+pynvml==11.4.1
+pyparsing==3.1.1
+pytest-flakefinder==1.1.0
+pytest-rerunfailures==13.0
+pytest-shard==0.1.2
+pytest-xdist==3.5.0
+pytest==8.0.0
+python-dateutil==2.8.2
+python-dotenv==1.0.0
+python-hostlist==1.23.0
+pytorch-quantization==2.1.2
+pytz==2023.3.post1
+pyyaml==6.0.1
+pyzmq==25.1.2
+raft-dask==23.12.0
+rapids-dask-dependency==23.12.1
+referencing==0.33.0
+regex==2023.12.25
+requests-oauthlib==1.3.1
+requests==2.31.0
+rich==13.7.0
+rmm==23.12.0
+rpds-py==0.17.1
+rsa==4.9
+sacrebleu==2.4.0
+safetensors==0.4.3
+scikit-learn==1.2.0
+scipy==1.12.0
+send2trash==1.8.2
+sentencepiece==0.1.99
+sentry-sdk==2.12.0
+setproctitle==1.3.3
+setuptools==68.2.2
+six==1.16.0
+smart-open==6.4.0
+smmap==5.0.1
+sortedcontainers==2.4.0
+soundfile==0.12.1
+soupsieve==2.5
+soxr==0.3.7
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+spacy==3.7.2
+sphinx-glpi-theme==0.6
+srsly==2.4.8
+stack-data==0.6.3
+sympy==1.12
+tabulate==0.9.0
+tbb==2021.11.0
+tblib==3.0.0
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.1
+tensorboard==2.9.0
+tensorrt==8.6.3
+terminado==0.18.0
+termplotlib==0.3.9
+thinc==8.2.3
+threadpoolctl==3.2.0
+thriftpy2==0.4.17
+tinycss2==1.2.1
+tokenizers==0.19.1
+toml==0.10.2
+tomli==2.0.1
+toolz==0.12.1
+torch-tensorrt==2.3.0a0
+torch==2.3.0a0+ebedce2
+torchdata==0.7.1a0
+torchtext==0.17.0a0
+torchvision==0.18.0a0
+tornado==6.4
+tqdm==4.66.1
+traitlets==5.9.0
+transformer-engine==1.3.0+5b90b7f
+transformers==4.43.3
+treelite-runtime==3.9.1
+treelite==3.9.1
+triton==2.2.0+e28a256
+typer==0.9.0
+types-dataclasses==0.6.6
+typing-extensions==4.9.0
+ucx-py==0.35.0
+uff==0.6.9
+ujson==5.8.0
+urllib3==1.26.18
+wandb==0.16.3
+wasabi==1.1.2
+wcwidth==0.2.13
+weasel==0.3.4
+webencodings==0.5.1
+werkzeug==3.0.1
+wheel==0.42.0
+xdoctest==1.0.2
+xgboost==1.7.6
+yarl==1.9.4
+zict==3.0.0
+zipp==3.17.0

wandb/run-20240812_052853-n84i0o06/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,215 @@

+{
+    "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
+    "python": "3.10.12",
+    "heartbeatAt": "2024-08-11T20:28:54.148690",
+    "startedAt": "2024-08-11T20:28:53.511276",
+    "docker": null,
+    "cuda": null,
+    "args": [
+        "--seq-length",
+        "4096",
+        "--sliding-window-size",
+        "4096",
+        "--micro-batch-size",
+        "1",
+        "--global-batch-size",
+        "320",
+        "--train-iters",
+        "20000",
+        "--tokenizer-type",
+        "HFPreTrainedTokenizer",
+        "--tokenizer-model",
+        "/share/pretrained_lm/Qwen/Qwen2-0.5B",
+        "--train-data-path",
+        "304771887",
+        "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
+        "--valid-data-path",
+        "304771887",
+        "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
+        "--test-data-path",
+        "304771887",
+        "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
+        "--lr",
+        "2e-5",
+        "--min-lr",
+        "1e-6",
+        "--lr-decay-style",
+        "cosine",
+        "--lr-warmup-iters",
+        "500",
+        "--lr-decay-iters",
+        "20000",
+        "--weight-decay",
+        "0.1",
+        "--grad-clip-norm",
+        "1.0",
+        "--optimizer",
+        "adam",
+        "--adam-beta1",
+        "0.9",
+        "--adam-beta2",
+        "0.95",
+        "--adam-eps",
+        "1e-6",
+        "--save-interval",
+        "5",
+        "--eval-interval",
+        "200",
+        "--eval-iters",
+        "10",
+        "--bf16",
+        "--mixed-precision",
+        "--base-model",
+        "/share/pretrained_lm/Qwen/Qwen2-0.5B",
+        "--save",
+        "/work/llm_recipes/models/yans-qwen2-0.5B",
+        "--load",
+        "/work/llm_recipes/models/yans-qwen2-0.5B",
+        "--fsdp-activation-checkpointing",
+        "--sharding-strategy",
+        "FULL_SHARD",
+        "--checkpoint-type",
+        "LOCAL_STATE_DICT",
+        "--save-n-checkpoints",
+        "10",
+        "--hf-upload-retry-limit",
+        "2",
+        "--hf-repo-id",
+        "koichi12/yans-qwen2-0.5B",
+        "--wandb-entity",
+        "iwakawa-koichi-q5-tohoku-nlp6723",
+        "--wandb-project",
+        "llm_tutorial",
+        "--wandb-name",
+        "yans-qwen2-0.5B_train_2024-08-12-05:28:42"
+    ],
+    "state": "running",
+    "program": "/project/examples/finetuning.py",
+    "codePathLocal": "examples/finetuning.py",
+    "codePath": "examples/finetuning.py",
+    "git": {
+        "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
+        "commit": "6da01327e78c302bc0cfdb335f3ca297e2a19c8c"
+    },
+    "email": null,
+    "root": "/project",
+    "host": "gpu-koiwa-00",
+    "username": "koiwa",
+    "executable": "/usr/bin/python",
+    "cpu_count": 18,
+    "cpu_count_logical": 18,
+    "cpu_freq": {
+        "current": 2400.0429999999997,
+        "min": 0.0,
+        "max": 0.0
+    },
+    "cpu_freq_per_core": [
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        }
+    ],
+    "disk": {
+        "/": {
+            "total": 0.0625,
+            "used": 1.1444091796875e-05
+        }
+    },
+    "gpu": "NVIDIA A100-SXM4-40GB",
+    "gpu_count": 1,
+    "gpu_devices": [
+        {
+            "name": "NVIDIA A100-SXM4-40GB",
+            "memory_total": 42949672960
+        }
+    ],
+    "memory": {
+        "total": 56.487823486328125
+    }
+}

wandb/run-20240812_052853-n84i0o06/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"training/loss": 4.198177337646484, "training/perplexity": 66.56489507784042, "utils/batch_size": 1, "utils/global_batch_size": 320, "utils/seq_len": 4097, "utils/gradient_accumulation_steps": 320, "utils/iteration": 5, "optimizer/lr": 1.19e-06, "optimizer/variance_l2": 0.00650817005037245, "optimizer/variance_sqrt_l2": 0.4753125323283669, "optimizer/momentum_l2": 0.4059003829432183, "optimizer/weight_l2": 825.0639369164065, "optimizer/variance_l1": 0.22650909423828125, "optimizer/variance_sqrt_l1": 1979.75, "optimizer/momentum_l1": 1591.375, "optimizer/weight_l1": 6886400.0, "optimizer/variance_abs_max": 0.004669189453125, "optimizer/variance_sqrt_abs_max": 0.068359375, "optimizer/momentum_abs_max": 0.058837890625, "optimizer/weight_abs_max": 175.0, "stats/1_iteration_time": 74.65197611400004, "stats/tokens_per_sec": 17562.0267305172, "stats/tokens_per_sec_per_gpu": 17562.0267305172, "stats/tflops": 70.61673302016509, "_timestamp": 1723408520.9273944, "_runtime": 387.4032714366913, "_step": 5, "_wandb": {"runtime": 391}}

wandb/run-20240812_052853-n84i0o06/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,384 @@

+2024-08-12 05:28:53,525 INFO    StreamThr :10531 [internal.py:wandb_internal():86] W&B internal server running at pid: 10531, started at: 2024-08-12 05:28:53.524894
+2024-08-12 05:28:53,527 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status
+2024-08-12 05:28:53,529 INFO    WriterThread:10531 [datastore.py:open_for_write():87] open: /project/wandb/run-20240812_052853-n84i0o06/run-n84i0o06.wandb
+2024-08-12 05:28:53,530 DEBUG   SenderThread:10531 [sender.py:send():382] send: header
+2024-08-12 05:28:53,544 DEBUG   SenderThread:10531 [sender.py:send():382] send: run
+2024-08-12 05:28:54,033 INFO    SenderThread:10531 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240812_052853-n84i0o06/files
+2024-08-12 05:28:54,033 INFO    SenderThread:10531 [sender.py:_start_run_threads():1136] run started: n84i0o06 with start time 1723408133.524123
+2024-08-12 05:28:54,038 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: check_version
+2024-08-12 05:28:54,038 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: check_version
+2024-08-12 05:28:54,128 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: run_start
+2024-08-12 05:28:54,135 DEBUG   HandlerThread:10531 [system_info.py:__init__():27] System info init
+2024-08-12 05:28:54,135 DEBUG   HandlerThread:10531 [system_info.py:__init__():42] System info init done
+2024-08-12 05:28:54,135 INFO    HandlerThread:10531 [system_monitor.py:start():194] Starting system monitor
+2024-08-12 05:28:54,135 INFO    SystemMonitor:10531 [system_monitor.py:_start():158] Starting system asset monitoring threads
+2024-08-12 05:28:54,135 INFO    HandlerThread:10531 [system_monitor.py:probe():214] Collecting system info
+2024-08-12 05:28:54,136 INFO    SystemMonitor:10531 [interfaces.py:start():190] Started cpu monitoring
+2024-08-12 05:28:54,136 INFO    SystemMonitor:10531 [interfaces.py:start():190] Started disk monitoring
+2024-08-12 05:28:54,137 INFO    SystemMonitor:10531 [interfaces.py:start():190] Started gpu monitoring
+2024-08-12 05:28:54,138 INFO    SystemMonitor:10531 [interfaces.py:start():190] Started memory monitoring
+2024-08-12 05:28:54,139 INFO    SystemMonitor:10531 [interfaces.py:start():190] Started network monitoring
+2024-08-12 05:28:54,148 DEBUG   HandlerThread:10531 [system_info.py:probe():151] Probing system
+2024-08-12 05:28:54,150 DEBUG   HandlerThread:10531 [system_info.py:_probe_git():136] Probing git
+2024-08-12 05:28:54,163 DEBUG   HandlerThread:10531 [system_info.py:_probe_git():144] Probing git done
+2024-08-12 05:28:54,163 DEBUG   HandlerThread:10531 [system_info.py:probe():199] Probing system done
+2024-08-12 05:28:54,163 DEBUG   HandlerThread:10531 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-11T20:28:54.148690', 'startedAt': '2024-08-11T20:28:53.511276', 'docker': None, 'cuda': None, 'args': ('--seq-length', '4096', '--sliding-window-size', '4096', '--micro-batch-size', '1', '--global-batch-size', '320', '--train-iters', '20000', '--tokenizer-type', 'HFPreTrainedTokenizer', '--tokenizer-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--train-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--valid-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--test-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '20000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '5', '--eval-interval', '200', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--save', '/work/llm_recipes/models/yans-qwen2-0.5B', '--load', '/work/llm_recipes/models/yans-qwen2-0.5B', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/yans-qwen2-0.5B', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'yans-qwen2-0.5B_train_2024-08-12-05:28:42'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '6da01327e78c302bc0cfdb335f3ca297e2a19c8c'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0429999999997, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.487823486328125}}
+2024-08-12 05:28:54,163 INFO    HandlerThread:10531 [system_monitor.py:probe():224] Finished collecting system info
+2024-08-12 05:28:54,163 INFO    HandlerThread:10531 [system_monitor.py:probe():227] Publishing system info
+2024-08-12 05:28:54,164 INFO    HandlerThread:10531 [system_monitor.py:probe():229] Finished publishing system info
+2024-08-12 05:28:54,170 DEBUG   SenderThread:10531 [sender.py:send():382] send: files
+2024-08-12 05:28:54,170 INFO    SenderThread:10531 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
+2024-08-12 05:28:54,180 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: python_packages
+2024-08-12 05:28:54,180 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 05:28:54,181 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: python_packages
+2024-08-12 05:28:54,181 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 05:28:54,182 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 05:28:54,475 DEBUG   SenderThread:10531 [sender.py:send():382] send: telemetry
+2024-08-12 05:28:54,885 INFO    wandb-upload_0:10531 [upload_job.py:push():131] Uploaded file /tmp/tmp0u7r0fs3wandb/exuilam8-wandb-metadata.json
+2024-08-12 05:28:55,035 INFO    Thread-12 :10531 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_052853-n84i0o06/files/wandb-metadata.json
+2024-08-12 05:28:55,035 INFO    Thread-12 :10531 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_052853-n84i0o06/files/requirements.txt
+2024-08-12 05:28:56,035 INFO    Thread-12 :10531 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_052853-n84i0o06/files/output.log
+2024-08-12 05:28:58,036 INFO    Thread-12 :10531 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052853-n84i0o06/files/output.log
+2024-08-12 05:28:59,328 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:29:00,038 INFO    Thread-12 :10531 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052853-n84i0o06/files/output.log
+2024-08-12 05:29:01,878 DEBUG   SenderThread:10531 [sender.py:send():382] send: config
+2024-08-12 05:29:01,879 DEBUG   SenderThread:10531 [sender.py:send():382] send: config
+2024-08-12 05:29:02,039 INFO    Thread-12 :10531 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052853-n84i0o06/files/output.log
+2024-08-12 05:29:04,040 INFO    Thread-12 :10531 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052853-n84i0o06/files/output.log
+2024-08-12 05:29:04,879 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:29:09,180 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 05:29:09,181 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 05:29:09,181 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 05:29:10,368 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:29:15,369 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:29:20,370 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:29:24,180 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 05:29:24,180 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 05:29:24,220 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 05:29:26,367 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:29:27,058 INFO    Thread-12 :10531 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052853-n84i0o06/files/config.yaml
+2024-08-12 05:29:31,577 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:29:36,578 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:29:39,180 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 05:29:39,180 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 05:29:39,220 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 05:29:42,448 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:29:47,449 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:29:52,450 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:29:54,139 DEBUG   SystemMonitor:10531 [system_monitor.py:_start():172] Starting system metrics aggregation loop
+2024-08-12 05:29:54,141 DEBUG   SenderThread:10531 [sender.py:send():382] send: stats
+2024-08-12 05:29:54,180 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 05:29:54,180 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 05:29:54,220 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 05:29:58,446 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:30:03,447 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:30:08,448 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:30:09,180 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 05:30:09,181 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 05:30:09,224 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 05:30:13,456 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:30:18,457 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:30:22,408 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: partial_history
+2024-08-12 05:30:24,097 INFO    Thread-12 :10531 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052853-n84i0o06/files/output.log
+2024-08-12 05:30:24,142 DEBUG   SenderThread:10531 [sender.py:send():382] send: stats
+2024-08-12 05:30:24,142 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:30:24,180 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 05:30:24,180 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 05:30:24,182 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 05:30:29,451 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:30:34,451 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:30:39,180 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 05:30:39,181 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 05:30:39,224 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 05:30:40,419 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:30:45,420 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:30:50,421 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:30:54,143 DEBUG   SenderThread:10531 [sender.py:send():382] send: stats
+2024-08-12 05:30:54,180 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 05:30:54,181 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 05:30:54,224 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 05:30:56,414 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:31:01,416 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:31:06,417 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:31:09,181 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 05:31:09,181 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 05:31:09,224 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 05:31:12,373 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:31:17,375 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:31:22,376 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:31:24,144 DEBUG   SenderThread:10531 [sender.py:send():382] send: stats
+2024-08-12 05:31:24,181 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 05:31:24,181 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 05:31:24,224 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 05:31:28,366 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:31:33,367 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:31:36,963 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: partial_history
+2024-08-12 05:31:36,966 DEBUG   SenderThread:10531 [sender.py:send():382] send: history
+2024-08-12 05:31:36,966 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: summary_record
+2024-08-12 05:31:36,968 INFO    SenderThread:10531 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-12 05:31:37,152 INFO    Thread-12 :10531 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_052853-n84i0o06/files/wandb-summary.json
+2024-08-12 05:31:39,006 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:31:39,181 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 05:31:39,181 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 05:31:39,183 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 05:31:40,154 INFO    Thread-12 :10531 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052853-n84i0o06/files/output.log
+2024-08-12 05:31:44,409 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:31:49,410 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:31:54,145 DEBUG   SenderThread:10531 [sender.py:send():382] send: stats
+2024-08-12 05:31:54,181 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 05:31:54,181 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 05:31:54,228 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 05:31:55,354 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:32:00,355 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:32:05,356 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:32:09,181 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 05:32:09,181 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 05:32:09,224 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 05:32:10,376 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:32:15,377 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:32:20,378 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:32:24,146 DEBUG   SenderThread:10531 [sender.py:send():382] send: stats
+2024-08-12 05:32:24,181 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 05:32:24,181 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 05:32:24,224 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 05:32:25,450 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:32:30,451 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:32:35,451 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:32:39,181 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 05:32:39,182 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 05:32:39,224 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 05:32:41,437 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:32:46,438 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:32:51,438 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:32:51,692 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: partial_history
+2024-08-12 05:32:51,694 DEBUG   SenderThread:10531 [sender.py:send():382] send: history
+2024-08-12 05:32:51,694 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: summary_record
+2024-08-12 05:32:51,696 INFO    SenderThread:10531 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-12 05:32:52,204 INFO    Thread-12 :10531 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052853-n84i0o06/files/wandb-summary.json
+2024-08-12 05:32:54,147 DEBUG   SenderThread:10531 [sender.py:send():382] send: stats
+2024-08-12 05:32:54,181 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 05:32:54,182 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 05:32:54,183 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 05:32:54,205 INFO    Thread-12 :10531 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052853-n84i0o06/files/output.log
+2024-08-12 05:32:56,453 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:33:01,453 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:33:06,454 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:33:09,181 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 05:33:09,182 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 05:33:09,224 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 05:33:12,386 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:33:17,386 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:33:22,387 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:33:24,148 DEBUG   SenderThread:10531 [sender.py:send():382] send: stats
+2024-08-12 05:33:24,181 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 05:33:24,182 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 05:33:24,224 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 05:33:28,379 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:33:33,380 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:33:38,380 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:33:39,182 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 05:33:39,182 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 05:33:39,224 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 05:33:43,420 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:33:48,421 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:33:53,421 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:33:54,149 DEBUG   SenderThread:10531 [sender.py:send():382] send: stats
+2024-08-12 05:33:54,182 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 05:33:54,182 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 05:33:54,224 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 05:33:59,378 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:34:04,379 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:34:06,274 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: partial_history
+2024-08-12 05:34:06,276 DEBUG   SenderThread:10531 [sender.py:send():382] send: history
+2024-08-12 05:34:06,277 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: summary_record
+2024-08-12 05:34:06,278 INFO    SenderThread:10531 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-12 05:34:07,249 INFO    Thread-12 :10531 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052853-n84i0o06/files/wandb-summary.json
+2024-08-12 05:34:08,250 INFO    Thread-12 :10531 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052853-n84i0o06/files/output.log
+2024-08-12 05:34:09,182 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 05:34:09,182 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 05:34:09,184 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 05:34:09,395 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:34:14,395 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:34:19,396 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:34:24,150 DEBUG   SenderThread:10531 [sender.py:send():382] send: stats
+2024-08-12 05:34:24,182 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 05:34:24,182 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 05:34:24,224 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 05:34:25,394 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:34:30,395 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:34:35,396 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:34:39,182 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 05:34:39,182 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 05:34:39,224 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 05:34:40,439 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:34:45,439 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:34:50,440 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:34:54,152 DEBUG   SenderThread:10531 [sender.py:send():382] send: stats
+2024-08-12 05:34:54,182 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 05:34:54,182 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 05:34:54,224 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 05:34:55,454 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:35:00,455 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:35:05,455 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:35:09,182 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 05:35:09,182 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 05:35:09,224 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 05:35:11,407 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:35:16,407 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:35:20,928 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: partial_history
+2024-08-12 05:35:20,930 DEBUG   SenderThread:10531 [sender.py:send():382] send: history
+2024-08-12 05:35:20,931 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: summary_record
+2024-08-12 05:35:20,932 INFO    SenderThread:10531 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-12 05:35:21,295 INFO    Thread-12 :10531 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052853-n84i0o06/files/wandb-summary.json
+2024-08-12 05:35:21,970 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:35:22,296 INFO    Thread-12 :10531 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052853-n84i0o06/files/output.log
+2024-08-12 05:35:24,152 DEBUG   SenderThread:10531 [sender.py:send():382] send: stats
+2024-08-12 05:35:24,232 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 05:35:24,255 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 05:35:24,256 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 05:35:24,297 INFO    Thread-12 :10531 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052853-n84i0o06/files/output.log
+2024-08-12 05:35:25,212 DEBUG   SenderThread:10531 [sender.py:send():382] send: exit
+2024-08-12 05:35:25,213 INFO    SenderThread:10531 [sender.py:send_exit():589] handling exit code: 1
+2024-08-12 05:35:25,213 INFO    SenderThread:10531 [sender.py:send_exit():591] handling runtime: 391
+2024-08-12 05:35:25,214 INFO    SenderThread:10531 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-12 05:35:25,214 INFO    SenderThread:10531 [sender.py:send_exit():597] send defer
+2024-08-12 05:35:25,214 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 05:35:25,214 INFO    HandlerThread:10531 [handler.py:handle_request_defer():172] handle defer: 0
+2024-08-12 05:35:25,215 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: defer
+2024-08-12 05:35:25,215 INFO    SenderThread:10531 [sender.py:send_request_defer():613] handle sender defer: 0
+2024-08-12 05:35:25,215 INFO    SenderThread:10531 [sender.py:transition_state():617] send defer: 1
+2024-08-12 05:35:25,215 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 05:35:25,215 INFO    HandlerThread:10531 [handler.py:handle_request_defer():172] handle defer: 1
+2024-08-12 05:35:25,215 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: defer
+2024-08-12 05:35:25,215 INFO    SenderThread:10531 [sender.py:send_request_defer():613] handle sender defer: 1
+2024-08-12 05:35:25,215 INFO    SenderThread:10531 [sender.py:transition_state():617] send defer: 2
+2024-08-12 05:35:25,215 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 05:35:25,215 INFO    HandlerThread:10531 [handler.py:handle_request_defer():172] handle defer: 2
+2024-08-12 05:35:25,215 INFO    HandlerThread:10531 [system_monitor.py:finish():203] Stopping system monitor
+2024-08-12 05:35:25,215 DEBUG   SystemMonitor:10531 [system_monitor.py:_start():179] Finished system metrics aggregation loop
+2024-08-12 05:35:25,215 INFO    HandlerThread:10531 [interfaces.py:finish():202] Joined cpu monitor
+2024-08-12 05:35:25,216 DEBUG   SystemMonitor:10531 [system_monitor.py:_start():183] Publishing last batch of metrics
+2024-08-12 05:35:25,216 INFO    HandlerThread:10531 [interfaces.py:finish():202] Joined disk monitor
+2024-08-12 05:35:25,249 INFO    HandlerThread:10531 [interfaces.py:finish():202] Joined gpu monitor
+2024-08-12 05:35:25,249 INFO    HandlerThread:10531 [interfaces.py:finish():202] Joined memory monitor
+2024-08-12 05:35:25,249 INFO    HandlerThread:10531 [interfaces.py:finish():202] Joined network monitor
+2024-08-12 05:35:25,249 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: defer
+2024-08-12 05:35:25,249 INFO    SenderThread:10531 [sender.py:send_request_defer():613] handle sender defer: 2
+2024-08-12 05:35:25,249 INFO    SenderThread:10531 [sender.py:transition_state():617] send defer: 3
+2024-08-12 05:35:25,249 DEBUG   SenderThread:10531 [sender.py:send():382] send: stats
+2024-08-12 05:35:25,250 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 05:35:25,250 INFO    HandlerThread:10531 [handler.py:handle_request_defer():172] handle defer: 3
+2024-08-12 05:35:25,251 DEBUG   SenderThread:10531 [sender.py:send():382] send: history
+2024-08-12 05:35:25,252 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: summary_record
+2024-08-12 05:35:25,253 INFO    SenderThread:10531 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-12 05:35:25,253 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: defer
+2024-08-12 05:35:25,253 INFO    SenderThread:10531 [sender.py:send_request_defer():613] handle sender defer: 3
+2024-08-12 05:35:25,253 INFO    SenderThread:10531 [sender.py:transition_state():617] send defer: 4
+2024-08-12 05:35:25,253 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 05:35:25,253 INFO    HandlerThread:10531 [handler.py:handle_request_defer():172] handle defer: 4
+2024-08-12 05:35:25,253 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: defer
+2024-08-12 05:35:25,253 INFO    SenderThread:10531 [sender.py:send_request_defer():613] handle sender defer: 4
+2024-08-12 05:35:25,253 INFO    SenderThread:10531 [sender.py:transition_state():617] send defer: 5
+2024-08-12 05:35:25,253 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 05:35:25,253 INFO    HandlerThread:10531 [handler.py:handle_request_defer():172] handle defer: 5
+2024-08-12 05:35:25,254 DEBUG   SenderThread:10531 [sender.py:send():382] send: summary
+2024-08-12 05:35:25,255 INFO    SenderThread:10531 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-12 05:35:25,255 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: defer
+2024-08-12 05:35:25,255 INFO    SenderThread:10531 [sender.py:send_request_defer():613] handle sender defer: 5
+2024-08-12 05:35:25,255 INFO    SenderThread:10531 [sender.py:transition_state():617] send defer: 6
+2024-08-12 05:35:25,255 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 05:35:25,255 INFO    HandlerThread:10531 [handler.py:handle_request_defer():172] handle defer: 6
+2024-08-12 05:35:25,255 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: defer
+2024-08-12 05:35:25,255 INFO    SenderThread:10531 [sender.py:send_request_defer():613] handle sender defer: 6
+2024-08-12 05:35:25,256 INFO    SenderThread:10531 [sender.py:transition_state():617] send defer: 7
+2024-08-12 05:35:25,256 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 05:35:25,256 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 05:35:25,256 INFO    HandlerThread:10531 [handler.py:handle_request_defer():172] handle defer: 7
+2024-08-12 05:35:25,256 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: defer
+2024-08-12 05:35:25,256 INFO    SenderThread:10531 [sender.py:send_request_defer():613] handle sender defer: 7
+2024-08-12 05:35:25,298 INFO    Thread-12 :10531 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052853-n84i0o06/files/wandb-summary.json
+2024-08-12 05:35:26,141 INFO    SenderThread:10531 [sender.py:transition_state():617] send defer: 8
+2024-08-12 05:35:26,142 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 05:35:26,142 INFO    HandlerThread:10531 [handler.py:handle_request_defer():172] handle defer: 8
+2024-08-12 05:35:26,142 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: defer
+2024-08-12 05:35:26,142 INFO    SenderThread:10531 [sender.py:send_request_defer():613] handle sender defer: 8
+2024-08-12 05:35:26,142 INFO    SenderThread:10531 [job_builder.py:build():296] Attempting to build job artifact
+2024-08-12 05:35:26,143 INFO    SenderThread:10531 [job_builder.py:_get_source_type():426] is repo sourced job
+2024-08-12 05:35:26,157 INFO    SenderThread:10531 [job_builder.py:build():402] adding wandb-job metadata file
+2024-08-12 05:35:26,166 INFO    SenderThread:10531 [sender.py:transition_state():617] send defer: 9
+2024-08-12 05:35:26,166 DEBUG   SenderThread:10531 [sender.py:send():382] send: artifact
+2024-08-12 05:35:26,166 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 05:35:26,167 INFO    HandlerThread:10531 [handler.py:handle_request_defer():172] handle defer: 9
+2024-08-12 05:35:26,213 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-12 05:35:26,299 INFO    Thread-12 :10531 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_052853-n84i0o06/files/output.log
+2024-08-12 05:35:27,302 INFO    wandb-upload_1:10531 [upload_job.py:push():86] Skipped uploading /singularity_home/.local/share/wandb/artifacts/staging/tmpyfws5ko3
+2024-08-12 05:35:27,738 INFO    wandb-upload_0:10531 [upload_job.py:push():89] Uploaded file /singularity_home/.local/share/wandb/artifacts/staging/tmpypuucsag
+2024-08-12 05:35:29,357 INFO    SenderThread:10531 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTEzOTg5OTc5MQ==', 'state': 'PENDING', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTEzOTgzMzc4Mw==', 'versionIndex': 6}}}
+2024-08-12 05:35:29,357 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: defer
+2024-08-12 05:35:29,357 INFO    SenderThread:10531 [sender.py:send_request_defer():613] handle sender defer: 9
+2024-08-12 05:35:29,358 INFO    SenderThread:10531 [dir_watcher.py:finish():358] shutting down directory watcher
+2024-08-12 05:35:30,300 INFO    SenderThread:10531 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240812_052853-n84i0o06/files
+2024-08-12 05:35:30,301 INFO    SenderThread:10531 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_052853-n84i0o06/files/requirements.txt requirements.txt
+2024-08-12 05:35:30,301 INFO    SenderThread:10531 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_052853-n84i0o06/files/config.yaml config.yaml
+2024-08-12 05:35:30,301 INFO    SenderThread:10531 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_052853-n84i0o06/files/wandb-metadata.json wandb-metadata.json
+2024-08-12 05:35:30,302 INFO    SenderThread:10531 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_052853-n84i0o06/files/wandb-summary.json wandb-summary.json
+2024-08-12 05:35:30,304 INFO    SenderThread:10531 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_052853-n84i0o06/files/output.log output.log
+2024-08-12 05:35:30,306 INFO    SenderThread:10531 [sender.py:transition_state():617] send defer: 10
+2024-08-12 05:35:30,306 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: poll_exit
+2024-08-12 05:35:30,306 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 05:35:30,307 INFO    HandlerThread:10531 [handler.py:handle_request_defer():172] handle defer: 10
+2024-08-12 05:35:30,308 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: defer
+2024-08-12 05:35:30,308 INFO    SenderThread:10531 [sender.py:send_request_defer():613] handle sender defer: 10
+2024-08-12 05:35:30,308 INFO    SenderThread:10531 [file_pusher.py:finish():172] shutting down file pusher
+2024-08-12 05:35:30,718 INFO    wandb-upload_0:10531 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_052853-n84i0o06/files/config.yaml
+2024-08-12 05:35:30,895 INFO    wandb-upload_3:10531 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_052853-n84i0o06/files/output.log
+2024-08-12 05:35:31,214 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: keepalive
+2024-08-12 05:35:31,214 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-12 05:35:31,214 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: poll_exit
+2024-08-12 05:35:31,248 INFO    wandb-upload_1:10531 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_052853-n84i0o06/files/requirements.txt
+2024-08-12 05:35:31,299 INFO    wandb-upload_2:10531 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_052853-n84i0o06/files/wandb-summary.json
+2024-08-12 05:35:31,499 INFO    Thread-11 (_thread_body):10531 [sender.py:transition_state():617] send defer: 11
+2024-08-12 05:35:31,499 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 05:35:31,500 INFO    HandlerThread:10531 [handler.py:handle_request_defer():172] handle defer: 11
+2024-08-12 05:35:31,500 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: defer
+2024-08-12 05:35:31,500 INFO    SenderThread:10531 [sender.py:send_request_defer():613] handle sender defer: 11
+2024-08-12 05:35:31,500 INFO    SenderThread:10531 [file_pusher.py:join():178] waiting for file pusher
+2024-08-12 05:35:31,500 INFO    SenderThread:10531 [sender.py:transition_state():617] send defer: 12
+2024-08-12 05:35:31,500 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 05:35:31,500 INFO    HandlerThread:10531 [handler.py:handle_request_defer():172] handle defer: 12
+2024-08-12 05:35:31,500 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: defer
+2024-08-12 05:35:31,500 INFO    SenderThread:10531 [sender.py:send_request_defer():613] handle sender defer: 12
+2024-08-12 05:35:31,500 INFO    SenderThread:10531 [file_stream.py:finish():595] file stream finish called
+2024-08-12 05:35:32,061 INFO    SenderThread:10531 [file_stream.py:finish():599] file stream finish is done
+2024-08-12 05:35:32,061 INFO    SenderThread:10531 [sender.py:transition_state():617] send defer: 13
+2024-08-12 05:35:32,061 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 05:35:32,061 INFO    HandlerThread:10531 [handler.py:handle_request_defer():172] handle defer: 13
+2024-08-12 05:35:32,062 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: defer
+2024-08-12 05:35:32,062 INFO    SenderThread:10531 [sender.py:send_request_defer():613] handle sender defer: 13
+2024-08-12 05:35:32,062 INFO    SenderThread:10531 [sender.py:transition_state():617] send defer: 14
+2024-08-12 05:35:32,062 DEBUG   SenderThread:10531 [sender.py:send():382] send: final
+2024-08-12 05:35:32,062 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 05:35:32,062 INFO    HandlerThread:10531 [handler.py:handle_request_defer():172] handle defer: 14
+2024-08-12 05:35:32,062 DEBUG   SenderThread:10531 [sender.py:send():382] send: footer
+2024-08-12 05:35:32,062 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: defer
+2024-08-12 05:35:32,062 INFO    SenderThread:10531 [sender.py:send_request_defer():613] handle sender defer: 14
+2024-08-12 05:35:32,063 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-12 05:35:32,063 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: poll_exit
+2024-08-12 05:35:32,063 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-12 05:35:32,064 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: poll_exit
+2024-08-12 05:35:32,064 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: server_info
+2024-08-12 05:35:32,064 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: server_info
+2024-08-12 05:35:32,065 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: get_summary
+2024-08-12 05:35:32,066 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: sampled_history
+2024-08-12 05:35:32,067 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 05:35:32,067 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: job_info
+2024-08-12 05:35:32,238 DEBUG   SenderThread:10531 [sender.py:send_request():409] send_request: job_info
+2024-08-12 05:35:32,238 INFO    MainThread:10531 [wandb_run.py:_footer_history_summary_info():3866] rendering history
+2024-08-12 05:35:32,239 INFO    MainThread:10531 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
+2024-08-12 05:35:32,239 INFO    MainThread:10531 [wandb_run.py:_footer_sync_info():3825] logging synced files
+2024-08-12 05:35:32,240 DEBUG   HandlerThread:10531 [handler.py:handle_request():146] handle_request: shutdown
+2024-08-12 05:35:32,240 INFO    HandlerThread:10531 [handler.py:finish():869] shutting down handler
+2024-08-12 05:35:33,068 INFO    WriterThread:10531 [datastore.py:close():296] close: /project/wandb/run-20240812_052853-n84i0o06/run-n84i0o06.wandb
+2024-08-12 05:35:33,239 INFO    SenderThread:10531 [sender.py:finish():1572] shutting down sender
+2024-08-12 05:35:33,239 INFO    SenderThread:10531 [file_pusher.py:finish():172] shutting down file pusher
+2024-08-12 05:35:33,239 INFO    SenderThread:10531 [file_pusher.py:join():178] waiting for file pusher

wandb/run-20240812_052853-n84i0o06/logs/debug.log ADDED Viewed

	@@ -0,0 +1,30 @@

+2024-08-12 05:28:53,517 INFO    MainThread:10460 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
+2024-08-12 05:28:53,517 INFO    MainThread:10460 [wandb_setup.py:_flush():76] Configure stats pid to 10460
+2024-08-12 05:28:53,517 INFO    MainThread:10460 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
+2024-08-12 05:28:53,517 INFO    MainThread:10460 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
+2024-08-12 05:28:53,517 INFO    MainThread:10460 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train Qwen2'}
+2024-08-12 05:28:53,518 INFO    MainThread:10460 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
+2024-08-12 05:28:53,518 INFO    MainThread:10460 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
+2024-08-12 05:28:53,518 INFO    MainThread:10460 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240812_052853-n84i0o06/logs/debug.log
+2024-08-12 05:28:53,518 INFO    MainThread:10460 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240812_052853-n84i0o06/logs/debug-internal.log
+2024-08-12 05:28:53,518 INFO    MainThread:10460 [wandb_init.py:init():566] calling init triggers
+2024-08-12 05:28:53,518 INFO    MainThread:10460 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
+config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'valid_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'test_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 4096, 'num_workers': 2, 'tokenizer_type': 'HFPreTrainedTokenizer', 'tokenizer_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'yans-qwen2-0.5B_train_2024-08-12-05:28:42', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/yans-qwen2-0.5B', 'save': '/work/llm_recipes/models/yans-qwen2-0.5B', 'base_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 5, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 1, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/yans-qwen2-0.5B', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 151680, 'gradient_accumulation_steps': 320}
+2024-08-12 05:28:53,518 INFO    MainThread:10460 [wandb_init.py:init():616] starting backend
+2024-08-12 05:28:53,518 INFO    MainThread:10460 [wandb_init.py:init():620] setting up manager
+2024-08-12 05:28:53,523 INFO    MainThread:10460 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-08-12 05:28:53,523 INFO    MainThread:10460 [wandb_init.py:init():628] backend started and connected
+2024-08-12 05:28:53,528 INFO    MainThread:10460 [wandb_init.py:init():720] updated telemetry
+2024-08-12 05:28:53,540 INFO    MainThread:10460 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
+2024-08-12 05:28:54,037 INFO    MainThread:10460 [wandb_run.py:_on_init():2262] communicating current version
+2024-08-12 05:28:54,121 INFO    MainThread:10460 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.6 is available!  To upgrade, please run:\n $ pip install wandb --upgrade"
+2024-08-12 05:28:54,121 INFO    MainThread:10460 [wandb_init.py:init():804] starting run threads in backend
+2024-08-12 05:28:54,179 INFO    MainThread:10460 [wandb_run.py:_console_start():2241] atexit reg
+2024-08-12 05:28:54,180 INFO    MainThread:10460 [wandb_run.py:_redirect():2096] redirect: wrap_raw
+2024-08-12 05:28:54,180 INFO    MainThread:10460 [wandb_run.py:_redirect():2161] Wrapping output streams.
+2024-08-12 05:28:54,180 INFO    MainThread:10460 [wandb_run.py:_redirect():2186] Redirects installed.
+2024-08-12 05:28:54,181 INFO    MainThread:10460 [wandb_init.py:init():847] run started, returning control to user process
+2024-08-12 05:29:01,877 INFO    MainThread:10460 [wandb_run.py:_config_callback():1343] config_cb None None {'model_architecture': 'Qwen2ForCausalLM', 'activation_function': 'silu', 'hidden_size': 896, 'model_type': 'qwen2', 'max_position_embeddings': 4096, 'num_attention_heads': 14, 'num_hidden_layers': 24}
+2024-08-12 05:29:01,878 INFO    MainThread:10460 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
+2024-08-12 05:35:33,240 WARNING MsgRouterThr:10460 [router.py:message_loop():77] message_loop has been closed

wandb/run-20240812_052853-n84i0o06/run-n84i0o06.wandb ADDED Viewed

Binary file (45.7 kB). View file

wandb/run-20240812_063027-j1htzx7q/files/output.log ADDED Viewed

	@@ -0,0 +1,121 @@

+Created Hugging Face repository with ID koichi12/yans-sample-gemma-2-2b.
+Clearing GPU cache for all ranks
+--> Running with torch torch_distributed debug set to detail
+File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
+File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
+You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
+Loading checkpoint shards:  67%|██████▋   | 2/3 [02:31<01:16, 76.44s/it]
+File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
+No checkpoint found in /work/llm_recipes/models/yans-sample-gemma-2-2b, skipping model loading
+--> Model /share/pretrained_lm/google/gemma-2-2b
+--> /share/pretrained_lm/google/gemma-2-2b has 2614.341888 Million params
+BFloat16 enabled for mixed precision - using bfSixteen policy
+--> applying fsdp activation checkpointing...
+ > datasets target sizes (minimum size):
+    train:      6400000
+    validation: 21334400
+    test:       3200
+Loading checkpoint shards: 100%|██████████| 3/3 [02:40<00:00, 53.37s/it]
+/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
+  warnings.warn(
+Let split = None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+> finished creating GPT datasets ...
+File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
+No checkpoint found in /work/llm_recipes/models/yans-sample-gemma-2-2b, skipping optimizer loading
+File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
+model info: FullyShardedDataParallel(
+  (_fsdp_wrapped_module): Gemma2ForCausalLM(
+    (model): Gemma2Model(
+      (embed_tokens): Embedding(256000, 2304, padding_idx=0)
+      (layers): ModuleList(
+        (0-25): 26 x FullyShardedDataParallel(
+          (_fsdp_wrapped_module): CheckpointWrapper(
+            (_checkpoint_wrapped_module): Gemma2DecoderLayer(
+              (self_attn): Gemma2FlashAttention2(
+                (q_proj): Linear(in_features=2304, out_features=2048, bias=False)
+                (k_proj): Linear(in_features=2304, out_features=1024, bias=False)
+                (v_proj): Linear(in_features=2304, out_features=1024, bias=False)
+                (o_proj): Linear(in_features=2048, out_features=2304, bias=False)
+                (rotary_emb): Gemma2RotaryEmbedding()
+              )
+              (mlp): Gemma2MLP(
+                (gate_proj): Linear(in_features=2304, out_features=9216, bias=False)
+                (up_proj): Linear(in_features=2304, out_features=9216, bias=False)
+                (down_proj): Linear(in_features=9216, out_features=2304, bias=False)
+                (act_fn): PytorchGELUTanh()
+              )
+              (input_layernorm): Gemma2RMSNorm()
+              (post_attention_layernorm): Gemma2RMSNorm()
+              (pre_feedforward_layernorm): Gemma2RMSNorm()
+              (post_feedforward_layernorm): Gemma2RMSNorm()
+            )
+          )
+        )
+      )
+      (norm): Gemma2RMSNorm()
+    )
+    (lm_head): Linear(in_features=2304, out_features=256000, bias=False)
+  )
+)
+model config: Gemma2Config {
+  "_name_or_path": "/share/pretrained_lm/google/gemma-2-2b",
+  "architectures": [
+    "Gemma2ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "attn_logit_softcapping": 50.0,
+  "bos_token_id": 2,
+  "cache_implementation": "hybrid",
+  "eos_token_id": 1,
+  "final_logit_softcapping": 30.0,
+  "head_dim": 256,
+  "hidden_act": "gelu_pytorch_tanh",
+  "hidden_activation": "gelu_pytorch_tanh",
+  "hidden_size": 2304,
+  "initializer_range": 0.02,
+  "intermediate_size": 9216,
+  "label_smoothing": 0.0,
+  "max_position_embeddings": 4096,
+  "model_type": "gemma2",
+  "num_attention_heads": 8,
+  "num_hidden_layers": 26,
+  "num_key_value_heads": 4,
+  "pad_token_id": 0,
+  "query_pre_attn_scalar": 256,
+  "rms_norm_eps": 1e-06,
+  "rope_theta": 10000.0,
+  "sliding_window": 4096,
+  "torch_dtype": "float32",
+  "transformers_version": "4.43.3",
+  "use_cache": false,
+  "vocab_size": 256000
+}
+It is strongly recommended to train Gemma2 models with the `eager` attention implementation instead of `flash_attention_2`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.
+Traceback (most recent call last):
+  File "/project/examples/finetuning.py", line 13, in <module>
+    main()
+  File "/project/src/llama_recipes/finetuning.py", line 281, in main
+    train(
+  File "/project/src/llama_recipes/utils/train_utils.py", line 118, in train
+    loss.backward()
+  File "/usr/local/lib/python3.10/dist-packages/torch/_tensor.py", line 522, in backward
+    torch.autograd.backward(
+  File "/usr/local/lib/python3.10/dist-packages/torch/autograd/__init__.py", line 267, in backward
+    _engine_run_backward(
+  File "/usr/local/lib/python3.10/dist-packages/torch/autograd/graph.py", line 681, in _engine_run_backward
+    return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
+torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.70 GiB. GPU 0 has a total capacity of 39.39 GiB of which 3.86 GiB is free. Including non-PyTorch memory, this process has 35.52 GiB memory in use. Of the allocated memory 32.71 GiB is allocated by PyTorch, and 1.99 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

wandb/run-20240812_063027-j1htzx7q/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"_wandb": {"runtime": 167}}

wandb/run-20240823_154448-v9m85jnt/files/config.yaml ADDED Viewed

	@@ -0,0 +1,321 @@

+wandb_version: 1
+sharding_strategy:
+  desc: null
+  value: FULL_SHARD
+checkpoint_type:
+  desc: null
+  value: LOCAL_STATE_DICT
+fsdp_activation_checkpointing:
+  desc: null
+  value: true
+fsdp_cpu_offload:
+  desc: null
+  value: false
+low_cpu_fsdp:
+  desc: null
+  value: false
+no_meta_device:
+  desc: null
+  value: false
+data_path:
+  desc: null
+  value: null
+split:
+  desc: null
+  value: 969, 30, 1
+train_data_path:
+  desc: null
+  value:
+  - '1754785366'
+  - /project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document
+  - '28623823675'
+  - /project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document
+valid_data_path:
+  desc: null
+  value:
+  - '1754785366'
+  - /project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document
+test_data_path:
+  desc: null
+  value:
+  - '1754785366'
+  - /project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document
+data_cache_path:
+  desc: null
+  value: null
+vocab_size:
+  desc: null
+  value: null
+vocab_file:
+  desc: null
+  value: null
+merge_file:
+  desc: null
+  value: null
+seq_length:
+  desc: null
+  value: 2048
+num_workers:
+  desc: null
+  value: 2
+tokenizer_type:
+  desc: null
+  value: HFPreTrainedTokenizer
+tokenizer_model:
+  desc: null
+  value: /share/pretrained_lm/Qwen/Qwen2-0.5B
+reset_position_ids:
+  desc: null
+  value: false
+reset_attention_mask:
+  desc: null
+  value: false
+eod_mask_loss:
+  desc: null
+  value: false
+retro_return_doc_ids:
+  desc: null
+  value: false
+short_seq_prob:
+  desc: null
+  value: 0.1
+vocab_extra_ids:
+  desc: null
+  value: 0
+seed:
+  desc: null
+  value: 1234
+use_mpi:
+  desc: null
+  value: false
+wandb_entity:
+  desc: null
+  value: iwakawa-koichi-q5-tohoku-nlp6723
+wandb_name:
+  desc: null
+  value: Qwen2-0.5b-0.2_train_2024-08-23-15:44:18
+wandb_project:
+  desc: null
+  value: llm_tutorial-0.2
+quantization:
+  desc: null
+  value: false
+use_freeze_layers:
+  desc: null
+  value: false
+freeze_layers:
+  desc: null
+  value: null
+bf16:
+  desc: null
+  value: true
+fp16:
+  desc: null
+  value: false
+mixed_precision:
+  desc: null
+  value: true
+param_dtype:
+  desc: null
+  value: null
+load:
+  desc: null
+  value: /work/llm_recipes/models/Qwen2-0.5b-0.2
+save:
+  desc: null
+  value: /work/llm_recipes/models/Qwen2-0.5b-0.2
+base_model:
+  desc: null
+  value: /share/pretrained_lm/Qwen/Qwen2-0.5B
+use_better_transformer:
+  desc: null
+  value: false
+grad_clip_norm:
+  desc: null
+  value: 1.0
+eval_interval:
+  desc: null
+  value: 10
+save_interval:
+  desc: null
+  value: 10
+eval_iters:
+  desc: null
+  value: 10
+optimizer:
+  desc: null
+  value: anyprecision
+lr:
+  desc: null
+  value: 2.0e-05
+lr_decay_style:
+  desc: null
+  value: cosine
+lr_decay_iters:
+  desc: null
+  value: 7500
+lr_warmup_iters:
+  desc: null
+  value: 500
+min_lr:
+  desc: null
+  value: 1.0e-06
+train_iters:
+  desc: null
+  value: 7500
+train_samples:
+  desc: null
+  value: null
+global_batch_size:
+  desc: null
+  value: 320
+micro_batch_size:
+  desc: null
+  value: 1
+make_vocab_size_divisible_by:
+  desc: null
+  value: 128
+sliding_window_size:
+  desc: null
+  value: 4096
+skip_batch:
+  desc: null
+  value: null
+no_save_optimizer_state:
+  desc: null
+  value: false
+continual_pretraining:
+  desc: null
+  value: false
+instruction_tuning:
+  desc: null
+  value: false
+direct_preference_optimization:
+  desc: null
+  value: false
+attention_dropout:
+  desc: null
+  value: 0.1
+hidden_dropout:
+  desc: null
+  value: 0.1
+weight_decay:
+  desc: null
+  value: 0.1
+adam_beta1:
+  desc: null
+  value: 0.9
+adam_beta2:
+  desc: null
+  value: 0.95
+adam_eps:
+  desc: null
+  value: 1.0e-06
+hf_transformer_model_dir:
+  desc: null
+  value: null
+instruction_train_data_path:
+  desc: null
+  value: null
+instruction_valid_data_path:
+  desc: null
+  value: null
+epoch:
+  desc: null
+  value: null
+instruction_dataset_size:
+  desc: null
+  value: null
+save_sampler_state:
+  desc: null
+  value: false
+label_smoothing:
+  desc: null
+  value: 0.0
+save_n_checkpoints:
+  desc: null
+  value: 10
+hf_repo_id:
+  desc: null
+  value: koichi12/Qwen2-0.5b-0.2
+create_public_hf_repo:
+  desc: null
+  value: false
+upload_all_checkpoints_to_hf:
+  desc: null
+  value: true
+hf_upload_retry_limit:
+  desc: null
+  value: 2
+exit_duration_in_mins:
+  desc: null
+  value: null
+source_key:
+  desc: null
+  value: null
+target_key:
+  desc: null
+  value: null
+attn_implementation:
+  desc: null
+  value: flash_attention_2
+efficient_instruction_tuning:
+  desc: null
+  value: false
+remove_padding_masking:
+  desc: null
+  value: false
+save_start_iter:
+  desc: null
+  value: null
+valid_micro_batch_size:
+  desc: null
+  value: 1
+rank:
+  desc: null
+  value: 0
+world_size:
+  desc: null
+  value: 1
+padded_vocab_size:
+  desc: null
+  value: 151680
+gradient_accumulation_steps:
+  desc: null
+  value: 320
+_wandb:
+  desc: null
+  value:
+    python_version: 3.10.12
+    cli_version: 0.16.3
+    framework: huggingface
+    huggingface_version: 4.43.3
+    is_jupyter_run: false
+    is_kaggle_kernel: false
+    start_time: 1724395488.891619
+    t:
+      1:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      - 105
+      2:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      - 105
+      3:
+      - 13
+      - 16
+      - 23
+      4: 3.10.12
+      5: 0.16.3
+      6: 4.43.3
+      8:
+      - 5
+      13: linux-x86_64

wandb/run-20240823_154448-v9m85jnt/files/output.log ADDED Viewed

	@@ -0,0 +1,15 @@

+Created Hugging Face repository with ID koichi12/Qwen2-0.5b-0.2.
+Clearing GPU cache for all ranks
+--> Running with torch torch_distributed debug set to detail
+File not found: /work/llm_recipes/models/Qwen2-0.5b-0.2/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/Qwen2-0.5b-0.2/latest_iteration.txt
+File not found: /work/llm_recipes/models/Qwen2-0.5b-0.2/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/Qwen2-0.5b-0.2/latest_iteration.txt
+Traceback (most recent call last):
+  File "/project/examples/finetuning.py", line 13, in <module>
+    main()
+  File "/project/src/llama_recipes/finetuning.py", line 103, in main
+    model = get_model(
+  File "/project/src/llama_recipes/get_models.py", line 106, in get_model
+    assert sliding_window == 131072
+AssertionError

wandb/run-20240823_154448-v9m85jnt/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,375 @@

+absl-py==2.1.0
+accelerate==0.23.0
+aiohttp==3.9.1
+aiosignal==1.3.1
+annotated-types==0.6.0
+antlr4-python3-runtime==4.9.3
+anyio==4.4.0
+apex==0.1
+appdirs==1.4.4
+argon2-cffi-bindings==21.2.0
+argon2-cffi==23.1.0
+astroid==3.2.4
+asttokens==2.4.1
+astunparse==1.6.3
+async-timeout==4.0.3
+attrs==23.2.0
+audioread==3.0.1
+beautifulsoup4==4.12.3
+bert-score==0.3.13
+bleach==6.1.0
+blis==0.7.11
+build==1.2.1
+cachecontrol==0.14.0
+cachetools==5.3.2
+catalogue==2.0.10
+certifi==2024.2.2
+cffi==1.16.0
+chardet==5.2.0
+charset-normalizer==3.3.2
+cleo==2.1.0
+click==8.1.7
+cloudpathlib==0.16.0
+cloudpickle==3.0.0
+cmake==3.28.1
+colorama==0.4.6
+comm==0.2.1
+confection==0.1.4
+contourpy==1.2.0
+cramjam==2.8.3
+crashtest==0.4.1
+cryptography==43.0.0
+cubinlinker==0.3.0+2.g405ac64
+cuda-python==12.3.0rc4+9.gdb8c48a.dirty
+cudf==23.12.0
+cugraph-dgl==23.12.0
+cugraph-service-client==23.12.0
+cugraph-service-server==23.12.0
+cugraph==23.12.0
+cuml==23.12.0
+cupy-cuda12x==12.3.0
+cycler==0.12.1
+cymem==2.0.8
+cython==3.0.8
+dask-cuda==23.12.0
+dask-cudf==23.12.0
+dask==2023.11.0
+dataclasses-json==0.6.7
+dataproperty==1.0.1
+datasets==2.20.0
+debugpy==1.8.1
+decorator==5.1.1
+defusedxml==0.7.1
+dill==0.3.8
+distlib==0.3.8
+distributed==2023.11.0
+distro==1.9.0
+dm-tree==0.1.8
+docker-pycreds==0.4.0
+dulwich==0.21.7
+einops==0.7.0
+emoji==2.12.1
+entmax==1.3
+evaluate==0.4.2
+exceptiongroup==1.2.0
+execnet==2.0.2
+executing==2.0.1
+expecttest==0.1.3
+fastjsonschema==2.19.1
+fastparquet==2023.10.1
+fastrlock==0.8.2
+filelock==3.13.1
+flash-attn==2.4.2
+fonttools==4.48.1
+frozenlist==1.4.1
+fsspec==2023.12.2
+fugashi==1.3.2
+fuzzywuzzy==0.18.0
+gast==0.5.4
+gitdb==4.0.11
+gitpython==3.1.43
+google-auth-oauthlib==0.4.6
+google-auth==2.27.0
+graphsurgeon==0.4.6
+greenlet==3.0.3
+grpcio==1.60.1
+h11==0.14.0
+httpcore==1.0.5
+httpx==0.27.0
+huggingface-hub==0.24.5
+hydra-core==1.3.2
+hypothesis==5.35.1
+idna==3.6
+importlib-metadata==7.0.1
+iniconfig==2.0.0
+installer==0.7.0
+intel-openmp==2021.4.0
+ipadic==1.0.0
+ipykernel==6.29.2
+ipython-genutils==0.2.0
+ipython==8.21.0
+isort==5.13.2
+jaraco.classes==3.4.0
+jedi==0.19.1
+jeepney==0.8.0
+jinja2==3.1.3
+jiter==0.5.0
+joblib==1.3.2
+json5==0.9.14
+jsonargparse==3.13.1
+jsonlines==4.0.0
+jsonnet==0.19.1
+jsonpatch==1.33
+jsonpointer==3.0.0
+jsonschema-specifications==2023.12.1
+jsonschema==4.21.1
+jupyter-client==8.6.0
+jupyter-core==5.7.1
+jupyter-tensorboard==0.2.0
+jupyterlab-pygments==0.3.0
+jupyterlab-server==1.2.0
+jupyterlab==2.3.2
+jupytext==1.16.1
+keyring==24.3.1
+kiwisolver==1.4.5
+langchain-community==0.2.12
+langchain-core==0.2.31
+langchain-huggingface==0.0.2
+langchain-openai==0.1.21
+langchain-text-splitters==0.2.2
+langchain==0.2.13
+langcodes==3.3.0
+langsmith==0.1.99
+lazy-loader==0.3
+levenshtein==0.25.1
+librosa==0.10.1
+lightning-utilities==0.11.6
+llm-jp-eval==1.4.0
+llvmlite==0.40.1
+lm-eval==0.3.0
+locket==1.0.0
+logzero==1.7.0
+lxml==5.2.2
+markdown-it-py==3.0.0
+markdown==3.5.2
+markupsafe==2.1.4
+marshmallow==3.21.3
+matplotlib-inline==0.1.6
+matplotlib==3.8.2
+mbstrdecoder==1.1.3
+mccabe==0.7.0
+mdit-py-plugins==0.4.0
+mdurl==0.1.2
+mecab-python3==1.0.6
+mistune==3.0.2
+mkl-devel==2021.1.1
+mkl-include==2021.1.1
+mkl==2021.1.1
+mock==5.1.0
+mojimoji==0.0.13
+more-itertools==9.1.0
+mpmath==1.3.0
+msgpack==1.0.7
+multidict==6.0.4
+multiprocess==0.70.16
+murmurhash==1.0.10
+mypy-extensions==1.0.0
+nbclient==0.9.0
+nbconvert==7.16.0
+nbformat==5.9.2
+neologdn==0.5.3
+nest-asyncio==1.6.0
+networkx==2.6.3
+ninja==1.11.1.1
+nltk==3.8.1
+notebook==6.4.10
+numba==0.57.1+1.g1ff679645
+numexpr==2.10.1
+numpy==1.24.4
+nvfuser==0.1.4a0+d0bb811
+nvidia-dali-cuda120==1.34.0
+nvidia-pyindex==1.0.9
+nvtx==0.2.5
+oauthlib==3.2.2
+omegaconf==2.3.0
+onnx==1.15.0rc2
+openai==1.40.6
+opencv==4.7.0
+optree==0.10.0
+orjson==3.10.7
+packaging==23.2
+pandas==2.2.2
+pandocfilters==1.5.1
+parso==0.8.3
+partd==1.4.1
+pathvalidate==3.2.0
+peft==0.5.0
+pexpect==4.9.0
+pillow==10.2.0
+pip==24.0
+pkginfo==1.11.1
+plac==1.4.3
+platformdirs==4.2.0
+pluggy==1.4.0
+ply==3.11
+poetry-core==1.9.0
+poetry-plugin-export==1.8.0
+poetry==1.8.3
+polygraphy==0.49.4
+pooch==1.8.0
+portalocker==2.10.1
+preshed==3.0.9
+prettytable==3.9.0
+prometheus-client==0.19.0
+prompt-toolkit==3.0.43
+protobuf==4.24.4
+psutil==5.9.4
+ptxcompiler==0.8.1+2.g0d406d6
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyarrow-hotfix==0.6
+pyarrow==15.0.2
+pyasn1-modules==0.3.0
+pyasn1==0.5.1
+pybind11-global==2.11.1
+pybind11==2.11.1
+pycocotools==2.0+nv0.8.0
+pycountry==24.6.1
+pycparser==2.21
+pydantic-core==2.16.2
+pydantic==2.6.1
+pygments==2.17.2
+pylibcugraph==23.12.0
+pylibcugraphops==23.12.0
+pylibraft==23.12.0
+pylint==3.2.6
+pynvml==11.4.1
+pyparsing==3.1.1
+pyproject-hooks==1.1.0
+pytablewriter==1.2.0
+pytest-flakefinder==1.1.0
+pytest-rerunfailures==13.0
+pytest-shard==0.1.2
+pytest-xdist==3.5.0
+pytest==8.0.0
+python-dateutil==2.8.2
+python-dotenv==1.0.0
+python-hostlist==1.23.0
+python-levenshtein==0.25.1
+pytorch-lightning==2.4.0
+pytorch-quantization==2.1.2
+pytz==2023.3.post1
+pyyaml==6.0.1
+pyzmq==25.1.2
+raft-dask==23.12.0
+rapidfuzz==3.9.6
+rapids-dask-dependency==23.12.1
+referencing==0.33.0
+regex==2023.12.25
+requests-oauthlib==1.3.1
+requests-toolbelt==1.0.0
+requests==2.32.3
+rhoknp==1.7.0
+rich==13.7.0
+rmm==23.12.0
+rouge-score==0.1.2
+rpds-py==0.17.1
+rsa==4.9
+sacrebleu==2.4.2
+safetensors==0.4.3
+scikit-learn==1.5.1
+scipy==1.12.0
+secretstorage==3.3.3
+send2trash==1.8.2
+sentence-transformers==3.0.1
+sentencepiece==0.1.99
+sentry-sdk==2.12.0
+setproctitle==1.3.3
+setuptools==68.2.2
+shellingham==1.5.4
+six==1.16.0
+smart-open==6.4.0
+smmap==5.0.1
+sniffio==1.3.1
+sortedcontainers==2.4.0
+soundfile==0.12.1
+soupsieve==2.5
+soxr==0.3.7
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+spacy==3.7.2
+sphinx-glpi-theme==0.6
+sqlalchemy==2.0.32
+sqlitedict==2.1.0
+srsly==2.4.8
+stack-data==0.6.3
+sumeval==0.2.2
+sympy==1.12
+tabledata==1.3.3
+tabulate==0.9.0
+tbb==2021.11.0
+tblib==3.0.0
+tcolorpy==0.1.6
+tenacity==8.5.0
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.1
+tensorboard==2.9.0
+tensorrt==8.6.3
+terminado==0.18.0
+termplotlib==0.3.9
+text-generation==0.7.0
+thinc==8.2.3
+threadpoolctl==3.2.0
+thriftpy2==0.4.17
+tiktoken==0.7.0
+tinycss2==1.2.1
+tokenizers==0.19.1
+toml==0.10.2
+tomli==2.0.1
+tomlkit==0.13.2
+toolz==0.12.1
+torch-tensorrt==2.3.0a0
+torch==2.3.0a0+ebedce2
+torchdata==0.7.1a0
+torchmetrics==0.10.3
+torchtext==0.17.0a0
+torchvision==0.18.0a0
+tornado==6.4
+tqdm-multiprocess==0.0.11
+tqdm==4.66.5
+traitlets==5.9.0
+transformer-engine==1.3.0+5b90b7f
+transformers==4.43.3
+treelite-runtime==3.9.1
+treelite==3.9.1
+triton==2.2.0+e28a256
+trove-classifiers==2024.7.2
+typepy==1.3.2
+typer==0.9.0
+types-dataclasses==0.6.6
+typing-extensions==4.12.2
+typing-inspect==0.9.0
+tzdata==2024.1
+ucx-py==0.35.0
+uff==0.6.9
+ujson==5.8.0
+unbabel-comet==2.2.2
+unidic-lite==1.0.8
+urllib3==1.26.18
+virtualenv==20.26.3
+wandb==0.16.3
+wasabi==1.1.2
+wcwidth==0.2.13
+weasel==0.3.4
+webencodings==0.5.1
+werkzeug==3.0.1
+wheel==0.42.0
+word2number==1.1
+xdoctest==1.0.2
+xgboost==1.7.6
+xmltodict==0.13.0
+xxhash==3.4.1
+yarl==1.9.4
+zict==3.0.0
+zipp==3.17.0
+zstandard==0.23.0

wandb/run-20240823_154448-v9m85jnt/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,220 @@

+{
+    "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
+    "python": "3.10.12",
+    "heartbeatAt": "2024-08-23T06:44:49.486428",
+    "startedAt": "2024-08-23T06:44:48.878270",
+    "docker": null,
+    "cuda": null,
+    "args": [
+        "--seq-length",
+        "2048",
+        "--sliding-window-size",
+        "4096",
+        "--micro-batch-size",
+        "1",
+        "--valid_micro_batch_size",
+        "1",
+        "--global-batch-size",
+        "320",
+        "--train-iters",
+        "7500",
+        "--tokenizer-type",
+        "HFPreTrainedTokenizer",
+        "--tokenizer-model",
+        "/share/pretrained_lm/Qwen/Qwen2-0.5B",
+        "--train-data-path",
+        "1754785366",
+        "/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document",
+        "28623823675",
+        "/project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document",
+        "--valid-data-path",
+        "1754785366",
+        "/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document",
+        "--test-data-path",
+        "1754785366",
+        "/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document",
+        "--lr",
+        "2e-5",
+        "--min-lr",
+        "1e-6",
+        "--lr-decay-style",
+        "cosine",
+        "--lr-warmup-iters",
+        "500",
+        "--lr-decay-iters",
+        "7500",
+        "--weight-decay",
+        "0.1",
+        "--grad-clip-norm",
+        "1.0",
+        "--optimizer",
+        "anyprecision",
+        "--adam-beta1",
+        "0.9",
+        "--adam-beta2",
+        "0.95",
+        "--adam-eps",
+        "1e-6",
+        "--save-interval",
+        "10",
+        "--eval-interval",
+        "10",
+        "--eval-iters",
+        "10",
+        "--bf16",
+        "--mixed-precision",
+        "--base-model",
+        "/share/pretrained_lm/Qwen/Qwen2-0.5B",
+        "--save",
+        "/work/llm_recipes/models/Qwen2-0.5b-0.2",
+        "--load",
+        "/work/llm_recipes/models/Qwen2-0.5b-0.2",
+        "--fsdp-activation-checkpointing",
+        "--sharding-strategy",
+        "FULL_SHARD",
+        "--checkpoint-type",
+        "LOCAL_STATE_DICT",
+        "--save-n-checkpoints",
+        "10",
+        "--upload-all-checkpoints-to-hf",
+        "--hf-upload-retry-limit",
+        "2",
+        "--hf-repo-id",
+        "koichi12/Qwen2-0.5b-0.2",
+        "--wandb-entity",
+        "iwakawa-koichi-q5-tohoku-nlp6723",
+        "--wandb-project",
+        "llm_tutorial-0.2",
+        "--wandb-name",
+        "Qwen2-0.5b-0.2_train_2024-08-23-15:44:18"
+    ],
+    "state": "running",
+    "program": "/project/examples/finetuning.py",
+    "codePathLocal": "examples/finetuning.py",
+    "codePath": "examples/finetuning.py",
+    "git": {
+        "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
+        "commit": "887a2cc5d104c10264701f95cbbb0a6a116768d6"
+    },
+    "email": null,
+    "root": "/project",
+    "host": "gpu-koiwa-00",
+    "username": "koiwa",
+    "executable": "/usr/bin/python",
+    "cpu_count": 18,
+    "cpu_count_logical": 18,
+    "cpu_freq": {
+        "current": 2400.0389999999993,
+        "min": 0.0,
+        "max": 0.0
+    },
+    "cpu_freq_per_core": [
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        }
+    ],
+    "disk": {
+        "/": {
+            "total": 0.0625,
+            "used": 1.1444091796875e-05
+        }
+    },
+    "gpu": "NVIDIA A100-SXM4-40GB",
+    "gpu_count": 1,
+    "gpu_devices": [
+        {
+            "name": "NVIDIA A100-SXM4-40GB",
+            "memory_total": 42949672960
+        }
+    ],
+    "memory": {
+        "total": 56.487831115722656
+    }
+}

wandb/run-20240823_154448-v9m85jnt/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"_wandb": {"runtime": 1}}

wandb/run-20240823_154448-v9m85jnt/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,189 @@

+2024-08-23 15:44:48,892 INFO    StreamThr :10032 [internal.py:wandb_internal():86] W&B internal server running at pid: 10032, started at: 2024-08-23 15:44:48.891774
+2024-08-23 15:44:48,893 DEBUG   HandlerThread:10032 [handler.py:handle_request():146] handle_request: status
+2024-08-23 15:44:48,896 INFO    WriterThread:10032 [datastore.py:open_for_write():87] open: /project/wandb/run-20240823_154448-v9m85jnt/run-v9m85jnt.wandb
+2024-08-23 15:44:48,897 DEBUG   SenderThread:10032 [sender.py:send():382] send: header
+2024-08-23 15:44:48,913 DEBUG   SenderThread:10032 [sender.py:send():382] send: run
+2024-08-23 15:44:49,390 INFO    SenderThread:10032 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240823_154448-v9m85jnt/files
+2024-08-23 15:44:49,390 INFO    SenderThread:10032 [sender.py:_start_run_threads():1136] run started: v9m85jnt with start time 1724395488.891619
+2024-08-23 15:44:49,395 DEBUG   HandlerThread:10032 [handler.py:handle_request():146] handle_request: check_version
+2024-08-23 15:44:49,396 DEBUG   SenderThread:10032 [sender.py:send_request():409] send_request: check_version
+2024-08-23 15:44:49,467 DEBUG   HandlerThread:10032 [handler.py:handle_request():146] handle_request: run_start
+2024-08-23 15:44:49,473 DEBUG   HandlerThread:10032 [system_info.py:__init__():27] System info init
+2024-08-23 15:44:49,474 DEBUG   HandlerThread:10032 [system_info.py:__init__():42] System info init done
+2024-08-23 15:44:49,474 INFO    HandlerThread:10032 [system_monitor.py:start():194] Starting system monitor
+2024-08-23 15:44:49,474 INFO    SystemMonitor:10032 [system_monitor.py:_start():158] Starting system asset monitoring threads
+2024-08-23 15:44:49,474 INFO    HandlerThread:10032 [system_monitor.py:probe():214] Collecting system info
+2024-08-23 15:44:49,474 INFO    SystemMonitor:10032 [interfaces.py:start():190] Started cpu monitoring
+2024-08-23 15:44:49,475 INFO    SystemMonitor:10032 [interfaces.py:start():190] Started disk monitoring
+2024-08-23 15:44:49,475 INFO    SystemMonitor:10032 [interfaces.py:start():190] Started gpu monitoring
+2024-08-23 15:44:49,475 INFO    SystemMonitor:10032 [interfaces.py:start():190] Started memory monitoring
+2024-08-23 15:44:49,476 INFO    SystemMonitor:10032 [interfaces.py:start():190] Started network monitoring
+2024-08-23 15:44:49,486 DEBUG   HandlerThread:10032 [system_info.py:probe():151] Probing system
+2024-08-23 15:44:49,488 DEBUG   HandlerThread:10032 [system_info.py:_probe_git():136] Probing git
+2024-08-23 15:44:49,500 DEBUG   HandlerThread:10032 [system_info.py:_probe_git():144] Probing git done
+2024-08-23 15:44:49,500 DEBUG   HandlerThread:10032 [system_info.py:probe():199] Probing system done
+2024-08-23 15:44:49,500 DEBUG   HandlerThread:10032 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-23T06:44:49.486428', 'startedAt': '2024-08-23T06:44:48.878270', 'docker': None, 'cuda': None, 'args': ('--seq-length', '2048', '--sliding-window-size', '4096', '--micro-batch-size', '1', '--valid_micro_batch_size', '1', '--global-batch-size', '320', '--train-iters', '7500', '--tokenizer-type', 'HFPreTrainedTokenizer', '--tokenizer-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--train-data-path', '1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '28623823675', '/project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document', '--valid-data-path', '1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '--test-data-path', '1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '7500', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'anyprecision', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '10', '--eval-interval', '10', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--save', '/work/llm_recipes/models/Qwen2-0.5b-0.2', '--load', '/work/llm_recipes/models/Qwen2-0.5b-0.2', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--upload-all-checkpoints-to-hf', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/Qwen2-0.5b-0.2', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial-0.2', '--wandb-name', 'Qwen2-0.5b-0.2_train_2024-08-23-15:44:18'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '887a2cc5d104c10264701f95cbbb0a6a116768d6'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0389999999993, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.487831115722656}}
+2024-08-23 15:44:49,500 INFO    HandlerThread:10032 [system_monitor.py:probe():224] Finished collecting system info
+2024-08-23 15:44:49,500 INFO    HandlerThread:10032 [system_monitor.py:probe():227] Publishing system info
+2024-08-23 15:44:49,502 INFO    HandlerThread:10032 [system_monitor.py:probe():229] Finished publishing system info
+2024-08-23 15:44:49,528 DEBUG   SenderThread:10032 [sender.py:send():382] send: files
+2024-08-23 15:44:49,529 INFO    SenderThread:10032 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
+2024-08-23 15:44:49,540 DEBUG   HandlerThread:10032 [handler.py:handle_request():146] handle_request: python_packages
+2024-08-23 15:44:49,540 DEBUG   HandlerThread:10032 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-23 15:44:49,540 DEBUG   HandlerThread:10032 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-23 15:44:49,541 DEBUG   SenderThread:10032 [sender.py:send_request():409] send_request: python_packages
+2024-08-23 15:44:49,543 DEBUG   SenderThread:10032 [sender.py:send_request():409] send_request: stop_status
+2024-08-23 15:44:49,740 DEBUG   SenderThread:10032 [sender.py:send():382] send: telemetry
+2024-08-23 15:44:50,157 INFO    wandb-upload_0:10032 [upload_job.py:push():131] Uploaded file /tmp/tmp_akktvpmwandb/xbudf9th-wandb-metadata.json
+2024-08-23 15:44:50,392 INFO    Thread-12 :10032 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240823_154448-v9m85jnt/files/wandb-metadata.json
+2024-08-23 15:44:50,392 INFO    Thread-12 :10032 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240823_154448-v9m85jnt/files/requirements.txt
+2024-08-23 15:44:50,392 INFO    Thread-12 :10032 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240823_154448-v9m85jnt/files/output.log
+2024-08-23 15:44:50,729 DEBUG   SenderThread:10032 [sender.py:send():382] send: exit
+2024-08-23 15:44:50,729 INFO    SenderThread:10032 [sender.py:send_exit():589] handling exit code: 1
+2024-08-23 15:44:50,730 INFO    SenderThread:10032 [sender.py:send_exit():591] handling runtime: 1
+2024-08-23 15:44:50,731 INFO    SenderThread:10032 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-23 15:44:50,731 INFO    SenderThread:10032 [sender.py:send_exit():597] send defer
+2024-08-23 15:44:50,731 DEBUG   HandlerThread:10032 [handler.py:handle_request():146] handle_request: defer
+2024-08-23 15:44:50,731 INFO    HandlerThread:10032 [handler.py:handle_request_defer():172] handle defer: 0
+2024-08-23 15:44:50,731 DEBUG   SenderThread:10032 [sender.py:send_request():409] send_request: defer
+2024-08-23 15:44:50,732 INFO    SenderThread:10032 [sender.py:send_request_defer():613] handle sender defer: 0
+2024-08-23 15:44:50,732 INFO    SenderThread:10032 [sender.py:transition_state():617] send defer: 1
+2024-08-23 15:44:50,732 DEBUG   HandlerThread:10032 [handler.py:handle_request():146] handle_request: defer
+2024-08-23 15:44:50,732 INFO    HandlerThread:10032 [handler.py:handle_request_defer():172] handle defer: 1
+2024-08-23 15:44:50,732 DEBUG   SenderThread:10032 [sender.py:send_request():409] send_request: defer
+2024-08-23 15:44:50,732 INFO    SenderThread:10032 [sender.py:send_request_defer():613] handle sender defer: 1
+2024-08-23 15:44:50,732 INFO    SenderThread:10032 [sender.py:transition_state():617] send defer: 2
+2024-08-23 15:44:50,732 DEBUG   HandlerThread:10032 [handler.py:handle_request():146] handle_request: defer
+2024-08-23 15:44:50,732 INFO    HandlerThread:10032 [handler.py:handle_request_defer():172] handle defer: 2
+2024-08-23 15:44:50,732 INFO    HandlerThread:10032 [system_monitor.py:finish():203] Stopping system monitor
+2024-08-23 15:44:50,732 INFO    HandlerThread:10032 [interfaces.py:finish():202] Joined cpu monitor
+2024-08-23 15:44:50,733 DEBUG   SystemMonitor:10032 [system_monitor.py:_start():172] Starting system metrics aggregation loop
+2024-08-23 15:44:50,733 INFO    HandlerThread:10032 [interfaces.py:finish():202] Joined disk monitor
+2024-08-23 15:44:50,733 DEBUG   SystemMonitor:10032 [system_monitor.py:_start():179] Finished system metrics aggregation loop
+2024-08-23 15:44:50,733 DEBUG   SystemMonitor:10032 [system_monitor.py:_start():183] Publishing last batch of metrics
+2024-08-23 15:44:50,765 INFO    HandlerThread:10032 [interfaces.py:finish():202] Joined gpu monitor
+2024-08-23 15:44:50,765 INFO    HandlerThread:10032 [interfaces.py:finish():202] Joined memory monitor
+2024-08-23 15:44:50,765 INFO    HandlerThread:10032 [interfaces.py:finish():202] Joined network monitor
+2024-08-23 15:44:50,766 DEBUG   SenderThread:10032 [sender.py:send_request():409] send_request: defer
+2024-08-23 15:44:50,766 INFO    SenderThread:10032 [sender.py:send_request_defer():613] handle sender defer: 2
+2024-08-23 15:44:50,766 INFO    SenderThread:10032 [sender.py:transition_state():617] send defer: 3
+2024-08-23 15:44:50,766 DEBUG   SenderThread:10032 [sender.py:send():382] send: stats
+2024-08-23 15:44:50,766 DEBUG   HandlerThread:10032 [handler.py:handle_request():146] handle_request: defer
+2024-08-23 15:44:50,766 INFO    HandlerThread:10032 [handler.py:handle_request_defer():172] handle defer: 3
+2024-08-23 15:44:50,766 DEBUG   SenderThread:10032 [sender.py:send_request():409] send_request: defer
+2024-08-23 15:44:50,766 INFO    SenderThread:10032 [sender.py:send_request_defer():613] handle sender defer: 3
+2024-08-23 15:44:50,766 INFO    SenderThread:10032 [sender.py:transition_state():617] send defer: 4
+2024-08-23 15:44:50,767 DEBUG   HandlerThread:10032 [handler.py:handle_request():146] handle_request: defer
+2024-08-23 15:44:50,767 INFO    HandlerThread:10032 [handler.py:handle_request_defer():172] handle defer: 4
+2024-08-23 15:44:50,767 DEBUG   SenderThread:10032 [sender.py:send_request():409] send_request: defer
+2024-08-23 15:44:50,767 INFO    SenderThread:10032 [sender.py:send_request_defer():613] handle sender defer: 4
+2024-08-23 15:44:50,767 INFO    SenderThread:10032 [sender.py:transition_state():617] send defer: 5
+2024-08-23 15:44:50,767 DEBUG   HandlerThread:10032 [handler.py:handle_request():146] handle_request: defer
+2024-08-23 15:44:50,767 INFO    HandlerThread:10032 [handler.py:handle_request_defer():172] handle defer: 5
+2024-08-23 15:44:50,767 DEBUG   SenderThread:10032 [sender.py:send():382] send: summary
+2024-08-23 15:44:50,768 INFO    SenderThread:10032 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-23 15:44:50,768 DEBUG   SenderThread:10032 [sender.py:send_request():409] send_request: defer
+2024-08-23 15:44:50,768 INFO    SenderThread:10032 [sender.py:send_request_defer():613] handle sender defer: 5
+2024-08-23 15:44:50,768 INFO    SenderThread:10032 [sender.py:transition_state():617] send defer: 6
+2024-08-23 15:44:50,768 DEBUG   HandlerThread:10032 [handler.py:handle_request():146] handle_request: defer
+2024-08-23 15:44:50,768 INFO    HandlerThread:10032 [handler.py:handle_request_defer():172] handle defer: 6
+2024-08-23 15:44:50,768 DEBUG   SenderThread:10032 [sender.py:send_request():409] send_request: defer
+2024-08-23 15:44:50,769 INFO    SenderThread:10032 [sender.py:send_request_defer():613] handle sender defer: 6
+2024-08-23 15:44:50,771 DEBUG   HandlerThread:10032 [handler.py:handle_request():146] handle_request: status_report
+2024-08-23 15:44:50,957 INFO    SenderThread:10032 [sender.py:transition_state():617] send defer: 7
+2024-08-23 15:44:50,957 DEBUG   HandlerThread:10032 [handler.py:handle_request():146] handle_request: defer
+2024-08-23 15:44:50,957 INFO    HandlerThread:10032 [handler.py:handle_request_defer():172] handle defer: 7
+2024-08-23 15:44:50,958 DEBUG   SenderThread:10032 [sender.py:send_request():409] send_request: defer
+2024-08-23 15:44:50,958 INFO    SenderThread:10032 [sender.py:send_request_defer():613] handle sender defer: 7
+2024-08-23 15:44:51,392 INFO    Thread-12 :10032 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_154448-v9m85jnt/files/config.yaml
+2024-08-23 15:44:51,392 INFO    Thread-12 :10032 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240823_154448-v9m85jnt/files/wandb-summary.json
+2024-08-23 15:44:51,729 DEBUG   HandlerThread:10032 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-23 15:44:52,393 INFO    Thread-12 :10032 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_154448-v9m85jnt/files/output.log
+2024-08-23 15:44:52,721 INFO    SenderThread:10032 [sender.py:transition_state():617] send defer: 8
+2024-08-23 15:44:52,721 DEBUG   SenderThread:10032 [sender.py:send_request():409] send_request: poll_exit
+2024-08-23 15:44:52,721 DEBUG   HandlerThread:10032 [handler.py:handle_request():146] handle_request: defer
+2024-08-23 15:44:52,721 INFO    HandlerThread:10032 [handler.py:handle_request_defer():172] handle defer: 8
+2024-08-23 15:44:52,721 DEBUG   SenderThread:10032 [sender.py:send_request():409] send_request: defer
+2024-08-23 15:44:52,721 INFO    SenderThread:10032 [sender.py:send_request_defer():613] handle sender defer: 8
+2024-08-23 15:44:52,721 INFO    SenderThread:10032 [job_builder.py:build():296] Attempting to build job artifact
+2024-08-23 15:44:52,722 INFO    SenderThread:10032 [job_builder.py:_get_source_type():426] is repo sourced job
+2024-08-23 15:44:52,730 DEBUG   HandlerThread:10032 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-23 15:44:52,737 INFO    SenderThread:10032 [job_builder.py:build():402] adding wandb-job metadata file
+2024-08-23 15:44:52,746 INFO    SenderThread:10032 [sender.py:transition_state():617] send defer: 9
+2024-08-23 15:44:52,747 DEBUG   SenderThread:10032 [sender.py:send_request():409] send_request: poll_exit
+2024-08-23 15:44:52,747 DEBUG   HandlerThread:10032 [handler.py:handle_request():146] handle_request: defer
+2024-08-23 15:44:52,747 DEBUG   SenderThread:10032 [sender.py:send():382] send: artifact
+2024-08-23 15:44:52,747 INFO    HandlerThread:10032 [handler.py:handle_request_defer():172] handle defer: 9
+2024-08-23 15:44:53,393 INFO    Thread-12 :10032 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_154448-v9m85jnt/files/output.log
+2024-08-23 15:44:53,730 DEBUG   HandlerThread:10032 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-23 15:44:54,153 INFO    wandb-upload_1:10032 [upload_job.py:push():89] Uploaded file /singularity_home/.local/share/wandb/artifacts/staging/tmp_o6jbw71
+2024-08-23 15:44:54,878 INFO    wandb-upload_0:10032 [upload_job.py:push():89] Uploaded file /singularity_home/.local/share/wandb/artifacts/staging/tmpdgbh2byi
+2024-08-23 15:44:55,934 INFO    SenderThread:10032 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTE2MTk3MTc1OA==', 'state': 'PENDING', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjQxNjQ1ODQ1MA==', 'latestArtifact': None}}
+2024-08-23 15:44:55,934 DEBUG   SenderThread:10032 [sender.py:send_request():409] send_request: defer
+2024-08-23 15:44:55,934 DEBUG   HandlerThread:10032 [handler.py:handle_request():146] handle_request: status_report
+2024-08-23 15:44:55,934 INFO    SenderThread:10032 [sender.py:send_request_defer():613] handle sender defer: 9
+2024-08-23 15:44:55,934 INFO    SenderThread:10032 [dir_watcher.py:finish():358] shutting down directory watcher
+2024-08-23 15:44:56,394 INFO    SenderThread:10032 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240823_154448-v9m85jnt/files
+2024-08-23 15:44:56,395 INFO    SenderThread:10032 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_154448-v9m85jnt/files/requirements.txt requirements.txt
+2024-08-23 15:44:56,395 INFO    SenderThread:10032 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_154448-v9m85jnt/files/config.yaml config.yaml
+2024-08-23 15:44:56,396 INFO    SenderThread:10032 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_154448-v9m85jnt/files/wandb-metadata.json wandb-metadata.json
+2024-08-23 15:44:56,396 INFO    SenderThread:10032 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_154448-v9m85jnt/files/wandb-summary.json wandb-summary.json
+2024-08-23 15:44:56,398 INFO    SenderThread:10032 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_154448-v9m85jnt/files/output.log output.log
+2024-08-23 15:44:56,399 INFO    SenderThread:10032 [sender.py:transition_state():617] send defer: 10
+2024-08-23 15:44:56,399 DEBUG   SenderThread:10032 [sender.py:send_request():409] send_request: poll_exit
+2024-08-23 15:44:56,399 DEBUG   HandlerThread:10032 [handler.py:handle_request():146] handle_request: defer
+2024-08-23 15:44:56,401 INFO    HandlerThread:10032 [handler.py:handle_request_defer():172] handle defer: 10
+2024-08-23 15:44:56,401 DEBUG   SenderThread:10032 [sender.py:send_request():409] send_request: defer
+2024-08-23 15:44:56,401 INFO    SenderThread:10032 [sender.py:send_request_defer():613] handle sender defer: 10
+2024-08-23 15:44:56,401 INFO    SenderThread:10032 [file_pusher.py:finish():172] shutting down file pusher
+2024-08-23 15:44:56,731 DEBUG   HandlerThread:10032 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-23 15:44:56,731 DEBUG   SenderThread:10032 [sender.py:send_request():409] send_request: poll_exit
+2024-08-23 15:44:56,790 INFO    wandb-upload_1:10032 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240823_154448-v9m85jnt/files/requirements.txt
+2024-08-23 15:44:56,818 INFO    wandb-upload_0:10032 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240823_154448-v9m85jnt/files/config.yaml
+2024-08-23 15:44:56,848 INFO    wandb-upload_2:10032 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240823_154448-v9m85jnt/files/wandb-summary.json
+2024-08-23 15:44:56,865 INFO    wandb-upload_3:10032 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240823_154448-v9m85jnt/files/output.log
+2024-08-23 15:44:57,065 INFO    Thread-11 (_thread_body):10032 [sender.py:transition_state():617] send defer: 11
+2024-08-23 15:44:57,065 DEBUG   HandlerThread:10032 [handler.py:handle_request():146] handle_request: defer
+2024-08-23 15:44:57,065 INFO    HandlerThread:10032 [handler.py:handle_request_defer():172] handle defer: 11
+2024-08-23 15:44:57,065 DEBUG   SenderThread:10032 [sender.py:send_request():409] send_request: defer
+2024-08-23 15:44:57,065 INFO    SenderThread:10032 [sender.py:send_request_defer():613] handle sender defer: 11
+2024-08-23 15:44:57,065 INFO    SenderThread:10032 [file_pusher.py:join():178] waiting for file pusher
+2024-08-23 15:44:57,066 INFO    SenderThread:10032 [sender.py:transition_state():617] send defer: 12
+2024-08-23 15:44:57,066 DEBUG   HandlerThread:10032 [handler.py:handle_request():146] handle_request: defer
+2024-08-23 15:44:57,066 INFO    HandlerThread:10032 [handler.py:handle_request_defer():172] handle defer: 12
+2024-08-23 15:44:57,066 DEBUG   SenderThread:10032 [sender.py:send_request():409] send_request: defer
+2024-08-23 15:44:57,066 INFO    SenderThread:10032 [sender.py:send_request_defer():613] handle sender defer: 12
+2024-08-23 15:44:57,066 INFO    SenderThread:10032 [file_stream.py:finish():595] file stream finish called
+2024-08-23 15:44:57,271 INFO    SenderThread:10032 [file_stream.py:finish():599] file stream finish is done
+2024-08-23 15:44:57,271 INFO    SenderThread:10032 [sender.py:transition_state():617] send defer: 13
+2024-08-23 15:44:57,271 DEBUG   HandlerThread:10032 [handler.py:handle_request():146] handle_request: defer
+2024-08-23 15:44:57,271 INFO    HandlerThread:10032 [handler.py:handle_request_defer():172] handle defer: 13
+2024-08-23 15:44:57,271 DEBUG   SenderThread:10032 [sender.py:send_request():409] send_request: defer
+2024-08-23 15:44:57,271 INFO    SenderThread:10032 [sender.py:send_request_defer():613] handle sender defer: 13
+2024-08-23 15:44:57,271 INFO    SenderThread:10032 [sender.py:transition_state():617] send defer: 14
+2024-08-23 15:44:57,271 DEBUG   HandlerThread:10032 [handler.py:handle_request():146] handle_request: defer
+2024-08-23 15:44:57,271 DEBUG   SenderThread:10032 [sender.py:send():382] send: final
+2024-08-23 15:44:57,271 INFO    HandlerThread:10032 [handler.py:handle_request_defer():172] handle defer: 14
+2024-08-23 15:44:57,271 DEBUG   SenderThread:10032 [sender.py:send():382] send: footer
+2024-08-23 15:44:57,272 DEBUG   SenderThread:10032 [sender.py:send_request():409] send_request: defer
+2024-08-23 15:44:57,272 INFO    SenderThread:10032 [sender.py:send_request_defer():613] handle sender defer: 14
+2024-08-23 15:44:57,272 DEBUG   HandlerThread:10032 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-23 15:44:57,272 DEBUG   HandlerThread:10032 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-23 15:44:57,272 DEBUG   HandlerThread:10032 [handler.py:handle_request():146] handle_request: server_info
+2024-08-23 15:44:57,273 DEBUG   HandlerThread:10032 [handler.py:handle_request():146] handle_request: get_summary
+2024-08-23 15:44:57,273 DEBUG   HandlerThread:10032 [handler.py:handle_request():146] handle_request: sampled_history
+2024-08-23 15:44:57,273 DEBUG   SenderThread:10032 [sender.py:send_request():409] send_request: poll_exit
+2024-08-23 15:44:57,273 DEBUG   HandlerThread:10032 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-23 15:44:57,273 DEBUG   SenderThread:10032 [sender.py:send_request():409] send_request: poll_exit
+2024-08-23 15:44:57,274 DEBUG   SenderThread:10032 [sender.py:send_request():409] send_request: server_info
+2024-08-23 15:44:57,275 DEBUG   HandlerThread:10032 [handler.py:handle_request():146] handle_request: job_info
+2024-08-23 15:44:57,441 DEBUG   SenderThread:10032 [sender.py:send_request():409] send_request: job_info
+2024-08-23 15:44:57,441 INFO    MainThread:10032 [wandb_run.py:_footer_history_summary_info():3866] rendering history
+2024-08-23 15:44:57,441 INFO    MainThread:10032 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
+2024-08-23 15:44:57,441 INFO    MainThread:10032 [wandb_run.py:_footer_sync_info():3825] logging synced files
+2024-08-23 15:44:57,441 DEBUG   HandlerThread:10032 [handler.py:handle_request():146] handle_request: shutdown
+2024-08-23 15:44:57,441 INFO    HandlerThread:10032 [handler.py:finish():869] shutting down handler
+2024-08-23 15:44:58,275 INFO    WriterThread:10032 [datastore.py:close():296] close: /project/wandb/run-20240823_154448-v9m85jnt/run-v9m85jnt.wandb
+2024-08-23 15:44:58,441 INFO    SenderThread:10032 [sender.py:finish():1572] shutting down sender
+2024-08-23 15:44:58,441 INFO    SenderThread:10032 [file_pusher.py:finish():172] shutting down file pusher
+2024-08-23 15:44:58,441 INFO    SenderThread:10032 [file_pusher.py:join():178] waiting for file pusher

wandb/run-20240823_154448-v9m85jnt/logs/debug.log ADDED Viewed

	@@ -0,0 +1,28 @@

+2024-08-23 15:44:48,884 INFO    MainThread:9961 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
+2024-08-23 15:44:48,884 INFO    MainThread:9961 [wandb_setup.py:_flush():76] Configure stats pid to 9961
+2024-08-23 15:44:48,884 INFO    MainThread:9961 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
+2024-08-23 15:44:48,884 INFO    MainThread:9961 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
+2024-08-23 15:44:48,884 INFO    MainThread:9961 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train sample'}
+2024-08-23 15:44:48,884 INFO    MainThread:9961 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
+2024-08-23 15:44:48,884 INFO    MainThread:9961 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
+2024-08-23 15:44:48,884 INFO    MainThread:9961 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240823_154448-v9m85jnt/logs/debug.log
+2024-08-23 15:44:48,884 INFO    MainThread:9961 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240823_154448-v9m85jnt/logs/debug-internal.log
+2024-08-23 15:44:48,884 INFO    MainThread:9961 [wandb_init.py:init():566] calling init triggers
+2024-08-23 15:44:48,884 INFO    MainThread:9961 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
+config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '28623823675', '/project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document'], 'valid_data_path': ['1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document'], 'test_data_path': ['1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 2048, 'num_workers': 2, 'tokenizer_type': 'HFPreTrainedTokenizer', 'tokenizer_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'Qwen2-0.5b-0.2_train_2024-08-23-15:44:18', 'wandb_project': 'llm_tutorial-0.2', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/Qwen2-0.5b-0.2', 'save': '/work/llm_recipes/models/Qwen2-0.5b-0.2', 'base_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 10, 'save_interval': 10, 'eval_iters': 10, 'optimizer': 'anyprecision', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 7500, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 7500, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 1, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/Qwen2-0.5b-0.2', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': True, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'valid_micro_batch_size': 1, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 151680, 'gradient_accumulation_steps': 320}
+2024-08-23 15:44:48,884 INFO    MainThread:9961 [wandb_init.py:init():616] starting backend
+2024-08-23 15:44:48,885 INFO    MainThread:9961 [wandb_init.py:init():620] setting up manager
+2024-08-23 15:44:48,889 INFO    MainThread:9961 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-08-23 15:44:48,891 INFO    MainThread:9961 [wandb_init.py:init():628] backend started and connected
+2024-08-23 15:44:48,896 INFO    MainThread:9961 [wandb_init.py:init():720] updated telemetry
+2024-08-23 15:44:48,909 INFO    MainThread:9961 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
+2024-08-23 15:44:49,395 INFO    MainThread:9961 [wandb_run.py:_on_init():2262] communicating current version
+2024-08-23 15:44:49,418 INFO    MainThread:9961 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.7 is available!  To upgrade, please run:\n $ pip install wandb --upgrade"
+2024-08-23 15:44:49,418 INFO    MainThread:9961 [wandb_init.py:init():804] starting run threads in backend
+2024-08-23 15:44:49,539 INFO    MainThread:9961 [wandb_run.py:_console_start():2241] atexit reg
+2024-08-23 15:44:49,539 INFO    MainThread:9961 [wandb_run.py:_redirect():2096] redirect: wrap_raw
+2024-08-23 15:44:49,539 INFO    MainThread:9961 [wandb_run.py:_redirect():2161] Wrapping output streams.
+2024-08-23 15:44:49,539 INFO    MainThread:9961 [wandb_run.py:_redirect():2186] Redirects installed.
+2024-08-23 15:44:49,540 INFO    MainThread:9961 [wandb_init.py:init():847] run started, returning control to user process
+2024-08-23 15:44:58,442 WARNING MsgRouterThr:9961 [router.py:message_loop():77] message_loop has been closed

wandb/run-20240823_154448-v9m85jnt/run-v9m85jnt.wandb ADDED Viewed

Binary file (8.01 kB). View file