diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..18490970cdb55ea7c9b6d683513bd222d1e5e295 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +global_step_0/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/global_step_0/README.md b/global_step_0/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7711c5fdbc3259b38a56eb7bb022bd66baa0142c --- /dev/null +++ b/global_step_0/README.md @@ -0,0 +1,60 @@ +--- +library_name: transformers +license: other +tags: +- llama-factory +- full +- generated_from_trainer +model-index: +- name: think_sft_nopack_lr1.5e5_ep3 + results: [] +--- + + + +# think_sft_nopack_lr1.5e5_ep3 + +This model is a fine-tuned version of a custom Llama 3B model pretrained on 52B tokens on the open_thoughts_43k_think_format dataset. + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 1.5e-05 +- train_batch_size: 2 +- eval_batch_size: 8 +- seed: 42 +- distributed_type: multi-GPU +- num_devices: 4 +- gradient_accumulation_steps: 32 +- total_train_batch_size: 256 +- total_eval_batch_size: 32 +- optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments +- lr_scheduler_type: cosine +- lr_scheduler_warmup_ratio: 0.1 +- num_epochs: 3.0 + +### Training results + + + +### Framework versions + +- Transformers 4.57.1 +- Pytorch 2.6.0+cu124 +- Datasets 4.0.0 +- Tokenizers 0.22.1 diff --git a/global_step_0/all_results.json b/global_step_0/all_results.json new file mode 100644 index 0000000000000000000000000000000000000000..361e4e132f240d363d25c252389f9999a100c486 --- /dev/null +++ b/global_step_0/all_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 3.0, + "total_flos": 1.1980638081930756e+19, + "train_loss": 0.49363853406255476, + "train_runtime": 40041.2675, + "train_samples_per_second": 3.261, + "train_steps_per_second": 0.013 +} \ No newline at end of file diff --git a/global_step_0/chat_template.jinja b/global_step_0/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..2413626ec1ea4485c29369803d71586d2a5ed64b --- /dev/null +++ b/global_step_0/chat_template.jinja @@ -0,0 +1,21 @@ +{{- bos_token }} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content'] %} + {%- set loop_messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} + {%- set loop_messages = messages %} +{%- endif %} +{%- if system_message %} +{{- '<|start_header_id|>system<|end_header_id|>\n\n' + system_message + '<|eot_id|>' }} +{%- endif %} +{%- for message in loop_messages %} + {%- if message['role'] == 'user' %} +{{- '<|start_header_id|>user<|end_header_id|>\n\n' + message['content'] + '<|eot_id|>' }} + {%- elif message['role'] == 'assistant' %} +{{- '<|start_header_id|>assistant<|end_header_id|>\n\n' + message['content'] + '<|eot_id|>' }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} +{{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/global_step_0/config.json b/global_step_0/config.json new file mode 100644 index 0000000000000000000000000000000000000000..44616e007e75e54816a9f0899ea06b59549777bf --- /dev/null +++ b/global_step_0/config.json @@ -0,0 +1,36 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "bfloat16", + "eos_token_id": 128009, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 3072, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 28, + "num_key_value_heads": 8, + "pad_token_id": 128001, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "transformers_version": "4.57.1", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/global_step_0/generation_config.json b/global_step_0/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..28e1ef5d319e1b40e14cae21d3ec6ee69cc033e8 --- /dev/null +++ b/global_step_0/generation_config.json @@ -0,0 +1,13 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128009, + 128001 + ], + "pad_token_id": 128001, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.57.1" +} diff --git a/global_step_0/logs/sft_train_20260305_150038.log b/global_step_0/logs/sft_train_20260305_150038.log new file mode 100644 index 0000000000000000000000000000000000000000..797680cf615a2eda7d5294c18694c6d678edd4c8 --- /dev/null +++ b/global_step_0/logs/sft_train_20260305_150038.log @@ -0,0 +1,1176 @@ +[INFO|2026-03-05 15:00:44] llamafactory.launcher:143 >> Initializing 4 distributed tasks at: 127.0.0.1:53151 +W0305 15:00:45.406000 1741551 site-packages/torch/distributed/run.py:792] +W0305 15:00:45.406000 1741551 site-packages/torch/distributed/run.py:792] ***************************************** +W0305 15:00:45.406000 1741551 site-packages/torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0305 15:00:45.406000 1741551 site-packages/torch/distributed/run.py:792] ***************************************** +[2026-03-05 15:00:54,415] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2026-03-05 15:00:56,464] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2026-03-05 15:00:56,464] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2026-03-05 15:00:56,609] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) +/home/salman/anaconda3/envs/reward-signal/lib/python3.11/site-packages/jieba/_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81. + import pkg_resources +[2026-03-05 15:00:57,141] [INFO] [comm.py:669:init_distributed] cdb=None +[INFO|2026-03-05 15:00:58] llamafactory.hparams.parser:423 >> Process rank: 3, world size: 4, device: cuda:3, distributed training: True, compute dtype: torch.bfloat16 +/home/salman/anaconda3/envs/reward-signal/lib/python3.11/site-packages/jieba/_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81. + import pkg_resources +/home/salman/anaconda3/envs/reward-signal/lib/python3.11/site-packages/jieba/_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81. + import pkg_resources +[2026-03-05 15:00:58,934] [INFO] [comm.py:669:init_distributed] cdb=None +[2026-03-05 15:00:58,996] [INFO] [comm.py:669:init_distributed] cdb=None +[2026-03-05 15:00:58,996] [INFO] [comm.py:700:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +/home/salman/anaconda3/envs/reward-signal/lib/python3.11/site-packages/jieba/_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81. + import pkg_resources +[2026-03-05 15:00:59,183] [INFO] [comm.py:669:init_distributed] cdb=None +[rank3]:[W305 15:00:59.155040498 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 3] using GPU 3 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. +[INFO|2026-03-05 15:01:01] llamafactory.hparams.parser:423 >> Process rank: 0, world size: 4, device: cuda:0, distributed training: True, compute dtype: torch.bfloat16 +[INFO|tokenization_utils_base.py:2093] 2026-03-05 15:01:01,764 >> loading file tokenizer.json +[INFO|tokenization_utils_base.py:2093] 2026-03-05 15:01:01,764 >> loading file tokenizer.model +[INFO|tokenization_utils_base.py:2093] 2026-03-05 15:01:01,764 >> loading file added_tokens.json +[INFO|tokenization_utils_base.py:2093] 2026-03-05 15:01:01,764 >> loading file special_tokens_map.json +[INFO|tokenization_utils_base.py:2093] 2026-03-05 15:01:01,764 >> loading file tokenizer_config.json +[INFO|tokenization_utils_base.py:2093] 2026-03-05 15:01:01,764 >> loading file chat_template.jinja +[INFO|tokenization_utils_base.py:2364] 2026-03-05 15:01:02,134 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +[INFO|configuration_utils.py:763] 2026-03-05 15:01:02,134 >> loading configuration file /local2/salman/model/pretrain_model/v2_4_gpu_llama_3b_nemo_52b/config.json +[INFO|configuration_utils.py:839] 2026-03-05 15:01:02,136 >> Model config LlamaConfig { + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "bfloat16", + "eos_token_id": 128001, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 3072, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 28, + "num_key_value_heads": 8, + "pad_token_id": 128001, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "transformers_version": "4.57.1", + "use_cache": false, + "vocab_size": 128256 +} + +[INFO|tokenization_utils_base.py:2093] 2026-03-05 15:01:02,137 >> loading file tokenizer.json +[INFO|tokenization_utils_base.py:2093] 2026-03-05 15:01:02,137 >> loading file tokenizer.model +[INFO|tokenization_utils_base.py:2093] 2026-03-05 15:01:02,137 >> loading file added_tokens.json +[INFO|tokenization_utils_base.py:2093] 2026-03-05 15:01:02,137 >> loading file special_tokens_map.json +[INFO|tokenization_utils_base.py:2093] 2026-03-05 15:01:02,137 >> loading file tokenizer_config.json +[INFO|tokenization_utils_base.py:2093] 2026-03-05 15:01:02,137 >> loading file chat_template.jinja +[INFO|tokenization_utils_base.py:2364] 2026-03-05 15:01:02,481 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +[INFO|2026-03-05 15:01:02] llamafactory.data.template:143 >> Replace eos token: <|eot_id|>. +[INFO|2026-03-05 15:01:02] llamafactory.data.template:143 >> Add <|eom_id|> to stop words. +[INFO|2026-03-05 15:01:02] llamafactory.data.loader:143 >> Loading dataset /home/salman/reward-signal-analysis/data/open_thought_sft_data/think_format/open-thoughts114k_math_think_format.jsonl... + Converting format of dataset (num_proc=16): 0%| | 0/43525 [00:00> Process rank: 1, world size: 4, device: cuda:1, distributed training: True, compute dtype: torch.bfloat16 + Converting format of dataset (num_proc=16): 1%| | 336/43525 [00:00<00:47, 905.93 examples/s] Converting format of dataset (num_proc=16): 24%|██▍ | 10554/43525 [00:00<00:01, 25617.28 examples/s] Converting format of dataset (num_proc=16): 46%|████▌ | 19979/43525 [00:00<00:00, 43413.40 examples/s] Converting format of dataset (num_proc=16): 72%|███████▏ | 31463/43525 [00:00<00:00, 38741.76 examples/s][INFO|2026-03-05 15:01:03] llamafactory.hparams.parser:423 >> Process rank: 2, world size: 4, device: cuda:2, distributed training: True, compute dtype: torch.bfloat16 +[rank1]:[W305 15:01:03.435588757 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 1] using GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. + Converting format of dataset (num_proc=16): 85%|████████▍ | 36924/43525 [00:01<00:00, 20118.93 examples/s] Converting format of dataset (num_proc=16): 93%|█████████▎| 40644/43525 [00:01<00:00, 18445.98 examples/s] Converting format of dataset (num_proc=16): 100%|██████████| 43525/43525 [00:02<00:00, 18189.21 examples/s] +[rank2]:[W305 15:01:05.313540243 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 2] using GPU 2 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. +[rank0]:[W305 15:01:06.696130699 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 0] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. +NCCL version 2.21.5+cuda12.4 + Running tokenizer on dataset (num_proc=16): 0%| | 0/43525 [00:00<|start_header_id|>system<|end_header_id|> + +Your role as an assistant involves thoroughly exploring questions through a systematic long thinking process before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, backtracing, and iteration to develop well-considered thinking process. Please structure your response into two main sections: Thought and Solution. In the Thought section, first think step by step inside ... tags. Each step should include detailed considerations such as analysing questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, explorations, and reflections from the Thought section, systematically present the final solution that you deem correct. The solution should remain a logical, accurate, concise expression style and detail necessary step needed to reach the conclusion. Now, try to solve the following question through the above guidelines: Please enclose your final answer in the box: \boxed{Your Answer}.<|eot_id|><|start_header_id|>user<|end_header_id|> + +Given real numbers \( a, b, c \) and a positive number \( \lambda \) such that the polynomial \( f(x) = x^3 + a x^2 + b x + c \) has three real roots \( x_1, x_2, x_3 \), and the conditions \( x_2 - x_1 = \lambda \) and \( x_3 > \frac{1}{2}(x_1 + x_2) \) are satisfied, find the maximum value of \( \frac{2 a^3 + 27 c - 9 a b}{\lambda^3} \).<|eot_id|><|start_header_id|>assistant<|end_header_id|> + + +Okay, let's try to tackle this problem step by step. The problem is about finding the maximum value of the expression (2a³ + 27c - 9ab)/λ³ given some conditions on the roots of the cubic polynomial f(x) = x³ + ax² + bx + c. The roots are real, with x₂ - x₁ = λ and x₃ > (x₁ + x₂)/2. + +First, I need to recall that for a cubic polynomial with roots x₁, x₂, x₃, the coefficients can be expressed using Vieta's formulas. So, let's write those down. + +Vieta's formulas for f(x) = x³ + ax² + bx + c: +1. Sum of roots: x₁ + x₂ + x₃ = -a +2. Sum of products: x₁x₂ + x₁x₃ + x₂x₃ = b +3. Product of roots: x₁x₂x₃ = -c + +Our target expression is (2a³ + 27c - 9ab)/λ³. Let's see if we can express this in terms of the roots using Vieta's formulas. + +First, let's substitute a, b, c in terms of the roots. + +From Vieta: +a = -(x₁ + x₂ + x₃) +b = x₁x₂ + x₁x₃ + x₂x₃ +c = -x₁x₂x₃ + +Plugging these into the expression: + +2a³ + 27c - 9ab += 2(-(x₁ + x₂ + x₃))³ + 27(-x₁x₂x₃) - 9(-(x₁ + x₂ + x₃))(x₁x₂ + x₁x₃ + x₂x₃) + +Let me compute each term step by step. + +First term: 2(-(x₁ + x₂ + x₃))³ = -2(x₁ + x₂ + x₃)³ + +Second term: 27(-x₁x₂x₃) = -27x₁x₂x₃ + +Third term: -9(-(x₁ + x₂ + x₃))(x₁x₂ + x₁x₃ + x₂x₃) = 9(x₁ + x₂ + x₃)(x₁x₂ + x₁x₃ + x₂x₃) + +So combining all three: + +-2(x₁ + x₂ + x₃)³ -27x₁x₂x₃ + 9(x₁ + x₂ + x₃)(x₁x₂ + x₁x₃ + x₂x₃) + +Hmm, this seems complex. Maybe there is a known identity that relates these terms? Let me think. + +Recall that for a cubic polynomial, the discriminant D is given by D = 18abcd - 4b³d + b²c² - 4ac³ - 27a²d² for ax³ + bx² + cx + d. But in our case, the polynomial is monic (leading coefficient 1), so a=1, but here the coefficients are a, b, c as given. Wait, actually, in the standard form, the discriminant for x³ + a x² + b x + c would be D = 18abc - 4a³c + a²b² - 4b³ - 27c². But I might be mixing up the formula. Let me verify: + +Wait, the discriminant of a cubic x³ + px² + qx + r is given by: + +Δ = 18pqr - 4p³r + p²q² - 4q³ - 27r² + +Yes, so in our case, with p = a, q = b, r = c, so Δ = 18abc - 4a³c + a²b² - 4b³ - 27c² + +But I'm not sure if the discriminant is directly related here, but maybe. Since the polynomial has three real roots, the discriminant must be non-negative. However, the problem states that all roots are real, so Δ ≥ 0. But maybe the expression we're dealing with is related to the discriminant? + +Wait, let's check the expression given: 2a³ + 27c - 9ab. If we compare with the discriminant formula: + +Δ = 18abc - 4a³c + a²b² - 4b³ - 27c² + +Not directly obvious. Maybe not. Let's try another approach. + +Alternatively, perhaps the expression (2a³ + 27c - 9ab) can be rewritten in terms of the roots. Let's try substituting the Vieta expressions into it. + +So let's substitute a, b, c: + +2(-(x₁ + x₂ + x₃))³ + 27(-x₁x₂x₃) - 9(-(x₁ + x₂ + x₃))(x₁x₂ + x₁x₃ + x₂x₃) + +Compute each term: + +First term: 2*(-1)^3*(x₁ + x₂ + x₃)^3 = -2(x₁ + x₂ + x₃)^3 + +Second term: 27*(-x₁x₂x₃) = -27x₁x₂x₃ + +Third term: -9*(-1)*(x₁ + x₂ + x₃)(x₁x₂ + x₁x₃ + x₂x₃) = 9(x₁ + x₂ + x₃)(x₁x₂ + x₁x₃ + x₂x₃) + +So putting it all together: + +-2(x₁ + x₂ + x₃)^3 - 27x₁x₂x₃ + 9(x₁ + x₂ + x₃)(x₁x₂ + x₁x₃ + x₂x₃) + +Hmm. Let's see if we can factor this or simplify it. Let's denote S = x₁ + x₂ + x₃, P = x₁x₂ + x₁x₃ + x₂x₃, Q = x₁x₂x₃. Then our expression becomes: + +-2S³ -27Q + 9S P + +But for a cubic polynomial, the relationship between S, P, Q is given by Vieta's formulas. But perhaps we can relate this expression to something else. + +Alternatively, maybe using symmetric sums. Let's compute this expression for specific roots. Let's suppose that x₁, x₂, x₃ are variables with x₂ - x₁ = λ and x₃ > (x₁ + x₂)/2. Maybe we can parametrize the roots in terms of variables that capture the given conditions. + +Given that x₂ - x₁ = λ, let's set x₁ = t - λ/2 and x₂ = t + λ/2 for some t. Then the midpoint of x₁ and x₂ is t, and the condition x₃ > (x₁ + x₂)/2 becomes x₃ > t. + +Therefore, let me set: + +x₁ = t - λ/2 + +x₂ = t + λ/2 + +x₃ = t + s, where s > 0 (since x₃ > t) + +So now, our roots are expressed in terms of t, λ, and s > 0. + +Now, let's compute S, P, Q in terms of t, λ, s. + +First, S = x₁ + x₂ + x₃ = (t - λ/2) + (t + λ/2) + (t + s) = 3t + s + +Second, P = x₁x₂ + x₁x₃ + x₂x₃ + +Compute each term: + +x₁x₂ = (t - λ/2)(t + λ/2) = t² - (λ/2)² = t² - λ²/4 + +x₁x₃ = (t - λ/2)(t + s) = t(t + s) - (λ/2)(t + s) = t² + ts - (λ t)/2 - (λ s)/2 + +x₂x₃ = (t + λ/2)(t + s) = t(t + s) + (λ/2)(t + s) = t² + ts + (λ t)/2 + (λ s)/2 + +Adding these together: + +P = [t² - λ²/4] + [t² + ts - (λ t)/2 - (λ s)/2] + [t² + ts + (λ t)/2 + (λ s)/2] + +Let's combine terms: + +First term: t² - λ²/4 + +Second term: t² + ts - (λ t)/2 - (λ s)/2 + +Third term: t² + ts + (λ t)/2 + (λ s)/2 + +Adding them: + +t² - λ²/4 + t² + ts - (λ t)/2 - (λ s)/2 + t² + ts + (λ t)/2 + (λ s)/2 + +Combine like terms: + +t² + t² + t² = 3t² + +ts + ts = 2ts + +-λ²/4 + +For the terms with λ t/2: - (λ t)/2 + (λ t)/2 = 0 + +Similarly, for λ s/2: - (λ s)/2 + (λ s)/2 = 0 + +So P = 3t² + 2ts - λ²/4 + +Now Q = x₁x₂x₃ = (t - λ/2)(t + λ/2)(t + s) = [t² - (λ/2)^2](t + s) = (t² - λ²/4)(t + s) + +Multiply this out: + += t³ + t² s - (λ²/4) t - (λ²/4) s + +Now, let's plug S, P, Q into the expression: + +-2S³ -27Q + 9S P + +First, compute S³: + +S = 3t + s + +S³ = (3t + s)^3 = 27t³ + 27t² s + 9t s² + s³ + +Multiply by -2: -2*27t³ -2*27t² s -2*9t s² -2*s³ = -54t³ -54t² s -18t s² -2s³ + +Next, compute -27Q: + +Q = t³ + t² s - (λ²/4) t - (λ²/4) s + +Multiply by -27: -27t³ -27t² s + (27λ²/4) t + (27λ²/4)s + +Third term: 9S P + +S = 3t + s + +P = 3t² + 2ts - λ²/4 + +So 9S P = 9*(3t + s)*(3t² + 2ts - λ²/4) + +Let's expand this product step by step. + +First, multiply (3t + s) with (3t² + 2ts - λ²/4): + += 3t*(3t²) + 3t*(2ts) + 3t*(-λ²/4) + s*(3t²) + s*(2ts) + s*(-λ²/4) + += 9t³ + 6t² s - (3t λ²)/4 + 3s t² + 2t s² - (s λ²)/4 + +Combine like terms: + +9t³ + (6t² s + 3t² s) + (2t s²) + (-3t λ²/4 - s λ²/4) + += 9t³ + 9t² s + 2t s² - (λ²/4)(3t + s) + +Multiply this by 9: + +9*(9t³ + 9t² s + 2t s² - (λ²/4)(3t + s)) = 81t³ + 81t² s + 18t s² - (9λ²/4)(3t + s) + +Now, combining all three parts: + +First part: -54t³ -54t² s -18t s² -2s³ + +Second part: -27t³ -27t² s + (27λ²/4) t + (27λ²/4)s + +Third part: 81t³ + 81t² s + 18t s² - (9λ²/4)(3t + s) + +Let's add them term by term. + +For t³ terms: + +-54t³ -27t³ +81t³ = 0 + +For t² s terms: + +-54t² s -27t² s +81t² s = 0 + +For t s² terms: + +-18t s² +18t s² = 0 + +For s³ terms: + +-2s³ (from first part) +... (second part has no s³ term, third part also none) so total -2s³ + +Now the terms involving λ²: + +From second part: (27λ²/4) t + (27λ²/4)s + +From third part: - (9λ²/4)(3t + s) = -27λ²/4 t -9λ²/4 s + +Adding these together: + +(27λ²/4 t -27λ²/4 t) + (27λ²/4 s -9λ²/4 s) = 0 + (18λ²/4)s = (9λ²/2)s + +So overall, combining all terms: + +-2s³ + (9λ²/2)s + +So the entire expression simplifies to: + +-2s³ + (9λ²/2)s + +Therefore, the original expression (2a³ + 27c -9ab) equals -2s³ + (9λ²/2)s + +Therefore, the expression we need to maximize is: + +(2a³ + 27c -9ab)/λ³ = (-2s³ + (9λ²/2)s)/λ³ = (-2s³)/λ³ + (9λ²/2 s)/λ³ = -2(s/λ)^3 + (9/2)(s/λ) + +Let me set k = s/λ. Since s > 0 and λ > 0, k > 0. Then our expression becomes: + +-2k³ + (9/2)k + +So now, the problem reduces to finding the maximum value of the function g(k) = -2k³ + (9/2)k for k > 0. + +This is a calculus problem. To find the maximum, take the derivative of g(k) with respect to k, set it to zero, and solve for k. + +Compute g'(k): + +g'(k) = -6k² + 9/2 + +Set g'(k) = 0: + +-6k² + 9/2 = 0 + +=> 6k² = 9/2 + +=> k² = (9/2)/6 = 9/12 = 3/4 + +=> k = sqrt(3/4) = sqrt(3)/2 ≈ 0.866 + +Since k > 0, we take the positive root. Now, check if this is a maximum by second derivative test. + +Second derivative: + +g''(k) = -12k + +At k = sqrt(3)/2, g''(k) = -12*(sqrt(3)/2) = -6sqrt(3) < 0, which means it's a local maximum. + +Therefore, the maximum value of g(k) is achieved at k = sqrt(3)/2. + +Compute g(sqrt(3)/2): + +g(sqrt(3)/2) = -2*(sqrt(3)/2)^3 + (9/2)*(sqrt(3)/2) + +Compute each term: + +First term: -2*( (3^(1/2))/2 )^3 = -2*( (3^(3/2))/8 ) = -2*(3√3)/8 = - (3√3)/4 + +Second term: (9/2)*(sqrt(3)/2) = (9√3)/4 + +Adding both terms: + +- (3√3)/4 + (9√3)/4 = (6√3)/4 = (3√3)/2 + +Therefore, the maximum value of g(k) is (3√3)/2. + +Hence, the maximum value of (2a³ + 27c -9ab)/λ³ is (3√3)/2. + +But wait, let me check if there are any constraints on k. Remember that we had s = kλ. But we have to ensure that the roots are real and satisfy x₃ > (x₁ + x₂)/2. However, in our parametrization, we set x₃ = t + s, and since s > 0, x₃ > t = (x₁ + x₂)/2, which is already satisfied. So as long as s > 0, the condition holds. But since k = s/λ > 0, and λ > 0, s > 0 is equivalent to k > 0, which we already considered. Therefore, the maximum is indeed (3√3)/2. + +But let me verify this result by considering specific cases. For example, take λ = 1. Then the expression (2a³ +27c -9ab)/1³ should be maximized at 3√3/2. Let's check with k = sqrt(3)/2, so s = sqrt(3)/2. + +Then the roots are x₁ = t - 1/2, x₂ = t + 1/2, x₃ = t + sqrt(3)/2. Then S = 3t + sqrt(3)/2, P = 3t² + 2t*(sqrt(3)/2) - 1/4 = 3t² + t*sqrt(3) - 1/4, Q = (t² - 1/4)(t + sqrt(3)/2). Then compute a, b, c from Vieta's formulas. Then compute 2a³ +27c -9ab. It might be tedious, but perhaps we can check. + +Alternatively, note that our derivation led us to an expression dependent only on k, which is s/λ, and through substitution, we found the maximum. Therefore, the maximum value is (3√3)/2. However, let me recall that the problem states x₃ > (x₁ + x₂)/2, which in our parametrization is s > 0. Therefore, as long as s is positive, which it is, we are okay. So there are no further constraints, and the maximum is indeed (3√3)/2. + +Wait, but just to be thorough, let's see if when k = sqrt(3)/2, the cubic polynomial indeed has three real roots. Since we are given that all roots are real, but when we parametrize, we need to ensure that the cubic with these roots is valid. However, since we are given that all roots are real and satisfy the given conditions, our parametrization already takes that into account. Therefore, as long as s > 0, the roots are real and satisfy the required conditions. Therefore, the maximum value is indeed (3√3)/2. + +Thus, the maximum value of (2a³ +27c -9ab)/λ³ is (3√3)/2. + +**Final Answer** +\boxed{\dfrac{3\sqrt{3}}{2}} + + +Given real numbers \(a, b, c\) and a positive number \(\lambda\) such that the polynomial \(f(x) = x^3 + ax^2 + bx + c\) has three real roots \(x_1, x_2, x_3\) with the conditions \(x_2 - x_1 = \lambda\) and \(x_3 > \frac{1}{2}(x_1 + x_2)\), we need to find the maximum value of \(\frac{2a^3 + 27c - 9ab}{\lambda^3}\). + +Using Vieta's formulas for the polynomial \(f(x)\): +1. \(x_1 + x_2 + x_3 = -a\) +2. \(x_1x_2 + x_1x_3 + x_2x_3 = b\) +3. \(x_1x_2x_3 = -c\) + +We parametrize the roots as follows: +- \(x_1 = t - \frac{\lambda}{2}\) +- \(x_2 = t + \frac{\lambda}{2}\) +- \(x_3 = t + s\) where \(s > 0\) + +Substituting these into Vieta's formulas, we get: +- \(S = x_1 + x_2 + x_3 = 3t + s\) +- \(P = x_1x_2 + x_1x_3 + x_2x_3 = 3t^2 + 2ts - \frac{\lambda^2}{4}\) +- \(Q = x_1x_2x_3 = (t^2 - \frac{\lambda^2}{4})(t + s)\) + +Substituting \(a = -S\), \(b = P\), and \(c = -Q\) into the expression \(2a^3 + 27c - 9ab\), we simplify it to: +\[ +-2s^3 + \frac{9}{2}\lambda^2 s +\] + +Introducing \(k = \frac{s}{\lambda}\), the expression becomes: +\[ +\frac{-2s^3 + \frac{9}{2}\lambda^2 s}{\lambda^3} = -2k^3 + \frac{9}{2}k +\] + +To maximize \(-2k^3 + \frac{9}{2}k\), we take the derivative and set it to zero: +\[ +g'(k) = -6k^2 + \frac{9}{2} = 0 \implies k^2 = \frac{3}{4} \implies k = \frac{\sqrt{3}}{2} +\] + +Checking the second derivative confirms a maximum. Substituting \(k = \frac{\sqrt{3}}{2}\) back into the expression gives: +\[ +-2\left(\frac{\sqrt{3}}{2}\right)^3 + \frac{9}{2}\left(\frac{\sqrt{3}}{2}\right) = \frac{3\sqrt{3}}{2} +\] + +Thus, the maximum value of \(\frac{2a^3 + 27c - 9ab}{\lambda^3}\) is \(\boxed{\dfrac{3\sqrt{3}}{2}}\). + +The final answer is \boxed{\dfrac{3\sqrt{3}}{2}}.<|eot_id|> +label_ids: +[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 14023, 771, 397, 33413, 11, 1095, 596, 1456, 311, 22118, 420, 3575, 3094, 555, 3094, 13, 578, 3575, 374, 922, 9455, 279, 7340, 907, 315, 279, 7645, 320, 17, 64, 44301, 489, 220, 1544, 66, 482, 220, 24, 370, 5738, 34586, 44301, 2728, 1063, 4787, 389, 279, 20282, 315, 279, 41999, 48411, 282, 2120, 8, 284, 865, 44301, 489, 3944, 30556, 489, 45341, 489, 272, 13, 578, 20282, 527, 1972, 11, 449, 865, 32907, 482, 865, 32086, 284, 49438, 323, 865, 121412, 871, 320, 87, 32086, 489, 865, 32907, 5738, 17, 13, 4815, 5451, 11, 358, 1205, 311, 19635, 430, 369, 264, 41999, 48411, 449, 20282, 865, 32086, 11, 865, 32907, 11, 865, 121412, 11, 279, 37929, 649, 387, 13605, 1701, 11655, 1955, 596, 55179, 13, 2100, 11, 1095, 596, 3350, 1884, 1523, 13, 4815, 36644, 1955, 596, 55179, 369, 282, 2120, 8, 284, 865, 44301, 489, 3944, 30556, 489, 45341, 489, 272, 512, 16, 13, 8279, 315, 20282, 25, 865, 32086, 489, 865, 32907, 489, 865, 121412, 284, 482, 64, 198, 17, 13, 8279, 315, 3956, 25, 865, 32086, 87, 32907, 489, 865, 32086, 87, 121412, 489, 865, 32907, 87, 121412, 284, 293, 198, 18, 13, 5761, 315, 20282, 25, 865, 32086, 87, 32907, 87, 121412, 284, 482, 66, 271, 8140, 2218, 7645, 374, 320, 17, 64, 44301, 489, 220, 1544, 66, 482, 220, 24, 370, 5738, 34586, 44301, 13, 6914, 596, 1518, 422, 584, 649, 3237, 420, 304, 3878, 315, 279, 20282, 1701, 11655, 1955, 596, 55179, 13, 4815, 5451, 11, 1095, 596, 28779, 264, 11, 293, 11, 272, 304, 3878, 315, 279, 20282, 13, 4815, 3915, 11655, 1955, 512, 64, 284, 29506, 87, 32086, 489, 865, 32907, 489, 865, 121412, 340, 65, 284, 865, 32086, 87, 32907, 489, 865, 32086, 87, 121412, 489, 865, 32907, 87, 121412, 198, 66, 284, 482, 87, 32086, 87, 32907, 87, 121412, 271, 2169, 36368, 1521, 1139, 279, 7645, 1473, 17, 64, 44301, 489, 220, 1544, 66, 482, 220, 24, 370, 720, 28, 220, 17, 66767, 87, 32086, 489, 865, 32907, 489, 865, 121412, 595, 44301, 489, 220, 1544, 4172, 87, 32086, 87, 32907, 87, 121412, 8, 482, 220, 24, 66767, 87, 32086, 489, 865, 32907, 489, 865, 121412, 29254, 87, 32086, 87, 32907, 489, 865, 32086, 87, 121412, 489, 865, 32907, 87, 121412, 696, 10267, 757, 12849, 1855, 4751, 3094, 555, 3094, 382, 5451, 4751, 25, 220, 17, 66767, 87, 32086, 489, 865, 32907, 489, 865, 121412, 595, 44301, 284, 482, 17, 2120, 32086, 489, 865, 32907, 489, 865, 121412, 8, 44301, 271, 16041, 4751, 25, 220, 1544, 4172, 87, 32086, 87, 32907, 87, 121412, 8, 284, 482, 1544, 87, 32086, 87, 32907, 87, 121412, 271, 38075, 4751, 25, 482, 24, 66767, 87, 32086, 489, 865, 32907, 489, 865, 121412, 29254, 87, 32086, 87, 32907, 489, 865, 32086, 87, 121412, 489, 865, 32907, 87, 121412, 8, 284, 220, 24, 2120, 32086, 489, 865, 32907, 489, 865, 121412, 2432, 87, 32086, 87, 32907, 489, 865, 32086, 87, 121412, 489, 865, 32907, 87, 121412, 696, 4516, 35271, 682, 2380, 1473, 12, 17, 2120, 32086, 489, 865, 32907, 489, 865, 121412, 8, 44301, 482, 1544, 87, 32086, 87, 32907, 87, 121412, 489, 220, 24, 2120, 32086, 489, 865, 32907, 489, 865, 121412, 2432, 87, 32086, 87, 32907, 489, 865, 32086, 87, 121412, 489, 865, 32907, 87, 121412, 696, 81122, 11, 420, 5084, 6485, 13, 10926, 1070, 374, 264, 3967, 9764, 430, 36716, 1521, 3878, 30, 6914, 757, 1781, 13, 4815, 3905, 543, 430, 369, 264, 41999, 48411, 11, 279, 50419, 519, 423, 374, 2728, 555, 423, 284, 220, 972, 69744, 482, 220, 19, 65, 44301, 67, 489, 293, 30556, 66, 30556, 482, 220, 19, 582, 44301, 482, 220, 1544, 64, 30556, 67, 30556, 369, 3944, 44301, 489, 45341, 30556, 489, 21375, 489, 294, 13, 2030, 304, 1057, 1162, 11, 279, 48411, 374, 1647, 292, 320, 21307, 36706, 220, 16, 705, 779, 264, 28, 16, 11, 719, 1618, 279, 37929, 527, 264, 11, 293, 11, 272, 439, 2728, 13, 14144, 11, 3604, 11, 304, 279, 5410, 1376, 11, 279, 50419, 519, 369, 865, 44301, 489, 264, 865, 30556, 489, 293, 865, 489, 272, 1053, 387, 423, 284, 220, 972, 13997, 482, 220, 19, 64, 44301, 66, 489, 264, 30556, 65, 30556, 482, 220, 19, 65, 44301, 482, 220, 1544, 66, 30556, 13, 2030, 358, 2643, 387, 27890, 709, 279, 15150, 13, 6914, 757, 10356, 1473, 14524, 11, 279, 50419, 519, 315, 264, 41999, 865, 44301, 489, 17585, 30556, 489, 97130, 489, 436, 374, 2728, 555, 1473, 101561, 284, 220, 972, 79, 23866, 482, 220, 19, 79, 44301, 81, 489, 281, 30556, 80, 30556, 482, 220, 19, 80, 44301, 482, 220, 1544, 81, 30556, 271, 9642, 11, 779, 304, 1057, 1162, 11, 449, 281, 284, 264, 11, 2874, 284, 293, 11, 436, 284, 272, 11, 779, 82263, 284, 220, 972, 13997, 482, 220, 19, 64, 44301, 66, 489, 264, 30556, 65, 30556, 482, 220, 19, 65, 44301, 482, 220, 1544, 66, 30556, 271, 4071, 358, 2846, 539, 2771, 422, 279, 50419, 519, 374, 6089, 5552, 1618, 11, 719, 7344, 13, 8876, 279, 48411, 706, 2380, 1972, 20282, 11, 279, 50419, 519, 2011, 387, 2536, 62035, 13, 4452, 11, 279, 3575, 5415, 430, 682, 20282, 527, 1972, 11, 779, 82263, 63247, 220, 15, 13, 2030, 7344, 279, 7645, 584, 2351, 14892, 449, 374, 5552, 311, 279, 50419, 519, 1980, 14524, 11, 1095, 596, 1817, 279, 7645, 2728, 25, 220, 17, 64, 44301, 489, 220, 1544, 66, 482, 220, 24, 370, 13, 1442, 584, 9616, 449, 279, 50419, 519, 15150, 1473, 101561, 284, 220, 972, 13997, 482, 220, 19, 64, 44301, 66, 489, 264, 30556, 65, 30556, 482, 220, 19, 65, 44301, 482, 220, 1544, 66, 30556, 271, 2688, 6089, 8196, 13, 10926, 539, 13, 6914, 596, 1456, 2500, 5603, 382, 93114, 11, 8530, 279, 7645, 320, 17, 64, 44301, 489, 220, 1544, 66, 482, 220, 24, 370, 8, 649, 387, 59624, 304, 3878, 315, 279, 20282, 13, 6914, 596, 1456, 32434, 10831, 279, 11655, 1955, 24282, 1139, 433, 382, 4516, 1095, 596, 28779, 264, 11, 293, 11, 272, 1473, 17, 66767, 87, 32086, 489, 865, 32907, 489, 865, 121412, 595, 44301, 489, 220, 1544, 4172, 87, 32086, 87, 32907, 87, 121412, 8, 482, 220, 24, 66767, 87, 32086, 489, 865, 32907, 489, 865, 121412, 29254, 87, 32086, 87, 32907, 489, 865, 32086, 87, 121412, 489, 865, 32907, 87, 121412, 696, 47354, 1855, 4751, 1473, 5451, 4751, 25, 220, 17, 35399, 16, 30876, 18, 6737, 87, 32086, 489, 865, 32907, 489, 865, 121412, 30876, 18, 284, 482, 17, 2120, 32086, 489, 865, 32907, 489, 865, 121412, 30876, 18, 271, 16041, 4751, 25, 220, 1544, 35399, 87, 32086, 87, 32907, 87, 121412, 8, 284, 482, 1544, 87, 32086, 87, 32907, 87, 121412, 271, 38075, 4751, 25, 482, 24, 35399, 16, 18201, 87, 32086, 489, 865, 32907, 489, 865, 121412, 2432, 87, 32086, 87, 32907, 489, 865, 32086, 87, 121412, 489, 865, 32907, 87, 121412, 8, 284, 220, 24, 2120, 32086, 489, 865, 32907, 489, 865, 121412, 2432, 87, 32086, 87, 32907, 489, 865, 32086, 87, 121412, 489, 865, 32907, 87, 121412, 696, 4516, 10917, 433, 682, 3871, 1473, 12, 17, 2120, 32086, 489, 865, 32907, 489, 865, 121412, 30876, 18, 482, 220, 1544, 87, 32086, 87, 32907, 87, 121412, 489, 220, 24, 2120, 32086, 489, 865, 32907, 489, 865, 121412, 2432, 87, 32086, 87, 32907, 489, 865, 32086, 87, 121412, 489, 865, 32907, 87, 121412, 696, 81122, 13, 6914, 596, 1518, 422, 584, 649, 8331, 420, 477, 40821, 433, 13, 6914, 596, 79164, 328, 284, 865, 32086, 489, 865, 32907, 489, 865, 121412, 11, 393, 284, 865, 32086, 87, 32907, 489, 865, 32086, 87, 121412, 489, 865, 32907, 87, 121412, 11, 1229, 284, 865, 32086, 87, 32907, 87, 121412, 13, 5112, 1057, 7645, 9221, 1473, 12, 17, 50, 44301, 482, 1544, 48, 489, 220, 24, 50, 393, 271, 4071, 369, 264, 41999, 48411, 11, 279, 5133, 1990, 328, 11, 393, 11, 1229, 374, 2728, 555, 11655, 1955, 596, 55179, 13, 2030, 8530, 584, 649, 29243, 420, 7645, 311, 2555, 775, 382, 93114, 11, 7344, 1701, 55443, 37498, 13, 6914, 596, 12849, 420, 7645, 369, 3230, 20282, 13, 6914, 596, 23289, 430, 865, 32086, 11, 865, 32907, 11, 865, 121412, 527, 7482, 449, 865, 32907, 482, 865, 32086, 284, 49438, 323, 865, 121412, 871, 320, 87, 32086, 489, 865, 32907, 5738, 17, 13, 10926, 584, 649, 1719, 58053, 279, 20282, 304, 3878, 315, 7482, 430, 12602, 279, 2728, 4787, 382, 22818, 430, 865, 32907, 482, 865, 32086, 284, 49438, 11, 1095, 596, 743, 865, 32086, 284, 259, 482, 49438, 14, 17, 323, 865, 32907, 284, 259, 489, 49438, 14, 17, 369, 1063, 259, 13, 5112, 279, 83063, 315, 865, 32086, 323, 865, 32907, 374, 259, 11, 323, 279, 3044, 865, 121412, 871, 320, 87, 32086, 489, 865, 32907, 5738, 17, 9221, 865, 121412, 871, 259, 13, 4815, 55915, 11, 1095, 757, 743, 1473, 87, 32086, 284, 259, 482, 49438, 14, 17, 271, 87, 32907, 284, 259, 489, 49438, 14, 17, 271, 87, 121412, 284, 259, 489, 274, 11, 1405, 274, 871, 220, 15, 320, 11536, 865, 121412, 871, 259, 696, 4516, 1457, 11, 1057, 20282, 527, 13605, 304, 3878, 315, 259, 11, 49438, 11, 323, 274, 871, 220, 15, 382, 7184, 11, 1095, 596, 12849, 328, 11, 393, 11, 1229, 304, 3878, 315, 259, 11, 49438, 11, 274, 382, 5451, 11, 328, 284, 865, 32086, 489, 865, 32907, 489, 865, 121412, 284, 320, 83, 482, 49438, 14, 17, 8, 489, 320, 83, 489, 49438, 14, 17, 8, 489, 320, 83, 489, 274, 8, 284, 220, 18, 83, 489, 274, 271, 16041, 11, 393, 284, 865, 32086, 87, 32907, 489, 865, 32086, 87, 121412, 489, 865, 32907, 87, 121412, 271, 47354, 1855, 4751, 1473, 87, 32086, 87, 32907, 284, 320, 83, 482, 49438, 14, 17, 2432, 83, 489, 49438, 14, 17, 8, 284, 259, 30556, 482, 320, 34586, 14, 17, 8, 30556, 284, 259, 30556, 482, 49438, 30556, 14, 19, 271, 87, 32086, 87, 121412, 284, 320, 83, 482, 49438, 14, 17, 2432, 83, 489, 274, 8, 284, 259, 1175, 489, 274, 8, 482, 320, 34586, 14, 17, 2432, 83, 489, 274, 8, 284, 259, 30556, 489, 10814, 482, 320, 34586, 259, 5738, 17, 482, 320, 34586, 274, 5738, 17, 271, 87, 32907, 87, 121412, 284, 320, 83, 489, 49438, 14, 17, 2432, 83, 489, 274, 8, 284, 259, 1175, 489, 274, 8, 489, 320, 34586, 14, 17, 2432, 83, 489, 274, 8, 284, 259, 30556, 489, 10814, 489, 320, 34586, 259, 5738, 17, 489, 320, 34586, 274, 5738, 17, 271, 33408, 1521, 3871, 1473, 47, 284, 510, 83, 30556, 482, 49438, 30556, 14, 19, 60, 489, 510, 83, 30556, 489, 10814, 482, 320, 34586, 259, 5738, 17, 482, 320, 34586, 274, 5738, 17, 60, 489, 510, 83, 30556, 489, 10814, 489, 320, 34586, 259, 5738, 17, 489, 320, 34586, 274, 5738, 17, 2595, 10267, 596, 16343, 3878, 1473, 5451, 4751, 25, 259, 30556, 482, 49438, 30556, 14, 19, 271, 16041, 4751, 25, 259, 30556, 489, 10814, 482, 320, 34586, 259, 5738, 17, 482, 320, 34586, 274, 5738, 17, 271, 38075, 4751, 25, 259, 30556, 489, 10814, 489, 320, 34586, 259, 5738, 17, 489, 320, 34586, 274, 5738, 17, 271, 33408, 1124, 1473, 83, 30556, 482, 49438, 30556, 14, 19, 489, 259, 30556, 489, 10814, 482, 320, 34586, 259, 5738, 17, 482, 320, 34586, 274, 5738, 17, 489, 259, 30556, 489, 10814, 489, 320, 34586, 259, 5738, 17, 489, 320, 34586, 274, 5738, 17, 271, 82214, 1093, 3878, 1473, 83, 30556, 489, 259, 30556, 489, 259, 30556, 284, 220, 18, 83, 30556, 271, 2641, 489, 10814, 284, 220, 17, 2641, 271, 12, 34586, 30556, 14, 19, 271, 2520, 279, 3878, 449, 49438, 259, 14, 17, 25, 482, 320, 34586, 259, 5738, 17, 489, 320, 34586, 259, 5738, 17, 284, 220, 15, 271, 68791, 11, 369, 49438, 274, 14, 17, 25, 482, 320, 34586, 274, 5738, 17, 489, 320, 34586, 274, 5738, 17, 284, 220, 15, 271, 4516, 393, 284, 220, 18, 83, 30556, 489, 220, 17, 2641, 482, 49438, 30556, 14, 19, 271, 7184, 1229, 284, 865, 32086, 87, 32907, 87, 121412, 284, 320, 83, 482, 49438, 14, 17, 2432, 83, 489, 49438, 14, 17, 2432, 83, 489, 274, 8, 284, 510, 83, 30556, 482, 320, 34586, 14, 17, 30876, 17, 9725, 83, 489, 274, 8, 284, 320, 83, 30556, 482, 49438, 30556, 14, 19, 2432, 83, 489, 274, 696, 96255, 420, 704, 1473, 28, 259, 44301, 489, 259, 30556, 274, 482, 320, 34586, 30556, 14, 19, 8, 259, 482, 320, 34586, 30556, 14, 19, 8, 274, 271, 7184, 11, 1095, 596, 20206, 328, 11, 393, 11, 1229, 1139, 279, 7645, 1473, 12, 17, 50, 44301, 482, 1544, 48, 489, 220, 24, 50, 393, 271, 5451, 11, 12849, 328, 44301, 1473, 50, 284, 220, 18, 83, 489, 274, 271, 50, 44301, 284, 320, 18, 83, 489, 274, 30876, 18, 284, 220, 1544, 83, 44301, 489, 220, 1544, 83, 30556, 274, 489, 220, 24, 83, 274, 30556, 489, 274, 44301, 271, 96255, 555, 482, 17, 25, 482, 17, 9, 1544, 83, 44301, 482, 17, 9, 1544, 83, 30556, 274, 482, 17, 9, 24, 83, 274, 30556, 482, 17, 34554, 44301, 284, 482, 4370, 83, 44301, 482, 4370, 83, 30556, 274, 482, 972, 83, 274, 30556, 482, 17, 82, 44301, 271, 5971, 11, 12849, 482, 1544, 48, 1473, 48, 284, 259, 44301, 489, 259, 30556, 274, 482, 320, 34586, 30556, 14, 19, 8, 259, 482, 320, 34586, 30556, 14, 19, 8, 274, 271, 96255, 555, 482, 1544, 25, 482, 1544, 83, 44301, 482, 1544, 83, 30556, 274, 489, 320, 1544, 34586, 30556, 14, 19, 8, 259, 489, 320, 1544, 34586, 30556, 14, 19, 16871, 271, 38075, 4751, 25, 220, 24, 50, 393, 271, 50, 284, 220, 18, 83, 489, 274, 271, 47, 284, 220, 18, 83, 30556, 489, 220, 17, 2641, 482, 49438, 30556, 14, 19, 271, 4516, 220, 24, 50, 393, 284, 220, 24, 6737, 18, 83, 489, 274, 18201, 18, 83, 30556, 489, 220, 17, 2641, 482, 49438, 30556, 14, 19, 696, 10267, 596, 9407, 420, 2027, 3094, 555, 3094, 382, 5451, 11, 31370, 320, 18, 83, 489, 274, 8, 449, 320, 18, 83, 30556, 489, 220, 17, 2641, 482, 49438, 30556, 14, 19, 7887, 28, 220, 18, 83, 6737, 18, 83, 30556, 8, 489, 220, 18, 83, 6737, 17, 2641, 8, 489, 220, 18, 83, 35399, 34586, 30556, 14, 19, 8, 489, 274, 6737, 18, 83, 30556, 8, 489, 274, 6737, 17, 2641, 8, 489, 274, 35399, 34586, 30556, 14, 19, 696, 28, 220, 24, 83, 44301, 489, 220, 21, 83, 30556, 274, 482, 320, 18, 83, 49438, 30556, 5738, 19, 489, 220, 18, 82, 259, 30556, 489, 220, 17, 83, 274, 30556, 482, 320, 82, 49438, 30556, 5738, 19, 271, 82214, 1093, 3878, 1473, 24, 83, 44301, 489, 320, 21, 83, 30556, 274, 489, 220, 18, 83, 30556, 274, 8, 489, 320, 17, 83, 274, 30556, 8, 489, 10505, 18, 83, 49438, 30556, 14, 19, 482, 274, 49438, 30556, 14, 19, 696, 28, 220, 24, 83, 44301, 489, 220, 24, 83, 30556, 274, 489, 220, 17, 83, 274, 30556, 482, 320, 34586, 30556, 14, 19, 2432, 18, 83, 489, 274, 696, 96255, 420, 555, 220, 24, 1473, 24, 6737, 24, 83, 44301, 489, 220, 24, 83, 30556, 274, 489, 220, 17, 83, 274, 30556, 482, 320, 34586, 30556, 14, 19, 2432, 18, 83, 489, 274, 595, 284, 220, 5932, 83, 44301, 489, 220, 5932, 83, 30556, 274, 489, 220, 972, 83, 274, 30556, 482, 320, 24, 34586, 30556, 14, 19, 2432, 18, 83, 489, 274, 696, 7184, 11, 35271, 682, 2380, 5596, 1473, 5451, 961, 25, 482, 4370, 83, 44301, 482, 4370, 83, 30556, 274, 482, 972, 83, 274, 30556, 482, 17, 82, 44301, 271, 16041, 961, 25, 482, 1544, 83, 44301, 482, 1544, 83, 30556, 274, 489, 320, 1544, 34586, 30556, 14, 19, 8, 259, 489, 320, 1544, 34586, 30556, 14, 19, 16871, 271, 38075, 961, 25, 220, 5932, 83, 44301, 489, 220, 5932, 83, 30556, 274, 489, 220, 972, 83, 274, 30556, 482, 320, 24, 34586, 30556, 14, 19, 2432, 18, 83, 489, 274, 696, 10267, 596, 923, 1124, 4751, 555, 4751, 382, 2520, 259, 44301, 3878, 1473, 12, 4370, 83, 44301, 482, 1544, 83, 44301, 489, 5932, 83, 44301, 284, 220, 15, 271, 2520, 259, 30556, 274, 3878, 1473, 12, 4370, 83, 30556, 274, 482, 1544, 83, 30556, 274, 489, 5932, 83, 30556, 274, 284, 220, 15, 271, 2520, 259, 274, 30556, 3878, 1473, 12, 972, 83, 274, 30556, 489, 972, 83, 274, 30556, 284, 220, 15, 271, 2520, 274, 44301, 3878, 1473, 12, 17, 82, 44301, 320, 1527, 1176, 961, 8, 489, 2564, 320, 5686, 961, 706, 912, 274, 44301, 4751, 11, 4948, 961, 1101, 7000, 8, 779, 2860, 482, 17, 82, 44301, 271, 7184, 279, 3878, 16239, 49438, 30556, 1473, 3915, 2132, 961, 25, 320, 1544, 34586, 30556, 14, 19, 8, 259, 489, 320, 1544, 34586, 30556, 14, 19, 16871, 271, 3915, 4948, 961, 25, 482, 320, 24, 34586, 30556, 14, 19, 2432, 18, 83, 489, 274, 8, 284, 482, 1544, 34586, 30556, 14, 19, 259, 482, 24, 34586, 30556, 14, 19, 274, 271, 33408, 1521, 3871, 1473, 7, 1544, 34586, 30556, 14, 19, 259, 482, 1544, 34586, 30556, 14, 19, 259, 8, 489, 320, 1544, 34586, 30556, 14, 19, 274, 482, 24, 34586, 30556, 14, 19, 274, 8, 284, 220, 15, 489, 320, 972, 34586, 30556, 14, 19, 16871, 284, 320, 24, 34586, 30556, 14, 17, 16871, 271, 4516, 8244, 11, 35271, 682, 3878, 1473, 12, 17, 82, 44301, 489, 320, 24, 34586, 30556, 14, 17, 16871, 271, 4516, 279, 4553, 7645, 15858, 9803, 311, 1473, 12, 17, 82, 44301, 489, 320, 24, 34586, 30556, 14, 17, 16871, 271, 55915, 11, 279, 4113, 7645, 320, 17, 64, 44301, 489, 220, 1544, 66, 482, 24, 370, 8, 17239, 482, 17, 82, 44301, 489, 320, 24, 34586, 30556, 14, 17, 16871, 271, 55915, 11, 279, 7645, 584, 1205, 311, 35608, 374, 1473, 7, 17, 64, 44301, 489, 220, 1544, 66, 482, 24, 370, 5738, 34586, 44301, 284, 10505, 17, 82, 44301, 489, 320, 24, 34586, 30556, 14, 17, 16871, 5738, 34586, 44301, 284, 10505, 17, 82, 44301, 5738, 34586, 44301, 489, 320, 24, 34586, 30556, 14, 17, 274, 5738, 34586, 44301, 284, 482, 17, 1161, 14, 34586, 30876, 18, 489, 320, 24, 14, 17, 2432, 82, 14, 34586, 696, 10267, 757, 743, 597, 284, 274, 14, 34586, 13, 8876, 274, 871, 220, 15, 323, 49438, 871, 220, 15, 11, 597, 871, 220, 15, 13, 5112, 1057, 7645, 9221, 1473, 12, 17, 74, 44301, 489, 320, 24, 14, 17, 8, 74, 271, 4516, 1457, 11, 279, 3575, 26338, 311, 9455, 279, 7340, 907, 315, 279, 734, 342, 6097, 8, 284, 482, 17, 74, 44301, 489, 320, 24, 14, 17, 8, 74, 369, 597, 871, 220, 15, 382, 2028, 374, 264, 83768, 3575, 13, 2057, 1505, 279, 7340, 11, 1935, 279, 32905, 315, 342, 6097, 8, 449, 5201, 311, 597, 11, 743, 433, 311, 7315, 11, 323, 11886, 369, 597, 382, 47354, 342, 59436, 74, 7887, 70, 59436, 74, 8, 284, 482, 21, 74, 30556, 489, 220, 24, 14, 17, 271, 1681, 342, 59436, 74, 8, 284, 220, 15, 1473, 12, 21, 74, 30556, 489, 220, 24, 14, 17, 284, 220, 15, 271, 2228, 220, 21, 74, 30556, 284, 220, 24, 14, 17, 271, 2228, 597, 30556, 284, 320, 24, 14, 17, 5738, 21, 284, 220, 24, 14, 717, 284, 220, 18, 14, 19, 271, 2228, 597, 284, 18430, 7, 18, 14, 19, 8, 284, 18430, 7, 18, 5738, 17, 118792, 220, 15, 13, 22455, 271, 12834, 597, 871, 220, 15, 11, 584, 1935, 279, 6928, 3789, 13, 4800, 11, 1817, 422, 420, 374, 264, 7340, 555, 2132, 32905, 1296, 382, 16041, 32905, 1473, 70, 4708, 7, 74, 8, 284, 482, 717, 74, 271, 1688, 597, 284, 18430, 7, 18, 5738, 17, 11, 342, 4708, 7, 74, 8, 284, 482, 717, 6737, 27986, 7, 18, 5738, 17, 8, 284, 482, 21, 27986, 7, 18, 8, 366, 220, 15, 11, 902, 3445, 433, 596, 264, 2254, 7340, 382, 55915, 11, 279, 7340, 907, 315, 342, 6097, 8, 374, 17427, 520, 597, 284, 18430, 7, 18, 5738, 17, 382, 47354, 342, 84173, 7, 18, 5738, 17, 7887, 70, 84173, 7, 18, 5738, 17, 8, 284, 482, 17, 6737, 27986, 7, 18, 5738, 17, 30876, 18, 489, 320, 24, 14, 17, 18201, 27986, 7, 18, 5738, 17, 696, 47354, 1855, 4751, 1473, 5451, 4751, 25, 482, 17, 6737, 320, 18, 13571, 16, 14, 17, 31185, 17, 883, 61, 18, 284, 482, 17, 6737, 320, 18, 13571, 18, 14, 17, 31185, 23, 883, 284, 482, 17, 6737, 18, 110682, 18, 5738, 23, 284, 482, 320, 18, 110682, 18, 5738, 19, 271, 16041, 4751, 25, 320, 24, 14, 17, 18201, 27986, 7, 18, 5738, 17, 8, 284, 320, 24, 110682, 18, 5738, 19, 271, 33408, 2225, 3878, 1473, 12, 320, 18, 110682, 18, 5738, 19, 489, 320, 24, 110682, 18, 5738, 19, 284, 320, 21, 110682, 18, 5738, 19, 284, 320, 18, 110682, 18, 5738, 17, 271, 55915, 11, 279, 7340, 907, 315, 342, 6097, 8, 374, 320, 18, 110682, 18, 5738, 17, 382, 39, 768, 11, 279, 7340, 907, 315, 320, 17, 64, 44301, 489, 220, 1544, 66, 482, 24, 370, 5738, 34586, 44301, 374, 320, 18, 110682, 18, 5738, 17, 382, 4071, 3868, 11, 1095, 757, 1817, 422, 1070, 527, 904, 17413, 389, 597, 13, 20474, 430, 584, 1047, 274, 284, 597, 34586, 13, 2030, 584, 617, 311, 6106, 430, 279, 20282, 527, 1972, 323, 27651, 865, 121412, 871, 320, 87, 32086, 489, 865, 32907, 5738, 17, 13, 4452, 11, 304, 1057, 1719, 17820, 2065, 11, 584, 743, 865, 121412, 284, 259, 489, 274, 11, 323, 2533, 274, 871, 220, 15, 11, 865, 121412, 871, 259, 284, 320, 87, 32086, 489, 865, 32907, 5738, 17, 11, 902, 374, 2736, 20097, 13, 2100, 439, 1317, 439, 274, 871, 220, 15, 11, 279, 3044, 10187, 13, 2030, 2533, 597, 284, 274, 14, 34586, 871, 220, 15, 11, 323, 49438, 871, 220, 15, 11, 274, 871, 220, 15, 374, 13890, 311, 597, 871, 220, 15, 11, 902, 584, 2736, 6646, 13, 15636, 11, 279, 7340, 374, 13118, 320, 18, 110682, 18, 5738, 17, 382, 4071, 1095, 757, 10356, 420, 1121, 555, 13126, 3230, 5157, 13, 1789, 3187, 11, 1935, 49438, 284, 220, 16, 13, 5112, 279, 7645, 320, 17, 64, 44301, 489, 1544, 66, 482, 24, 370, 5738, 16, 44301, 1288, 387, 31127, 1534, 520, 220, 18, 110682, 18, 14, 17, 13, 6914, 596, 1817, 449, 597, 284, 18430, 7, 18, 5738, 17, 11, 779, 274, 284, 18430, 7, 18, 5738, 17, 382, 12487, 279, 20282, 527, 865, 32086, 284, 259, 482, 220, 16, 14, 17, 11, 865, 32907, 284, 259, 489, 220, 16, 14, 17, 11, 865, 121412, 284, 259, 489, 18430, 7, 18, 5738, 17, 13, 5112, 328, 284, 220, 18, 83, 489, 18430, 7, 18, 5738, 17, 11, 393, 284, 220, 18, 83, 30556, 489, 220, 17, 83, 6737, 27986, 7, 18, 5738, 17, 8, 482, 220, 16, 14, 19, 284, 220, 18, 83, 30556, 489, 259, 9, 27986, 7, 18, 8, 482, 220, 16, 14, 19, 11, 1229, 284, 320, 83, 30556, 482, 220, 16, 14, 19, 2432, 83, 489, 18430, 7, 18, 5738, 17, 570, 5112, 12849, 264, 11, 293, 11, 272, 505, 11655, 1955, 596, 55179, 13, 5112, 12849, 220, 17, 64, 44301, 489, 1544, 66, 482, 24, 370, 13, 1102, 2643, 387, 66838, 11, 719, 8530, 584, 649, 1817, 382, 93114, 11, 5296, 430, 1057, 96354, 6197, 603, 311, 459, 7645, 18222, 1193, 389, 597, 11, 902, 374, 274, 14, 34586, 11, 323, 1555, 50068, 11, 584, 1766, 279, 7340, 13, 15636, 11, 279, 7340, 907, 374, 320, 18, 110682, 18, 5738, 17, 13, 4452, 11, 1095, 757, 19635, 430, 279, 3575, 5415, 865, 121412, 871, 320, 87, 32086, 489, 865, 32907, 5738, 17, 11, 902, 304, 1057, 1719, 17820, 2065, 374, 274, 871, 220, 15, 13, 15636, 11, 439, 1317, 439, 274, 374, 6928, 11, 902, 433, 374, 11, 584, 527, 17339, 13, 2100, 1070, 527, 912, 4726, 17413, 11, 323, 279, 7340, 374, 13118, 320, 18, 110682, 18, 5738, 17, 382, 14524, 11, 719, 1120, 311, 387, 17879, 11, 1095, 596, 1518, 422, 994, 597, 284, 18430, 7, 18, 5738, 17, 11, 279, 41999, 48411, 13118, 706, 2380, 1972, 20282, 13, 8876, 584, 527, 2728, 430, 682, 20282, 527, 1972, 11, 719, 994, 584, 1719, 58053, 11, 584, 1205, 311, 6106, 430, 279, 41999, 449, 1521, 20282, 374, 2764, 13, 4452, 11, 2533, 584, 527, 2728, 430, 682, 20282, 527, 1972, 323, 27651, 279, 2728, 4787, 11, 1057, 1719, 17820, 2065, 2736, 5097, 430, 1139, 2759, 13, 15636, 11, 439, 1317, 439, 274, 871, 220, 15, 11, 279, 20282, 527, 1972, 323, 27651, 279, 2631, 4787, 13, 15636, 11, 279, 7340, 907, 374, 13118, 320, 18, 110682, 18, 5738, 17, 382, 45600, 11, 279, 7340, 907, 315, 320, 17, 64, 44301, 489, 1544, 66, 482, 24, 370, 5738, 34586, 44301, 374, 320, 18, 110682, 18, 5738, 17, 382, 334, 19918, 22559, 1035, 59, 80175, 36802, 67, 38118, 90, 18, 59, 27986, 90, 18, 3500, 90, 17, 11498, 524, 27963, 1363, 22818, 1972, 5219, 18240, 64, 11, 293, 11, 272, 58858, 323, 264, 6928, 1396, 1144, 11781, 13231, 58858, 1778, 430, 279, 48411, 18240, 69, 2120, 8, 284, 865, 61, 18, 489, 3944, 61, 17, 489, 45341, 489, 272, 58858, 706, 2380, 1972, 20282, 18240, 87, 62, 16, 11, 865, 62, 17, 11, 865, 62, 18, 58858, 449, 279, 4787, 18240, 87, 62, 17, 482, 865, 62, 16, 284, 1144, 13231, 58858, 323, 18240, 87, 62, 18, 871, 1144, 38118, 90, 16, 15523, 17, 26628, 87, 62, 16, 489, 865, 62, 17, 10929, 705, 584, 1205, 311, 1505, 279, 7340, 907, 315, 1144, 11781, 38118, 90, 17, 64, 61, 18, 489, 220, 1544, 66, 482, 220, 24, 370, 15523, 59, 13231, 61, 18, 11281, 3677, 16834, 11655, 1955, 596, 55179, 369, 279, 48411, 18240, 69, 2120, 10929, 997, 16, 13, 18240, 87, 62, 16, 489, 865, 62, 17, 489, 865, 62, 18, 284, 482, 64, 59, 340, 17, 13, 18240, 87, 62, 16, 87, 62, 17, 489, 865, 62, 16, 87, 62, 18, 489, 865, 62, 17, 87, 62, 18, 284, 293, 59, 340, 18, 13, 18240, 87, 62, 16, 87, 62, 17, 87, 62, 18, 284, 482, 66, 59, 696, 1687, 1719, 58053, 279, 20282, 439, 11263, 512, 12, 18240, 87, 62, 16, 284, 259, 482, 1144, 38118, 36802, 13231, 15523, 17, 11281, 340, 12, 18240, 87, 62, 17, 284, 259, 489, 1144, 38118, 36802, 13231, 15523, 17, 11281, 340, 12, 18240, 87, 62, 18, 284, 259, 489, 274, 58858, 1405, 18240, 82, 871, 220, 15, 59, 696, 3214, 3781, 10831, 1521, 1139, 11655, 1955, 596, 55179, 11, 584, 636, 512, 12, 18240, 50, 284, 865, 62, 16, 489, 865, 62, 17, 489, 865, 62, 18, 284, 220, 18, 83, 489, 274, 59, 340, 12, 18240, 47, 284, 865, 62, 16, 87, 62, 17, 489, 865, 62, 16, 87, 62, 18, 489, 865, 62, 17, 87, 62, 18, 284, 220, 18, 83, 61, 17, 489, 220, 17, 2641, 482, 1144, 38118, 36802, 13231, 61, 17, 15523, 19, 11281, 340, 12, 18240, 48, 284, 865, 62, 16, 87, 62, 17, 87, 62, 18, 284, 320, 83, 61, 17, 482, 1144, 38118, 36802, 13231, 61, 17, 15523, 19, 32988, 83, 489, 274, 10929, 696, 3214, 3781, 10831, 18240, 64, 284, 482, 50, 59, 705, 18240, 65, 284, 393, 59, 705, 323, 18240, 66, 284, 482, 48, 58858, 1139, 279, 7645, 18240, 17, 64, 61, 18, 489, 220, 1544, 66, 482, 220, 24, 370, 59, 705, 584, 40821, 433, 311, 512, 59, 9837, 12, 17, 82, 61, 18, 489, 1144, 38118, 90, 24, 15523, 17, 11281, 13231, 61, 17, 274, 198, 59, 2595, 1090, 60637, 18240, 74, 284, 1144, 38118, 85486, 15523, 59, 13231, 11281, 705, 279, 7645, 9221, 512, 59, 9837, 59, 38118, 20597, 17, 82, 61, 18, 489, 1144, 38118, 90, 24, 15523, 17, 11281, 13231, 61, 17, 274, 15523, 59, 13231, 61, 18, 92, 284, 482, 17, 74, 61, 18, 489, 1144, 38118, 90, 24, 15523, 17, 92, 74, 198, 59, 2595, 1271, 35608, 1144, 4172, 17, 74, 61, 18, 489, 1144, 38118, 90, 24, 15523, 17, 92, 74, 59, 705, 584, 1935, 279, 32905, 323, 743, 433, 311, 7315, 512, 59, 9837, 70, 59436, 74, 8, 284, 482, 21, 74, 61, 17, 489, 1144, 38118, 90, 24, 15523, 17, 92, 284, 220, 15, 1144, 6517, 552, 597, 61, 17, 284, 1144, 38118, 90, 18, 15523, 19, 92, 1144, 6517, 552, 597, 284, 1144, 38118, 36802, 27986, 90, 18, 3500, 90, 17, 534, 59, 2595, 41229, 279, 2132, 32905, 43496, 264, 7340, 13, 3804, 3781, 10831, 18240, 74, 284, 1144, 38118, 36802, 27986, 90, 18, 3500, 90, 17, 11281, 8, 1203, 1139, 279, 7645, 6835, 512, 59, 9837, 12, 17, 59, 2414, 11781, 38118, 36802, 27986, 90, 18, 3500, 90, 17, 11281, 1315, 30876, 18, 489, 1144, 38118, 90, 24, 15523, 17, 11281, 2414, 11781, 38118, 36802, 27986, 90, 18, 3500, 90, 17, 11281, 1315, 8, 284, 1144, 38118, 90, 18, 59, 27986, 90, 18, 3500, 90, 17, 534, 59, 2595, 45600, 11, 279, 7340, 907, 315, 1144, 11781, 38118, 90, 17, 64, 61, 18, 489, 220, 1544, 66, 482, 220, 24, 370, 15523, 59, 13231, 61, 18, 11281, 8, 374, 1144, 11781, 80175, 36802, 67, 38118, 90, 18, 59, 27986, 90, 18, 3500, 90, 17, 3500, 59, 3677, 791, 1620, 4320, 374, 1144, 80175, 36802, 67, 38118, 90, 18, 59, 27986, 90, 18, 3500, 90, 17, 3500, 13, 128009] +labels: + +Okay, let's try to tackle this problem step by step. The problem is about finding the maximum value of the expression (2a³ + 27c - 9ab)/λ³ given some conditions on the roots of the cubic polynomial f(x) = x³ + ax² + bx + c. The roots are real, with x₂ - x₁ = λ and x₃ > (x₁ + x₂)/2. + +First, I need to recall that for a cubic polynomial with roots x₁, x₂, x₃, the coefficients can be expressed using Vieta's formulas. So, let's write those down. + +Vieta's formulas for f(x) = x³ + ax² + bx + c: +1. Sum of roots: x₁ + x₂ + x₃ = -a +2. Sum of products: x₁x₂ + x₁x₃ + x₂x₃ = b +3. Product of roots: x₁x₂x₃ = -c + +Our target expression is (2a³ + 27c - 9ab)/λ³. Let's see if we can express this in terms of the roots using Vieta's formulas. + +First, let's substitute a, b, c in terms of the roots. + +From Vieta: +a = -(x₁ + x₂ + x₃) +b = x₁x₂ + x₁x₃ + x₂x₃ +c = -x₁x₂x₃ + +Plugging these into the expression: + +2a³ + 27c - 9ab += 2(-(x₁ + x₂ + x₃))³ + 27(-x₁x₂x₃) - 9(-(x₁ + x₂ + x₃))(x₁x₂ + x₁x₃ + x₂x₃) + +Let me compute each term step by step. + +First term: 2(-(x₁ + x₂ + x₃))³ = -2(x₁ + x₂ + x₃)³ + +Second term: 27(-x₁x₂x₃) = -27x₁x₂x₃ + +Third term: -9(-(x₁ + x₂ + x₃))(x₁x₂ + x₁x₃ + x₂x₃) = 9(x₁ + x₂ + x₃)(x₁x₂ + x₁x₃ + x₂x₃) + +So combining all three: + +-2(x₁ + x₂ + x₃)³ -27x₁x₂x₃ + 9(x₁ + x₂ + x₃)(x₁x₂ + x₁x₃ + x₂x₃) + +Hmm, this seems complex. Maybe there is a known identity that relates these terms? Let me think. + +Recall that for a cubic polynomial, the discriminant D is given by D = 18abcd - 4b³d + b²c² - 4ac³ - 27a²d² for ax³ + bx² + cx + d. But in our case, the polynomial is monic (leading coefficient 1), so a=1, but here the coefficients are a, b, c as given. Wait, actually, in the standard form, the discriminant for x³ + a x² + b x + c would be D = 18abc - 4a³c + a²b² - 4b³ - 27c². But I might be mixing up the formula. Let me verify: + +Wait, the discriminant of a cubic x³ + px² + qx + r is given by: + +Δ = 18pqr - 4p³r + p²q² - 4q³ - 27r² + +Yes, so in our case, with p = a, q = b, r = c, so Δ = 18abc - 4a³c + a²b² - 4b³ - 27c² + +But I'm not sure if the discriminant is directly related here, but maybe. Since the polynomial has three real roots, the discriminant must be non-negative. However, the problem states that all roots are real, so Δ ≥ 0. But maybe the expression we're dealing with is related to the discriminant? + +Wait, let's check the expression given: 2a³ + 27c - 9ab. If we compare with the discriminant formula: + +Δ = 18abc - 4a³c + a²b² - 4b³ - 27c² + +Not directly obvious. Maybe not. Let's try another approach. + +Alternatively, perhaps the expression (2a³ + 27c - 9ab) can be rewritten in terms of the roots. Let's try substituting the Vieta expressions into it. + +So let's substitute a, b, c: + +2(-(x₁ + x₂ + x₃))³ + 27(-x₁x₂x₃) - 9(-(x₁ + x₂ + x₃))(x₁x₂ + x₁x₃ + x₂x₃) + +Compute each term: + +First term: 2*(-1)^3*(x₁ + x₂ + x₃)^3 = -2(x₁ + x₂ + x₃)^3 + +Second term: 27*(-x₁x₂x₃) = -27x₁x₂x₃ + +Third term: -9*(-1)*(x₁ + x₂ + x₃)(x₁x₂ + x₁x₃ + x₂x₃) = 9(x₁ + x₂ + x₃)(x₁x₂ + x₁x₃ + x₂x₃) + +So putting it all together: + +-2(x₁ + x₂ + x₃)^3 - 27x₁x₂x₃ + 9(x₁ + x₂ + x₃)(x₁x₂ + x₁x₃ + x₂x₃) + +Hmm. Let's see if we can factor this or simplify it. Let's denote S = x₁ + x₂ + x₃, P = x₁x₂ + x₁x₃ + x₂x₃, Q = x₁x₂x₃. Then our expression becomes: + +-2S³ -27Q + 9S P + +But for a cubic polynomial, the relationship between S, P, Q is given by Vieta's formulas. But perhaps we can relate this expression to something else. + +Alternatively, maybe using symmetric sums. Let's compute this expression for specific roots. Let's suppose that x₁, x₂, x₃ are variables with x₂ - x₁ = λ and x₃ > (x₁ + x₂)/2. Maybe we can parametrize the roots in terms of variables that capture the given conditions. + +Given that x₂ - x₁ = λ, let's set x₁ = t - λ/2 and x₂ = t + λ/2 for some t. Then the midpoint of x₁ and x₂ is t, and the condition x₃ > (x₁ + x₂)/2 becomes x₃ > t. + +Therefore, let me set: + +x₁ = t - λ/2 + +x₂ = t + λ/2 + +x₃ = t + s, where s > 0 (since x₃ > t) + +So now, our roots are expressed in terms of t, λ, and s > 0. + +Now, let's compute S, P, Q in terms of t, λ, s. + +First, S = x₁ + x₂ + x₃ = (t - λ/2) + (t + λ/2) + (t + s) = 3t + s + +Second, P = x₁x₂ + x₁x₃ + x₂x₃ + +Compute each term: + +x₁x₂ = (t - λ/2)(t + λ/2) = t² - (λ/2)² = t² - λ²/4 + +x₁x₃ = (t - λ/2)(t + s) = t(t + s) - (λ/2)(t + s) = t² + ts - (λ t)/2 - (λ s)/2 + +x₂x₃ = (t + λ/2)(t + s) = t(t + s) + (λ/2)(t + s) = t² + ts + (λ t)/2 + (λ s)/2 + +Adding these together: + +P = [t² - λ²/4] + [t² + ts - (λ t)/2 - (λ s)/2] + [t² + ts + (λ t)/2 + (λ s)/2] + +Let's combine terms: + +First term: t² - λ²/4 + +Second term: t² + ts - (λ t)/2 - (λ s)/2 + +Third term: t² + ts + (λ t)/2 + (λ s)/2 + +Adding them: + +t² - λ²/4 + t² + ts - (λ t)/2 - (λ s)/2 + t² + ts + (λ t)/2 + (λ s)/2 + +Combine like terms: + +t² + t² + t² = 3t² + +ts + ts = 2ts + +-λ²/4 + +For the terms with λ t/2: - (λ t)/2 + (λ t)/2 = 0 + +Similarly, for λ s/2: - (λ s)/2 + (λ s)/2 = 0 + +So P = 3t² + 2ts - λ²/4 + +Now Q = x₁x₂x₃ = (t - λ/2)(t + λ/2)(t + s) = [t² - (λ/2)^2](t + s) = (t² - λ²/4)(t + s) + +Multiply this out: + += t³ + t² s - (λ²/4) t - (λ²/4) s + +Now, let's plug S, P, Q into the expression: + +-2S³ -27Q + 9S P + +First, compute S³: + +S = 3t + s + +S³ = (3t + s)^3 = 27t³ + 27t² s + 9t s² + s³ + +Multiply by -2: -2*27t³ -2*27t² s -2*9t s² -2*s³ = -54t³ -54t² s -18t s² -2s³ + +Next, compute -27Q: + +Q = t³ + t² s - (λ²/4) t - (λ²/4) s + +Multiply by -27: -27t³ -27t² s + (27λ²/4) t + (27λ²/4)s + +Third term: 9S P + +S = 3t + s + +P = 3t² + 2ts - λ²/4 + +So 9S P = 9*(3t + s)*(3t² + 2ts - λ²/4) + +Let's expand this product step by step. + +First, multiply (3t + s) with (3t² + 2ts - λ²/4): + += 3t*(3t²) + 3t*(2ts) + 3t*(-λ²/4) + s*(3t²) + s*(2ts) + s*(-λ²/4) + += 9t³ + 6t² s - (3t λ²)/4 + 3s t² + 2t s² - (s λ²)/4 + +Combine like terms: + +9t³ + (6t² s + 3t² s) + (2t s²) + (-3t λ²/4 - s λ²/4) + += 9t³ + 9t² s + 2t s² - (λ²/4)(3t + s) + +Multiply this by 9: + +9*(9t³ + 9t² s + 2t s² - (λ²/4)(3t + s)) = 81t³ + 81t² s + 18t s² - (9λ²/4)(3t + s) + +Now, combining all three parts: + +First part: -54t³ -54t² s -18t s² -2s³ + +Second part: -27t³ -27t² s + (27λ²/4) t + (27λ²/4)s + +Third part: 81t³ + 81t² s + 18t s² - (9λ²/4)(3t + s) + +Let's add them term by term. + +For t³ terms: + +-54t³ -27t³ +81t³ = 0 + +For t² s terms: + +-54t² s -27t² s +81t² s = 0 + +For t s² terms: + +-18t s² +18t s² = 0 + +For s³ terms: + +-2s³ (from first part) +... (second part has no s³ term, third part also none) so total -2s³ + +Now the terms involving λ²: + +From second part: (27λ²/4) t + (27λ²/4)s + +From third part: - (9λ²/4)(3t + s) = -27λ²/4 t -9λ²/4 s + +Adding these together: + +(27λ²/4 t -27λ²/4 t) + (27λ²/4 s -9λ²/4 s) = 0 + (18λ²/4)s = (9λ²/2)s + +So overall, combining all terms: + +-2s³ + (9λ²/2)s + +So the entire expression simplifies to: + +-2s³ + (9λ²/2)s + +Therefore, the original expression (2a³ + 27c -9ab) equals -2s³ + (9λ²/2)s + +Therefore, the expression we need to maximize is: + +(2a³ + 27c -9ab)/λ³ = (-2s³ + (9λ²/2)s)/λ³ = (-2s³)/λ³ + (9λ²/2 s)/λ³ = -2(s/λ)^3 + (9/2)(s/λ) + +Let me set k = s/λ. Since s > 0 and λ > 0, k > 0. Then our expression becomes: + +-2k³ + (9/2)k + +So now, the problem reduces to finding the maximum value of the function g(k) = -2k³ + (9/2)k for k > 0. + +This is a calculus problem. To find the maximum, take the derivative of g(k) with respect to k, set it to zero, and solve for k. + +Compute g'(k): + +g'(k) = -6k² + 9/2 + +Set g'(k) = 0: + +-6k² + 9/2 = 0 + +=> 6k² = 9/2 + +=> k² = (9/2)/6 = 9/12 = 3/4 + +=> k = sqrt(3/4) = sqrt(3)/2 ≈ 0.866 + +Since k > 0, we take the positive root. Now, check if this is a maximum by second derivative test. + +Second derivative: + +g''(k) = -12k + +At k = sqrt(3)/2, g''(k) = -12*(sqrt(3)/2) = -6sqrt(3) < 0, which means it's a local maximum. + +Therefore, the maximum value of g(k) is achieved at k = sqrt(3)/2. + +Compute g(sqrt(3)/2): + +g(sqrt(3)/2) = -2*(sqrt(3)/2)^3 + (9/2)*(sqrt(3)/2) + +Compute each term: + +First term: -2*( (3^(1/2))/2 )^3 = -2*( (3^(3/2))/8 ) = -2*(3√3)/8 = - (3√3)/4 + +Second term: (9/2)*(sqrt(3)/2) = (9√3)/4 + +Adding both terms: + +- (3√3)/4 + (9√3)/4 = (6√3)/4 = (3√3)/2 + +Therefore, the maximum value of g(k) is (3√3)/2. + +Hence, the maximum value of (2a³ + 27c -9ab)/λ³ is (3√3)/2. + +But wait, let me check if there are any constraints on k. Remember that we had s = kλ. But we have to ensure that the roots are real and satisfy x₃ > (x₁ + x₂)/2. However, in our parametrization, we set x₃ = t + s, and since s > 0, x₃ > t = (x₁ + x₂)/2, which is already satisfied. So as long as s > 0, the condition holds. But since k = s/λ > 0, and λ > 0, s > 0 is equivalent to k > 0, which we already considered. Therefore, the maximum is indeed (3√3)/2. + +But let me verify this result by considering specific cases. For example, take λ = 1. Then the expression (2a³ +27c -9ab)/1³ should be maximized at 3√3/2. Let's check with k = sqrt(3)/2, so s = sqrt(3)/2. + +Then the roots are x₁ = t - 1/2, x₂ = t + 1/2, x₃ = t + sqrt(3)/2. Then S = 3t + sqrt(3)/2, P = 3t² + 2t*(sqrt(3)/2) - 1/4 = 3t² + t*sqrt(3) - 1/4, Q = (t² - 1/4)(t + sqrt(3)/2). Then compute a, b, c from Vieta's formulas. Then compute 2a³ +27c -9ab. It might be tedious, but perhaps we can check. + +Alternatively, note that our derivation led us to an expression dependent only on k, which is s/λ, and through substitution, we found the maximum. Therefore, the maximum value is (3√3)/2. However, let me recall that the problem states x₃ > (x₁ + x₂)/2, which in our parametrization is s > 0. Therefore, as long as s is positive, which it is, we are okay. So there are no further constraints, and the maximum is indeed (3√3)/2. + +Wait, but just to be thorough, let's see if when k = sqrt(3)/2, the cubic polynomial indeed has three real roots. Since we are given that all roots are real, but when we parametrize, we need to ensure that the cubic with these roots is valid. However, since we are given that all roots are real and satisfy the given conditions, our parametrization already takes that into account. Therefore, as long as s > 0, the roots are real and satisfy the required conditions. Therefore, the maximum value is indeed (3√3)/2. + +Thus, the maximum value of (2a³ +27c -9ab)/λ³ is (3√3)/2. + +**Final Answer** +\boxed{\dfrac{3\sqrt{3}}{2}} + + +Given real numbers \(a, b, c\) and a positive number \(\lambda\) such that the polynomial \(f(x) = x^3 + ax^2 + bx + c\) has three real roots \(x_1, x_2, x_3\) with the conditions \(x_2 - x_1 = \lambda\) and \(x_3 > \frac{1}{2}(x_1 + x_2)\), we need to find the maximum value of \(\frac{2a^3 + 27c - 9ab}{\lambda^3}\). + +Using Vieta's formulas for the polynomial \(f(x)\): +1. \(x_1 + x_2 + x_3 = -a\) +2. \(x_1x_2 + x_1x_3 + x_2x_3 = b\) +3. \(x_1x_2x_3 = -c\) + +We parametrize the roots as follows: +- \(x_1 = t - \frac{\lambda}{2}\) +- \(x_2 = t + \frac{\lambda}{2}\) +- \(x_3 = t + s\) where \(s > 0\) + +Substituting these into Vieta's formulas, we get: +- \(S = x_1 + x_2 + x_3 = 3t + s\) +- \(P = x_1x_2 + x_1x_3 + x_2x_3 = 3t^2 + 2ts - \frac{\lambda^2}{4}\) +- \(Q = x_1x_2x_3 = (t^2 - \frac{\lambda^2}{4})(t + s)\) + +Substituting \(a = -S\), \(b = P\), and \(c = -Q\) into the expression \(2a^3 + 27c - 9ab\), we simplify it to: +\[ +-2s^3 + \frac{9}{2}\lambda^2 s +\] + +Introducing \(k = \frac{s}{\lambda}\), the expression becomes: +\[ +\frac{-2s^3 + \frac{9}{2}\lambda^2 s}{\lambda^3} = -2k^3 + \frac{9}{2}k +\] + +To maximize \(-2k^3 + \frac{9}{2}k\), we take the derivative and set it to zero: +\[ +g'(k) = -6k^2 + \frac{9}{2} = 0 \implies k^2 = \frac{3}{4} \implies k = \frac{\sqrt{3}}{2} +\] + +Checking the second derivative confirms a maximum. Substituting \(k = \frac{\sqrt{3}}{2}\) back into the expression gives: +\[ +-2\left(\frac{\sqrt{3}}{2}\right)^3 + \frac{9}{2}\left(\frac{\sqrt{3}}{2}\right) = \frac{3\sqrt{3}}{2} +\] + +Thus, the maximum value of \(\frac{2a^3 + 27c - 9ab}{\lambda^3}\) is \(\boxed{\dfrac{3\sqrt{3}}{2}}\). + +The final answer is \boxed{\dfrac{3\sqrt{3}}{2}}.<|eot_id|> +[INFO|configuration_utils.py:763] 2026-03-05 15:01:44,389 >> loading configuration file /local2/salman/model/pretrain_model/v2_4_gpu_llama_3b_nemo_52b/config.json +[INFO|configuration_utils.py:839] 2026-03-05 15:01:44,391 >> Model config LlamaConfig { + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "bfloat16", + "eos_token_id": 128001, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 3072, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 28, + "num_key_value_heads": 8, + "pad_token_id": 128001, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "transformers_version": "4.57.1", + "use_cache": false, + "vocab_size": 128256 +} + +[INFO|2026-03-05 15:01:44] llamafactory.model.model_utils.kv_cache:143 >> KV cache is disabled during training. +[INFO|2026-03-05 15:01:44] llamafactory.model.model_utils.liger_kernel:143 >> Liger kernel has been applied to the model. +[WARNING|logging.py:328] 2026-03-05 15:01:44,600 >> `torch_dtype` is deprecated! Use `dtype` instead! +[INFO|modeling_utils.py:1169] 2026-03-05 15:01:44,600 >> loading weights file /local2/salman/model/pretrain_model/v2_4_gpu_llama_3b_nemo_52b/model.safetensors.index.json +[INFO|modeling_utils.py:2341] 2026-03-05 15:01:44,600 >> Instantiating LlamaForCausalLM model under default dtype torch.bfloat16. +[INFO|configuration_utils.py:986] 2026-03-05 15:01:44,604 >> Generate config GenerationConfig { + "bos_token_id": 128000, + "eos_token_id": 128001, + "pad_token_id": 128001, + "use_cache": false +} + + Loading checkpoint shards: 0%| | 0/2 [00:00> loading configuration file /local2/salman/model/pretrain_model/v2_4_gpu_llama_3b_nemo_52b/generation_config.json +[INFO|configuration_utils.py:986] 2026-03-05 15:01:45,151 >> Generate config GenerationConfig { + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001 + ], + "pad_token_id": 128001, + "temperature": 0.6, + "top_p": 0.9 +} + +[INFO|dynamic_module_utils.py:423] 2026-03-05 15:01:45,151 >> Could not locate the custom_generate/generate.py inside /local2/salman/model/pretrain_model/v2_4_gpu_llama_3b_nemo_52b. +[INFO|2026-03-05 15:01:45] llamafactory.model.model_utils.checkpointing:143 >> Gradient checkpointing enabled. +[INFO|2026-03-05 15:01:45] llamafactory.model.model_utils.attention:143 >> Using FlashAttention-2 for faster training and inference. +[INFO|2026-03-05 15:01:45] llamafactory.model.adapter:143 >> Upcasting trainable params to float32. +[INFO|2026-03-05 15:01:45] llamafactory.model.adapter:143 >> Fine-tuning method: Full +[INFO|2026-03-05 15:01:45] llamafactory.model.loader:143 >> trainable params: 3,212,749,824 || all params: 3,212,749,824 || trainable%: 100.0000 +[INFO|trainer.py:749] 2026-03-05 15:01:45,205 >> Using auto half precision backend +[WARNING|trainer.py:982] 2026-03-05 15:01:45,206 >> The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 128009}. + Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 4.62it/s] Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 4.28it/s] + Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 4.31it/s] Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 4.01it/s] +The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 128009}. + Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 4.21it/s] Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 3.91it/s] +The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 128009}. +The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 128009}. +Gradient accumulation steps mismatch: GradientAccumulationPlugin has 1, DeepSpeed config has 32. Using DeepSpeed's value. +[2026-03-05 15:01:45,555] [INFO] [logging.py:107:log_dist] [Rank 0] DeepSpeed info: version=0.16.9, git-hash=unknown, git-branch=unknown +[2026-03-05 15:01:45,555] [INFO] [config.py:735:__init__] Config mesh_device None world_size = 4 +[2026-03-05 15:01:46,373] [INFO] [logging.py:107:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +[2026-03-05 15:01:46,374] [INFO] [logging.py:107:log_dist] [Rank 0] Using client Optimizer as basic optimizer +[2026-03-05 15:01:46,374] [INFO] [logging.py:107:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer +[2026-03-05 15:01:46,383] [INFO] [logging.py:107:log_dist] [Rank 0] DeepSpeed Basic Optimizer = AdamW +[2026-03-05 15:01:46,384] [INFO] [utils.py:59:is_zero_supported_optimizer] Checking ZeRO support for optimizer=AdamW type= +[2026-03-05 15:01:46,384] [INFO] [logging.py:107:log_dist] [Rank 0] Creating torch.bfloat16 ZeRO stage 2 optimizer +[2026-03-05 15:01:46,384] [INFO] [stage_1_and_2.py:150:__init__] Reduce bucket size 500000000 +[2026-03-05 15:01:46,384] [INFO] [stage_1_and_2.py:151:__init__] Allgather bucket size 500000000 +[2026-03-05 15:01:46,384] [INFO] [stage_1_and_2.py:152:__init__] CPU Offload: False +[2026-03-05 15:01:46,384] [INFO] [stage_1_and_2.py:153:__init__] Round robin gradient partitioning: True +[2026-03-05 15:01:56,186] [INFO] [utils.py:781:see_memory_usage] Before initializing optimizer states +[2026-03-05 15:01:56,187] [INFO] [utils.py:782:see_memory_usage] MA 8.98 GB Max_MA 8.98 GB CA 9.03 GB Max_CA 9 GB +[2026-03-05 15:01:56,187] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 82.93 GB, percent = 5.5% +[2026-03-05 15:01:56,445] [INFO] [utils.py:781:see_memory_usage] After initializing optimizer states +[2026-03-05 15:01:56,445] [INFO] [utils.py:782:see_memory_usage] MA 8.98 GB Max_MA 11.97 GB CA 12.01 GB Max_CA 12 GB +[2026-03-05 15:01:56,445] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 82.97 GB, percent = 5.5% +[2026-03-05 15:01:56,445] [INFO] [stage_1_and_2.py:557:__init__] optimizer state initialized +[2026-03-05 15:01:56,681] [INFO] [utils.py:781:see_memory_usage] After initializing ZeRO optimizer +[2026-03-05 15:01:56,682] [INFO] [utils.py:782:see_memory_usage] MA 8.98 GB Max_MA 8.98 GB CA 12.01 GB Max_CA 12 GB +[2026-03-05 15:01:56,682] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 82.96 GB, percent = 5.5% +[2026-03-05 15:01:56,683] [INFO] [logging.py:107:log_dist] [Rank 0] DeepSpeed Final Optimizer = DeepSpeedZeroOptimizer +[2026-03-05 15:01:56,684] [INFO] [logging.py:107:log_dist] [Rank 0] DeepSpeed using configured LR scheduler = None +[2026-03-05 15:01:56,684] [INFO] [logging.py:107:log_dist] [Rank 0] DeepSpeed LR Scheduler = None +[2026-03-05 15:01:56,684] [INFO] [logging.py:107:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0, 0.0], mom=[(0.9, 0.999), (0.9, 0.999)] +[2026-03-05 15:01:56,684] [INFO] [config.py:1003:print] DeepSpeedEngine configuration: +[2026-03-05 15:01:56,685] [INFO] [config.py:1007:print] activation_checkpointing_config { + "partition_activations": false, + "contiguous_memory_optimization": false, + "cpu_checkpointing": false, + "number_checkpoints": null, + "synchronize_checkpoint_boundary": false, + "profile": false +} +[2026-03-05 15:01:56,685] [INFO] [config.py:1007:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'intra_op_parallelism': 1, 'single_submit': False, 'overlap_events': True, 'use_gds': False} +[2026-03-05 15:01:56,685] [INFO] [config.py:1007:print] amp_enabled .................. False +[2026-03-05 15:01:56,685] [INFO] [config.py:1007:print] amp_params ................... False +[2026-03-05 15:01:56,685] [INFO] [config.py:1007:print] autotuning_config ............ { + "enabled": false, + "start_step": null, + "end_step": null, + "metric_path": null, + "arg_mappings": null, + "metric": "throughput", + "model_info": null, + "results_dir": "autotuning_results", + "exps_dir": "autotuning_exps", + "overwrite": true, + "fast": true, + "start_profile_step": 3, + "end_profile_step": 5, + "tuner_type": "gridsearch", + "tuner_early_stopping": 5, + "tuner_num_trials": 50, + "model_info_path": null, + "mp_size": 1, + "max_train_batch_size": null, + "min_train_batch_size": 1, + "max_train_micro_batch_size_per_gpu": 1.024000e+03, + "min_train_micro_batch_size_per_gpu": 1, + "num_tuning_micro_batch_sizes": 3 +} +[2026-03-05 15:01:56,685] [INFO] [config.py:1007:print] bfloat16_enabled ............. True +[2026-03-05 15:01:56,685] [INFO] [config.py:1007:print] bfloat16_immediate_grad_update True +[2026-03-05 15:01:56,685] [INFO] [config.py:1007:print] checkpoint_parallel_write_pipeline False +[2026-03-05 15:01:56,685] [INFO] [config.py:1007:print] checkpoint_tag_validation_enabled True +[2026-03-05 15:01:56,685] [INFO] [config.py:1007:print] checkpoint_tag_validation_fail False +[2026-03-05 15:01:56,685] [INFO] [config.py:1007:print] comms_config ................. +[2026-03-05 15:01:56,685] [INFO] [config.py:1007:print] communication_data_type ...... None +[2026-03-05 15:01:56,685] [INFO] [config.py:1007:print] compile_config ............... deepcompile=False free_activation=False offload_activation=False offload_opt_states=False double_buffer=True symmetric_memory=False debug_log=False offload_parameters=False sync_before_reduce=False sync_after_reduce=False sync_before_allgather=False sync_after_allgather=False +[2026-03-05 15:01:56,685] [INFO] [config.py:1007:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +[2026-03-05 15:01:56,685] [INFO] [config.py:1007:print] curriculum_enabled_legacy .... False +[2026-03-05 15:01:56,685] [INFO] [config.py:1007:print] curriculum_params_legacy ..... False +[2026-03-05 15:01:56,685] [INFO] [config.py:1007:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'pin_memory': False, 'curriculum_learning': {'enabled': False}, 'dynamic_batching': {'enabled': False, 'lr_scaling_method': 'linear', 'min_batch_size': 1, 'max_batch_size': None, 'sequence_picking_order': 'dataloader', 'verbose': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}} +[2026-03-05 15:01:56,685] [INFO] [config.py:1007:print] data_efficiency_enabled ...... False +[2026-03-05 15:01:56,685] [INFO] [config.py:1007:print] dataloader_drop_last ......... False +[2026-03-05 15:01:56,685] [INFO] [config.py:1007:print] disable_allgather ............ False +[2026-03-05 15:01:56,685] [INFO] [config.py:1007:print] dump_state ................... False +[2026-03-05 15:01:56,685] [INFO] [config.py:1007:print] dynamic_loss_scale_args ...... None +[2026-03-05 15:01:56,685] [INFO] [config.py:1007:print] eigenvalue_enabled ........... False +[2026-03-05 15:01:56,685] [INFO] [config.py:1007:print] eigenvalue_gas_boundary_resolution 1 +[2026-03-05 15:01:56,685] [INFO] [config.py:1007:print] eigenvalue_layer_name ........ bert.encoder.layer +[2026-03-05 15:01:56,685] [INFO] [config.py:1007:print] eigenvalue_layer_num ......... 0 +[2026-03-05 15:01:56,685] [INFO] [config.py:1007:print] eigenvalue_max_iter .......... 100 +[2026-03-05 15:01:56,685] [INFO] [config.py:1007:print] eigenvalue_stability ......... 1e-06 +[2026-03-05 15:01:56,685] [INFO] [config.py:1007:print] eigenvalue_tol ............... 0.01 +[2026-03-05 15:01:56,685] [INFO] [config.py:1007:print] eigenvalue_verbose ........... False +[2026-03-05 15:01:56,685] [INFO] [config.py:1007:print] elasticity_enabled ........... False +[2026-03-05 15:01:56,685] [INFO] [config.py:1007:print] flops_profiler_config ........ { + "enabled": false, + "recompute_fwd_factor": 0.0, + "profile_step": 1, + "module_depth": -1, + "top_modules": 1, + "detailed": true, + "output_file": null +} +[2026-03-05 15:01:56,685] [INFO] [config.py:1007:print] fp16_auto_cast ............... None +[2026-03-05 15:01:56,685] [INFO] [config.py:1007:print] fp16_enabled ................. False +[2026-03-05 15:01:56,685] [INFO] [config.py:1007:print] fp16_master_weights_and_gradients False +[2026-03-05 15:01:56,685] [INFO] [config.py:1007:print] global_rank .................. 0 +[2026-03-05 15:01:56,685] [INFO] [config.py:1007:print] grad_accum_dtype ............. None +[2026-03-05 15:01:56,685] [INFO] [config.py:1007:print] gradient_accumulation_steps .. 32 +[2026-03-05 15:01:56,685] [INFO] [config.py:1007:print] gradient_clipping ............ 1.0 +[2026-03-05 15:01:56,685] [INFO] [config.py:1007:print] gradient_predivide_factor .... 1.0 +[2026-03-05 15:01:56,685] [INFO] [config.py:1007:print] graph_harvesting ............. False +[2026-03-05 15:01:56,685] [INFO] [config.py:1007:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8 +[2026-03-05 15:01:56,685] [INFO] [config.py:1007:print] initial_dynamic_scale ........ 1 +[2026-03-05 15:01:56,685] [INFO] [config.py:1007:print] load_universal_checkpoint .... False +[2026-03-05 15:01:56,685] [INFO] [config.py:1007:print] loss_scale ................... 1.0 +[2026-03-05 15:01:56,685] [INFO] [config.py:1007:print] memory_breakdown ............. False +[2026-03-05 15:01:56,686] [INFO] [config.py:1007:print] mics_hierarchial_params_gather False +[2026-03-05 15:01:56,686] [INFO] [config.py:1007:print] mics_shard_size .............. -1 +[2026-03-05 15:01:56,686] [INFO] [config.py:1007:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') comet=CometConfig(enabled=False, samples_log_interval=100, project=None, workspace=None, api_key=None, experiment_name=None, experiment_key=None, online=None, mode=None) wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') +[2026-03-05 15:01:56,686] [INFO] [config.py:1007:print] nebula_config ................ { + "enabled": false, + "persistent_storage_path": null, + "persistent_time_interval": 100, + "num_of_version_in_retention": 2, + "enable_nebula_load": true, + "load_path": null +} +[2026-03-05 15:01:56,686] [INFO] [config.py:1007:print] optimizer_legacy_fusion ...... False +[2026-03-05 15:01:56,686] [INFO] [config.py:1007:print] optimizer_name ............... None +[2026-03-05 15:01:56,686] [INFO] [config.py:1007:print] optimizer_params ............. None +[2026-03-05 15:01:56,686] [INFO] [config.py:1007:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True} +[2026-03-05 15:01:56,686] [INFO] [config.py:1007:print] pld_enabled .................. False +[2026-03-05 15:01:56,686] [INFO] [config.py:1007:print] pld_params ................... False +[2026-03-05 15:01:56,686] [INFO] [config.py:1007:print] prescale_gradients ........... False +[2026-03-05 15:01:56,686] [INFO] [config.py:1007:print] scheduler_name ............... None +[2026-03-05 15:01:56,686] [INFO] [config.py:1007:print] scheduler_params ............. None +[2026-03-05 15:01:56,686] [INFO] [config.py:1007:print] seq_parallel_communication_data_type torch.float32 +[2026-03-05 15:01:56,686] [INFO] [config.py:1007:print] sparse_attention ............. None +[2026-03-05 15:01:56,686] [INFO] [config.py:1007:print] sparse_gradients_enabled ..... False +[2026-03-05 15:01:56,686] [INFO] [config.py:1007:print] steps_per_print .............. inf +[2026-03-05 15:01:56,686] [INFO] [config.py:1007:print] tensor_parallel_config ....... dtype=torch.float16 autotp_size=0 tp_overlap_comm=False tensor_parallel=TPConfig(tp_size=1, tp_grain_size=1, mpu=None, tp_group=None) injection_policy_tuple=None keep_module_on_host=False replace_with_kernel_inject=False +[2026-03-05 15:01:56,686] [INFO] [config.py:1007:print] timers_config ................ enabled=True synchronized=True +[2026-03-05 15:01:56,686] [INFO] [config.py:1007:print] train_batch_size ............. 256 +[2026-03-05 15:01:56,686] [INFO] [config.py:1007:print] train_micro_batch_size_per_gpu 2 +[2026-03-05 15:01:56,686] [INFO] [config.py:1007:print] use_data_before_expert_parallel_ False +[2026-03-05 15:01:56,686] [INFO] [config.py:1007:print] use_node_local_storage ....... False +[2026-03-05 15:01:56,686] [INFO] [config.py:1007:print] wall_clock_breakdown ......... False +[2026-03-05 15:01:56,686] [INFO] [config.py:1007:print] weight_quantization_config ... None +[2026-03-05 15:01:56,686] [INFO] [config.py:1007:print] world_size ................... 4 +[2026-03-05 15:01:56,686] [INFO] [config.py:1007:print] zero_allow_untested_optimizer True +[2026-03-05 15:01:56,686] [INFO] [config.py:1007:print] zero_config .................. stage=2 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False module_granularity_threshold=0 use_all_reduce_for_fetch_params=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=True zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False zeropp_loco_param=None mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True pipeline_loading_checkpoint=False override_module_apply=True log_trace_cache_warnings=False +[2026-03-05 15:01:56,686] [INFO] [config.py:1007:print] zero_enabled ................. True +[2026-03-05 15:01:56,686] [INFO] [config.py:1007:print] zero_force_ds_cpu_optimizer .. True +[2026-03-05 15:01:56,686] [INFO] [config.py:1007:print] zero_optimization_stage ...... 2 +[2026-03-05 15:01:56,686] [INFO] [config.py:993:print_user_config] json = { + "train_batch_size": 256, + "train_micro_batch_size_per_gpu": 2, + "gradient_accumulation_steps": 32, + "gradient_clipping": 1.0, + "zero_allow_untested_optimizer": true, + "fp16": { + "enabled": false, + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": true + }, + "zero_optimization": { + "stage": 2, + "allgather_partitions": true, + "allgather_bucket_size": 5.000000e+08, + "overlap_comm": false, + "reduce_scatter": true, + "reduce_bucket_size": 5.000000e+08, + "contiguous_gradients": true, + "round_robin_gradients": true + }, + "steps_per_print": inf +} +[INFO|trainer.py:2519] 2026-03-05 15:01:56,687 >> ***** Running training ***** +[INFO|trainer.py:2520] 2026-03-05 15:01:56,687 >> Num examples = 43,525 +[INFO|trainer.py:2521] 2026-03-05 15:01:56,687 >> Num Epochs = 3 +[INFO|trainer.py:2522] 2026-03-05 15:01:56,687 >> Instantaneous batch size per device = 2 +[INFO|trainer.py:2525] 2026-03-05 15:01:56,687 >> Total train batch size (w. parallel, distributed & accumulation) = 256 +[INFO|trainer.py:2526] 2026-03-05 15:01:56,687 >> Gradient Accumulation steps = 32 +[INFO|trainer.py:2527] 2026-03-05 15:01:56,687 >> Total optimization steps = 513 +[INFO|trainer.py:2528] 2026-03-05 15:01:56,688 >> Number of trainable parameters = 3,212,749,824 +2026/03/05 15:01:56 INFO mlflow.tracking.fluent: Experiment with name 'llama3b_think_sft_nopack_lr1.5e5_ep3' does not exist. Creating a new experiment. + 0%| | 0/513 [00:00> Saving model checkpoint to /local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/checkpoint-200 +[INFO|configuration_utils.py:491] 2026-03-05 18:39:42,675 >> Configuration saved in /local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/checkpoint-200/config.json +[INFO|configuration_utils.py:757] 2026-03-05 18:39:42,676 >> Configuration saved in /local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/checkpoint-200/generation_config.json +[INFO|modeling_utils.py:4189] 2026-03-05 18:39:47,807 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 2 checkpoint shards. You can find where each parameters has been saved in the index located at /local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/checkpoint-200/model.safetensors.index.json. +[INFO|tokenization_utils_base.py:2421] 2026-03-05 18:39:47,807 >> chat template saved in /local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/checkpoint-200/chat_template.jinja +[INFO|tokenization_utils_base.py:2590] 2026-03-05 18:39:47,809 >> tokenizer config file saved in /local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/checkpoint-200/tokenizer_config.json +[INFO|tokenization_utils_base.py:2599] 2026-03-05 18:39:47,809 >> Special tokens file saved in /local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/checkpoint-200/special_tokens_map.json +[2026-03-05 18:39:48,359] [INFO] [logging.py:107:log_dist] [Rank 0] [Torch] Checkpoint global_step200 is about to be saved! +[2026-03-05 18:39:48,364] [INFO] [logging.py:107:log_dist] [Rank 0] Saving model checkpoint: /local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/checkpoint-200/global_step200/mp_rank_00_model_states.pt +[2026-03-05 18:39:48,365] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/checkpoint-200/global_step200/mp_rank_00_model_states.pt... +[2026-03-05 18:39:54,671] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/checkpoint-200/global_step200/mp_rank_00_model_states.pt. +[2026-03-05 18:39:54,676] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/checkpoint-200/global_step200/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2026-03-05 18:40:04,317] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/checkpoint-200/global_step200/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2026-03-05 18:40:04,318] [INFO] [engine.py:3701:_save_zero_checkpoint] zero checkpoint saved /local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/checkpoint-200/global_step200/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2026-03-05 18:40:04,318] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step200 is ready now! + 39%|███▉ | 201/513 [3:39:24<7:00:57, 80.95s/it] 39%|███▉ | 202/513 [3:40:38<6:49:29, 79.00s/it] 40%|███▉ | 203/513 [3:41:57<6:47:34, 78.89s/it] 40%|███▉ | 204/513 [3:43:10<6:37:12, 77.13s/it] 40%|███▉ | 205/513 [3:44:19<6:23:12, 74.65s/it] 40%|████ | 206/513 [3:45:25<6:09:24, 72.20s/it] 40%|████ | 207/513 [3:46:32<6:00:47, 70.74s/it] 41%|████ | 208/513 [3:47:42<5:57:31, 70.33s/it] 41%|████ | 209/513 [3:48:59<6:06:21, 72.31s/it] 41%|████ | 210/513 [3:50:20<6:18:10, 74.89s/it] {'loss': 0.4765, 'grad_norm': 0.23004528880119324, 'learning_rate': 1.1101501885274894e-05, 'epoch': 1.23} + 41%|████ | 210/513 [3:50:20<6:18:10, 74.89s/it] 41%|████ | 211/513 [3:51:38<6:22:25, 75.98s/it] 41%|████▏ | 212/513 [3:52:58<6:26:32, 77.05s/it] 42%|████▏ | 213/513 [3:54:16<6:26:36, 77.32s/it] 42%|████▏ | 214/513 [3:55:36<6:29:09, 78.09s/it] 42%|████▏ | 215/513 [3:56:51<6:23:41, 77.25s/it] 42%|████▏ | 216/513 [3:57:58<6:06:39, 74.07s/it] 42%|████▏ | 217/513 [3:59:10<6:03:28, 73.68s/it] 42%|████▏ | 218/513 [4:00:26<6:05:07, 74.26s/it] 43%|████▎ | 219/513 [4:01:41<6:05:34, 74.61s/it] 43%|████▎ | 220/513 [4:02:56<6:05:02, 74.75s/it] {'loss': 0.4706, 'grad_norm': 0.23046620190143585, 'learning_rate': 1.0645168870035313e-05, 'epoch': 1.29} + 43%|████▎ | 220/513 [4:02:56<6:05:02, 74.75s/it] 43%|████▎ | 221/513 [4:04:12<6:04:21, 74.87s/it] 43%|████▎ | 222/513 [4:05:30<6:08:27, 75.97s/it] 43%|████▎ | 223/513 [4:06:44<6:04:43, 75.46s/it] 44%|████▎ | 224/513 [4:08:02<6:06:25, 76.08s/it] 44%|████▍ | 225/513 [4:09:23<6:12:25, 77.59s/it] 44%|████▍ | 226/513 [4:10:40<6:10:17, 77.41s/it] 44%|████▍ | 227/513 [4:11:49<5:56:40, 74.83s/it] 44%|████▍ | 228/513 [4:12:59<5:48:44, 73.42s/it] 45%|████▍ | 229/513 [4:14:13<5:48:23, 73.60s/it] 45%|████▍ | 230/513 [4:15:23<5:42:11, 72.55s/it] {'loss': 0.4681, 'grad_norm': 0.243893101811409, 'learning_rate': 1.0174235151272025e-05, 'epoch': 1.35} + 45%|████▍ | 230/513 [4:15:23<5:42:11, 72.55s/it] 45%|████▌ | 231/513 [4:16:34<5:38:50, 72.09s/it] 45%|████▌ | 232/513 [4:17:50<5:42:18, 73.09s/it] 45%|████▌ | 233/513 [4:19:03<5:41:07, 73.10s/it] 46%|████▌ | 234/513 [4:20:20<5:45:50, 74.37s/it] 46%|████▌ | 235/513 [4:21:38<5:50:15, 75.59s/it] 46%|████▌ | 236/513 [4:22:55<5:50:19, 75.88s/it] 46%|████▌ | 237/513 [4:24:18<5:59:04, 78.06s/it] 46%|████▋ | 238/513 [4:25:38<6:00:05, 78.57s/it] 47%|████▋ | 239/513 [4:26:55<5:57:23, 78.26s/it] 47%|████▋ | 240/513 [4:28:11<5:51:55, 77.34s/it] {'loss': 0.4715, 'grad_norm': 0.2657492160797119, 'learning_rate': 9.690886927529886e-06, 'epoch': 1.41} + 47%|████▋ | 240/513 [4:28:11<5:51:55, 77.34s/it] 47%|████▋ | 241/513 [4:29:29<5:51:53, 77.62s/it] 47%|████▋ | 242/513 [4:30:46<5:49:27, 77.37s/it] 47%|████▋ | 243/513 [4:32:00<5:43:59, 76.44s/it] 48%|████▊ | 244/513 [4:33:17<5:43:03, 76.52s/it] 48%|████▊ | 245/513 [4:34:34<5:42:26, 76.67s/it] 48%|████▊ | 246/513 [4:35:47<5:36:16, 75.57s/it] 48%|████▊ | 247/513 [4:37:01<5:33:14, 75.17s/it] 48%|████▊ | 248/513 [4:38:13<5:28:04, 74.28s/it] 49%|████▊ | 249/513 [4:39:31<5:31:48, 75.41s/it] 49%|████▊ | 250/513 [4:40:41<5:22:51, 73.66s/it] {'loss': 0.4711, 'grad_norm': 0.24003422260284424, 'learning_rate': 9.197368028760536e-06, 'epoch': 1.46} + 49%|████▊ | 250/513 [4:40:41<5:22:51, 73.66s/it] 49%|████▉ | 251/513 [4:41:55<5:22:39, 73.89s/it] 49%|████▉ | 252/513 [4:43:05<5:15:36, 72.55s/it] 49%|████▉ | 253/513 [4:44:25<5:24:30, 74.89s/it] 50%|████▉ | 254/513 [4:45:42<5:25:41, 75.45s/it] 50%|████▉ | 255/513 [4:46:59<5:27:25, 76.15s/it] 50%|████▉ | 256/513 [4:48:16<5:27:05, 76.36s/it] 50%|█████ | 257/513 [4:49:31<5:24:14, 76.00s/it] 50%|█████ | 258/513 [4:50:48<5:24:10, 76.28s/it] 50%|█████ | 259/513 [4:52:10<5:29:57, 77.94s/it] 51%|█████ | 260/513 [4:53:35<5:37:18, 80.00s/it] {'loss': 0.4685, 'grad_norm': 0.238833948969841, 'learning_rate': 8.695969499871911e-06, 'epoch': 1.52} + 51%|█████ | 260/513 [4:53:35<5:37:18, 80.00s/it] 51%|█████ | 261/513 [4:54:58<5:39:18, 80.79s/it] 51%|█████ | 262/513 [4:56:22<5:42:59, 81.99s/it] 51%|█████▏ | 263/513 [4:57:49<5:47:14, 83.34s/it] 51%|█████▏ | 264/513 [4:59:13<5:46:35, 83.52s/it] 52%|█████▏ | 265/513 [5:00:35<5:43:30, 83.10s/it] 52%|█████▏ | 266/513 [5:01:49<5:30:51, 80.37s/it] 52%|█████▏ | 267/513 [5:03:09<5:29:11, 80.29s/it] 52%|█████▏ | 268/513 [5:04:28<5:26:09, 79.88s/it] 52%|█████▏ | 269/513 [5:05:46<5:22:13, 79.24s/it] 53%|█████▎ | 270/513 [5:07:00<5:14:23, 77.63s/it] {'loss': 0.4688, 'grad_norm': 0.237404927611351, 'learning_rate': 8.18901896509343e-06, 'epoch': 1.58} + 53%|█████▎ | 270/513 [5:07:00<5:14:23, 77.63s/it] 53%|█████▎ | 271/513 [5:08:13<5:08:19, 76.44s/it] 53%|█████▎ | 272/513 [5:09:26<5:03:01, 75.44s/it] 53%|█████▎ | 273/513 [5:10:37<4:56:30, 74.13s/it] 53%|█████▎ | 274/513 [5:11:55<4:59:53, 75.29s/it] 54%|█████▎ | 275/513 [5:13:09<4:56:13, 74.68s/it] 54%|█████▍ | 276/513 [5:14:21<4:51:50, 73.89s/it] 54%|█████▍ | 277/513 [5:15:31<4:45:59, 72.71s/it] 54%|█████▍ | 278/513 [5:16:42<4:43:26, 72.37s/it] 54%|█████▍ | 279/513 [5:17:51<4:37:24, 71.13s/it] 55%|█████▍ | 280/513 [5:18:58<4:31:44, 69.97s/it] {'loss': 0.4722, 'grad_norm': 0.22758300602436066, 'learning_rate': 7.678869822530362e-06, 'epoch': 1.64} + 55%|█████▍ | 280/513 [5:18:58<4:31:44, 69.97s/it] 55%|█████▍ | 281/513 [5:20:12<4:35:18, 71.20s/it] 55%|█████▍ | 282/513 [5:21:22<4:33:14, 70.97s/it] 55%|█████▌ | 283/513 [5:22:33<4:31:39, 70.87s/it] 55%|█████▌ | 284/513 [5:23:36<4:21:30, 68.52s/it] 56%|█████▌ | 285/513 [5:24:35<4:09:22, 65.63s/it] 56%|█████▌ | 286/513 [5:25:51<4:20:45, 68.92s/it] 56%|█████▌ | 287/513 [5:27:07<4:27:15, 70.95s/it] 56%|█████▌ | 288/513 [5:28:24<4:33:03, 72.81s/it] 56%|█████▋ | 289/513 [5:29:39<4:33:49, 73.35s/it] 57%|█████▋ | 290/513 [5:30:51<4:31:33, 73.07s/it] {'loss': 0.4649, 'grad_norm': 0.22680319845676422, 'learning_rate': 7.167890319069035e-06, 'epoch': 1.7} + 57%|█████▋ | 290/513 [5:30:51<4:31:33, 73.07s/it] 57%|█████▋ | 291/513 [5:32:04<4:29:32, 72.85s/it] 57%|█████▋ | 292/513 [5:33:11<4:22:30, 71.27s/it] 57%|█████▋ | 293/513 [5:34:27<4:25:53, 72.52s/it] 57%|█████▋ | 294/513 [5:35:35<4:20:19, 71.32s/it] 58%|█████▊ | 295/513 [5:36:51<4:24:13, 72.72s/it] 58%|█████▊ | 296/513 [5:38:02<4:20:26, 72.01s/it] 58%|█████▊ | 297/513 [5:39:10<4:14:54, 70.81s/it] 58%|█████▊ | 298/513 [5:40:24<4:17:11, 71.77s/it] 58%|█████▊ | 299/513 [5:41:36<4:16:37, 71.95s/it] 58%|█████▊ | 300/513 [5:42:50<4:17:46, 72.61s/it] {'loss': 0.4692, 'grad_norm': 0.2401188611984253, 'learning_rate': 6.658452556350092e-06, 'epoch': 1.76} + 58%|█████▊ | 300/513 [5:42:50<4:17:46, 72.61s/it] 59%|█████▊ | 301/513 [5:44:01<4:14:16, 71.96s/it] 59%|█████▉ | 302/513 [5:45:14<4:14:21, 72.33s/it] 59%|█████▉ | 303/513 [5:46:22<4:09:20, 71.24s/it] 59%|█████▉ | 304/513 [5:47:33<4:06:58, 70.90s/it] 59%|█████▉ | 305/513 [5:48:46<4:08:04, 71.56s/it] 60%|█████▉ | 306/513 [5:49:56<4:05:37, 71.20s/it] 60%|█████▉ | 307/513 [5:51:07<4:04:30, 71.22s/it] 60%|██████ | 308/513 [5:52:18<4:02:48, 71.07s/it] 60%|██████ | 309/513 [5:53:27<3:59:22, 70.41s/it] 60%|██████ | 310/513 [5:54:40<4:01:06, 71.26s/it] {'loss': 0.4653, 'grad_norm': 0.2211555689573288, 'learning_rate': 6.152921478846986e-06, 'epoch': 1.82} + 60%|██████ | 310/513 [5:54:40<4:01:06, 71.26s/it] 61%|██████ | 311/513 [5:55:52<4:00:23, 71.40s/it] 61%|██████ | 312/513 [5:57:06<4:01:45, 72.17s/it] 61%|██████ | 313/513 [5:58:28<4:10:55, 75.28s/it] 61%|██████ | 314/513 [5:59:44<4:10:04, 75.40s/it] 61%|██████▏ | 315/513 [6:00:55<4:04:29, 74.09s/it] 62%|██████▏ | 316/513 [6:02:12<4:06:28, 75.07s/it] 62%|██████▏ | 317/513 [6:03:24<4:01:30, 73.93s/it] 62%|██████▏ | 318/513 [6:04:31<3:54:07, 72.04s/it] 62%|██████▏ | 319/513 [6:05:40<3:49:51, 71.09s/it] 62%|██████▏ | 320/513 [6:06:49<3:46:01, 70.27s/it] {'loss': 0.4664, 'grad_norm': 0.24088308215141296, 'learning_rate': 5.65364389516988e-06, 'epoch': 1.88} + 62%|██████▏ | 320/513 [6:06:49<3:46:01, 70.27s/it] 63%|██████▎ | 321/513 [6:07:58<3:44:27, 70.15s/it] 63%|██████▎ | 322/513 [6:09:07<3:42:09, 69.79s/it] 63%|██████▎ | 323/513 [6:10:18<3:41:37, 69.99s/it] 63%|██████▎ | 324/513 [6:11:27<3:40:09, 69.89s/it] 63%|██████▎ | 325/513 [6:12:39<3:41:00, 70.53s/it] 64%|██████▎ | 326/513 [6:13:49<3:39:18, 70.37s/it] 64%|██████▎ | 327/513 [6:15:01<3:39:05, 70.67s/it] 64%|██████▍ | 328/513 [6:16:11<3:37:20, 70.49s/it] 64%|██████▍ | 329/513 [6:17:22<3:36:42, 70.67s/it] 64%|██████▍ | 330/513 [6:18:35<3:37:31, 71.32s/it] {'loss': 0.4621, 'grad_norm': 0.21008798480033875, 'learning_rate': 5.162937583561072e-06, 'epoch': 1.94} + 64%|██████▍ | 330/513 [6:18:35<3:37:31, 71.32s/it] 65%|██████▍ | 331/513 [6:19:46<3:36:22, 71.33s/it] 65%|██████▍ | 332/513 [6:20:56<3:33:23, 70.74s/it] 65%|██████▍ | 333/513 [6:22:02<3:28:02, 69.35s/it] 65%|██████▌ | 334/513 [6:23:15<3:30:45, 70.65s/it] 65%|██████▌ | 335/513 [6:24:24<3:27:55, 70.09s/it] 65%|██████▌ | 336/513 [6:25:35<3:27:27, 70.33s/it] 66%|██████▌ | 337/513 [6:26:47<3:28:04, 70.94s/it] 66%|██████▌ | 338/513 [6:28:00<3:28:34, 71.51s/it] 66%|██████▌ | 339/513 [6:29:20<3:34:32, 73.98s/it] 66%|██████▋ | 340/513 [6:30:39<3:37:52, 75.57s/it] {'loss': 0.4675, 'grad_norm': 0.2156449556350708, 'learning_rate': 4.683080532156986e-06, 'epoch': 1.99} + 66%|██████▋ | 340/513 [6:30:39<3:37:52, 75.57s/it] 66%|██████▋ | 341/513 [6:31:58<3:39:38, 76.62s/it] 67%|██████▋ | 342/513 [6:32:01<2:35:06, 54.43s/it] 67%|██████▋ | 343/513 [6:33:14<2:50:14, 60.09s/it] 67%|██████▋ | 344/513 [6:34:23<2:56:50, 62.79s/it] 67%|██████▋ | 345/513 [6:35:36<3:03:46, 65.64s/it] 67%|██████▋ | 346/513 [6:36:48<3:08:42, 67.80s/it] 68%|██████▊ | 347/513 [6:37:59<3:09:30, 68.50s/it] 68%|██████▊ | 348/513 [6:39:07<3:08:06, 68.41s/it] 68%|██████▊ | 349/513 [6:40:22<3:12:57, 70.59s/it] 68%|██████▊ | 350/513 [6:41:31<3:10:29, 70.12s/it] {'loss': 0.44, 'grad_norm': 0.2731837034225464, 'learning_rate': 4.216300363966383e-06, 'epoch': 2.05} + 68%|██████▊ | 350/513 [6:41:31<3:10:29, 70.12s/it] 68%|██████▊ | 351/513 [6:42:42<3:09:28, 70.17s/it] 69%|██████▊ | 352/513 [6:43:55<3:10:41, 71.07s/it] 69%|██████▉ | 353/513 [6:45:04<3:08:03, 70.52s/it] 69%|██████▉ | 354/513 [6:46:20<3:11:00, 72.08s/it] 69%|██████▉ | 355/513 [6:48:06<3:36:42, 82.29s/it] 69%|██████▉ | 356/513 [6:49:51<3:53:13, 89.13s/it] 70%|██████▉ | 357/513 [6:51:36<4:04:24, 94.00s/it] 70%|██████▉ | 358/513 [6:53:13<4:04:47, 94.76s/it] 70%|██████▉ | 359/513 [6:54:52<4:06:36, 96.08s/it] 70%|███████ | 360/513 [6:56:27<4:04:18, 95.80s/it] {'loss': 0.4447, 'grad_norm': 0.2207324057817459, 'learning_rate': 3.7647639956567304e-06, 'epoch': 2.11} + 70%|███████ | 360/513 [6:56:27<4:04:18, 95.80s/it] 70%|███████ | 361/513 [6:58:07<4:05:31, 96.92s/it] 71%|███████ | 362/513 [6:59:46<4:05:43, 97.64s/it] 71%|███████ | 363/513 [7:01:26<4:05:39, 98.27s/it] 71%|███████ | 364/513 [7:03:06<4:05:30, 98.86s/it] 71%|███████ | 365/513 [7:04:49<4:06:41, 100.01s/it] 71%|███████▏ | 366/513 [7:06:27<4:03:31, 99.40s/it] 72%|███████▏ | 367/513 [7:08:02<3:58:36, 98.05s/it] 72%|███████▏ | 368/513 [7:09:39<3:56:06, 97.70s/it] 72%|███████▏ | 369/513 [7:11:17<3:55:01, 97.93s/it] 72%|███████▏ | 370/513 [7:12:51<3:50:49, 96.85s/it] {'loss': 0.4391, 'grad_norm': 0.21577142179012299, 'learning_rate': 3.3305675781554655e-06, 'epoch': 2.16} + 72%|███████▏ | 370/513 [7:12:51<3:50:49, 96.85s/it] 72%|███████▏ | 371/513 [7:14:26<3:47:24, 96.09s/it] 73%|███████▎ | 372/513 [7:16:03<3:46:28, 96.37s/it] 73%|███████▎ | 373/513 [7:17:40<3:45:18, 96.56s/it] 73%|███████▎ | 374/513 [7:19:19<3:45:54, 97.51s/it] 73%|███████▎ | 375/513 [7:21:03<3:48:29, 99.34s/it] 73%|███████▎ | 376/513 [7:22:47<3:50:01, 100.74s/it] 73%|███████▎ | 377/513 [7:24:27<3:47:46, 100.49s/it] 74%|███████▎ | 378/513 [7:26:02<3:42:32, 98.91s/it] 74%|███████▍ | 379/513 [7:27:39<3:39:13, 98.16s/it] 74%|███████▍ | 380/513 [7:29:12<3:34:33, 96.79s/it] {'loss': 0.4419, 'grad_norm': 0.22381627559661865, 'learning_rate': 2.915726765764453e-06, 'epoch': 2.22} + 74%|███████▍ | 380/513 [7:29:12<3:34:33, 96.79s/it] 74%|███████▍ | 381/513 [7:30:56<3:37:20, 98.79s/it] 74%|███████▍ | 382/513 [7:32:34<3:35:42, 98.80s/it] 75%|███████▍ | 383/513 [7:34:13<3:34:07, 98.83s/it] 75%|███████▍ | 384/513 [7:35:52<3:32:39, 98.91s/it] 75%|███████▌ | 385/513 [7:37:30<3:29:50, 98.37s/it] 75%|███████▌ | 386/513 [7:39:05<3:26:32, 97.58s/it] 75%|███████▌ | 387/513 [7:40:43<3:25:14, 97.73s/it] 76%|███████▌ | 388/513 [7:42:22<3:24:05, 97.96s/it] 76%|███████▌ | 389/513 [7:44:03<3:24:06, 98.77s/it] 76%|███████▌ | 390/513 [7:45:42<3:22:50, 98.95s/it] {'loss': 0.4377, 'grad_norm': 0.2167045623064041, 'learning_rate': 2.522167358961046e-06, 'epoch': 2.28} + 76%|███████▌ | 390/513 [7:45:42<3:22:50, 98.95s/it] 76%|███████▌ | 391/513 [7:47:18<3:19:30, 98.12s/it] 76%|███████▋ | 392/513 [7:48:59<3:19:23, 98.87s/it] 77%|███████▋ | 393/513 [7:50:44<3:21:25, 100.71s/it] 77%|███████▋ | 394/513 [7:52:26<3:20:24, 101.04s/it] 77%|███████▋ | 395/513 [7:54:08<3:19:38, 101.51s/it] 77%|███████▋ | 396/513 [7:55:48<3:16:44, 100.90s/it] 77%|███████▋ | 397/513 [7:57:21<3:10:55, 98.75s/it] 78%|███████▊ | 398/513 [7:58:55<3:06:12, 97.15s/it] 78%|███████▊ | 399/513 [8:00:31<3:04:07, 96.91s/it] 78%|███████▊ | 400/513 [8:02:07<3:01:47, 96.52s/it] {'loss': 0.4387, 'grad_norm': 0.2239835262298584, 'learning_rate': 2.151716364324264e-06, 'epoch': 2.34} + 78%|███████▊ | 400/513 [8:02:07<3:01:47, 96.52s/it][INFO|trainer.py:4309] 2026-03-05 23:04:07,534 >> Saving model checkpoint to /local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/checkpoint-400 +[INFO|configuration_utils.py:491] 2026-03-05 23:04:07,536 >> Configuration saved in /local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/checkpoint-400/config.json +[INFO|configuration_utils.py:757] 2026-03-05 23:04:07,536 >> Configuration saved in /local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/checkpoint-400/generation_config.json +[INFO|modeling_utils.py:4189] 2026-03-05 23:04:11,928 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 2 checkpoint shards. You can find where each parameters has been saved in the index located at /local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/checkpoint-400/model.safetensors.index.json. +[INFO|tokenization_utils_base.py:2421] 2026-03-05 23:04:11,928 >> chat template saved in /local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/checkpoint-400/chat_template.jinja +[INFO|tokenization_utils_base.py:2590] 2026-03-05 23:04:11,930 >> tokenizer config file saved in /local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/checkpoint-400/tokenizer_config.json +[INFO|tokenization_utils_base.py:2599] 2026-03-05 23:04:11,930 >> Special tokens file saved in /local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/checkpoint-400/special_tokens_map.json +[2026-03-05 23:04:12,412] [INFO] [logging.py:107:log_dist] [Rank 0] [Torch] Checkpoint global_step400 is about to be saved! +[2026-03-05 23:04:12,421] [INFO] [logging.py:107:log_dist] [Rank 0] Saving model checkpoint: /local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/checkpoint-400/global_step400/mp_rank_00_model_states.pt +[2026-03-05 23:04:12,421] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/checkpoint-400/global_step400/mp_rank_00_model_states.pt... +[2026-03-05 23:04:18,139] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/checkpoint-400/global_step400/mp_rank_00_model_states.pt. +[2026-03-05 23:04:18,156] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/checkpoint-400/global_step400/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2026-03-05 23:04:27,597] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/checkpoint-400/global_step400/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2026-03-05 23:04:27,598] [INFO] [engine.py:3701:_save_zero_checkpoint] zero checkpoint saved /local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/checkpoint-400/global_step400/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2026-03-05 23:04:27,598] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step400 is ready now! + 78%|███████▊ | 401/513 [8:04:13<3:17:04, 105.58s/it] 78%|███████▊ | 402/513 [8:05:50<3:10:30, 102.98s/it] 79%|███████▊ | 403/513 [8:07:31<3:07:34, 102.31s/it] 79%|███████▉ | 404/513 [8:09:05<3:01:13, 99.75s/it] 79%|███████▉ | 405/513 [8:10:39<2:56:36, 98.11s/it] 79%|███████▉ | 406/513 [8:12:16<2:54:03, 97.60s/it] 79%|███████▉ | 407/513 [8:13:54<2:52:58, 97.91s/it] 80%|███████▉ | 408/513 [8:15:33<2:52:00, 98.29s/it] 80%|███████▉ | 409/513 [8:17:12<2:50:37, 98.44s/it] 80%|███████▉ | 410/513 [8:18:53<2:50:01, 99.05s/it] {'loss': 0.4426, 'grad_norm': 0.2177765816450119, 'learning_rate': 1.806093513088348e-06, 'epoch': 2.4} + 80%|███████▉ | 410/513 [8:18:53<2:50:01, 99.05s/it] 80%|████████ | 411/513 [8:20:36<2:50:30, 100.29s/it] 80%|████████ | 412/513 [8:22:19<2:50:16, 101.16s/it] 81%|████████ | 413/513 [8:24:01<2:49:09, 101.50s/it] 81%|████████ | 414/513 [8:25:42<2:47:04, 101.25s/it] 81%|████████ | 415/513 [8:27:22<2:44:35, 100.77s/it] 81%|████████ | 416/513 [8:28:58<2:40:49, 99.47s/it] 81%|████████▏ | 417/513 [8:30:39<2:39:41, 99.81s/it] 81%|████████▏ | 418/513 [8:32:15<2:36:23, 98.78s/it] 82%|████████▏ | 419/513 [8:33:58<2:36:53, 100.14s/it] 82%|████████▏ | 420/513 [8:35:37<2:34:34, 99.72s/it] {'loss': 0.442, 'grad_norm': 0.21108600497245789, 'learning_rate': 1.486903277696733e-06, 'epoch': 2.46} + 82%|████████▏ | 420/513 [8:35:37<2:34:34, 99.72s/it] 82%|████████▏ | 421/513 [8:37:15<2:32:06, 99.20s/it] 82%|████████▏ | 422/513 [8:38:53<2:29:38, 98.67s/it] 82%|████████▏ | 423/513 [8:40:29<2:26:47, 97.86s/it] 83%|████████▎ | 424/513 [8:42:08<2:25:41, 98.22s/it] 83%|████████▎ | 425/513 [8:43:43<2:22:43, 97.31s/it] 83%|████████▎ | 426/513 [8:45:17<2:19:56, 96.51s/it] 83%|████████▎ | 427/513 [8:46:56<2:19:22, 97.23s/it] 83%|████████▎ | 428/513 [8:48:38<2:19:39, 98.58s/it] 84%|████████▎ | 429/513 [8:50:21<2:19:38, 99.75s/it] 84%|████████▍ | 430/513 [8:51:59<2:17:23, 99.32s/it] {'loss': 0.4449, 'grad_norm': 0.20833276212215424, 'learning_rate': 1.1956274234177322e-06, 'epoch': 2.52} + 84%|████████▍ | 430/513 [8:51:59<2:17:23, 99.32s/it] 84%|████████▍ | 431/513 [8:53:40<2:16:20, 99.76s/it] 84%|████████▍ | 432/513 [8:55:21<2:15:23, 100.30s/it] 84%|████████▍ | 433/513 [8:57:04<2:14:44, 101.05s/it] 85%|████████▍ | 434/513 [8:58:41<2:11:28, 99.85s/it] 85%|████████▍ | 435/513 [9:00:21<2:09:56, 99.96s/it] 85%|████████▍ | 436/513 [9:01:55<2:05:51, 98.08s/it] 85%|████████▌ | 437/513 [9:03:30<2:02:56, 97.07s/it] 85%|████████▌ | 438/513 [9:05:10<2:02:39, 98.13s/it] 86%|████████▌ | 439/513 [9:06:52<2:02:14, 99.12s/it] 86%|████████▌ | 440/513 [9:08:26<1:59:00, 97.81s/it] {'loss': 0.4391, 'grad_norm': 0.20782434940338135, 'learning_rate': 9.336181295993204e-07, 'epoch': 2.58} + 86%|████████▌ | 440/513 [9:08:26<1:59:00, 97.81s/it] 86%|████████▌ | 441/513 [9:10:05<1:57:27, 97.89s/it] 86%|████████▌ | 442/513 [9:11:42<1:55:36, 97.70s/it] 86%|████████▋ | 443/513 [9:13:22<1:54:48, 98.41s/it] 87%|████████▋ | 444/513 [9:14:56<1:51:50, 97.25s/it] 87%|████████▋ | 445/513 [9:16:34<1:50:23, 97.40s/it] 87%|████████▋ | 446/513 [9:18:11<1:48:42, 97.35s/it] 87%|████████▋ | 447/513 [9:19:53<1:48:35, 98.72s/it] 87%|████████▋ | 448/513 [9:21:30<1:46:20, 98.17s/it] 88%|████████▊ | 449/513 [9:23:11<1:45:34, 98.97s/it] 88%|████████▊ | 450/513 [9:24:54<1:45:01, 100.03s/it] {'loss': 0.4374, 'grad_norm': 0.20101866126060486, 'learning_rate': 7.02091712495907e-07, 'epoch': 2.64} + 88%|████████▊ | 450/513 [9:24:54<1:45:01, 100.03s/it] 88%|████████▊ | 451/513 [9:26:33<1:43:13, 99.90s/it] 88%|████████▊ | 452/513 [9:28:08<1:39:56, 98.30s/it] 88%|████████▊ | 453/513 [9:29:46<1:38:21, 98.36s/it] 88%|████████▊ | 454/513 [9:31:28<1:37:36, 99.27s/it] 89%|████████▊ | 455/513 [9:33:02<1:34:30, 97.76s/it] 89%|████████▉ | 456/513 [9:34:28<1:29:28, 94.19s/it] 89%|████████▉ | 457/513 [9:36:01<1:27:46, 94.04s/it] 89%|████████▉ | 458/513 [9:37:43<1:28:13, 96.24s/it] 89%|████████▉ | 459/513 [9:39:25<1:28:08, 97.93s/it] 90%|████████▉ | 460/513 [9:41:06<1:27:17, 98.83s/it] {'loss': 0.4403, 'grad_norm': 0.1978382021188736, 'learning_rate': 5.021229788074589e-07, 'epoch': 2.69} + 90%|████████▉ | 460/513 [9:41:06<1:27:17, 98.83s/it] 90%|████████▉ | 461/513 [9:42:44<1:25:31, 98.69s/it] 90%|█████████ | 462/513 [9:44:22<1:23:49, 98.62s/it] 90%|█████████ | 463/513 [9:45:58<1:21:23, 97.67s/it] 90%|█████████ | 464/513 [9:47:35<1:19:37, 97.49s/it] 91%|█████████ | 465/513 [9:49:13<1:18:01, 97.53s/it] 91%|█████████ | 466/513 [9:50:51<1:16:38, 97.85s/it] 91%|█████████ | 467/513 [9:52:31<1:15:31, 98.52s/it] 91%|█████████ | 468/513 [9:54:11<1:14:09, 98.88s/it] 91%|█████████▏| 469/513 [9:55:46<1:11:35, 97.63s/it] 92%|█████████▏| 470/513 [9:57:23<1:10:00, 97.68s/it] {'loss': 0.44, 'grad_norm': 0.20072239637374878, 'learning_rate': 3.3464023614327683e-07, 'epoch': 2.75} + 92%|█████████▏| 470/513 [9:57:23<1:10:00, 97.68s/it] 92%|█████████▏| 471/513 [9:59:03<1:08:42, 98.16s/it] 92%|█████████▏| 472/513 [10:00:40<1:06:58, 98.01s/it] 92%|█████████▏| 473/513 [10:02:19<1:05:29, 98.23s/it] 92%|█████████▏| 474/513 [10:03:57<1:03:47, 98.15s/it] 93%|█████████▎| 475/513 [10:05:34<1:01:55, 97.78s/it] 93%|█████████▎| 476/513 [10:07:12<1:00:19, 97.82s/it] 93%|█████████▎| 477/513 [10:08:55<59:42, 99.51s/it] 93%|█████████▎| 478/513 [10:10:33<57:40, 98.88s/it] 93%|█████████▎| 479/513 [10:12:12<56:09, 99.10s/it] 94%|█████████▎| 480/513 [10:13:50<54:13, 98.58s/it] {'loss': 0.4358, 'grad_norm': 0.2036609798669815, 'learning_rate': 2.0042098357321209e-07, 'epoch': 2.81} + 94%|█████████▎| 480/513 [10:13:50<54:13, 98.58s/it] 94%|█████████▍| 481/513 [10:15:30<52:54, 99.20s/it] 94%|█████████▍| 482/513 [10:17:07<50:52, 98.48s/it] 94%|█████████▍| 483/513 [10:18:44<49:01, 98.05s/it] 94%|█████████▍| 484/513 [10:20:21<47:15, 97.79s/it] 95%|█████████▍| 485/513 [10:22:03<46:08, 98.87s/it] 95%|█████████▍| 486/513 [10:23:44<44:46, 99.49s/it] 95%|█████████▍| 487/513 [10:25:24<43:16, 99.85s/it] 95%|█████████▌| 488/513 [10:27:05<41:39, 99.99s/it] 95%|█████████▌| 489/513 [10:28:48<40:21, 100.90s/it] 96%|█████████▌| 490/513 [10:30:28<38:36, 100.70s/it] {'loss': 0.4384, 'grad_norm': 0.20166757702827454, 'learning_rate': 1.0008830227189431e-07, 'epoch': 2.87} + 96%|█████████▌| 490/513 [10:30:28<38:36, 100.70s/it] 96%|█████████▌| 491/513 [10:32:11<37:09, 101.36s/it] 96%|█████████▌| 492/513 [10:33:51<35:18, 100.89s/it] 96%|█████████▌| 493/513 [10:35:28<33:16, 99.81s/it] 96%|█████████▋| 494/513 [10:37:07<31:32, 99.60s/it] 96%|█████████▋| 495/513 [10:38:47<29:53, 99.66s/it] 97%|█████████▋| 496/513 [10:40:29<28:26, 100.40s/it] 97%|█████████▋| 497/513 [10:42:11<26:55, 100.98s/it] 97%|█████████▋| 498/513 [10:43:54<25:21, 101.40s/it] 97%|█████████▋| 499/513 [10:45:33<23:31, 100.84s/it] 97%|█████████▋| 500/513 [10:47:13<21:45, 100.40s/it] {'loss': 0.4438, 'grad_norm': 0.20334972441196442, 'learning_rate': 3.410796301156205e-08, 'epoch': 2.93} + 97%|█████████▋| 500/513 [10:47:13<21:45, 100.40s/it] 98%|█████████▊| 501/513 [10:48:51<19:57, 99.77s/it] 98%|█████████▊| 502/513 [10:50:27<18:05, 98.71s/it] 98%|█████████▊| 503/513 [10:51:57<16:01, 96.15s/it] 98%|█████████▊| 504/513 [10:53:36<14:32, 96.93s/it] 98%|█████████▊| 505/513 [10:55:18<13:08, 98.51s/it] 99%|█████████▊| 506/513 [10:56:58<11:33, 99.01s/it] 99%|█████████▉| 507/513 [10:58:40<09:58, 99.72s/it] 99%|█████████▉| 508/513 [11:00:08<08:01, 96.33s/it] 99%|█████████▉| 509/513 [11:01:53<06:35, 98.91s/it] 99%|█████████▉| 510/513 [11:03:34<04:58, 99.46s/it] {'loss': 0.4378, 'grad_norm': 0.20352092385292053, 'learning_rate': 2.7862639312792317e-09, 'epoch': 2.99} + 99%|█████████▉| 510/513 [11:03:34<04:58, 99.46s/it] 100%|█████████▉| 511/513 [11:05:14<03:19, 99.72s/it] 100%|█████████▉| 512/513 [11:06:53<01:39, 99.55s/it] 100%|██████████| 513/513 [11:06:57<00:00, 70.84s/it][INFO|trainer.py:4309] 2026-03-06 02:08:58,011 >> Saving model checkpoint to /local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/checkpoint-513 +[INFO|configuration_utils.py:491] 2026-03-06 02:08:58,013 >> Configuration saved in /local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/checkpoint-513/config.json +[INFO|configuration_utils.py:757] 2026-03-06 02:08:58,014 >> Configuration saved in /local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/checkpoint-513/generation_config.json +[INFO|modeling_utils.py:4189] 2026-03-06 02:09:02,393 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 2 checkpoint shards. You can find where each parameters has been saved in the index located at /local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/checkpoint-513/model.safetensors.index.json. +[INFO|tokenization_utils_base.py:2421] 2026-03-06 02:09:02,393 >> chat template saved in /local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/checkpoint-513/chat_template.jinja +[INFO|tokenization_utils_base.py:2590] 2026-03-06 02:09:02,395 >> tokenizer config file saved in /local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/checkpoint-513/tokenizer_config.json +[INFO|tokenization_utils_base.py:2599] 2026-03-06 02:09:02,395 >> Special tokens file saved in /local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/checkpoint-513/special_tokens_map.json +[2026-03-06 02:09:02,887] [INFO] [logging.py:107:log_dist] [Rank 0] [Torch] Checkpoint global_step513 is about to be saved! +[2026-03-06 02:09:02,900] [INFO] [logging.py:107:log_dist] [Rank 0] Saving model checkpoint: /local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/checkpoint-513/global_step513/mp_rank_00_model_states.pt +[2026-03-06 02:09:02,900] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/checkpoint-513/global_step513/mp_rank_00_model_states.pt... +[2026-03-06 02:09:08,710] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/checkpoint-513/global_step513/mp_rank_00_model_states.pt. +[2026-03-06 02:09:08,719] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/checkpoint-513/global_step513/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2026-03-06 02:09:17,788] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/checkpoint-513/global_step513/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2026-03-06 02:09:17,788] [INFO] [engine.py:3701:_save_zero_checkpoint] zero checkpoint saved /local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/checkpoint-513/global_step513/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2026-03-06 02:09:17,788] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step513 is ready now! +[INFO|trainer.py:2810] 2026-03-06 02:09:17,955 >> + +Training completed. Do not forget to share your model on huggingface.co/models =) + + + {'train_runtime': 40041.2675, 'train_samples_per_second': 3.261, 'train_steps_per_second': 0.013, 'train_loss': 0.49363853406255476, 'epoch': 3.0} + 100%|██████████| 513/513 [11:07:21<00:00, 70.84s/it] 100%|██████████| 513/513 [11:07:22<00:00, 78.06s/it] +[INFO|trainer.py:4309] 2026-03-06 02:09:21,898 >> Saving model checkpoint to /local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3 +[INFO|configuration_utils.py:491] 2026-03-06 02:09:21,900 >> Configuration saved in /local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/config.json +[INFO|configuration_utils.py:757] 2026-03-06 02:09:21,900 >> Configuration saved in /local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/generation_config.json +[INFO|modeling_utils.py:4189] 2026-03-06 02:09:26,695 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 2 checkpoint shards. You can find where each parameters has been saved in the index located at /local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/model.safetensors.index.json. +[INFO|tokenization_utils_base.py:2421] 2026-03-06 02:09:26,695 >> chat template saved in /local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/chat_template.jinja +[INFO|tokenization_utils_base.py:2590] 2026-03-06 02:09:26,697 >> tokenizer config file saved in /local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/tokenizer_config.json +[INFO|tokenization_utils_base.py:2599] 2026-03-06 02:09:26,697 >> Special tokens file saved in /local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/special_tokens_map.json +***** train metrics ***** + epoch = 3.0 + total_flos = 11157838704GF + train_loss = 0.4936 + train_runtime = 11:07:21.26 + train_samples_per_second = 3.261 + train_steps_per_second = 0.013 +Figure saved at: /local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/training_loss.png +[WARNING|2026-03-06 02:09:27] llamafactory.extras.ploting:148 >> No metric eval_loss to plot. +[WARNING|2026-03-06 02:09:27] llamafactory.extras.ploting:148 >> No metric eval_accuracy to plot. +[INFO|modelcard.py:456] 2026-03-06 02:09:27,263 >> Dropping the following result as it does not have all the necessary fields: +{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}} diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/meta.yaml b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/meta.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c2e35c74b79744fd7ec39024868f294cce1123f8 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/meta.yaml @@ -0,0 +1,14 @@ +artifact_uri: file:///local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/artifacts +end_time: 1772791757971 +entry_point_name: '' +experiment_id: '356092632336622637' +lifecycle_stage: active +run_id: c370ae36b3594e5b8e4483476b3515b7 +run_name: llama3b_think_sft_nopack_lr1.5e5_ep3 +source_name: '' +source_type: 4 +source_version: '' +start_time: 1772751716706 +status: 3 +tags: [] +user_id: salman diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/epoch b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/epoch new file mode 100644 index 0000000000000000000000000000000000000000..97af6927422df272d54f9427901853e6fae9eacf --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/epoch @@ -0,0 +1,52 @@ +1772752246047 0.05881271825032163 10 +1772752764661 0.11762543650064326 20 +1772753281010 0.1764381547509649 30 +1772753804852 0.23525087300128653 40 +1772754322796 0.29406359125160814 50 +1772754836358 0.3528763095019298 60 +1772755352360 0.4116890277522514 70 +1772755869425 0.47050174600257305 80 +1772756578289 0.5293144642528946 90 +1772757484272 0.5881271825032163 100 +1772758410313 0.6469399007535379 110 +1772759124014 0.7057526190038595 120 +1772759815988 0.7645653372541812 130 +1772760543838 0.8233780555045028 140 +1772761248573 0.8821907737548245 150 +1772761990136 0.9410034920051461 160 +1772762702603 0.9998162102554677 170 +1772763342537 1.0529314464252895 180 +1772764073344 1.1117441646756112 190 +1772764778282 1.1705568829259327 200 +1772765536908 1.2293696011762543 210 +1772766293667 1.288182319426576 220 +1772767040309 1.3469950376768978 230 +1772767807864 1.4058077559272193 240 +1772768557956 1.4646204741775408 250 +1772769332250 1.5234331924278626 260 +1772770136860 1.5822459106781843 270 +1772770855053 1.6410586289285058 280 +1772771568547 1.6998713471788274 290 +1772772287333 1.758684065429149 300 +1772772997323 1.8174967836794707 310 +1772773725742 1.8763095019297924 320 +1772774432045 1.935122220180114 330 +1772775156419 1.9939349384304355 340 +1772775808686 2.0470501746002574 350 +1772776704525 2.105862892850579 360 +1772777688582 2.1646756111009005 370 +1772778669427 2.2234883293512224 380 +1772779659140 2.282301047601544 390 +1772780643974 2.3411137658518655 400 +1772781649888 2.399926484102187 410 +1772782654359 2.4587392023525085 420 +1772783636091 2.5175519206028305 430 +1772784623680 2.576364638853152 440 +1772785610745 2.6351773571034736 450 +1772786582766 2.6939900753537955 460 +1772787560653 2.7528027936041166 470 +1772788546976 2.8116155118544386 480 +1772789545218 2.87042823010476 490 +1772790549864 2.9292409483550816 500 +1772791531194 2.9880536666054036 510 +1772791757957 3.0 513 diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/grad_norm b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/grad_norm new file mode 100644 index 0000000000000000000000000000000000000000..df72ee82e7695b2415f4cf800da3ff756244a06f --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/grad_norm @@ -0,0 +1,51 @@ +1772752246047 1.3511555194854736 10 +1772752764661 0.7383383512496948 20 +1772753281010 0.47219017148017883 30 +1772753804852 0.30038249492645264 40 +1772754322796 0.2751595377922058 50 +1772754836358 0.26936954259872437 60 +1772755352360 0.25376981496810913 70 +1772755869425 0.2703434228897095 80 +1772756578289 0.3386951684951782 90 +1772757484272 0.30952027440071106 100 +1772758410313 0.2706937789916992 110 +1772759124014 0.286222368478775 120 +1772759815988 0.2553636431694031 130 +1772760543838 0.2975357472896576 140 +1772761248573 0.24958086013793945 150 +1772761990136 0.302441269159317 160 +1772762702603 0.24974007904529572 170 +1772763342537 0.35062289237976074 180 +1772764073344 0.28535276651382446 190 +1772764778282 0.2474713921546936 200 +1772765536908 0.23004528880119324 210 +1772766293667 0.23046620190143585 220 +1772767040309 0.243893101811409 230 +1772767807864 0.2657492160797119 240 +1772768557956 0.24003422260284424 250 +1772769332250 0.238833948969841 260 +1772770136860 0.237404927611351 270 +1772770855053 0.22758300602436066 280 +1772771568547 0.22680319845676422 290 +1772772287333 0.2401188611984253 300 +1772772997323 0.2211555689573288 310 +1772773725742 0.24088308215141296 320 +1772774432045 0.21008798480033875 330 +1772775156419 0.2156449556350708 340 +1772775808686 0.2731837034225464 350 +1772776704525 0.2207324057817459 360 +1772777688582 0.21577142179012299 370 +1772778669427 0.22381627559661865 380 +1772779659140 0.2167045623064041 390 +1772780643974 0.2239835262298584 400 +1772781649888 0.2177765816450119 410 +1772782654359 0.21108600497245789 420 +1772783636091 0.20833276212215424 430 +1772784623680 0.20782434940338135 440 +1772785610745 0.20101866126060486 450 +1772786582766 0.1978382021188736 460 +1772787560653 0.20072239637374878 470 +1772788546976 0.2036609798669815 480 +1772789545218 0.20166757702827454 490 +1772790549864 0.20334972441196442 500 +1772791531194 0.20352092385292053 510 diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/learning_rate b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/learning_rate new file mode 100644 index 0000000000000000000000000000000000000000..378f0b61caa000a20eff691f4b0d44b9a032ccd6 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/learning_rate @@ -0,0 +1,51 @@ +1772752246047 2.596153846153846e-06 10 +1772752764661 5.480769230769231e-06 20 +1772753281010 8.365384615384616e-06 30 +1772753804852 1.125e-05 40 +1772754322796 1.4134615384615384e-05 50 +1772754836358 1.4991468156423456e-05 60 +1772755352360 1.494972625749433e-05 70 +1772755869425 1.4873400764197756e-05 80 +1772756578289 1.4762845999606666e-05 90 +1772757484272 1.4618575188100301e-05 100 +1772758410313 1.4441258072841264e-05 110 +1772759124014 1.4231717806651086e-05 120 +1772759815988 1.3990927130717711e-05 130 +1772760543838 1.3720003858874311e-05 140 +1772761248573 1.3420205688412603e-05 150 +1772761990136 1.3092924361520291e-05 160 +1772762702603 1.2739679204446694e-05 170 +1772763342537 1.236211007438955e-05 180 +1772764073344 1.1961969746845325e-05 190 +1772764778282 1.1541115778763038e-05 200 +1772765536908 1.1101501885274894e-05 210 +1772766293667 1.0645168870035313e-05 220 +1772767040309 1.0174235151272025e-05 230 +1772767807864 9.690886927529886e-06 240 +1772768557956 9.197368028760536e-06 250 +1772769332250 8.695969499871911e-06 260 +1772770136860 8.18901896509343e-06 270 +1772770855053 7.678869822530362e-06 280 +1772771568547 7.167890319069035e-06 290 +1772772287333 6.658452556350092e-06 300 +1772772997323 6.152921478846986e-06 310 +1772773725742 5.65364389516988e-06 320 +1772774432045 5.162937583561072e-06 330 +1772775156419 4.683080532156986e-06 340 +1772775808686 4.216300363966383e-06 350 +1772776704525 3.7647639956567304e-06 360 +1772777688582 3.3305675781554655e-06 370 +1772778669427 2.915726765764453e-06 380 +1772779659140 2.522167358961046e-06 390 +1772780643974 2.151716364324264e-06 400 +1772781649888 1.806093513088348e-06 410 +1772782654359 1.486903277696733e-06 420 +1772783636091 1.1956274234177322e-06 430 +1772784623680 9.336181295993204e-07 440 +1772785610745 7.02091712495907e-07 450 +1772786582766 5.021229788074589e-07 460 +1772787560653 3.3464023614327683e-07 470 +1772788546976 2.0042098357321209e-07 480 +1772789545218 1.0008830227189431e-07 490 +1772790549864 3.410796301156205e-08 500 +1772791531194 2.7862639312792317e-09 510 diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/loss b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/loss new file mode 100644 index 0000000000000000000000000000000000000000..71f1daff0141a49f6122575354fb5e356362a096 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/loss @@ -0,0 +1,51 @@ +1772752246047 0.8486 10 +1772752764661 0.7412 20 +1772753281010 0.6532 30 +1772753804852 0.6102 40 +1772754322796 0.5784 50 +1772754836358 0.5641 60 +1772755352360 0.5469 70 +1772755869425 0.5424 80 +1772756578289 0.5293 90 +1772757484272 0.5266 100 +1772758410313 0.522 110 +1772759124014 0.5222 120 +1772759815988 0.5106 130 +1772760543838 0.5114 140 +1772761248573 0.5099 150 +1772761990136 0.5086 160 +1772762702603 0.5061 170 +1772763342537 0.4746 180 +1772764073344 0.478 190 +1772764778282 0.4755 200 +1772765536908 0.4765 210 +1772766293667 0.4706 220 +1772767040309 0.4681 230 +1772767807864 0.4715 240 +1772768557956 0.4711 250 +1772769332250 0.4685 260 +1772770136860 0.4688 270 +1772770855053 0.4722 280 +1772771568547 0.4649 290 +1772772287333 0.4692 300 +1772772997323 0.4653 310 +1772773725742 0.4664 320 +1772774432045 0.4621 330 +1772775156419 0.4675 340 +1772775808686 0.44 350 +1772776704525 0.4447 360 +1772777688582 0.4391 370 +1772778669427 0.4419 380 +1772779659140 0.4377 390 +1772780643974 0.4387 400 +1772781649888 0.4426 410 +1772782654359 0.442 420 +1772783636091 0.4449 430 +1772784623680 0.4391 440 +1772785610745 0.4374 450 +1772786582766 0.4403 460 +1772787560653 0.44 470 +1772788546976 0.4358 480 +1772789545218 0.4384 490 +1772790549864 0.4438 500 +1772791531194 0.4378 510 diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/total_flos b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/total_flos new file mode 100644 index 0000000000000000000000000000000000000000..9a802873e27ec4d9646414f92508dca5f1d7f0e3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/total_flos @@ -0,0 +1 @@ +1772791757957 1.1980638081930756e+19 513 diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/train_loss b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/train_loss new file mode 100644 index 0000000000000000000000000000000000000000..3bbb73714175b21eef7cfbdb6eae39fbfdd30604 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/train_loss @@ -0,0 +1 @@ +1772791757957 0.49363853406255476 513 diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/train_runtime b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/train_runtime new file mode 100644 index 0000000000000000000000000000000000000000..ddd75666b9e8558533c6490b184e7157dda0bd77 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/train_runtime @@ -0,0 +1 @@ +1772791757957 40041.2675 513 diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/train_samples_per_second b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/train_samples_per_second new file mode 100644 index 0000000000000000000000000000000000000000..dbc59d8a207474395bbdf872cc548d3c14a08285 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/train_samples_per_second @@ -0,0 +1 @@ +1772791757957 3.261 513 diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/train_steps_per_second b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/train_steps_per_second new file mode 100644 index 0000000000000000000000000000000000000000..d9172f42961505aefb9b26fd11e2fec900f727e5 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/train_steps_per_second @@ -0,0 +1 @@ +1772791757957 0.013 513 diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/_name_or_path b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/_name_or_path new file mode 100644 index 0000000000000000000000000000000000000000..d8e3ea0bc188cdd969fb8d4e7cf8f2f2c4523f88 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/_name_or_path @@ -0,0 +1 @@ +/local2/salman/model/pretrain_model/v2_4_gpu_llama_3b_nemo_52b \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/accelerator_config b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/accelerator_config new file mode 100644 index 0000000000000000000000000000000000000000..b1e7502bd7285c87efc171b09bce733551c8d179 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/accelerator_config @@ -0,0 +1 @@ +{'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None} \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/adafactor b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/adafactor new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/adafactor @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/adam_beta1 b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/adam_beta1 new file mode 100644 index 0000000000000000000000000000000000000000..9a7d84f2a96bb56f53bfc3a42ac10d06459e55c3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/adam_beta1 @@ -0,0 +1 @@ +0.9 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/adam_beta2 b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/adam_beta2 new file mode 100644 index 0000000000000000000000000000000000000000..79cbfdf0652c46b13ed8946e54aa94ff7bdd44ab --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/adam_beta2 @@ -0,0 +1 @@ +0.999 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/adam_epsilon b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/adam_epsilon new file mode 100644 index 0000000000000000000000000000000000000000..851199be9c9a0b8c721d7f305f5af1759637102d --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/adam_epsilon @@ -0,0 +1 @@ +1e-08 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/add_cross_attention b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/add_cross_attention new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/add_cross_attention @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/architectures b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/architectures new file mode 100644 index 0000000000000000000000000000000000000000..fe06827e8a90199228b9e7009f79062405f3d52f --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/architectures @@ -0,0 +1 @@ +['LlamaForCausalLM'] \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/attention_bias b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/attention_bias new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/attention_bias @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/attention_dropout b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/attention_dropout new file mode 100644 index 0000000000000000000000000000000000000000..171538eb0b00f4eddffa17929796de55b838f34b --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/attention_dropout @@ -0,0 +1 @@ +0.0 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/auto_find_batch_size b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/auto_find_batch_size new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/auto_find_batch_size @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/average_tokens_across_devices b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/average_tokens_across_devices new file mode 100644 index 0000000000000000000000000000000000000000..4791ed5559bd77f54e1520025768e2b368705876 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/average_tokens_across_devices @@ -0,0 +1 @@ +True \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/bad_words_ids b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/bad_words_ids new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/bad_words_ids @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/batch_eval_metrics b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/batch_eval_metrics new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/batch_eval_metrics @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/begin_suppress_tokens b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/begin_suppress_tokens new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/begin_suppress_tokens @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/bf16 b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/bf16 new file mode 100644 index 0000000000000000000000000000000000000000..4791ed5559bd77f54e1520025768e2b368705876 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/bf16 @@ -0,0 +1 @@ +True \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/bf16_full_eval b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/bf16_full_eval new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/bf16_full_eval @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/bos_token_id b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/bos_token_id new file mode 100644 index 0000000000000000000000000000000000000000..5499007cbac38bc897e3c2766b82a647ac28c735 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/bos_token_id @@ -0,0 +1 @@ +128000 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/chunk_size_feed_forward b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/chunk_size_feed_forward new file mode 100644 index 0000000000000000000000000000000000000000..c227083464fb9af8955c90d2924774ee50abb547 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/chunk_size_feed_forward @@ -0,0 +1 @@ +0 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/cross_attention_hidden_size b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/cross_attention_hidden_size new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/cross_attention_hidden_size @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/data_seed b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/data_seed new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/data_seed @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/dataloader_drop_last b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/dataloader_drop_last new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/dataloader_drop_last @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/dataloader_num_workers b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/dataloader_num_workers new file mode 100644 index 0000000000000000000000000000000000000000..bf0d87ab1b2b0ec1a11a3973d2845b42413d9767 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/dataloader_num_workers @@ -0,0 +1 @@ +4 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/dataloader_persistent_workers b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/dataloader_persistent_workers new file mode 100644 index 0000000000000000000000000000000000000000..4791ed5559bd77f54e1520025768e2b368705876 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/dataloader_persistent_workers @@ -0,0 +1 @@ +True \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/dataloader_pin_memory b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/dataloader_pin_memory new file mode 100644 index 0000000000000000000000000000000000000000..4791ed5559bd77f54e1520025768e2b368705876 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/dataloader_pin_memory @@ -0,0 +1 @@ +True \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/dataloader_prefetch_factor b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/dataloader_prefetch_factor new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/dataloader_prefetch_factor @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ddp_backend b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ddp_backend new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ddp_backend @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ddp_broadcast_buffers b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ddp_broadcast_buffers new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ddp_broadcast_buffers @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ddp_bucket_cap_mb b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ddp_bucket_cap_mb new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ddp_bucket_cap_mb @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ddp_find_unused_parameters b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ddp_find_unused_parameters new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ddp_find_unused_parameters @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ddp_timeout b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ddp_timeout new file mode 100644 index 0000000000000000000000000000000000000000..ea953a778190bc7131bea1a3f79bd88ff34fc5d1 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ddp_timeout @@ -0,0 +1 @@ +180000000 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/debug b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/debug new file mode 100644 index 0000000000000000000000000000000000000000..0637a088a01e8ddab3bf3fa98dbe804cbde1a0dc --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/debug @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/decoder_start_token_id b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/decoder_start_token_id new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/decoder_start_token_id @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/deepspeed b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/deepspeed new file mode 100644 index 0000000000000000000000000000000000000000..c76c24a25f92f79e456f4367b44d5089cf94aac6 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/deepspeed @@ -0,0 +1 @@ +examples/deepspeed/ds_z2_config.json \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/disable_tqdm b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/disable_tqdm new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/disable_tqdm @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/diversity_penalty b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/diversity_penalty new file mode 100644 index 0000000000000000000000000000000000000000..171538eb0b00f4eddffa17929796de55b838f34b --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/diversity_penalty @@ -0,0 +1 @@ +0.0 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/do_eval b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/do_eval new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/do_eval @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/do_predict b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/do_predict new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/do_predict @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/do_sample b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/do_sample new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/do_sample @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/do_train b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/do_train new file mode 100644 index 0000000000000000000000000000000000000000..4791ed5559bd77f54e1520025768e2b368705876 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/do_train @@ -0,0 +1 @@ +True \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/dtype b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/dtype new file mode 100644 index 0000000000000000000000000000000000000000..8481ec0098496c454d11e66437510c620f01aa78 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/dtype @@ -0,0 +1 @@ +bfloat16 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/early_stopping b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/early_stopping new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/early_stopping @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/encoder_no_repeat_ngram_size b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/encoder_no_repeat_ngram_size new file mode 100644 index 0000000000000000000000000000000000000000..c227083464fb9af8955c90d2924774ee50abb547 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/encoder_no_repeat_ngram_size @@ -0,0 +1 @@ +0 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/eos_token_id b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/eos_token_id new file mode 100644 index 0000000000000000000000000000000000000000..1fc3affa551d3f978dc4cd9d6dd031cb9031bcd5 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/eos_token_id @@ -0,0 +1 @@ +128009 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/eval_accumulation_steps b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/eval_accumulation_steps new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/eval_accumulation_steps @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/eval_delay b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/eval_delay new file mode 100644 index 0000000000000000000000000000000000000000..c227083464fb9af8955c90d2924774ee50abb547 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/eval_delay @@ -0,0 +1 @@ +0 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/eval_do_concat_batches b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/eval_do_concat_batches new file mode 100644 index 0000000000000000000000000000000000000000..4791ed5559bd77f54e1520025768e2b368705876 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/eval_do_concat_batches @@ -0,0 +1 @@ +True \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/eval_on_start b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/eval_on_start new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/eval_on_start @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/eval_steps b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/eval_steps new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/eval_steps @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/eval_strategy b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/eval_strategy new file mode 100644 index 0000000000000000000000000000000000000000..54299a48fb3ae76c848b3acc12248574d05d81b8 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/eval_strategy @@ -0,0 +1 @@ +no \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/eval_use_gather_object b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/eval_use_gather_object new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/eval_use_gather_object @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/exponential_decay_length_penalty b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/exponential_decay_length_penalty new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/exponential_decay_length_penalty @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/finetuning_task b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/finetuning_task new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/finetuning_task @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/forced_bos_token_id b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/forced_bos_token_id new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/forced_bos_token_id @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/forced_eos_token_id b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/forced_eos_token_id new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/forced_eos_token_id @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/fp16 b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/fp16 new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/fp16 @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/fp16_backend b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/fp16_backend new file mode 100644 index 0000000000000000000000000000000000000000..4d18c3e59ecf5c28b46b06ce26f2406b2d449870 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/fp16_backend @@ -0,0 +1 @@ +auto \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/fp16_full_eval b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/fp16_full_eval new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/fp16_full_eval @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/fp16_opt_level b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/fp16_opt_level new file mode 100644 index 0000000000000000000000000000000000000000..a9ada426ac8819467c6dc392dcbea40183a3e16e --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/fp16_opt_level @@ -0,0 +1 @@ +O1 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/fsdp b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/fsdp new file mode 100644 index 0000000000000000000000000000000000000000..0637a088a01e8ddab3bf3fa98dbe804cbde1a0dc --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/fsdp @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/fsdp_config b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/fsdp_config new file mode 100644 index 0000000000000000000000000000000000000000..9d33480169a14dfac929530aefc3cd1f5776a983 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/fsdp_config @@ -0,0 +1 @@ +{'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False} \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/fsdp_min_num_params b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/fsdp_min_num_params new file mode 100644 index 0000000000000000000000000000000000000000..c227083464fb9af8955c90d2924774ee50abb547 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/fsdp_min_num_params @@ -0,0 +1 @@ +0 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/fsdp_transformer_layer_cls_to_wrap b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/fsdp_transformer_layer_cls_to_wrap new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/fsdp_transformer_layer_cls_to_wrap @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/full_determinism b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/full_determinism new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/full_determinism @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/generation_config b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/generation_config new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/generation_config @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/generation_max_length b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/generation_max_length new file mode 100644 index 0000000000000000000000000000000000000000..e0c3f84a6747696c58b1a32f81129c66498e094a --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/generation_max_length @@ -0,0 +1 @@ +8192 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/generation_num_beams b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/generation_num_beams new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/generation_num_beams @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/gradient_accumulation_steps b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/gradient_accumulation_steps new file mode 100644 index 0000000000000000000000000000000000000000..1758dddccea2b3b02d21228a0d06a45a35c0d861 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/gradient_accumulation_steps @@ -0,0 +1 @@ +32 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/gradient_checkpointing b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/gradient_checkpointing new file mode 100644 index 0000000000000000000000000000000000000000..4791ed5559bd77f54e1520025768e2b368705876 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/gradient_checkpointing @@ -0,0 +1 @@ +True \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/gradient_checkpointing_kwargs b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/gradient_checkpointing_kwargs new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/gradient_checkpointing_kwargs @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/greater_is_better b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/greater_is_better new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/greater_is_better @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/group_by_length b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/group_by_length new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/group_by_length @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/half_precision_backend b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/half_precision_backend new file mode 100644 index 0000000000000000000000000000000000000000..4d18c3e59ecf5c28b46b06ce26f2406b2d449870 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/half_precision_backend @@ -0,0 +1 @@ +auto \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/head_dim b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/head_dim new file mode 100644 index 0000000000000000000000000000000000000000..b854a292176003137b48b2f2eb6267c6c3085c9b --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/head_dim @@ -0,0 +1 @@ +128 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/hidden_act b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/hidden_act new file mode 100644 index 0000000000000000000000000000000000000000..84972cd9564e61cac416981cb71bb1e176046f68 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/hidden_act @@ -0,0 +1 @@ +silu \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/hidden_size b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/hidden_size new file mode 100644 index 0000000000000000000000000000000000000000..489250e329290c73b6a137d9af9a29e421e6e1d7 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/hidden_size @@ -0,0 +1 @@ +3072 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/hub_always_push b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/hub_always_push new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/hub_always_push @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/hub_model_id b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/hub_model_id new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/hub_model_id @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/hub_private_repo b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/hub_private_repo new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/hub_private_repo @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/hub_revision b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/hub_revision new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/hub_revision @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/hub_strategy b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/hub_strategy new file mode 100644 index 0000000000000000000000000000000000000000..8532b12ca8add8fe61b84623fab9d559a366ce3c --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/hub_strategy @@ -0,0 +1 @@ +every_save \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/hub_token b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/hub_token new file mode 100644 index 0000000000000000000000000000000000000000..0a574a354979ef783f5f4fe08c3595f79596ff41 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/hub_token @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/id2label b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/id2label new file mode 100644 index 0000000000000000000000000000000000000000..74c276dcae370126a18f5657c0e1ed72e72325e9 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/id2label @@ -0,0 +1 @@ +{0: 'LABEL_0', 1: 'LABEL_1'} \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ignore_data_skip b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ignore_data_skip new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ignore_data_skip @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/include_for_metrics b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/include_for_metrics new file mode 100644 index 0000000000000000000000000000000000000000..0637a088a01e8ddab3bf3fa98dbe804cbde1a0dc --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/include_for_metrics @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/include_inputs_for_metrics b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/include_inputs_for_metrics new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/include_inputs_for_metrics @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/include_num_input_tokens_seen b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/include_num_input_tokens_seen new file mode 100644 index 0000000000000000000000000000000000000000..54299a48fb3ae76c848b3acc12248574d05d81b8 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/include_num_input_tokens_seen @@ -0,0 +1 @@ +no \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/include_tokens_per_second b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/include_tokens_per_second new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/include_tokens_per_second @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/initializer_range b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/initializer_range new file mode 100644 index 0000000000000000000000000000000000000000..79dd775c1e90ab736c362ede2f2332678eccf47e --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/initializer_range @@ -0,0 +1 @@ +0.02 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/intermediate_size b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/intermediate_size new file mode 100644 index 0000000000000000000000000000000000000000..e0c3f84a6747696c58b1a32f81129c66498e094a --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/intermediate_size @@ -0,0 +1 @@ +8192 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/is_decoder b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/is_decoder new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/is_decoder @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/is_encoder_decoder b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/is_encoder_decoder new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/is_encoder_decoder @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/jit_mode_eval b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/jit_mode_eval new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/jit_mode_eval @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/label2id b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/label2id new file mode 100644 index 0000000000000000000000000000000000000000..0589857be5c3ad7b568bf7c79a4172a5aa887693 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/label2id @@ -0,0 +1 @@ +{'LABEL_0': 0, 'LABEL_1': 1} \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/label_names b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/label_names new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/label_names @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/label_smoothing_factor b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/label_smoothing_factor new file mode 100644 index 0000000000000000000000000000000000000000..171538eb0b00f4eddffa17929796de55b838f34b --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/label_smoothing_factor @@ -0,0 +1 @@ +0.0 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/learning_rate b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/learning_rate new file mode 100644 index 0000000000000000000000000000000000000000..851afc92400e328f36a83152a5c0a3ae9da45fca --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/learning_rate @@ -0,0 +1 @@ +1.5e-05 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/length_column_name b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/length_column_name new file mode 100644 index 0000000000000000000000000000000000000000..c2e7ec839dabf14d5d59f187c6b8fdb3460872aa --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/length_column_name @@ -0,0 +1 @@ +length \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/length_penalty b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/length_penalty new file mode 100644 index 0000000000000000000000000000000000000000..9f8e9b69a33f4e8067d5b21661a35d8856758aba --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/length_penalty @@ -0,0 +1 @@ +1.0 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/liger_kernel_config b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/liger_kernel_config new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/liger_kernel_config @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/load_best_model_at_end b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/load_best_model_at_end new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/load_best_model_at_end @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/local_rank b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/local_rank new file mode 100644 index 0000000000000000000000000000000000000000..c227083464fb9af8955c90d2924774ee50abb547 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/local_rank @@ -0,0 +1 @@ +0 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/log_level b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/log_level new file mode 100644 index 0000000000000000000000000000000000000000..ecf328558d66d304c19bdd373f647085a3f0880d --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/log_level @@ -0,0 +1 @@ +passive \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/log_level_replica b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/log_level_replica new file mode 100644 index 0000000000000000000000000000000000000000..14b472df8d4481c6fea79c066ae4650980f02b7c --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/log_level_replica @@ -0,0 +1 @@ +warning \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/log_on_each_node b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/log_on_each_node new file mode 100644 index 0000000000000000000000000000000000000000..4791ed5559bd77f54e1520025768e2b368705876 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/log_on_each_node @@ -0,0 +1 @@ +True \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/logging_dir b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/logging_dir new file mode 100644 index 0000000000000000000000000000000000000000..1d04fea0ac87c57b36d52dc646937dfe05738ef6 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/logging_dir @@ -0,0 +1 @@ +/local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/runs/Mar05_15-00-58_raven \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/logging_first_step b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/logging_first_step new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/logging_first_step @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/logging_nan_inf_filter b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/logging_nan_inf_filter new file mode 100644 index 0000000000000000000000000000000000000000..4791ed5559bd77f54e1520025768e2b368705876 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/logging_nan_inf_filter @@ -0,0 +1 @@ +True \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/logging_steps b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/logging_steps new file mode 100644 index 0000000000000000000000000000000000000000..9a037142aa3c1b4c490e1a38251620f113465330 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/logging_steps @@ -0,0 +1 @@ +10 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/logging_strategy b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/logging_strategy new file mode 100644 index 0000000000000000000000000000000000000000..17f15e19cf5e8064aff8d528657b70e9611eb59e --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/logging_strategy @@ -0,0 +1 @@ +steps \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/lr_scheduler_kwargs b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/lr_scheduler_kwargs new file mode 100644 index 0000000000000000000000000000000000000000..9e26dfeeb6e641a33dae4961196235bdb965b21b --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/lr_scheduler_kwargs @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/lr_scheduler_type b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/lr_scheduler_type new file mode 100644 index 0000000000000000000000000000000000000000..84aa3999b5b7cae7f78b1f77e04d182643005a92 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/lr_scheduler_type @@ -0,0 +1 @@ +cosine \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/max_grad_norm b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/max_grad_norm new file mode 100644 index 0000000000000000000000000000000000000000..9f8e9b69a33f4e8067d5b21661a35d8856758aba --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/max_grad_norm @@ -0,0 +1 @@ +1.0 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/max_length b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/max_length new file mode 100644 index 0000000000000000000000000000000000000000..2edeafb09db0093bae6ff060e2dcd2166f5c9387 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/max_length @@ -0,0 +1 @@ +20 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/max_position_embeddings b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/max_position_embeddings new file mode 100644 index 0000000000000000000000000000000000000000..41fc5b8c922eda0b84dd1d03152fd870cd1a8295 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/max_position_embeddings @@ -0,0 +1 @@ +131072 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/max_steps b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/max_steps new file mode 100644 index 0000000000000000000000000000000000000000..d7d17fcbef95ca19081c4cc5e97cbc592cc7081f --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/max_steps @@ -0,0 +1 @@ +-1 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/metric_for_best_model b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/metric_for_best_model new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/metric_for_best_model @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/min_length b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/min_length new file mode 100644 index 0000000000000000000000000000000000000000..c227083464fb9af8955c90d2924774ee50abb547 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/min_length @@ -0,0 +1 @@ +0 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/mlp_bias b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/mlp_bias new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/mlp_bias @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/model_type b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/model_type new file mode 100644 index 0000000000000000000000000000000000000000..056bf100b8c479097594926c1ab454fe1ee0f39c --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/model_type @@ -0,0 +1 @@ +llama \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/mp_parameters b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/mp_parameters new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/neftune_noise_alpha b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/neftune_noise_alpha new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/neftune_noise_alpha @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/no_cuda b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/no_cuda new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/no_cuda @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/no_repeat_ngram_size b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/no_repeat_ngram_size new file mode 100644 index 0000000000000000000000000000000000000000..c227083464fb9af8955c90d2924774ee50abb547 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/no_repeat_ngram_size @@ -0,0 +1 @@ +0 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/num_attention_heads b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/num_attention_heads new file mode 100644 index 0000000000000000000000000000000000000000..cabf43b5ddf813cbe89697372a21373f14921884 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/num_attention_heads @@ -0,0 +1 @@ +24 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/num_beam_groups b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/num_beam_groups new file mode 100644 index 0000000000000000000000000000000000000000..56a6051ca2b02b04ef92d5150c9ef600403cb1de --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/num_beam_groups @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/num_beams b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/num_beams new file mode 100644 index 0000000000000000000000000000000000000000..56a6051ca2b02b04ef92d5150c9ef600403cb1de --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/num_beams @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/num_hidden_layers b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/num_hidden_layers new file mode 100644 index 0000000000000000000000000000000000000000..368f89ceef179cc546403ac0d5ef1d0e4b340447 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/num_hidden_layers @@ -0,0 +1 @@ +28 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/num_key_value_heads b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/num_key_value_heads new file mode 100644 index 0000000000000000000000000000000000000000..301160a93062df23030a69f4b5e4d9bf71866ee9 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/num_key_value_heads @@ -0,0 +1 @@ +8 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/num_return_sequences b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/num_return_sequences new file mode 100644 index 0000000000000000000000000000000000000000..56a6051ca2b02b04ef92d5150c9ef600403cb1de --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/num_return_sequences @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/num_train_epochs b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/num_train_epochs new file mode 100644 index 0000000000000000000000000000000000000000..f398a20612afa114338cf6fec4d5378e51473059 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/num_train_epochs @@ -0,0 +1 @@ +3.0 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/optim b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/optim new file mode 100644 index 0000000000000000000000000000000000000000..2fd30f30cf2a0413799ab7959d66333f63162f20 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/optim @@ -0,0 +1 @@ +adamw_torch \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/optim_args b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/optim_args new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/optim_args @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/optim_target_modules b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/optim_target_modules new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/optim_target_modules @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/output_attentions b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/output_attentions new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/output_attentions @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/output_dir b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/output_dir new file mode 100644 index 0000000000000000000000000000000000000000..c30e0dc20233bf05b963489ad59af173f7461fed --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/output_dir @@ -0,0 +1 @@ +/local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/output_hidden_states b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/output_hidden_states new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/output_hidden_states @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/output_scores b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/output_scores new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/output_scores @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/overwrite_output_dir b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/overwrite_output_dir new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/overwrite_output_dir @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/pad_token_id b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/pad_token_id new file mode 100644 index 0000000000000000000000000000000000000000..f3b3deeda7ab5834f5e3e5a2ed8083a147e11f2a --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/pad_token_id @@ -0,0 +1 @@ +128001 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/parallelism_config b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/parallelism_config new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/parallelism_config @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/past_index b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/past_index new file mode 100644 index 0000000000000000000000000000000000000000..d7d17fcbef95ca19081c4cc5e97cbc592cc7081f --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/past_index @@ -0,0 +1 @@ +-1 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/per_device_eval_batch_size b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/per_device_eval_batch_size new file mode 100644 index 0000000000000000000000000000000000000000..301160a93062df23030a69f4b5e4d9bf71866ee9 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/per_device_eval_batch_size @@ -0,0 +1 @@ +8 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/per_device_train_batch_size b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/per_device_train_batch_size new file mode 100644 index 0000000000000000000000000000000000000000..d8263ee9860594d2806b0dfd1bfd17528b0ba2a4 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/per_device_train_batch_size @@ -0,0 +1 @@ +2 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/per_gpu_eval_batch_size b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/per_gpu_eval_batch_size new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/per_gpu_eval_batch_size @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/per_gpu_train_batch_size b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/per_gpu_train_batch_size new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/per_gpu_train_batch_size @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/placement_strategy b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/placement_strategy new file mode 100644 index 0000000000000000000000000000000000000000..0d156a4c857f8ef50c1dc4c08dc2d14fbf2e8bf1 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/placement_strategy @@ -0,0 +1 @@ +PACK \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/predict_with_generate b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/predict_with_generate new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/predict_with_generate @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/prediction_loss_only b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/prediction_loss_only new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/prediction_loss_only @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/prefix b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/prefix new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/prefix @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/pretraining_tp b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/pretraining_tp new file mode 100644 index 0000000000000000000000000000000000000000..56a6051ca2b02b04ef92d5150c9ef600403cb1de --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/pretraining_tp @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/problem_type b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/problem_type new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/problem_type @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/project b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/project new file mode 100644 index 0000000000000000000000000000000000000000..58df447a1503b0f78c9105cdd52cf573b95f390f --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/project @@ -0,0 +1 @@ +huggingface \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/pruned_heads b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/pruned_heads new file mode 100644 index 0000000000000000000000000000000000000000..9e26dfeeb6e641a33dae4961196235bdb965b21b --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/pruned_heads @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/push_to_hub b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/push_to_hub new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/push_to_hub @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/push_to_hub_model_id b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/push_to_hub_model_id new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/push_to_hub_model_id @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/push_to_hub_organization b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/push_to_hub_organization new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/push_to_hub_organization @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/push_to_hub_token b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/push_to_hub_token new file mode 100644 index 0000000000000000000000000000000000000000..36e61093756f7c43b24cd50fc63164c08bcf50f1 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/push_to_hub_token @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ray_init_kwargs b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ray_init_kwargs new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ray_init_kwargs @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ray_num_workers b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ray_num_workers new file mode 100644 index 0000000000000000000000000000000000000000..56a6051ca2b02b04ef92d5150c9ef600403cb1de --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ray_num_workers @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ray_run_name b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ray_run_name new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ray_run_name @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ray_scope b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ray_scope new file mode 100644 index 0000000000000000000000000000000000000000..1c1206e8bf4337e96dad9a6d139628852077558d --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ray_scope @@ -0,0 +1 @@ +last \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ray_storage_filesystem b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ray_storage_filesystem new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ray_storage_filesystem @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ray_storage_path b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ray_storage_path new file mode 100644 index 0000000000000000000000000000000000000000..c6962e171e535e015188e2c019a938766fa0d574 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ray_storage_path @@ -0,0 +1 @@ +./saves \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/remove_invalid_values b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/remove_invalid_values new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/remove_invalid_values @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/remove_unused_columns b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/remove_unused_columns new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/remove_unused_columns @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/repetition_penalty b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/repetition_penalty new file mode 100644 index 0000000000000000000000000000000000000000..9f8e9b69a33f4e8067d5b21661a35d8856758aba --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/repetition_penalty @@ -0,0 +1 @@ +1.0 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/report_to b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/report_to new file mode 100644 index 0000000000000000000000000000000000000000..95988dbfb90ccf107f671f17824204c92d3d1674 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/report_to @@ -0,0 +1 @@ +['mlflow'] \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/resources_per_worker b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/resources_per_worker new file mode 100644 index 0000000000000000000000000000000000000000..37cf4fc6d29e1d007e764540c6445ce887916e44 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/resources_per_worker @@ -0,0 +1 @@ +{'GPU': 1} \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/restore_callback_states_from_checkpoint b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/restore_callback_states_from_checkpoint new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/restore_callback_states_from_checkpoint @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/resume_from_checkpoint b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/resume_from_checkpoint new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/resume_from_checkpoint @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/return_dict b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/return_dict new file mode 100644 index 0000000000000000000000000000000000000000..4791ed5559bd77f54e1520025768e2b368705876 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/return_dict @@ -0,0 +1 @@ +True \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/return_dict_in_generate b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/return_dict_in_generate new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/return_dict_in_generate @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/rms_norm_eps b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/rms_norm_eps new file mode 100644 index 0000000000000000000000000000000000000000..5868ff147459cee04c24f2de58e75969024870b8 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/rms_norm_eps @@ -0,0 +1 @@ +1e-05 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/rope_scaling b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/rope_scaling new file mode 100644 index 0000000000000000000000000000000000000000..9215f7897f1cde9615d8a2cbc581da576c91844b --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/rope_scaling @@ -0,0 +1 @@ +{'factor': 32.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'} \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/rope_theta b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/rope_theta new file mode 100644 index 0000000000000000000000000000000000000000..0dc280f5f9cfaea3b39e34bef5251cb1bbaf3bf8 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/rope_theta @@ -0,0 +1 @@ +500000.0 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/run_name b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/run_name new file mode 100644 index 0000000000000000000000000000000000000000..0f23e0877477455534bae4babc7610a7dc7cea17 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/run_name @@ -0,0 +1 @@ +llama3b_think_sft_nopack_lr1.5e5_ep3 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/save_on_each_node b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/save_on_each_node new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/save_on_each_node @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/save_only_model b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/save_only_model new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/save_only_model @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/save_safetensors b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/save_safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4791ed5559bd77f54e1520025768e2b368705876 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/save_safetensors @@ -0,0 +1 @@ +True \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/save_steps b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/save_steps new file mode 100644 index 0000000000000000000000000000000000000000..ae4ee13c08e7628701b925b8962108bd7643bf6e --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/save_steps @@ -0,0 +1 @@ +200 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/save_strategy b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/save_strategy new file mode 100644 index 0000000000000000000000000000000000000000..17f15e19cf5e8064aff8d528657b70e9611eb59e --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/save_strategy @@ -0,0 +1 @@ +steps \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/save_total_limit b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/save_total_limit new file mode 100644 index 0000000000000000000000000000000000000000..7813681f5b41c028345ca62a2be376bae70b7f61 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/save_total_limit @@ -0,0 +1 @@ +5 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/seed b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/seed new file mode 100644 index 0000000000000000000000000000000000000000..f70d7bba4ae1f07682e0358bd7a2068094fc023b --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/seed @@ -0,0 +1 @@ +42 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/sep_token_id b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/sep_token_id new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/sep_token_id @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/skip_memory_metrics b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/skip_memory_metrics new file mode 100644 index 0000000000000000000000000000000000000000..4791ed5559bd77f54e1520025768e2b368705876 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/skip_memory_metrics @@ -0,0 +1 @@ +True \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/sortish_sampler b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/sortish_sampler new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/sortish_sampler @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/suppress_tokens b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/suppress_tokens new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/suppress_tokens @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/task_specific_params b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/task_specific_params new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/task_specific_params @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/temperature b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/temperature new file mode 100644 index 0000000000000000000000000000000000000000..9f8e9b69a33f4e8067d5b21661a35d8856758aba --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/temperature @@ -0,0 +1 @@ +1.0 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/tf32 b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/tf32 new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/tf32 @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/tf_legacy_loss b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/tf_legacy_loss new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/tf_legacy_loss @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/tie_encoder_decoder b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/tie_encoder_decoder new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/tie_encoder_decoder @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/tie_word_embeddings b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/tie_word_embeddings new file mode 100644 index 0000000000000000000000000000000000000000..4791ed5559bd77f54e1520025768e2b368705876 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/tie_word_embeddings @@ -0,0 +1 @@ +True \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/tokenizer_class b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/tokenizer_class new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/tokenizer_class @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/top_k b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/top_k new file mode 100644 index 0000000000000000000000000000000000000000..c5b431b6cba29540b4b284840ff229bce0460886 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/top_k @@ -0,0 +1 @@ +50 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/top_p b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/top_p new file mode 100644 index 0000000000000000000000000000000000000000..9f8e9b69a33f4e8067d5b21661a35d8856758aba --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/top_p @@ -0,0 +1 @@ +1.0 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/torch_compile b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/torch_compile new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/torch_compile @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/torch_compile_backend b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/torch_compile_backend new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/torch_compile_backend @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/torch_compile_mode b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/torch_compile_mode new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/torch_compile_mode @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/torch_empty_cache_steps b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/torch_empty_cache_steps new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/torch_empty_cache_steps @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/torchdynamo b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/torchdynamo new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/torchdynamo @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/torchscript b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/torchscript new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/torchscript @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/tpu_metrics_debug b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/tpu_metrics_debug new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/tpu_metrics_debug @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/tpu_num_cores b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/tpu_num_cores new file mode 100644 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/tpu_num_cores @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/trackio_space_id b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/trackio_space_id new file mode 100644 index 0000000000000000000000000000000000000000..ce8d952594b1b4638b748fcc5486c6c20e791dcc --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/trackio_space_id @@ -0,0 +1 @@ +trackio \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/transformers_version b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/transformers_version new file mode 100644 index 0000000000000000000000000000000000000000..e5a11132710d05d4da277d6ae5402768cc434018 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/transformers_version @@ -0,0 +1 @@ +4.57.1 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/typical_p b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/typical_p new file mode 100644 index 0000000000000000000000000000000000000000..9f8e9b69a33f4e8067d5b21661a35d8856758aba --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/typical_p @@ -0,0 +1 @@ +1.0 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/use_bfloat16 b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/use_bfloat16 new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/use_bfloat16 @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/use_cache b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/use_cache new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/use_cache @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/use_cpu b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/use_cpu new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/use_cpu @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/use_legacy_prediction_loop b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/use_legacy_prediction_loop new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/use_legacy_prediction_loop @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/use_liger_kernel b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/use_liger_kernel new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/use_liger_kernel @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/use_mps_device b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/use_mps_device new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/use_mps_device @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/vocab_size b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/vocab_size new file mode 100644 index 0000000000000000000000000000000000000000..34c44b19378193a3b4fa853df4426fc426c15535 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/vocab_size @@ -0,0 +1 @@ +128256 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/warmup_ratio b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/warmup_ratio new file mode 100644 index 0000000000000000000000000000000000000000..ceab6e11ece0bcec917c12e11d350946f085d549 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/warmup_ratio @@ -0,0 +1 @@ +0.1 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/warmup_steps b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/warmup_steps new file mode 100644 index 0000000000000000000000000000000000000000..c227083464fb9af8955c90d2924774ee50abb547 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/warmup_steps @@ -0,0 +1 @@ +0 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/weight_decay b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/weight_decay new file mode 100644 index 0000000000000000000000000000000000000000..d1c6331b3109accd73f01907062e6c174e28200a --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/weight_decay @@ -0,0 +1 @@ +0.01 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/tags/mlflow.runName b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/tags/mlflow.runName new file mode 100644 index 0000000000000000000000000000000000000000..0f23e0877477455534bae4babc7610a7dc7cea17 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/tags/mlflow.runName @@ -0,0 +1 @@ +llama3b_think_sft_nopack_lr1.5e5_ep3 \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/tags/mlflow.source.git.commit b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/tags/mlflow.source.git.commit new file mode 100644 index 0000000000000000000000000000000000000000..873686f91a08caa39c5cde9e33e5231dc5ae03d8 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/tags/mlflow.source.git.commit @@ -0,0 +1 @@ +25f2ae49189f9a73cdd23bd5845e544915a0d04d \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/tags/mlflow.source.name b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/tags/mlflow.source.name new file mode 100644 index 0000000000000000000000000000000000000000..fd98106671732f814b803d95444e53a6abcae75d --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/tags/mlflow.source.name @@ -0,0 +1 @@ +/home/salman/reward-signal-analysis/LLaMA-Factory/src/llamafactory/launcher.py \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/tags/mlflow.source.type b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/tags/mlflow.source.type new file mode 100644 index 0000000000000000000000000000000000000000..0c2c1fe9dc63b7040bb81006635e50fd528f056f --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/tags/mlflow.source.type @@ -0,0 +1 @@ +LOCAL \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/tags/mlflow.user b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/tags/mlflow.user new file mode 100644 index 0000000000000000000000000000000000000000..ed08908948e0c35bdf8cbdcc82956d2ad0b81915 --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/tags/mlflow.user @@ -0,0 +1 @@ +salman \ No newline at end of file diff --git a/global_step_0/mlflow/356092632336622637/meta.yaml b/global_step_0/mlflow/356092632336622637/meta.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f786490c4d5776f1c2bb3ab1faa0ac1d57ab00ff --- /dev/null +++ b/global_step_0/mlflow/356092632336622637/meta.yaml @@ -0,0 +1,6 @@ +artifact_location: file:///local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/mlflow/356092632336622637 +creation_time: 1772751716696 +experiment_id: '356092632336622637' +last_update_time: 1772751716696 +lifecycle_stage: active +name: llama3b_think_sft_nopack_lr1.5e5_ep3 diff --git a/global_step_0/model-00001-of-00002.safetensors b/global_step_0/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4a218d0c87ffcbc29804e0963ce8c5052cd944d5 --- /dev/null +++ b/global_step_0/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8958e10d9375f6edb777844678f32b35d6e200642d892b10cf23fba9d56b880 +size 4965799096 diff --git a/global_step_0/model-00002-of-00002.safetensors b/global_step_0/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..38fbe7a316db54d2943b4a5a55e17c3f8e3d4de5 --- /dev/null +++ b/global_step_0/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:552e715adc327a92f6466821e15c47bb806883b331891b871056fbbd7a783271 +size 2247734992 diff --git a/global_step_0/model.safetensors.index.json b/global_step_0/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..887d493f4ade1ceaea9675859e66b09891033710 --- /dev/null +++ b/global_step_0/model.safetensors.index.json @@ -0,0 +1,263 @@ +{ + "metadata": { + "total_parameters": 3212749824, + "total_size": 7213504512 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/global_step_0/special_tokens_map.json b/global_step_0/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..9d6f1d09511c78a15675d7e3bfece8089df89a1d --- /dev/null +++ b/global_step_0/special_tokens_map.json @@ -0,0 +1,32 @@ +{ + "additional_special_tokens": [ + { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ], + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/global_step_0/tokenizer.json b/global_step_0/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/global_step_0/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/global_step_0/tokenizer_config.json b/global_step_0/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ca4f78cbcbcdef77e48625636066f7d40adec5dd --- /dev/null +++ b/global_step_0/tokenizer_config.json @@ -0,0 +1,2068 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|eom_id|>" + ], + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/global_step_0/train_results.json b/global_step_0/train_results.json new file mode 100644 index 0000000000000000000000000000000000000000..361e4e132f240d363d25c252389f9999a100c486 --- /dev/null +++ b/global_step_0/train_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 3.0, + "total_flos": 1.1980638081930756e+19, + "train_loss": 0.49363853406255476, + "train_runtime": 40041.2675, + "train_samples_per_second": 3.261, + "train_steps_per_second": 0.013 +} \ No newline at end of file diff --git a/global_step_0/trainer_log.jsonl b/global_step_0/trainer_log.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d7510fa335f5a5beee10778a5977bd69b542abd9 --- /dev/null +++ b/global_step_0/trainer_log.jsonl @@ -0,0 +1,52 @@ +{"current_steps": 10, "total_steps": 513, "loss": 0.8486, "lr": 2.596153846153846e-06, "epoch": 0.05881271825032163, "percentage": 1.95, "elapsed_time": "0:08:49", "remaining_time": "7:23:45"} +{"current_steps": 20, "total_steps": 513, "loss": 0.7412, "lr": 5.480769230769231e-06, "epoch": 0.11762543650064326, "percentage": 3.9, "elapsed_time": "0:17:27", "remaining_time": "7:10:32"} +{"current_steps": 30, "total_steps": 513, "loss": 0.6532, "lr": 8.365384615384616e-06, "epoch": 0.1764381547509649, "percentage": 5.85, "elapsed_time": "0:26:04", "remaining_time": "6:59:45"} +{"current_steps": 40, "total_steps": 513, "loss": 0.6102, "lr": 1.125e-05, "epoch": 0.23525087300128653, "percentage": 7.8, "elapsed_time": "0:34:48", "remaining_time": "6:51:32"} +{"current_steps": 50, "total_steps": 513, "loss": 0.5784, "lr": 1.4134615384615384e-05, "epoch": 0.29406359125160814, "percentage": 9.75, "elapsed_time": "0:43:26", "remaining_time": "6:42:12"} +{"current_steps": 60, "total_steps": 513, "loss": 0.5641, "lr": 1.4991468156423456e-05, "epoch": 0.3528763095019298, "percentage": 11.7, "elapsed_time": "0:51:59", "remaining_time": "6:32:33"} +{"current_steps": 70, "total_steps": 513, "loss": 0.5469, "lr": 1.494972625749433e-05, "epoch": 0.4116890277522514, "percentage": 13.65, "elapsed_time": "1:00:35", "remaining_time": "6:23:28"} +{"current_steps": 80, "total_steps": 513, "loss": 0.5424, "lr": 1.4873400764197756e-05, "epoch": 0.47050174600257305, "percentage": 15.59, "elapsed_time": "1:09:12", "remaining_time": "6:14:36"} +{"current_steps": 90, "total_steps": 513, "loss": 0.5293, "lr": 1.4762845999606666e-05, "epoch": 0.5293144642528946, "percentage": 17.54, "elapsed_time": "1:21:01", "remaining_time": "6:20:49"} +{"current_steps": 100, "total_steps": 513, "loss": 0.5266, "lr": 1.4618575188100301e-05, "epoch": 0.5881271825032163, "percentage": 19.49, "elapsed_time": "1:36:07", "remaining_time": "6:37:00"} +{"current_steps": 110, "total_steps": 513, "loss": 0.522, "lr": 1.4441258072841264e-05, "epoch": 0.6469399007535379, "percentage": 21.44, "elapsed_time": "1:51:33", "remaining_time": "6:48:42"} +{"current_steps": 120, "total_steps": 513, "loss": 0.5222, "lr": 1.4231717806651086e-05, "epoch": 0.7057526190038595, "percentage": 23.39, "elapsed_time": "2:03:27", "remaining_time": "6:44:18"} +{"current_steps": 130, "total_steps": 513, "loss": 0.5106, "lr": 1.3990927130717711e-05, "epoch": 0.7645653372541812, "percentage": 25.34, "elapsed_time": "2:14:59", "remaining_time": "6:37:41"} +{"current_steps": 140, "total_steps": 513, "loss": 0.5114, "lr": 1.3720003858874311e-05, "epoch": 0.8233780555045028, "percentage": 27.29, "elapsed_time": "2:27:07", "remaining_time": "6:31:58"} +{"current_steps": 150, "total_steps": 513, "loss": 0.5099, "lr": 1.3420205688412603e-05, "epoch": 0.8821907737548245, "percentage": 29.24, "elapsed_time": "2:38:51", "remaining_time": "6:24:27"} +{"current_steps": 160, "total_steps": 513, "loss": 0.5086, "lr": 1.3092924361520291e-05, "epoch": 0.9410034920051461, "percentage": 31.19, "elapsed_time": "2:51:13", "remaining_time": "6:17:45"} +{"current_steps": 170, "total_steps": 513, "loss": 0.5061, "lr": 1.2739679204446694e-05, "epoch": 0.9998162102554677, "percentage": 33.14, "elapsed_time": "3:03:05", "remaining_time": "6:09:25"} +{"current_steps": 180, "total_steps": 513, "loss": 0.4746, "lr": 1.236211007438955e-05, "epoch": 1.0529314464252895, "percentage": 35.09, "elapsed_time": "3:13:45", "remaining_time": "5:58:27"} +{"current_steps": 190, "total_steps": 513, "loss": 0.478, "lr": 1.1961969746845325e-05, "epoch": 1.1117441646756112, "percentage": 37.04, "elapsed_time": "3:25:56", "remaining_time": "5:50:06"} +{"current_steps": 200, "total_steps": 513, "loss": 0.4755, "lr": 1.1541115778763038e-05, "epoch": 1.1705568829259327, "percentage": 38.99, "elapsed_time": "3:37:41", "remaining_time": "5:40:41"} +{"current_steps": 210, "total_steps": 513, "loss": 0.4765, "lr": 1.1101501885274894e-05, "epoch": 1.2293696011762543, "percentage": 40.94, "elapsed_time": "3:50:20", "remaining_time": "5:32:20"} +{"current_steps": 220, "total_steps": 513, "loss": 0.4706, "lr": 1.0645168870035313e-05, "epoch": 1.288182319426576, "percentage": 42.88, "elapsed_time": "4:02:56", "remaining_time": "5:23:33"} +{"current_steps": 230, "total_steps": 513, "loss": 0.4681, "lr": 1.0174235151272025e-05, "epoch": 1.3469950376768978, "percentage": 44.83, "elapsed_time": "4:15:23", "remaining_time": "5:14:14"} +{"current_steps": 240, "total_steps": 513, "loss": 0.4715, "lr": 9.690886927529886e-06, "epoch": 1.4058077559272193, "percentage": 46.78, "elapsed_time": "4:28:11", "remaining_time": "5:05:03"} +{"current_steps": 250, "total_steps": 513, "loss": 0.4711, "lr": 9.197368028760536e-06, "epoch": 1.4646204741775408, "percentage": 48.73, "elapsed_time": "4:40:41", "remaining_time": "4:55:16"} +{"current_steps": 260, "total_steps": 513, "loss": 0.4685, "lr": 8.695969499871911e-06, "epoch": 1.5234331924278626, "percentage": 50.68, "elapsed_time": "4:53:35", "remaining_time": "4:45:41"} +{"current_steps": 270, "total_steps": 513, "loss": 0.4688, "lr": 8.18901896509343e-06, "epoch": 1.5822459106781843, "percentage": 52.63, "elapsed_time": "5:07:00", "remaining_time": "4:36:18"} +{"current_steps": 280, "total_steps": 513, "loss": 0.4722, "lr": 7.678869822530362e-06, "epoch": 1.6410586289285058, "percentage": 54.58, "elapsed_time": "5:18:58", "remaining_time": "4:25:25"} +{"current_steps": 290, "total_steps": 513, "loss": 0.4649, "lr": 7.167890319069035e-06, "epoch": 1.6998713471788274, "percentage": 56.53, "elapsed_time": "5:30:51", "remaining_time": "4:14:25"} +{"current_steps": 300, "total_steps": 513, "loss": 0.4692, "lr": 6.658452556350092e-06, "epoch": 1.758684065429149, "percentage": 58.48, "elapsed_time": "5:42:50", "remaining_time": "4:03:25"} +{"current_steps": 310, "total_steps": 513, "loss": 0.4653, "lr": 6.152921478846986e-06, "epoch": 1.8174967836794707, "percentage": 60.43, "elapsed_time": "5:54:40", "remaining_time": "3:52:15"} +{"current_steps": 320, "total_steps": 513, "loss": 0.4664, "lr": 5.65364389516988e-06, "epoch": 1.8763095019297924, "percentage": 62.38, "elapsed_time": "6:06:49", "remaining_time": "3:41:14"} +{"current_steps": 330, "total_steps": 513, "loss": 0.4621, "lr": 5.162937583561072e-06, "epoch": 1.935122220180114, "percentage": 64.33, "elapsed_time": "6:18:35", "remaining_time": "3:29:56"} +{"current_steps": 340, "total_steps": 513, "loss": 0.4675, "lr": 4.683080532156986e-06, "epoch": 1.9939349384304355, "percentage": 66.28, "elapsed_time": "6:30:39", "remaining_time": "3:18:46"} +{"current_steps": 350, "total_steps": 513, "loss": 0.44, "lr": 4.216300363966383e-06, "epoch": 2.0470501746002574, "percentage": 68.23, "elapsed_time": "6:41:31", "remaining_time": "3:06:59"} +{"current_steps": 360, "total_steps": 513, "loss": 0.4447, "lr": 3.7647639956567304e-06, "epoch": 2.105862892850579, "percentage": 70.18, "elapsed_time": "6:56:27", "remaining_time": "2:56:59"} +{"current_steps": 370, "total_steps": 513, "loss": 0.4391, "lr": 3.3305675781554655e-06, "epoch": 2.1646756111009005, "percentage": 72.12, "elapsed_time": "7:12:51", "remaining_time": "2:47:17"} +{"current_steps": 380, "total_steps": 513, "loss": 0.4419, "lr": 2.915726765764453e-06, "epoch": 2.2234883293512224, "percentage": 74.07, "elapsed_time": "7:29:12", "remaining_time": "2:37:13"} +{"current_steps": 390, "total_steps": 513, "loss": 0.4377, "lr": 2.522167358961046e-06, "epoch": 2.282301047601544, "percentage": 76.02, "elapsed_time": "7:45:42", "remaining_time": "2:26:52"} +{"current_steps": 400, "total_steps": 513, "loss": 0.4387, "lr": 2.151716364324264e-06, "epoch": 2.3411137658518655, "percentage": 77.97, "elapsed_time": "8:02:07", "remaining_time": "2:16:11"} +{"current_steps": 410, "total_steps": 513, "loss": 0.4426, "lr": 1.806093513088348e-06, "epoch": 2.399926484102187, "percentage": 79.92, "elapsed_time": "8:18:53", "remaining_time": "2:05:19"} +{"current_steps": 420, "total_steps": 513, "loss": 0.442, "lr": 1.486903277696733e-06, "epoch": 2.4587392023525085, "percentage": 81.87, "elapsed_time": "8:35:37", "remaining_time": "1:54:10"} +{"current_steps": 430, "total_steps": 513, "loss": 0.4449, "lr": 1.1956274234177322e-06, "epoch": 2.5175519206028305, "percentage": 83.82, "elapsed_time": "8:51:59", "remaining_time": "1:42:41"} +{"current_steps": 440, "total_steps": 513, "loss": 0.4391, "lr": 9.336181295993204e-07, "epoch": 2.576364638853152, "percentage": 85.77, "elapsed_time": "9:08:26", "remaining_time": "1:30:59"} +{"current_steps": 450, "total_steps": 513, "loss": 0.4374, "lr": 7.02091712495907e-07, "epoch": 2.6351773571034736, "percentage": 87.72, "elapsed_time": "9:24:54", "remaining_time": "1:19:05"} +{"current_steps": 460, "total_steps": 513, "loss": 0.4403, "lr": 5.021229788074589e-07, "epoch": 2.6939900753537955, "percentage": 89.67, "elapsed_time": "9:41:06", "remaining_time": "1:06:57"} +{"current_steps": 470, "total_steps": 513, "loss": 0.44, "lr": 3.3464023614327683e-07, "epoch": 2.7528027936041166, "percentage": 91.62, "elapsed_time": "9:57:23", "remaining_time": "0:54:39"} +{"current_steps": 480, "total_steps": 513, "loss": 0.4358, "lr": 2.0042098357321209e-07, "epoch": 2.8116155118544386, "percentage": 93.57, "elapsed_time": "10:13:50", "remaining_time": "0:42:12"} +{"current_steps": 490, "total_steps": 513, "loss": 0.4384, "lr": 1.0008830227189431e-07, "epoch": 2.87042823010476, "percentage": 95.52, "elapsed_time": "10:30:28", "remaining_time": "0:29:35"} +{"current_steps": 500, "total_steps": 513, "loss": 0.4438, "lr": 3.410796301156205e-08, "epoch": 2.9292409483550816, "percentage": 97.47, "elapsed_time": "10:47:13", "remaining_time": "0:16:49"} +{"current_steps": 510, "total_steps": 513, "loss": 0.4378, "lr": 2.7862639312792317e-09, "epoch": 2.9880536666054036, "percentage": 99.42, "elapsed_time": "11:03:34", "remaining_time": "0:03:54"} +{"current_steps": 513, "total_steps": 513, "epoch": 3.0, "percentage": 100.0, "elapsed_time": "11:07:21", "remaining_time": "0:00:00"} diff --git a/global_step_0/trainer_state.json b/global_step_0/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6b5a94f48182bc4b5c93c421caa831865f6e9b46 --- /dev/null +++ b/global_step_0/trainer_state.json @@ -0,0 +1,400 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 513, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.05881271825032163, + "grad_norm": 1.3511555194854736, + "learning_rate": 2.596153846153846e-06, + "loss": 0.8486, + "step": 10 + }, + { + "epoch": 0.11762543650064326, + "grad_norm": 0.7383383512496948, + "learning_rate": 5.480769230769231e-06, + "loss": 0.7412, + "step": 20 + }, + { + "epoch": 0.1764381547509649, + "grad_norm": 0.47219017148017883, + "learning_rate": 8.365384615384616e-06, + "loss": 0.6532, + "step": 30 + }, + { + "epoch": 0.23525087300128653, + "grad_norm": 0.30038249492645264, + "learning_rate": 1.125e-05, + "loss": 0.6102, + "step": 40 + }, + { + "epoch": 0.29406359125160814, + "grad_norm": 0.2751595377922058, + "learning_rate": 1.4134615384615384e-05, + "loss": 0.5784, + "step": 50 + }, + { + "epoch": 0.3528763095019298, + "grad_norm": 0.26936954259872437, + "learning_rate": 1.4991468156423456e-05, + "loss": 0.5641, + "step": 60 + }, + { + "epoch": 0.4116890277522514, + "grad_norm": 0.25376981496810913, + "learning_rate": 1.494972625749433e-05, + "loss": 0.5469, + "step": 70 + }, + { + "epoch": 0.47050174600257305, + "grad_norm": 0.2703434228897095, + "learning_rate": 1.4873400764197756e-05, + "loss": 0.5424, + "step": 80 + }, + { + "epoch": 0.5293144642528946, + "grad_norm": 0.3386951684951782, + "learning_rate": 1.4762845999606666e-05, + "loss": 0.5293, + "step": 90 + }, + { + "epoch": 0.5881271825032163, + "grad_norm": 0.30952027440071106, + "learning_rate": 1.4618575188100301e-05, + "loss": 0.5266, + "step": 100 + }, + { + "epoch": 0.6469399007535379, + "grad_norm": 0.2706937789916992, + "learning_rate": 1.4441258072841264e-05, + "loss": 0.522, + "step": 110 + }, + { + "epoch": 0.7057526190038595, + "grad_norm": 0.286222368478775, + "learning_rate": 1.4231717806651086e-05, + "loss": 0.5222, + "step": 120 + }, + { + "epoch": 0.7645653372541812, + "grad_norm": 0.2553636431694031, + "learning_rate": 1.3990927130717711e-05, + "loss": 0.5106, + "step": 130 + }, + { + "epoch": 0.8233780555045028, + "grad_norm": 0.2975357472896576, + "learning_rate": 1.3720003858874311e-05, + "loss": 0.5114, + "step": 140 + }, + { + "epoch": 0.8821907737548245, + "grad_norm": 0.24958086013793945, + "learning_rate": 1.3420205688412603e-05, + "loss": 0.5099, + "step": 150 + }, + { + "epoch": 0.9410034920051461, + "grad_norm": 0.302441269159317, + "learning_rate": 1.3092924361520291e-05, + "loss": 0.5086, + "step": 160 + }, + { + "epoch": 0.9998162102554677, + "grad_norm": 0.24974007904529572, + "learning_rate": 1.2739679204446694e-05, + "loss": 0.5061, + "step": 170 + }, + { + "epoch": 1.0529314464252895, + "grad_norm": 0.35062289237976074, + "learning_rate": 1.236211007438955e-05, + "loss": 0.4746, + "step": 180 + }, + { + "epoch": 1.1117441646756112, + "grad_norm": 0.28535276651382446, + "learning_rate": 1.1961969746845325e-05, + "loss": 0.478, + "step": 190 + }, + { + "epoch": 1.1705568829259327, + "grad_norm": 0.2474713921546936, + "learning_rate": 1.1541115778763038e-05, + "loss": 0.4755, + "step": 200 + }, + { + "epoch": 1.2293696011762543, + "grad_norm": 0.23004528880119324, + "learning_rate": 1.1101501885274894e-05, + "loss": 0.4765, + "step": 210 + }, + { + "epoch": 1.288182319426576, + "grad_norm": 0.23046620190143585, + "learning_rate": 1.0645168870035313e-05, + "loss": 0.4706, + "step": 220 + }, + { + "epoch": 1.3469950376768978, + "grad_norm": 0.243893101811409, + "learning_rate": 1.0174235151272025e-05, + "loss": 0.4681, + "step": 230 + }, + { + "epoch": 1.4058077559272193, + "grad_norm": 0.2657492160797119, + "learning_rate": 9.690886927529886e-06, + "loss": 0.4715, + "step": 240 + }, + { + "epoch": 1.4646204741775408, + "grad_norm": 0.24003422260284424, + "learning_rate": 9.197368028760536e-06, + "loss": 0.4711, + "step": 250 + }, + { + "epoch": 1.5234331924278626, + "grad_norm": 0.238833948969841, + "learning_rate": 8.695969499871911e-06, + "loss": 0.4685, + "step": 260 + }, + { + "epoch": 1.5822459106781843, + "grad_norm": 0.237404927611351, + "learning_rate": 8.18901896509343e-06, + "loss": 0.4688, + "step": 270 + }, + { + "epoch": 1.6410586289285058, + "grad_norm": 0.22758300602436066, + "learning_rate": 7.678869822530362e-06, + "loss": 0.4722, + "step": 280 + }, + { + "epoch": 1.6998713471788274, + "grad_norm": 0.22680319845676422, + "learning_rate": 7.167890319069035e-06, + "loss": 0.4649, + "step": 290 + }, + { + "epoch": 1.758684065429149, + "grad_norm": 0.2401188611984253, + "learning_rate": 6.658452556350092e-06, + "loss": 0.4692, + "step": 300 + }, + { + "epoch": 1.8174967836794707, + "grad_norm": 0.2211555689573288, + "learning_rate": 6.152921478846986e-06, + "loss": 0.4653, + "step": 310 + }, + { + "epoch": 1.8763095019297924, + "grad_norm": 0.24088308215141296, + "learning_rate": 5.65364389516988e-06, + "loss": 0.4664, + "step": 320 + }, + { + "epoch": 1.935122220180114, + "grad_norm": 0.21008798480033875, + "learning_rate": 5.162937583561072e-06, + "loss": 0.4621, + "step": 330 + }, + { + "epoch": 1.9939349384304355, + "grad_norm": 0.2156449556350708, + "learning_rate": 4.683080532156986e-06, + "loss": 0.4675, + "step": 340 + }, + { + "epoch": 2.0470501746002574, + "grad_norm": 0.2731837034225464, + "learning_rate": 4.216300363966383e-06, + "loss": 0.44, + "step": 350 + }, + { + "epoch": 2.105862892850579, + "grad_norm": 0.2207324057817459, + "learning_rate": 3.7647639956567304e-06, + "loss": 0.4447, + "step": 360 + }, + { + "epoch": 2.1646756111009005, + "grad_norm": 0.21577142179012299, + "learning_rate": 3.3305675781554655e-06, + "loss": 0.4391, + "step": 370 + }, + { + "epoch": 2.2234883293512224, + "grad_norm": 0.22381627559661865, + "learning_rate": 2.915726765764453e-06, + "loss": 0.4419, + "step": 380 + }, + { + "epoch": 2.282301047601544, + "grad_norm": 0.2167045623064041, + "learning_rate": 2.522167358961046e-06, + "loss": 0.4377, + "step": 390 + }, + { + "epoch": 2.3411137658518655, + "grad_norm": 0.2239835262298584, + "learning_rate": 2.151716364324264e-06, + "loss": 0.4387, + "step": 400 + }, + { + "epoch": 2.399926484102187, + "grad_norm": 0.2177765816450119, + "learning_rate": 1.806093513088348e-06, + "loss": 0.4426, + "step": 410 + }, + { + "epoch": 2.4587392023525085, + "grad_norm": 0.21108600497245789, + "learning_rate": 1.486903277696733e-06, + "loss": 0.442, + "step": 420 + }, + { + "epoch": 2.5175519206028305, + "grad_norm": 0.20833276212215424, + "learning_rate": 1.1956274234177322e-06, + "loss": 0.4449, + "step": 430 + }, + { + "epoch": 2.576364638853152, + "grad_norm": 0.20782434940338135, + "learning_rate": 9.336181295993204e-07, + "loss": 0.4391, + "step": 440 + }, + { + "epoch": 2.6351773571034736, + "grad_norm": 0.20101866126060486, + "learning_rate": 7.02091712495907e-07, + "loss": 0.4374, + "step": 450 + }, + { + "epoch": 2.6939900753537955, + "grad_norm": 0.1978382021188736, + "learning_rate": 5.021229788074589e-07, + "loss": 0.4403, + "step": 460 + }, + { + "epoch": 2.7528027936041166, + "grad_norm": 0.20072239637374878, + "learning_rate": 3.3464023614327683e-07, + "loss": 0.44, + "step": 470 + }, + { + "epoch": 2.8116155118544386, + "grad_norm": 0.2036609798669815, + "learning_rate": 2.0042098357321209e-07, + "loss": 0.4358, + "step": 480 + }, + { + "epoch": 2.87042823010476, + "grad_norm": 0.20166757702827454, + "learning_rate": 1.0008830227189431e-07, + "loss": 0.4384, + "step": 490 + }, + { + "epoch": 2.9292409483550816, + "grad_norm": 0.20334972441196442, + "learning_rate": 3.410796301156205e-08, + "loss": 0.4438, + "step": 500 + }, + { + "epoch": 2.9880536666054036, + "grad_norm": 0.20352092385292053, + "learning_rate": 2.7862639312792317e-09, + "loss": 0.4378, + "step": 510 + }, + { + "epoch": 3.0, + "step": 513, + "total_flos": 1.1980638081930756e+19, + "train_loss": 0.49363853406255476, + "train_runtime": 40041.2675, + "train_samples_per_second": 3.261, + "train_steps_per_second": 0.013 + } + ], + "logging_steps": 10, + "max_steps": 513, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.1980638081930756e+19, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/global_step_0/training_args.bin b/global_step_0/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a0751554beb923f03f8962dcba69f3ba8297e954 --- /dev/null +++ b/global_step_0/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:149830d4281846c68c4643d93a8b6007f2239eca267a1d2803fe599ad5194580 +size 7608 diff --git a/global_step_0/training_loss.png b/global_step_0/training_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..aa2945bbc426b07aedad3fde8d37b9cd365fb5fd Binary files /dev/null and b/global_step_0/training_loss.png differ