Upload global_step_0
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +1 -0
- global_step_0/README.md +60 -0
- global_step_0/all_results.json +8 -0
- global_step_0/chat_template.jinja +21 -0
- global_step_0/config.json +36 -0
- global_step_0/generation_config.json +13 -0
- global_step_0/logs/sft_train_20260305_150038.log +0 -0
- global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/meta.yaml +14 -0
- global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/epoch +52 -0
- global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/grad_norm +51 -0
- global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/learning_rate +51 -0
- global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/loss +51 -0
- global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/total_flos +1 -0
- global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/train_loss +1 -0
- global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/train_runtime +1 -0
- global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/train_samples_per_second +1 -0
- global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/train_steps_per_second +1 -0
- global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/_name_or_path +1 -0
- global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/accelerator_config +1 -0
- global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/adafactor +1 -0
- global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/adam_beta1 +1 -0
- global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/adam_beta2 +1 -0
- global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/adam_epsilon +1 -0
- global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/add_cross_attention +1 -0
- global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/architectures +1 -0
- global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/attention_bias +1 -0
- global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/attention_dropout +1 -0
- global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/auto_find_batch_size +1 -0
- global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/average_tokens_across_devices +1 -0
- global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/bad_words_ids +1 -0
- global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/batch_eval_metrics +1 -0
- global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/begin_suppress_tokens +1 -0
- global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/bf16 +1 -0
- global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/bf16_full_eval +1 -0
- global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/bos_token_id +1 -0
- global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/chunk_size_feed_forward +1 -0
- global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/cross_attention_hidden_size +1 -0
- global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/data_seed +1 -0
- global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/dataloader_drop_last +1 -0
- global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/dataloader_num_workers +1 -0
- global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/dataloader_persistent_workers +1 -0
- global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/dataloader_pin_memory +1 -0
- global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/dataloader_prefetch_factor +1 -0
- global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ddp_backend +1 -0
- global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ddp_broadcast_buffers +1 -0
- global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ddp_bucket_cap_mb +1 -0
- global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ddp_find_unused_parameters +1 -0
- global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ddp_timeout +1 -0
- global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/debug +1 -0
- global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/decoder_start_token_id +1 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
global_step_0/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
global_step_0/README.md
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
library_name: transformers
|
| 3 |
+
license: other
|
| 4 |
+
tags:
|
| 5 |
+
- llama-factory
|
| 6 |
+
- full
|
| 7 |
+
- generated_from_trainer
|
| 8 |
+
model-index:
|
| 9 |
+
- name: think_sft_nopack_lr1.5e5_ep3
|
| 10 |
+
results: []
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
<!-- This model card has been generated automatically according to the information the Trainer had access to. You
|
| 14 |
+
should probably proofread and complete it, then remove this comment. -->
|
| 15 |
+
|
| 16 |
+
# think_sft_nopack_lr1.5e5_ep3
|
| 17 |
+
|
| 18 |
+
This model is a fine-tuned version of a custom Llama 3B model pretrained on 52B tokens on the open_thoughts_43k_think_format dataset.
|
| 19 |
+
|
| 20 |
+
## Model description
|
| 21 |
+
|
| 22 |
+
More information needed
|
| 23 |
+
|
| 24 |
+
## Intended uses & limitations
|
| 25 |
+
|
| 26 |
+
More information needed
|
| 27 |
+
|
| 28 |
+
## Training and evaluation data
|
| 29 |
+
|
| 30 |
+
More information needed
|
| 31 |
+
|
| 32 |
+
## Training procedure
|
| 33 |
+
|
| 34 |
+
### Training hyperparameters
|
| 35 |
+
|
| 36 |
+
The following hyperparameters were used during training:
|
| 37 |
+
- learning_rate: 1.5e-05
|
| 38 |
+
- train_batch_size: 2
|
| 39 |
+
- eval_batch_size: 8
|
| 40 |
+
- seed: 42
|
| 41 |
+
- distributed_type: multi-GPU
|
| 42 |
+
- num_devices: 4
|
| 43 |
+
- gradient_accumulation_steps: 32
|
| 44 |
+
- total_train_batch_size: 256
|
| 45 |
+
- total_eval_batch_size: 32
|
| 46 |
+
- optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
|
| 47 |
+
- lr_scheduler_type: cosine
|
| 48 |
+
- lr_scheduler_warmup_ratio: 0.1
|
| 49 |
+
- num_epochs: 3.0
|
| 50 |
+
|
| 51 |
+
### Training results
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
### Framework versions
|
| 56 |
+
|
| 57 |
+
- Transformers 4.57.1
|
| 58 |
+
- Pytorch 2.6.0+cu124
|
| 59 |
+
- Datasets 4.0.0
|
| 60 |
+
- Tokenizers 0.22.1
|
global_step_0/all_results.json
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"epoch": 3.0,
|
| 3 |
+
"total_flos": 1.1980638081930756e+19,
|
| 4 |
+
"train_loss": 0.49363853406255476,
|
| 5 |
+
"train_runtime": 40041.2675,
|
| 6 |
+
"train_samples_per_second": 3.261,
|
| 7 |
+
"train_steps_per_second": 0.013
|
| 8 |
+
}
|
global_step_0/chat_template.jinja
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{{- bos_token }}
|
| 2 |
+
{%- if messages[0]['role'] == 'system' %}
|
| 3 |
+
{%- set system_message = messages[0]['content'] %}
|
| 4 |
+
{%- set loop_messages = messages[1:] %}
|
| 5 |
+
{%- else %}
|
| 6 |
+
{%- set system_message = "" %}
|
| 7 |
+
{%- set loop_messages = messages %}
|
| 8 |
+
{%- endif %}
|
| 9 |
+
{%- if system_message %}
|
| 10 |
+
{{- '<|start_header_id|>system<|end_header_id|>\n\n' + system_message + '<|eot_id|>' }}
|
| 11 |
+
{%- endif %}
|
| 12 |
+
{%- for message in loop_messages %}
|
| 13 |
+
{%- if message['role'] == 'user' %}
|
| 14 |
+
{{- '<|start_header_id|>user<|end_header_id|>\n\n' + message['content'] + '<|eot_id|>' }}
|
| 15 |
+
{%- elif message['role'] == 'assistant' %}
|
| 16 |
+
{{- '<|start_header_id|>assistant<|end_header_id|>\n\n' + message['content'] + '<|eot_id|>' }}
|
| 17 |
+
{%- endif %}
|
| 18 |
+
{%- endfor %}
|
| 19 |
+
{%- if add_generation_prompt %}
|
| 20 |
+
{{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
|
| 21 |
+
{%- endif %}
|
global_step_0/config.json
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"LlamaForCausalLM"
|
| 4 |
+
],
|
| 5 |
+
"attention_bias": false,
|
| 6 |
+
"attention_dropout": 0.0,
|
| 7 |
+
"bos_token_id": 128000,
|
| 8 |
+
"dtype": "bfloat16",
|
| 9 |
+
"eos_token_id": 128009,
|
| 10 |
+
"head_dim": 128,
|
| 11 |
+
"hidden_act": "silu",
|
| 12 |
+
"hidden_size": 3072,
|
| 13 |
+
"initializer_range": 0.02,
|
| 14 |
+
"intermediate_size": 8192,
|
| 15 |
+
"max_position_embeddings": 131072,
|
| 16 |
+
"mlp_bias": false,
|
| 17 |
+
"model_type": "llama",
|
| 18 |
+
"num_attention_heads": 24,
|
| 19 |
+
"num_hidden_layers": 28,
|
| 20 |
+
"num_key_value_heads": 8,
|
| 21 |
+
"pad_token_id": 128001,
|
| 22 |
+
"pretraining_tp": 1,
|
| 23 |
+
"rms_norm_eps": 1e-05,
|
| 24 |
+
"rope_scaling": {
|
| 25 |
+
"factor": 32.0,
|
| 26 |
+
"high_freq_factor": 4.0,
|
| 27 |
+
"low_freq_factor": 1.0,
|
| 28 |
+
"original_max_position_embeddings": 8192,
|
| 29 |
+
"rope_type": "llama3"
|
| 30 |
+
},
|
| 31 |
+
"rope_theta": 500000.0,
|
| 32 |
+
"tie_word_embeddings": true,
|
| 33 |
+
"transformers_version": "4.57.1",
|
| 34 |
+
"use_cache": false,
|
| 35 |
+
"vocab_size": 128256
|
| 36 |
+
}
|
global_step_0/generation_config.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"bos_token_id": 128000,
|
| 4 |
+
"do_sample": true,
|
| 5 |
+
"eos_token_id": [
|
| 6 |
+
128009,
|
| 7 |
+
128001
|
| 8 |
+
],
|
| 9 |
+
"pad_token_id": 128001,
|
| 10 |
+
"temperature": 0.6,
|
| 11 |
+
"top_p": 0.9,
|
| 12 |
+
"transformers_version": "4.57.1"
|
| 13 |
+
}
|
global_step_0/logs/sft_train_20260305_150038.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/meta.yaml
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
artifact_uri: file:///local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/artifacts
|
| 2 |
+
end_time: 1772791757971
|
| 3 |
+
entry_point_name: ''
|
| 4 |
+
experiment_id: '356092632336622637'
|
| 5 |
+
lifecycle_stage: active
|
| 6 |
+
run_id: c370ae36b3594e5b8e4483476b3515b7
|
| 7 |
+
run_name: llama3b_think_sft_nopack_lr1.5e5_ep3
|
| 8 |
+
source_name: ''
|
| 9 |
+
source_type: 4
|
| 10 |
+
source_version: ''
|
| 11 |
+
start_time: 1772751716706
|
| 12 |
+
status: 3
|
| 13 |
+
tags: []
|
| 14 |
+
user_id: salman
|
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/epoch
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
1772752246047 0.05881271825032163 10
|
| 2 |
+
1772752764661 0.11762543650064326 20
|
| 3 |
+
1772753281010 0.1764381547509649 30
|
| 4 |
+
1772753804852 0.23525087300128653 40
|
| 5 |
+
1772754322796 0.29406359125160814 50
|
| 6 |
+
1772754836358 0.3528763095019298 60
|
| 7 |
+
1772755352360 0.4116890277522514 70
|
| 8 |
+
1772755869425 0.47050174600257305 80
|
| 9 |
+
1772756578289 0.5293144642528946 90
|
| 10 |
+
1772757484272 0.5881271825032163 100
|
| 11 |
+
1772758410313 0.6469399007535379 110
|
| 12 |
+
1772759124014 0.7057526190038595 120
|
| 13 |
+
1772759815988 0.7645653372541812 130
|
| 14 |
+
1772760543838 0.8233780555045028 140
|
| 15 |
+
1772761248573 0.8821907737548245 150
|
| 16 |
+
1772761990136 0.9410034920051461 160
|
| 17 |
+
1772762702603 0.9998162102554677 170
|
| 18 |
+
1772763342537 1.0529314464252895 180
|
| 19 |
+
1772764073344 1.1117441646756112 190
|
| 20 |
+
1772764778282 1.1705568829259327 200
|
| 21 |
+
1772765536908 1.2293696011762543 210
|
| 22 |
+
1772766293667 1.288182319426576 220
|
| 23 |
+
1772767040309 1.3469950376768978 230
|
| 24 |
+
1772767807864 1.4058077559272193 240
|
| 25 |
+
1772768557956 1.4646204741775408 250
|
| 26 |
+
1772769332250 1.5234331924278626 260
|
| 27 |
+
1772770136860 1.5822459106781843 270
|
| 28 |
+
1772770855053 1.6410586289285058 280
|
| 29 |
+
1772771568547 1.6998713471788274 290
|
| 30 |
+
1772772287333 1.758684065429149 300
|
| 31 |
+
1772772997323 1.8174967836794707 310
|
| 32 |
+
1772773725742 1.8763095019297924 320
|
| 33 |
+
1772774432045 1.935122220180114 330
|
| 34 |
+
1772775156419 1.9939349384304355 340
|
| 35 |
+
1772775808686 2.0470501746002574 350
|
| 36 |
+
1772776704525 2.105862892850579 360
|
| 37 |
+
1772777688582 2.1646756111009005 370
|
| 38 |
+
1772778669427 2.2234883293512224 380
|
| 39 |
+
1772779659140 2.282301047601544 390
|
| 40 |
+
1772780643974 2.3411137658518655 400
|
| 41 |
+
1772781649888 2.399926484102187 410
|
| 42 |
+
1772782654359 2.4587392023525085 420
|
| 43 |
+
1772783636091 2.5175519206028305 430
|
| 44 |
+
1772784623680 2.576364638853152 440
|
| 45 |
+
1772785610745 2.6351773571034736 450
|
| 46 |
+
1772786582766 2.6939900753537955 460
|
| 47 |
+
1772787560653 2.7528027936041166 470
|
| 48 |
+
1772788546976 2.8116155118544386 480
|
| 49 |
+
1772789545218 2.87042823010476 490
|
| 50 |
+
1772790549864 2.9292409483550816 500
|
| 51 |
+
1772791531194 2.9880536666054036 510
|
| 52 |
+
1772791757957 3.0 513
|
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/grad_norm
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
1772752246047 1.3511555194854736 10
|
| 2 |
+
1772752764661 0.7383383512496948 20
|
| 3 |
+
1772753281010 0.47219017148017883 30
|
| 4 |
+
1772753804852 0.30038249492645264 40
|
| 5 |
+
1772754322796 0.2751595377922058 50
|
| 6 |
+
1772754836358 0.26936954259872437 60
|
| 7 |
+
1772755352360 0.25376981496810913 70
|
| 8 |
+
1772755869425 0.2703434228897095 80
|
| 9 |
+
1772756578289 0.3386951684951782 90
|
| 10 |
+
1772757484272 0.30952027440071106 100
|
| 11 |
+
1772758410313 0.2706937789916992 110
|
| 12 |
+
1772759124014 0.286222368478775 120
|
| 13 |
+
1772759815988 0.2553636431694031 130
|
| 14 |
+
1772760543838 0.2975357472896576 140
|
| 15 |
+
1772761248573 0.24958086013793945 150
|
| 16 |
+
1772761990136 0.302441269159317 160
|
| 17 |
+
1772762702603 0.24974007904529572 170
|
| 18 |
+
1772763342537 0.35062289237976074 180
|
| 19 |
+
1772764073344 0.28535276651382446 190
|
| 20 |
+
1772764778282 0.2474713921546936 200
|
| 21 |
+
1772765536908 0.23004528880119324 210
|
| 22 |
+
1772766293667 0.23046620190143585 220
|
| 23 |
+
1772767040309 0.243893101811409 230
|
| 24 |
+
1772767807864 0.2657492160797119 240
|
| 25 |
+
1772768557956 0.24003422260284424 250
|
| 26 |
+
1772769332250 0.238833948969841 260
|
| 27 |
+
1772770136860 0.237404927611351 270
|
| 28 |
+
1772770855053 0.22758300602436066 280
|
| 29 |
+
1772771568547 0.22680319845676422 290
|
| 30 |
+
1772772287333 0.2401188611984253 300
|
| 31 |
+
1772772997323 0.2211555689573288 310
|
| 32 |
+
1772773725742 0.24088308215141296 320
|
| 33 |
+
1772774432045 0.21008798480033875 330
|
| 34 |
+
1772775156419 0.2156449556350708 340
|
| 35 |
+
1772775808686 0.2731837034225464 350
|
| 36 |
+
1772776704525 0.2207324057817459 360
|
| 37 |
+
1772777688582 0.21577142179012299 370
|
| 38 |
+
1772778669427 0.22381627559661865 380
|
| 39 |
+
1772779659140 0.2167045623064041 390
|
| 40 |
+
1772780643974 0.2239835262298584 400
|
| 41 |
+
1772781649888 0.2177765816450119 410
|
| 42 |
+
1772782654359 0.21108600497245789 420
|
| 43 |
+
1772783636091 0.20833276212215424 430
|
| 44 |
+
1772784623680 0.20782434940338135 440
|
| 45 |
+
1772785610745 0.20101866126060486 450
|
| 46 |
+
1772786582766 0.1978382021188736 460
|
| 47 |
+
1772787560653 0.20072239637374878 470
|
| 48 |
+
1772788546976 0.2036609798669815 480
|
| 49 |
+
1772789545218 0.20166757702827454 490
|
| 50 |
+
1772790549864 0.20334972441196442 500
|
| 51 |
+
1772791531194 0.20352092385292053 510
|
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/learning_rate
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
1772752246047 2.596153846153846e-06 10
|
| 2 |
+
1772752764661 5.480769230769231e-06 20
|
| 3 |
+
1772753281010 8.365384615384616e-06 30
|
| 4 |
+
1772753804852 1.125e-05 40
|
| 5 |
+
1772754322796 1.4134615384615384e-05 50
|
| 6 |
+
1772754836358 1.4991468156423456e-05 60
|
| 7 |
+
1772755352360 1.494972625749433e-05 70
|
| 8 |
+
1772755869425 1.4873400764197756e-05 80
|
| 9 |
+
1772756578289 1.4762845999606666e-05 90
|
| 10 |
+
1772757484272 1.4618575188100301e-05 100
|
| 11 |
+
1772758410313 1.4441258072841264e-05 110
|
| 12 |
+
1772759124014 1.4231717806651086e-05 120
|
| 13 |
+
1772759815988 1.3990927130717711e-05 130
|
| 14 |
+
1772760543838 1.3720003858874311e-05 140
|
| 15 |
+
1772761248573 1.3420205688412603e-05 150
|
| 16 |
+
1772761990136 1.3092924361520291e-05 160
|
| 17 |
+
1772762702603 1.2739679204446694e-05 170
|
| 18 |
+
1772763342537 1.236211007438955e-05 180
|
| 19 |
+
1772764073344 1.1961969746845325e-05 190
|
| 20 |
+
1772764778282 1.1541115778763038e-05 200
|
| 21 |
+
1772765536908 1.1101501885274894e-05 210
|
| 22 |
+
1772766293667 1.0645168870035313e-05 220
|
| 23 |
+
1772767040309 1.0174235151272025e-05 230
|
| 24 |
+
1772767807864 9.690886927529886e-06 240
|
| 25 |
+
1772768557956 9.197368028760536e-06 250
|
| 26 |
+
1772769332250 8.695969499871911e-06 260
|
| 27 |
+
1772770136860 8.18901896509343e-06 270
|
| 28 |
+
1772770855053 7.678869822530362e-06 280
|
| 29 |
+
1772771568547 7.167890319069035e-06 290
|
| 30 |
+
1772772287333 6.658452556350092e-06 300
|
| 31 |
+
1772772997323 6.152921478846986e-06 310
|
| 32 |
+
1772773725742 5.65364389516988e-06 320
|
| 33 |
+
1772774432045 5.162937583561072e-06 330
|
| 34 |
+
1772775156419 4.683080532156986e-06 340
|
| 35 |
+
1772775808686 4.216300363966383e-06 350
|
| 36 |
+
1772776704525 3.7647639956567304e-06 360
|
| 37 |
+
1772777688582 3.3305675781554655e-06 370
|
| 38 |
+
1772778669427 2.915726765764453e-06 380
|
| 39 |
+
1772779659140 2.522167358961046e-06 390
|
| 40 |
+
1772780643974 2.151716364324264e-06 400
|
| 41 |
+
1772781649888 1.806093513088348e-06 410
|
| 42 |
+
1772782654359 1.486903277696733e-06 420
|
| 43 |
+
1772783636091 1.1956274234177322e-06 430
|
| 44 |
+
1772784623680 9.336181295993204e-07 440
|
| 45 |
+
1772785610745 7.02091712495907e-07 450
|
| 46 |
+
1772786582766 5.021229788074589e-07 460
|
| 47 |
+
1772787560653 3.3464023614327683e-07 470
|
| 48 |
+
1772788546976 2.0042098357321209e-07 480
|
| 49 |
+
1772789545218 1.0008830227189431e-07 490
|
| 50 |
+
1772790549864 3.410796301156205e-08 500
|
| 51 |
+
1772791531194 2.7862639312792317e-09 510
|
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/loss
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
1772752246047 0.8486 10
|
| 2 |
+
1772752764661 0.7412 20
|
| 3 |
+
1772753281010 0.6532 30
|
| 4 |
+
1772753804852 0.6102 40
|
| 5 |
+
1772754322796 0.5784 50
|
| 6 |
+
1772754836358 0.5641 60
|
| 7 |
+
1772755352360 0.5469 70
|
| 8 |
+
1772755869425 0.5424 80
|
| 9 |
+
1772756578289 0.5293 90
|
| 10 |
+
1772757484272 0.5266 100
|
| 11 |
+
1772758410313 0.522 110
|
| 12 |
+
1772759124014 0.5222 120
|
| 13 |
+
1772759815988 0.5106 130
|
| 14 |
+
1772760543838 0.5114 140
|
| 15 |
+
1772761248573 0.5099 150
|
| 16 |
+
1772761990136 0.5086 160
|
| 17 |
+
1772762702603 0.5061 170
|
| 18 |
+
1772763342537 0.4746 180
|
| 19 |
+
1772764073344 0.478 190
|
| 20 |
+
1772764778282 0.4755 200
|
| 21 |
+
1772765536908 0.4765 210
|
| 22 |
+
1772766293667 0.4706 220
|
| 23 |
+
1772767040309 0.4681 230
|
| 24 |
+
1772767807864 0.4715 240
|
| 25 |
+
1772768557956 0.4711 250
|
| 26 |
+
1772769332250 0.4685 260
|
| 27 |
+
1772770136860 0.4688 270
|
| 28 |
+
1772770855053 0.4722 280
|
| 29 |
+
1772771568547 0.4649 290
|
| 30 |
+
1772772287333 0.4692 300
|
| 31 |
+
1772772997323 0.4653 310
|
| 32 |
+
1772773725742 0.4664 320
|
| 33 |
+
1772774432045 0.4621 330
|
| 34 |
+
1772775156419 0.4675 340
|
| 35 |
+
1772775808686 0.44 350
|
| 36 |
+
1772776704525 0.4447 360
|
| 37 |
+
1772777688582 0.4391 370
|
| 38 |
+
1772778669427 0.4419 380
|
| 39 |
+
1772779659140 0.4377 390
|
| 40 |
+
1772780643974 0.4387 400
|
| 41 |
+
1772781649888 0.4426 410
|
| 42 |
+
1772782654359 0.442 420
|
| 43 |
+
1772783636091 0.4449 430
|
| 44 |
+
1772784623680 0.4391 440
|
| 45 |
+
1772785610745 0.4374 450
|
| 46 |
+
1772786582766 0.4403 460
|
| 47 |
+
1772787560653 0.44 470
|
| 48 |
+
1772788546976 0.4358 480
|
| 49 |
+
1772789545218 0.4384 490
|
| 50 |
+
1772790549864 0.4438 500
|
| 51 |
+
1772791531194 0.4378 510
|
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/total_flos
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
1772791757957 1.1980638081930756e+19 513
|
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/train_loss
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
1772791757957 0.49363853406255476 513
|
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/train_runtime
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
1772791757957 40041.2675 513
|
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/train_samples_per_second
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
1772791757957 3.261 513
|
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/train_steps_per_second
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
1772791757957 0.013 513
|
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/_name_or_path
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
/local2/salman/model/pretrain_model/v2_4_gpu_llama_3b_nemo_52b
|
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/accelerator_config
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
|
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/adafactor
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
False
|
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/adam_beta1
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
0.9
|
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/adam_beta2
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
0.999
|
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/adam_epsilon
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
1e-08
|
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/add_cross_attention
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
False
|
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/architectures
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
['LlamaForCausalLM']
|
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/attention_bias
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
False
|
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/attention_dropout
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
0.0
|
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/auto_find_batch_size
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
False
|
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/average_tokens_across_devices
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
True
|
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/bad_words_ids
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
None
|
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/batch_eval_metrics
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
False
|
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/begin_suppress_tokens
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
None
|
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/bf16
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
True
|
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/bf16_full_eval
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
False
|
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/bos_token_id
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
128000
|
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/chunk_size_feed_forward
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
0
|
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/cross_attention_hidden_size
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
None
|
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/data_seed
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
None
|
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/dataloader_drop_last
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
False
|
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/dataloader_num_workers
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
4
|
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/dataloader_persistent_workers
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
True
|
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/dataloader_pin_memory
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
True
|
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/dataloader_prefetch_factor
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
None
|
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ddp_backend
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
None
|
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ddp_broadcast_buffers
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
None
|
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ddp_bucket_cap_mb
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
None
|
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ddp_find_unused_parameters
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
None
|
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ddp_timeout
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
180000000
|
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/debug
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
[]
|
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/decoder_start_token_id
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
None
|