salmannyu commited on
Commit
82a74cc
·
verified ·
1 Parent(s): ffbf2bc

Upload global_step_0

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. global_step_0/README.md +60 -0
  3. global_step_0/all_results.json +8 -0
  4. global_step_0/chat_template.jinja +21 -0
  5. global_step_0/config.json +36 -0
  6. global_step_0/generation_config.json +13 -0
  7. global_step_0/logs/sft_train_20260305_150038.log +0 -0
  8. global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/meta.yaml +14 -0
  9. global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/epoch +52 -0
  10. global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/grad_norm +51 -0
  11. global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/learning_rate +51 -0
  12. global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/loss +51 -0
  13. global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/total_flos +1 -0
  14. global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/train_loss +1 -0
  15. global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/train_runtime +1 -0
  16. global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/train_samples_per_second +1 -0
  17. global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/train_steps_per_second +1 -0
  18. global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/_name_or_path +1 -0
  19. global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/accelerator_config +1 -0
  20. global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/adafactor +1 -0
  21. global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/adam_beta1 +1 -0
  22. global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/adam_beta2 +1 -0
  23. global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/adam_epsilon +1 -0
  24. global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/add_cross_attention +1 -0
  25. global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/architectures +1 -0
  26. global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/attention_bias +1 -0
  27. global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/attention_dropout +1 -0
  28. global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/auto_find_batch_size +1 -0
  29. global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/average_tokens_across_devices +1 -0
  30. global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/bad_words_ids +1 -0
  31. global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/batch_eval_metrics +1 -0
  32. global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/begin_suppress_tokens +1 -0
  33. global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/bf16 +1 -0
  34. global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/bf16_full_eval +1 -0
  35. global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/bos_token_id +1 -0
  36. global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/chunk_size_feed_forward +1 -0
  37. global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/cross_attention_hidden_size +1 -0
  38. global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/data_seed +1 -0
  39. global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/dataloader_drop_last +1 -0
  40. global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/dataloader_num_workers +1 -0
  41. global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/dataloader_persistent_workers +1 -0
  42. global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/dataloader_pin_memory +1 -0
  43. global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/dataloader_prefetch_factor +1 -0
  44. global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ddp_backend +1 -0
  45. global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ddp_broadcast_buffers +1 -0
  46. global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ddp_bucket_cap_mb +1 -0
  47. global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ddp_find_unused_parameters +1 -0
  48. global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ddp_timeout +1 -0
  49. global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/debug +1 -0
  50. global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/decoder_start_token_id +1 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ global_step_0/tokenizer.json filter=lfs diff=lfs merge=lfs -text
global_step_0/README.md ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: other
4
+ tags:
5
+ - llama-factory
6
+ - full
7
+ - generated_from_trainer
8
+ model-index:
9
+ - name: think_sft_nopack_lr1.5e5_ep3
10
+ results: []
11
+ ---
12
+
13
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
14
+ should probably proofread and complete it, then remove this comment. -->
15
+
16
+ # think_sft_nopack_lr1.5e5_ep3
17
+
18
+ This model is a fine-tuned version of a custom Llama 3B model pretrained on 52B tokens on the open_thoughts_43k_think_format dataset.
19
+
20
+ ## Model description
21
+
22
+ More information needed
23
+
24
+ ## Intended uses & limitations
25
+
26
+ More information needed
27
+
28
+ ## Training and evaluation data
29
+
30
+ More information needed
31
+
32
+ ## Training procedure
33
+
34
+ ### Training hyperparameters
35
+
36
+ The following hyperparameters were used during training:
37
+ - learning_rate: 1.5e-05
38
+ - train_batch_size: 2
39
+ - eval_batch_size: 8
40
+ - seed: 42
41
+ - distributed_type: multi-GPU
42
+ - num_devices: 4
43
+ - gradient_accumulation_steps: 32
44
+ - total_train_batch_size: 256
45
+ - total_eval_batch_size: 32
46
+ - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
47
+ - lr_scheduler_type: cosine
48
+ - lr_scheduler_warmup_ratio: 0.1
49
+ - num_epochs: 3.0
50
+
51
+ ### Training results
52
+
53
+
54
+
55
+ ### Framework versions
56
+
57
+ - Transformers 4.57.1
58
+ - Pytorch 2.6.0+cu124
59
+ - Datasets 4.0.0
60
+ - Tokenizers 0.22.1
global_step_0/all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "total_flos": 1.1980638081930756e+19,
4
+ "train_loss": 0.49363853406255476,
5
+ "train_runtime": 40041.2675,
6
+ "train_samples_per_second": 3.261,
7
+ "train_steps_per_second": 0.013
8
+ }
global_step_0/chat_template.jinja ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{- bos_token }}
2
+ {%- if messages[0]['role'] == 'system' %}
3
+ {%- set system_message = messages[0]['content'] %}
4
+ {%- set loop_messages = messages[1:] %}
5
+ {%- else %}
6
+ {%- set system_message = "" %}
7
+ {%- set loop_messages = messages %}
8
+ {%- endif %}
9
+ {%- if system_message %}
10
+ {{- '<|start_header_id|>system<|end_header_id|>\n\n' + system_message + '<|eot_id|>' }}
11
+ {%- endif %}
12
+ {%- for message in loop_messages %}
13
+ {%- if message['role'] == 'user' %}
14
+ {{- '<|start_header_id|>user<|end_header_id|>\n\n' + message['content'] + '<|eot_id|>' }}
15
+ {%- elif message['role'] == 'assistant' %}
16
+ {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' + message['content'] + '<|eot_id|>' }}
17
+ {%- endif %}
18
+ {%- endfor %}
19
+ {%- if add_generation_prompt %}
20
+ {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
21
+ {%- endif %}
global_step_0/config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 128000,
8
+ "dtype": "bfloat16",
9
+ "eos_token_id": 128009,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 3072,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 8192,
15
+ "max_position_embeddings": 131072,
16
+ "mlp_bias": false,
17
+ "model_type": "llama",
18
+ "num_attention_heads": 24,
19
+ "num_hidden_layers": 28,
20
+ "num_key_value_heads": 8,
21
+ "pad_token_id": 128001,
22
+ "pretraining_tp": 1,
23
+ "rms_norm_eps": 1e-05,
24
+ "rope_scaling": {
25
+ "factor": 32.0,
26
+ "high_freq_factor": 4.0,
27
+ "low_freq_factor": 1.0,
28
+ "original_max_position_embeddings": 8192,
29
+ "rope_type": "llama3"
30
+ },
31
+ "rope_theta": 500000.0,
32
+ "tie_word_embeddings": true,
33
+ "transformers_version": "4.57.1",
34
+ "use_cache": false,
35
+ "vocab_size": 128256
36
+ }
global_step_0/generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 128000,
4
+ "do_sample": true,
5
+ "eos_token_id": [
6
+ 128009,
7
+ 128001
8
+ ],
9
+ "pad_token_id": 128001,
10
+ "temperature": 0.6,
11
+ "top_p": 0.9,
12
+ "transformers_version": "4.57.1"
13
+ }
global_step_0/logs/sft_train_20260305_150038.log ADDED
The diff for this file is too large to render. See raw diff
 
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/meta.yaml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ artifact_uri: file:///local2/salman/model/sft_model_llama_mid_train/think_sft_nopack_lr1.5e5_ep3/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/artifacts
2
+ end_time: 1772791757971
3
+ entry_point_name: ''
4
+ experiment_id: '356092632336622637'
5
+ lifecycle_stage: active
6
+ run_id: c370ae36b3594e5b8e4483476b3515b7
7
+ run_name: llama3b_think_sft_nopack_lr1.5e5_ep3
8
+ source_name: ''
9
+ source_type: 4
10
+ source_version: ''
11
+ start_time: 1772751716706
12
+ status: 3
13
+ tags: []
14
+ user_id: salman
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/epoch ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 1772752246047 0.05881271825032163 10
2
+ 1772752764661 0.11762543650064326 20
3
+ 1772753281010 0.1764381547509649 30
4
+ 1772753804852 0.23525087300128653 40
5
+ 1772754322796 0.29406359125160814 50
6
+ 1772754836358 0.3528763095019298 60
7
+ 1772755352360 0.4116890277522514 70
8
+ 1772755869425 0.47050174600257305 80
9
+ 1772756578289 0.5293144642528946 90
10
+ 1772757484272 0.5881271825032163 100
11
+ 1772758410313 0.6469399007535379 110
12
+ 1772759124014 0.7057526190038595 120
13
+ 1772759815988 0.7645653372541812 130
14
+ 1772760543838 0.8233780555045028 140
15
+ 1772761248573 0.8821907737548245 150
16
+ 1772761990136 0.9410034920051461 160
17
+ 1772762702603 0.9998162102554677 170
18
+ 1772763342537 1.0529314464252895 180
19
+ 1772764073344 1.1117441646756112 190
20
+ 1772764778282 1.1705568829259327 200
21
+ 1772765536908 1.2293696011762543 210
22
+ 1772766293667 1.288182319426576 220
23
+ 1772767040309 1.3469950376768978 230
24
+ 1772767807864 1.4058077559272193 240
25
+ 1772768557956 1.4646204741775408 250
26
+ 1772769332250 1.5234331924278626 260
27
+ 1772770136860 1.5822459106781843 270
28
+ 1772770855053 1.6410586289285058 280
29
+ 1772771568547 1.6998713471788274 290
30
+ 1772772287333 1.758684065429149 300
31
+ 1772772997323 1.8174967836794707 310
32
+ 1772773725742 1.8763095019297924 320
33
+ 1772774432045 1.935122220180114 330
34
+ 1772775156419 1.9939349384304355 340
35
+ 1772775808686 2.0470501746002574 350
36
+ 1772776704525 2.105862892850579 360
37
+ 1772777688582 2.1646756111009005 370
38
+ 1772778669427 2.2234883293512224 380
39
+ 1772779659140 2.282301047601544 390
40
+ 1772780643974 2.3411137658518655 400
41
+ 1772781649888 2.399926484102187 410
42
+ 1772782654359 2.4587392023525085 420
43
+ 1772783636091 2.5175519206028305 430
44
+ 1772784623680 2.576364638853152 440
45
+ 1772785610745 2.6351773571034736 450
46
+ 1772786582766 2.6939900753537955 460
47
+ 1772787560653 2.7528027936041166 470
48
+ 1772788546976 2.8116155118544386 480
49
+ 1772789545218 2.87042823010476 490
50
+ 1772790549864 2.9292409483550816 500
51
+ 1772791531194 2.9880536666054036 510
52
+ 1772791757957 3.0 513
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/grad_norm ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 1772752246047 1.3511555194854736 10
2
+ 1772752764661 0.7383383512496948 20
3
+ 1772753281010 0.47219017148017883 30
4
+ 1772753804852 0.30038249492645264 40
5
+ 1772754322796 0.2751595377922058 50
6
+ 1772754836358 0.26936954259872437 60
7
+ 1772755352360 0.25376981496810913 70
8
+ 1772755869425 0.2703434228897095 80
9
+ 1772756578289 0.3386951684951782 90
10
+ 1772757484272 0.30952027440071106 100
11
+ 1772758410313 0.2706937789916992 110
12
+ 1772759124014 0.286222368478775 120
13
+ 1772759815988 0.2553636431694031 130
14
+ 1772760543838 0.2975357472896576 140
15
+ 1772761248573 0.24958086013793945 150
16
+ 1772761990136 0.302441269159317 160
17
+ 1772762702603 0.24974007904529572 170
18
+ 1772763342537 0.35062289237976074 180
19
+ 1772764073344 0.28535276651382446 190
20
+ 1772764778282 0.2474713921546936 200
21
+ 1772765536908 0.23004528880119324 210
22
+ 1772766293667 0.23046620190143585 220
23
+ 1772767040309 0.243893101811409 230
24
+ 1772767807864 0.2657492160797119 240
25
+ 1772768557956 0.24003422260284424 250
26
+ 1772769332250 0.238833948969841 260
27
+ 1772770136860 0.237404927611351 270
28
+ 1772770855053 0.22758300602436066 280
29
+ 1772771568547 0.22680319845676422 290
30
+ 1772772287333 0.2401188611984253 300
31
+ 1772772997323 0.2211555689573288 310
32
+ 1772773725742 0.24088308215141296 320
33
+ 1772774432045 0.21008798480033875 330
34
+ 1772775156419 0.2156449556350708 340
35
+ 1772775808686 0.2731837034225464 350
36
+ 1772776704525 0.2207324057817459 360
37
+ 1772777688582 0.21577142179012299 370
38
+ 1772778669427 0.22381627559661865 380
39
+ 1772779659140 0.2167045623064041 390
40
+ 1772780643974 0.2239835262298584 400
41
+ 1772781649888 0.2177765816450119 410
42
+ 1772782654359 0.21108600497245789 420
43
+ 1772783636091 0.20833276212215424 430
44
+ 1772784623680 0.20782434940338135 440
45
+ 1772785610745 0.20101866126060486 450
46
+ 1772786582766 0.1978382021188736 460
47
+ 1772787560653 0.20072239637374878 470
48
+ 1772788546976 0.2036609798669815 480
49
+ 1772789545218 0.20166757702827454 490
50
+ 1772790549864 0.20334972441196442 500
51
+ 1772791531194 0.20352092385292053 510
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/learning_rate ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 1772752246047 2.596153846153846e-06 10
2
+ 1772752764661 5.480769230769231e-06 20
3
+ 1772753281010 8.365384615384616e-06 30
4
+ 1772753804852 1.125e-05 40
5
+ 1772754322796 1.4134615384615384e-05 50
6
+ 1772754836358 1.4991468156423456e-05 60
7
+ 1772755352360 1.494972625749433e-05 70
8
+ 1772755869425 1.4873400764197756e-05 80
9
+ 1772756578289 1.4762845999606666e-05 90
10
+ 1772757484272 1.4618575188100301e-05 100
11
+ 1772758410313 1.4441258072841264e-05 110
12
+ 1772759124014 1.4231717806651086e-05 120
13
+ 1772759815988 1.3990927130717711e-05 130
14
+ 1772760543838 1.3720003858874311e-05 140
15
+ 1772761248573 1.3420205688412603e-05 150
16
+ 1772761990136 1.3092924361520291e-05 160
17
+ 1772762702603 1.2739679204446694e-05 170
18
+ 1772763342537 1.236211007438955e-05 180
19
+ 1772764073344 1.1961969746845325e-05 190
20
+ 1772764778282 1.1541115778763038e-05 200
21
+ 1772765536908 1.1101501885274894e-05 210
22
+ 1772766293667 1.0645168870035313e-05 220
23
+ 1772767040309 1.0174235151272025e-05 230
24
+ 1772767807864 9.690886927529886e-06 240
25
+ 1772768557956 9.197368028760536e-06 250
26
+ 1772769332250 8.695969499871911e-06 260
27
+ 1772770136860 8.18901896509343e-06 270
28
+ 1772770855053 7.678869822530362e-06 280
29
+ 1772771568547 7.167890319069035e-06 290
30
+ 1772772287333 6.658452556350092e-06 300
31
+ 1772772997323 6.152921478846986e-06 310
32
+ 1772773725742 5.65364389516988e-06 320
33
+ 1772774432045 5.162937583561072e-06 330
34
+ 1772775156419 4.683080532156986e-06 340
35
+ 1772775808686 4.216300363966383e-06 350
36
+ 1772776704525 3.7647639956567304e-06 360
37
+ 1772777688582 3.3305675781554655e-06 370
38
+ 1772778669427 2.915726765764453e-06 380
39
+ 1772779659140 2.522167358961046e-06 390
40
+ 1772780643974 2.151716364324264e-06 400
41
+ 1772781649888 1.806093513088348e-06 410
42
+ 1772782654359 1.486903277696733e-06 420
43
+ 1772783636091 1.1956274234177322e-06 430
44
+ 1772784623680 9.336181295993204e-07 440
45
+ 1772785610745 7.02091712495907e-07 450
46
+ 1772786582766 5.021229788074589e-07 460
47
+ 1772787560653 3.3464023614327683e-07 470
48
+ 1772788546976 2.0042098357321209e-07 480
49
+ 1772789545218 1.0008830227189431e-07 490
50
+ 1772790549864 3.410796301156205e-08 500
51
+ 1772791531194 2.7862639312792317e-09 510
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/loss ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 1772752246047 0.8486 10
2
+ 1772752764661 0.7412 20
3
+ 1772753281010 0.6532 30
4
+ 1772753804852 0.6102 40
5
+ 1772754322796 0.5784 50
6
+ 1772754836358 0.5641 60
7
+ 1772755352360 0.5469 70
8
+ 1772755869425 0.5424 80
9
+ 1772756578289 0.5293 90
10
+ 1772757484272 0.5266 100
11
+ 1772758410313 0.522 110
12
+ 1772759124014 0.5222 120
13
+ 1772759815988 0.5106 130
14
+ 1772760543838 0.5114 140
15
+ 1772761248573 0.5099 150
16
+ 1772761990136 0.5086 160
17
+ 1772762702603 0.5061 170
18
+ 1772763342537 0.4746 180
19
+ 1772764073344 0.478 190
20
+ 1772764778282 0.4755 200
21
+ 1772765536908 0.4765 210
22
+ 1772766293667 0.4706 220
23
+ 1772767040309 0.4681 230
24
+ 1772767807864 0.4715 240
25
+ 1772768557956 0.4711 250
26
+ 1772769332250 0.4685 260
27
+ 1772770136860 0.4688 270
28
+ 1772770855053 0.4722 280
29
+ 1772771568547 0.4649 290
30
+ 1772772287333 0.4692 300
31
+ 1772772997323 0.4653 310
32
+ 1772773725742 0.4664 320
33
+ 1772774432045 0.4621 330
34
+ 1772775156419 0.4675 340
35
+ 1772775808686 0.44 350
36
+ 1772776704525 0.4447 360
37
+ 1772777688582 0.4391 370
38
+ 1772778669427 0.4419 380
39
+ 1772779659140 0.4377 390
40
+ 1772780643974 0.4387 400
41
+ 1772781649888 0.4426 410
42
+ 1772782654359 0.442 420
43
+ 1772783636091 0.4449 430
44
+ 1772784623680 0.4391 440
45
+ 1772785610745 0.4374 450
46
+ 1772786582766 0.4403 460
47
+ 1772787560653 0.44 470
48
+ 1772788546976 0.4358 480
49
+ 1772789545218 0.4384 490
50
+ 1772790549864 0.4438 500
51
+ 1772791531194 0.4378 510
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/total_flos ADDED
@@ -0,0 +1 @@
 
 
1
+ 1772791757957 1.1980638081930756e+19 513
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/train_loss ADDED
@@ -0,0 +1 @@
 
 
1
+ 1772791757957 0.49363853406255476 513
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/train_runtime ADDED
@@ -0,0 +1 @@
 
 
1
+ 1772791757957 40041.2675 513
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/train_samples_per_second ADDED
@@ -0,0 +1 @@
 
 
1
+ 1772791757957 3.261 513
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/metrics/train_steps_per_second ADDED
@@ -0,0 +1 @@
 
 
1
+ 1772791757957 0.013 513
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/_name_or_path ADDED
@@ -0,0 +1 @@
 
 
1
+ /local2/salman/model/pretrain_model/v2_4_gpu_llama_3b_nemo_52b
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/accelerator_config ADDED
@@ -0,0 +1 @@
 
 
1
+ {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/adafactor ADDED
@@ -0,0 +1 @@
 
 
1
+ False
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/adam_beta1 ADDED
@@ -0,0 +1 @@
 
 
1
+ 0.9
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/adam_beta2 ADDED
@@ -0,0 +1 @@
 
 
1
+ 0.999
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/adam_epsilon ADDED
@@ -0,0 +1 @@
 
 
1
+ 1e-08
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/add_cross_attention ADDED
@@ -0,0 +1 @@
 
 
1
+ False
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/architectures ADDED
@@ -0,0 +1 @@
 
 
1
+ ['LlamaForCausalLM']
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/attention_bias ADDED
@@ -0,0 +1 @@
 
 
1
+ False
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/attention_dropout ADDED
@@ -0,0 +1 @@
 
 
1
+ 0.0
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/auto_find_batch_size ADDED
@@ -0,0 +1 @@
 
 
1
+ False
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/average_tokens_across_devices ADDED
@@ -0,0 +1 @@
 
 
1
+ True
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/bad_words_ids ADDED
@@ -0,0 +1 @@
 
 
1
+ None
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/batch_eval_metrics ADDED
@@ -0,0 +1 @@
 
 
1
+ False
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/begin_suppress_tokens ADDED
@@ -0,0 +1 @@
 
 
1
+ None
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/bf16 ADDED
@@ -0,0 +1 @@
 
 
1
+ True
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/bf16_full_eval ADDED
@@ -0,0 +1 @@
 
 
1
+ False
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/bos_token_id ADDED
@@ -0,0 +1 @@
 
 
1
+ 128000
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/chunk_size_feed_forward ADDED
@@ -0,0 +1 @@
 
 
1
+ 0
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/cross_attention_hidden_size ADDED
@@ -0,0 +1 @@
 
 
1
+ None
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/data_seed ADDED
@@ -0,0 +1 @@
 
 
1
+ None
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/dataloader_drop_last ADDED
@@ -0,0 +1 @@
 
 
1
+ False
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/dataloader_num_workers ADDED
@@ -0,0 +1 @@
 
 
1
+ 4
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/dataloader_persistent_workers ADDED
@@ -0,0 +1 @@
 
 
1
+ True
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/dataloader_pin_memory ADDED
@@ -0,0 +1 @@
 
 
1
+ True
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/dataloader_prefetch_factor ADDED
@@ -0,0 +1 @@
 
 
1
+ None
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ddp_backend ADDED
@@ -0,0 +1 @@
 
 
1
+ None
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ddp_broadcast_buffers ADDED
@@ -0,0 +1 @@
 
 
1
+ None
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ddp_bucket_cap_mb ADDED
@@ -0,0 +1 @@
 
 
1
+ None
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ddp_find_unused_parameters ADDED
@@ -0,0 +1 @@
 
 
1
+ None
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/ddp_timeout ADDED
@@ -0,0 +1 @@
 
 
1
+ 180000000
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/debug ADDED
@@ -0,0 +1 @@
 
 
1
+ []
global_step_0/mlflow/356092632336622637/c370ae36b3594e5b8e4483476b3515b7/params/decoder_start_token_id ADDED
@@ -0,0 +1 @@
 
 
1
+ None