SirajRLX commited on
Commit
9fcdb22
·
verified ·
1 Parent(s): e9532f5

Add files using upload-large-folder tool

Browse files
Files changed (50) hide show
  1. best_adapter/README.md +2 -2
  2. best_adapter/adapter_config.json +6 -6
  3. best_adapter/training_args.bin +1 -1
  4. checkpoints/checkpoint-1000/README.md +2 -2
  5. checkpoints/checkpoint-1000/adapter_config.json +6 -6
  6. checkpoints/checkpoint-1000/trainer_state.json +0 -0
  7. checkpoints/checkpoint-1500/README.md +2 -2
  8. checkpoints/checkpoint-1500/adapter_config.json +6 -6
  9. checkpoints/checkpoint-1500/trainer_state.json +0 -0
  10. checkpoints/checkpoint-2000/README.md +2 -2
  11. checkpoints/checkpoint-2000/adapter_config.json +6 -6
  12. checkpoints/checkpoint-2000/trainer_state.json +0 -0
  13. checkpoints/checkpoint-2500/README.md +2 -2
  14. checkpoints/checkpoint-2500/adapter_config.json +6 -6
  15. checkpoints/checkpoint-2500/trainer_state.json +0 -0
  16. checkpoints/checkpoint-3000/README.md +2 -2
  17. checkpoints/checkpoint-3000/adapter_config.json +6 -6
  18. checkpoints/checkpoint-3000/trainer_state.json +0 -0
  19. checkpoints/checkpoint-3500/README.md +2 -2
  20. checkpoints/checkpoint-3500/adapter_config.json +6 -6
  21. checkpoints/checkpoint-3500/trainer_state.json +0 -0
  22. checkpoints/checkpoint-4000/README.md +2 -2
  23. checkpoints/checkpoint-4000/adapter_config.json +6 -6
  24. checkpoints/checkpoint-4000/trainer_state.json +0 -0
  25. checkpoints/checkpoint-4500/README.md +2 -2
  26. checkpoints/checkpoint-4500/adapter_config.json +6 -6
  27. checkpoints/checkpoint-4500/rng_state.pth +1 -1
  28. checkpoints/checkpoint-4500/scheduler.pt +1 -1
  29. checkpoints/checkpoint-4500/trainer_state.json +0 -0
  30. checkpoints/checkpoint-4500/training_args.bin +1 -1
  31. checkpoints/checkpoint-500/README.md +2 -2
  32. checkpoints/checkpoint-500/adapter_config.json +6 -6
  33. checkpoints/checkpoint-500/trainer_state.json +773 -773
  34. checkpoints/checkpoint-5000/README.md +2 -2
  35. checkpoints/checkpoint-5000/adapter_config.json +6 -6
  36. checkpoints/checkpoint-5000/trainer_state.json +0 -0
  37. config_resolved.yaml +8 -8
  38. eval_final.json +6 -6
  39. logs/eval.jsonl +52 -77
  40. logs/train.jsonl +0 -0
  41. wandb/debug-internal.log +12 -12
  42. wandb/debug.log +29 -29
  43. wandb/run-20251226_181544-upub1jan/files/config.yaml +601 -0
  44. wandb/run-20251226_181544-upub1jan/files/output.log +0 -0
  45. wandb/run-20251226_181544-upub1jan/files/requirements.txt +104 -0
  46. wandb/run-20251226_181544-upub1jan/files/wandb-metadata.json +47 -0
  47. wandb/run-20251226_181544-upub1jan/files/wandb-summary.json +1 -0
  48. wandb/run-20251226_181544-upub1jan/logs/debug-core.log +16 -0
  49. wandb/run-20251226_181544-upub1jan/logs/debug-internal.log +12 -0
  50. wandb/run-20251226_181544-upub1jan/logs/debug.log +29 -0
best_adapter/README.md CHANGED
@@ -1,9 +1,9 @@
1
  ---
2
- base_model: Models/Devstral-Small-2-24B-HS-CPT
3
  library_name: peft
4
  pipeline_tag: text-generation
5
  tags:
6
- - base_model:adapter:Models/Devstral-Small-2-24B-HS-CPT
7
  - lora
8
  - transformers
9
  ---
 
1
  ---
2
+ base_model: Models/Qwen2.5-Coder-14B-CPT
3
  library_name: peft
4
  pipeline_tag: text-generation
5
  tags:
6
+ - base_model:adapter:Models/Qwen2.5-Coder-14B-CPT
7
  - lora
8
  - transformers
9
  ---
best_adapter/adapter_config.json CHANGED
@@ -3,7 +3,7 @@
3
  "alpha_pattern": {},
4
  "arrow_config": null,
5
  "auto_mapping": null,
6
- "base_model_name_or_path": "Models/Devstral-Small-2-24B-HS-CPT",
7
  "bias": "none",
8
  "corda_config": null,
9
  "ensure_weight_tying": false,
@@ -16,7 +16,7 @@
16
  "layers_pattern": null,
17
  "layers_to_transform": null,
18
  "loftq_config": {},
19
- "lora_alpha": 16,
20
  "lora_bias": false,
21
  "lora_dropout": 0.05,
22
  "megatron_config": null,
@@ -25,14 +25,14 @@
25
  "peft_type": "LORA",
26
  "peft_version": "0.18.0",
27
  "qalora_group_size": 16,
28
- "r": 8,
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
- "v_proj",
33
- "q_proj",
34
  "o_proj",
35
- "k_proj"
 
 
36
  ],
37
  "target_parameters": null,
38
  "task_type": "CAUSAL_LM",
 
3
  "alpha_pattern": {},
4
  "arrow_config": null,
5
  "auto_mapping": null,
6
+ "base_model_name_or_path": "Models/Qwen2.5-Coder-14B-CPT",
7
  "bias": "none",
8
  "corda_config": null,
9
  "ensure_weight_tying": false,
 
16
  "layers_pattern": null,
17
  "layers_to_transform": null,
18
  "loftq_config": {},
19
+ "lora_alpha": 64,
20
  "lora_bias": false,
21
  "lora_dropout": 0.05,
22
  "megatron_config": null,
 
25
  "peft_type": "LORA",
26
  "peft_version": "0.18.0",
27
  "qalora_group_size": 16,
28
+ "r": 32,
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
 
 
32
  "o_proj",
33
+ "v_proj",
34
+ "k_proj",
35
+ "q_proj"
36
  ],
37
  "target_parameters": null,
38
  "task_type": "CAUSAL_LM",
best_adapter/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e09df88fe57630482e911c5fab6026e3d20e4f37f6e48706f3566768f533d6d7
3
  size 4792
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:afae4a8ce1391f149eb00b14eed8f891c715e892ea546bf754d22db2c2bc5969
3
  size 4792
checkpoints/checkpoint-1000/README.md CHANGED
@@ -1,9 +1,9 @@
1
  ---
2
- base_model: Models/Devstral-Small-2-24B-HS-CPT
3
  library_name: peft
4
  pipeline_tag: text-generation
5
  tags:
6
- - base_model:adapter:Models/Devstral-Small-2-24B-HS-CPT
7
  - lora
8
  - transformers
9
  ---
 
1
  ---
2
+ base_model: Models/Qwen2.5-Coder-14B-CPT
3
  library_name: peft
4
  pipeline_tag: text-generation
5
  tags:
6
+ - base_model:adapter:Models/Qwen2.5-Coder-14B-CPT
7
  - lora
8
  - transformers
9
  ---
checkpoints/checkpoint-1000/adapter_config.json CHANGED
@@ -3,7 +3,7 @@
3
  "alpha_pattern": {},
4
  "arrow_config": null,
5
  "auto_mapping": null,
6
- "base_model_name_or_path": "Models/Devstral-Small-2-24B-HS-CPT",
7
  "bias": "none",
8
  "corda_config": null,
9
  "ensure_weight_tying": false,
@@ -16,7 +16,7 @@
16
  "layers_pattern": null,
17
  "layers_to_transform": null,
18
  "loftq_config": {},
19
- "lora_alpha": 16,
20
  "lora_bias": false,
21
  "lora_dropout": 0.05,
22
  "megatron_config": null,
@@ -25,14 +25,14 @@
25
  "peft_type": "LORA",
26
  "peft_version": "0.18.0",
27
  "qalora_group_size": 16,
28
- "r": 8,
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
- "v_proj",
33
- "q_proj",
34
  "o_proj",
35
- "k_proj"
 
 
36
  ],
37
  "target_parameters": null,
38
  "task_type": "CAUSAL_LM",
 
3
  "alpha_pattern": {},
4
  "arrow_config": null,
5
  "auto_mapping": null,
6
+ "base_model_name_or_path": "Models/Qwen2.5-Coder-14B-CPT",
7
  "bias": "none",
8
  "corda_config": null,
9
  "ensure_weight_tying": false,
 
16
  "layers_pattern": null,
17
  "layers_to_transform": null,
18
  "loftq_config": {},
19
+ "lora_alpha": 64,
20
  "lora_bias": false,
21
  "lora_dropout": 0.05,
22
  "megatron_config": null,
 
25
  "peft_type": "LORA",
26
  "peft_version": "0.18.0",
27
  "qalora_group_size": 16,
28
+ "r": 32,
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
 
 
32
  "o_proj",
33
+ "v_proj",
34
+ "k_proj",
35
+ "q_proj"
36
  ],
37
  "target_parameters": null,
38
  "task_type": "CAUSAL_LM",
checkpoints/checkpoint-1000/trainer_state.json CHANGED
The diff for this file is too large to render. See raw diff
 
checkpoints/checkpoint-1500/README.md CHANGED
@@ -1,9 +1,9 @@
1
  ---
2
- base_model: Models/Devstral-Small-2-24B-HS-CPT
3
  library_name: peft
4
  pipeline_tag: text-generation
5
  tags:
6
- - base_model:adapter:Models/Devstral-Small-2-24B-HS-CPT
7
  - lora
8
  - transformers
9
  ---
 
1
  ---
2
+ base_model: Models/Qwen2.5-Coder-14B-CPT
3
  library_name: peft
4
  pipeline_tag: text-generation
5
  tags:
6
+ - base_model:adapter:Models/Qwen2.5-Coder-14B-CPT
7
  - lora
8
  - transformers
9
  ---
checkpoints/checkpoint-1500/adapter_config.json CHANGED
@@ -3,7 +3,7 @@
3
  "alpha_pattern": {},
4
  "arrow_config": null,
5
  "auto_mapping": null,
6
- "base_model_name_or_path": "Models/Devstral-Small-2-24B-HS-CPT",
7
  "bias": "none",
8
  "corda_config": null,
9
  "ensure_weight_tying": false,
@@ -16,7 +16,7 @@
16
  "layers_pattern": null,
17
  "layers_to_transform": null,
18
  "loftq_config": {},
19
- "lora_alpha": 16,
20
  "lora_bias": false,
21
  "lora_dropout": 0.05,
22
  "megatron_config": null,
@@ -25,14 +25,14 @@
25
  "peft_type": "LORA",
26
  "peft_version": "0.18.0",
27
  "qalora_group_size": 16,
28
- "r": 8,
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
- "v_proj",
33
- "q_proj",
34
  "o_proj",
35
- "k_proj"
 
 
36
  ],
37
  "target_parameters": null,
38
  "task_type": "CAUSAL_LM",
 
3
  "alpha_pattern": {},
4
  "arrow_config": null,
5
  "auto_mapping": null,
6
+ "base_model_name_or_path": "Models/Qwen2.5-Coder-14B-CPT",
7
  "bias": "none",
8
  "corda_config": null,
9
  "ensure_weight_tying": false,
 
16
  "layers_pattern": null,
17
  "layers_to_transform": null,
18
  "loftq_config": {},
19
+ "lora_alpha": 64,
20
  "lora_bias": false,
21
  "lora_dropout": 0.05,
22
  "megatron_config": null,
 
25
  "peft_type": "LORA",
26
  "peft_version": "0.18.0",
27
  "qalora_group_size": 16,
28
+ "r": 32,
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
 
 
32
  "o_proj",
33
+ "v_proj",
34
+ "k_proj",
35
+ "q_proj"
36
  ],
37
  "target_parameters": null,
38
  "task_type": "CAUSAL_LM",
checkpoints/checkpoint-1500/trainer_state.json CHANGED
The diff for this file is too large to render. See raw diff
 
checkpoints/checkpoint-2000/README.md CHANGED
@@ -1,9 +1,9 @@
1
  ---
2
- base_model: Models/Devstral-Small-2-24B-HS-CPT
3
  library_name: peft
4
  pipeline_tag: text-generation
5
  tags:
6
- - base_model:adapter:Models/Devstral-Small-2-24B-HS-CPT
7
  - lora
8
  - transformers
9
  ---
 
1
  ---
2
+ base_model: Models/Qwen2.5-Coder-14B-CPT
3
  library_name: peft
4
  pipeline_tag: text-generation
5
  tags:
6
+ - base_model:adapter:Models/Qwen2.5-Coder-14B-CPT
7
  - lora
8
  - transformers
9
  ---
checkpoints/checkpoint-2000/adapter_config.json CHANGED
@@ -3,7 +3,7 @@
3
  "alpha_pattern": {},
4
  "arrow_config": null,
5
  "auto_mapping": null,
6
- "base_model_name_or_path": "Models/Devstral-Small-2-24B-HS-CPT",
7
  "bias": "none",
8
  "corda_config": null,
9
  "ensure_weight_tying": false,
@@ -16,7 +16,7 @@
16
  "layers_pattern": null,
17
  "layers_to_transform": null,
18
  "loftq_config": {},
19
- "lora_alpha": 16,
20
  "lora_bias": false,
21
  "lora_dropout": 0.05,
22
  "megatron_config": null,
@@ -25,14 +25,14 @@
25
  "peft_type": "LORA",
26
  "peft_version": "0.18.0",
27
  "qalora_group_size": 16,
28
- "r": 8,
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
- "v_proj",
33
- "q_proj",
34
  "o_proj",
35
- "k_proj"
 
 
36
  ],
37
  "target_parameters": null,
38
  "task_type": "CAUSAL_LM",
 
3
  "alpha_pattern": {},
4
  "arrow_config": null,
5
  "auto_mapping": null,
6
+ "base_model_name_or_path": "Models/Qwen2.5-Coder-14B-CPT",
7
  "bias": "none",
8
  "corda_config": null,
9
  "ensure_weight_tying": false,
 
16
  "layers_pattern": null,
17
  "layers_to_transform": null,
18
  "loftq_config": {},
19
+ "lora_alpha": 64,
20
  "lora_bias": false,
21
  "lora_dropout": 0.05,
22
  "megatron_config": null,
 
25
  "peft_type": "LORA",
26
  "peft_version": "0.18.0",
27
  "qalora_group_size": 16,
28
+ "r": 32,
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
 
 
32
  "o_proj",
33
+ "v_proj",
34
+ "k_proj",
35
+ "q_proj"
36
  ],
37
  "target_parameters": null,
38
  "task_type": "CAUSAL_LM",
checkpoints/checkpoint-2000/trainer_state.json CHANGED
The diff for this file is too large to render. See raw diff
 
checkpoints/checkpoint-2500/README.md CHANGED
@@ -1,9 +1,9 @@
1
  ---
2
- base_model: Models/Devstral-Small-2-24B-HS-CPT
3
  library_name: peft
4
  pipeline_tag: text-generation
5
  tags:
6
- - base_model:adapter:Models/Devstral-Small-2-24B-HS-CPT
7
  - lora
8
  - transformers
9
  ---
 
1
  ---
2
+ base_model: Models/Qwen2.5-Coder-14B-CPT
3
  library_name: peft
4
  pipeline_tag: text-generation
5
  tags:
6
+ - base_model:adapter:Models/Qwen2.5-Coder-14B-CPT
7
  - lora
8
  - transformers
9
  ---
checkpoints/checkpoint-2500/adapter_config.json CHANGED
@@ -3,7 +3,7 @@
3
  "alpha_pattern": {},
4
  "arrow_config": null,
5
  "auto_mapping": null,
6
- "base_model_name_or_path": "Models/Devstral-Small-2-24B-HS-CPT",
7
  "bias": "none",
8
  "corda_config": null,
9
  "ensure_weight_tying": false,
@@ -16,7 +16,7 @@
16
  "layers_pattern": null,
17
  "layers_to_transform": null,
18
  "loftq_config": {},
19
- "lora_alpha": 16,
20
  "lora_bias": false,
21
  "lora_dropout": 0.05,
22
  "megatron_config": null,
@@ -25,14 +25,14 @@
25
  "peft_type": "LORA",
26
  "peft_version": "0.18.0",
27
  "qalora_group_size": 16,
28
- "r": 8,
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
- "v_proj",
33
- "q_proj",
34
  "o_proj",
35
- "k_proj"
 
 
36
  ],
37
  "target_parameters": null,
38
  "task_type": "CAUSAL_LM",
 
3
  "alpha_pattern": {},
4
  "arrow_config": null,
5
  "auto_mapping": null,
6
+ "base_model_name_or_path": "Models/Qwen2.5-Coder-14B-CPT",
7
  "bias": "none",
8
  "corda_config": null,
9
  "ensure_weight_tying": false,
 
16
  "layers_pattern": null,
17
  "layers_to_transform": null,
18
  "loftq_config": {},
19
+ "lora_alpha": 64,
20
  "lora_bias": false,
21
  "lora_dropout": 0.05,
22
  "megatron_config": null,
 
25
  "peft_type": "LORA",
26
  "peft_version": "0.18.0",
27
  "qalora_group_size": 16,
28
+ "r": 32,
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
 
 
32
  "o_proj",
33
+ "v_proj",
34
+ "k_proj",
35
+ "q_proj"
36
  ],
37
  "target_parameters": null,
38
  "task_type": "CAUSAL_LM",
checkpoints/checkpoint-2500/trainer_state.json CHANGED
The diff for this file is too large to render. See raw diff
 
checkpoints/checkpoint-3000/README.md CHANGED
@@ -1,9 +1,9 @@
1
  ---
2
- base_model: Models/Devstral-Small-2-24B-HS-CPT
3
  library_name: peft
4
  pipeline_tag: text-generation
5
  tags:
6
- - base_model:adapter:Models/Devstral-Small-2-24B-HS-CPT
7
  - lora
8
  - transformers
9
  ---
 
1
  ---
2
+ base_model: Models/Qwen2.5-Coder-14B-CPT
3
  library_name: peft
4
  pipeline_tag: text-generation
5
  tags:
6
+ - base_model:adapter:Models/Qwen2.5-Coder-14B-CPT
7
  - lora
8
  - transformers
9
  ---
checkpoints/checkpoint-3000/adapter_config.json CHANGED
@@ -3,7 +3,7 @@
3
  "alpha_pattern": {},
4
  "arrow_config": null,
5
  "auto_mapping": null,
6
- "base_model_name_or_path": "Models/Devstral-Small-2-24B-HS-CPT",
7
  "bias": "none",
8
  "corda_config": null,
9
  "ensure_weight_tying": false,
@@ -16,7 +16,7 @@
16
  "layers_pattern": null,
17
  "layers_to_transform": null,
18
  "loftq_config": {},
19
- "lora_alpha": 16,
20
  "lora_bias": false,
21
  "lora_dropout": 0.05,
22
  "megatron_config": null,
@@ -25,14 +25,14 @@
25
  "peft_type": "LORA",
26
  "peft_version": "0.18.0",
27
  "qalora_group_size": 16,
28
- "r": 8,
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
- "v_proj",
33
- "q_proj",
34
  "o_proj",
35
- "k_proj"
 
 
36
  ],
37
  "target_parameters": null,
38
  "task_type": "CAUSAL_LM",
 
3
  "alpha_pattern": {},
4
  "arrow_config": null,
5
  "auto_mapping": null,
6
+ "base_model_name_or_path": "Models/Qwen2.5-Coder-14B-CPT",
7
  "bias": "none",
8
  "corda_config": null,
9
  "ensure_weight_tying": false,
 
16
  "layers_pattern": null,
17
  "layers_to_transform": null,
18
  "loftq_config": {},
19
+ "lora_alpha": 64,
20
  "lora_bias": false,
21
  "lora_dropout": 0.05,
22
  "megatron_config": null,
 
25
  "peft_type": "LORA",
26
  "peft_version": "0.18.0",
27
  "qalora_group_size": 16,
28
+ "r": 32,
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
 
 
32
  "o_proj",
33
+ "v_proj",
34
+ "k_proj",
35
+ "q_proj"
36
  ],
37
  "target_parameters": null,
38
  "task_type": "CAUSAL_LM",
checkpoints/checkpoint-3000/trainer_state.json CHANGED
The diff for this file is too large to render. See raw diff
 
checkpoints/checkpoint-3500/README.md CHANGED
@@ -1,9 +1,9 @@
1
  ---
2
- base_model: Models/Devstral-Small-2-24B-HS-CPT
3
  library_name: peft
4
  pipeline_tag: text-generation
5
  tags:
6
- - base_model:adapter:Models/Devstral-Small-2-24B-HS-CPT
7
  - lora
8
  - transformers
9
  ---
 
1
  ---
2
+ base_model: Models/Qwen2.5-Coder-14B-CPT
3
  library_name: peft
4
  pipeline_tag: text-generation
5
  tags:
6
+ - base_model:adapter:Models/Qwen2.5-Coder-14B-CPT
7
  - lora
8
  - transformers
9
  ---
checkpoints/checkpoint-3500/adapter_config.json CHANGED
@@ -3,7 +3,7 @@
3
  "alpha_pattern": {},
4
  "arrow_config": null,
5
  "auto_mapping": null,
6
- "base_model_name_or_path": "Models/Devstral-Small-2-24B-HS-CPT",
7
  "bias": "none",
8
  "corda_config": null,
9
  "ensure_weight_tying": false,
@@ -16,7 +16,7 @@
16
  "layers_pattern": null,
17
  "layers_to_transform": null,
18
  "loftq_config": {},
19
- "lora_alpha": 16,
20
  "lora_bias": false,
21
  "lora_dropout": 0.05,
22
  "megatron_config": null,
@@ -25,14 +25,14 @@
25
  "peft_type": "LORA",
26
  "peft_version": "0.18.0",
27
  "qalora_group_size": 16,
28
- "r": 8,
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
- "v_proj",
33
- "q_proj",
34
  "o_proj",
35
- "k_proj"
 
 
36
  ],
37
  "target_parameters": null,
38
  "task_type": "CAUSAL_LM",
 
3
  "alpha_pattern": {},
4
  "arrow_config": null,
5
  "auto_mapping": null,
6
+ "base_model_name_or_path": "Models/Qwen2.5-Coder-14B-CPT",
7
  "bias": "none",
8
  "corda_config": null,
9
  "ensure_weight_tying": false,
 
16
  "layers_pattern": null,
17
  "layers_to_transform": null,
18
  "loftq_config": {},
19
+ "lora_alpha": 64,
20
  "lora_bias": false,
21
  "lora_dropout": 0.05,
22
  "megatron_config": null,
 
25
  "peft_type": "LORA",
26
  "peft_version": "0.18.0",
27
  "qalora_group_size": 16,
28
+ "r": 32,
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
 
 
32
  "o_proj",
33
+ "v_proj",
34
+ "k_proj",
35
+ "q_proj"
36
  ],
37
  "target_parameters": null,
38
  "task_type": "CAUSAL_LM",
checkpoints/checkpoint-3500/trainer_state.json CHANGED
The diff for this file is too large to render. See raw diff
 
checkpoints/checkpoint-4000/README.md CHANGED
@@ -1,9 +1,9 @@
1
  ---
2
- base_model: Models/Devstral-Small-2-24B-HS-CPT
3
  library_name: peft
4
  pipeline_tag: text-generation
5
  tags:
6
- - base_model:adapter:Models/Devstral-Small-2-24B-HS-CPT
7
  - lora
8
  - transformers
9
  ---
 
1
  ---
2
+ base_model: Models/Qwen2.5-Coder-14B-CPT
3
  library_name: peft
4
  pipeline_tag: text-generation
5
  tags:
6
+ - base_model:adapter:Models/Qwen2.5-Coder-14B-CPT
7
  - lora
8
  - transformers
9
  ---
checkpoints/checkpoint-4000/adapter_config.json CHANGED
@@ -3,7 +3,7 @@
3
  "alpha_pattern": {},
4
  "arrow_config": null,
5
  "auto_mapping": null,
6
- "base_model_name_or_path": "Models/Devstral-Small-2-24B-HS-CPT",
7
  "bias": "none",
8
  "corda_config": null,
9
  "ensure_weight_tying": false,
@@ -16,7 +16,7 @@
16
  "layers_pattern": null,
17
  "layers_to_transform": null,
18
  "loftq_config": {},
19
- "lora_alpha": 16,
20
  "lora_bias": false,
21
  "lora_dropout": 0.05,
22
  "megatron_config": null,
@@ -25,14 +25,14 @@
25
  "peft_type": "LORA",
26
  "peft_version": "0.18.0",
27
  "qalora_group_size": 16,
28
- "r": 8,
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
- "v_proj",
33
- "q_proj",
34
  "o_proj",
35
- "k_proj"
 
 
36
  ],
37
  "target_parameters": null,
38
  "task_type": "CAUSAL_LM",
 
3
  "alpha_pattern": {},
4
  "arrow_config": null,
5
  "auto_mapping": null,
6
+ "base_model_name_or_path": "Models/Qwen2.5-Coder-14B-CPT",
7
  "bias": "none",
8
  "corda_config": null,
9
  "ensure_weight_tying": false,
 
16
  "layers_pattern": null,
17
  "layers_to_transform": null,
18
  "loftq_config": {},
19
+ "lora_alpha": 64,
20
  "lora_bias": false,
21
  "lora_dropout": 0.05,
22
  "megatron_config": null,
 
25
  "peft_type": "LORA",
26
  "peft_version": "0.18.0",
27
  "qalora_group_size": 16,
28
+ "r": 32,
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
 
 
32
  "o_proj",
33
+ "v_proj",
34
+ "k_proj",
35
+ "q_proj"
36
  ],
37
  "target_parameters": null,
38
  "task_type": "CAUSAL_LM",
checkpoints/checkpoint-4000/trainer_state.json CHANGED
The diff for this file is too large to render. See raw diff
 
checkpoints/checkpoint-4500/README.md CHANGED
@@ -1,9 +1,9 @@
1
  ---
2
- base_model: Models/Devstral-Small-2-24B-HS-CPT
3
  library_name: peft
4
  pipeline_tag: text-generation
5
  tags:
6
- - base_model:adapter:Models/Devstral-Small-2-24B-HS-CPT
7
  - lora
8
  - transformers
9
  ---
 
1
  ---
2
+ base_model: Models/Qwen2.5-Coder-14B-CPT
3
  library_name: peft
4
  pipeline_tag: text-generation
5
  tags:
6
+ - base_model:adapter:Models/Qwen2.5-Coder-14B-CPT
7
  - lora
8
  - transformers
9
  ---
checkpoints/checkpoint-4500/adapter_config.json CHANGED
@@ -3,7 +3,7 @@
3
  "alpha_pattern": {},
4
  "arrow_config": null,
5
  "auto_mapping": null,
6
- "base_model_name_or_path": "Models/Devstral-Small-2-24B-HS-CPT",
7
  "bias": "none",
8
  "corda_config": null,
9
  "ensure_weight_tying": false,
@@ -16,7 +16,7 @@
16
  "layers_pattern": null,
17
  "layers_to_transform": null,
18
  "loftq_config": {},
19
- "lora_alpha": 16,
20
  "lora_bias": false,
21
  "lora_dropout": 0.05,
22
  "megatron_config": null,
@@ -25,14 +25,14 @@
25
  "peft_type": "LORA",
26
  "peft_version": "0.18.0",
27
  "qalora_group_size": 16,
28
- "r": 8,
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
- "v_proj",
33
- "q_proj",
34
  "o_proj",
35
- "k_proj"
 
 
36
  ],
37
  "target_parameters": null,
38
  "task_type": "CAUSAL_LM",
 
3
  "alpha_pattern": {},
4
  "arrow_config": null,
5
  "auto_mapping": null,
6
+ "base_model_name_or_path": "Models/Qwen2.5-Coder-14B-CPT",
7
  "bias": "none",
8
  "corda_config": null,
9
  "ensure_weight_tying": false,
 
16
  "layers_pattern": null,
17
  "layers_to_transform": null,
18
  "loftq_config": {},
19
+ "lora_alpha": 64,
20
  "lora_bias": false,
21
  "lora_dropout": 0.05,
22
  "megatron_config": null,
 
25
  "peft_type": "LORA",
26
  "peft_version": "0.18.0",
27
  "qalora_group_size": 16,
28
+ "r": 32,
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
 
 
32
  "o_proj",
33
+ "v_proj",
34
+ "k_proj",
35
+ "q_proj"
36
  ],
37
  "target_parameters": null,
38
  "task_type": "CAUSAL_LM",
checkpoints/checkpoint-4500/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a0efe65d231115c25223bf7b93f16e661ce129b91718b68f1f079e626bed512b
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cadd1d4964d81ffd554b895540fe42c724ce67c8ce385b329c23f9ba4322912d
3
  size 14244
checkpoints/checkpoint-4500/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1ddebc5e42121a3c52427c71de63ee27a7547ec14262f7ddfeb0be5491a11af0
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:315c9d74357f9543b8b6474791ceab2fe082042e289fc2e3547f3a4b6c9b01b2
3
  size 1064
checkpoints/checkpoint-4500/trainer_state.json CHANGED
The diff for this file is too large to render. See raw diff
 
checkpoints/checkpoint-4500/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e09df88fe57630482e911c5fab6026e3d20e4f37f6e48706f3566768f533d6d7
3
  size 4792
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:afae4a8ce1391f149eb00b14eed8f891c715e892ea546bf754d22db2c2bc5969
3
  size 4792
checkpoints/checkpoint-500/README.md CHANGED
@@ -1,9 +1,9 @@
1
  ---
2
- base_model: Models/Devstral-Small-2-24B-HS-CPT
3
  library_name: peft
4
  pipeline_tag: text-generation
5
  tags:
6
- - base_model:adapter:Models/Devstral-Small-2-24B-HS-CPT
7
  - lora
8
  - transformers
9
  ---
 
1
  ---
2
+ base_model: Models/Qwen2.5-Coder-14B-CPT
3
  library_name: peft
4
  pipeline_tag: text-generation
5
  tags:
6
+ - base_model:adapter:Models/Qwen2.5-Coder-14B-CPT
7
  - lora
8
  - transformers
9
  ---
checkpoints/checkpoint-500/adapter_config.json CHANGED
@@ -3,7 +3,7 @@
3
  "alpha_pattern": {},
4
  "arrow_config": null,
5
  "auto_mapping": null,
6
- "base_model_name_or_path": "Models/Devstral-Small-2-24B-HS-CPT",
7
  "bias": "none",
8
  "corda_config": null,
9
  "ensure_weight_tying": false,
@@ -16,7 +16,7 @@
16
  "layers_pattern": null,
17
  "layers_to_transform": null,
18
  "loftq_config": {},
19
- "lora_alpha": 16,
20
  "lora_bias": false,
21
  "lora_dropout": 0.05,
22
  "megatron_config": null,
@@ -25,14 +25,14 @@
25
  "peft_type": "LORA",
26
  "peft_version": "0.18.0",
27
  "qalora_group_size": 16,
28
- "r": 8,
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
- "v_proj",
33
- "q_proj",
34
  "o_proj",
35
- "k_proj"
 
 
36
  ],
37
  "target_parameters": null,
38
  "task_type": "CAUSAL_LM",
 
3
  "alpha_pattern": {},
4
  "arrow_config": null,
5
  "auto_mapping": null,
6
+ "base_model_name_or_path": "Models/Qwen2.5-Coder-14B-CPT",
7
  "bias": "none",
8
  "corda_config": null,
9
  "ensure_weight_tying": false,
 
16
  "layers_pattern": null,
17
  "layers_to_transform": null,
18
  "loftq_config": {},
19
+ "lora_alpha": 64,
20
  "lora_bias": false,
21
  "lora_dropout": 0.05,
22
  "megatron_config": null,
 
25
  "peft_type": "LORA",
26
  "peft_version": "0.18.0",
27
  "qalora_group_size": 16,
28
+ "r": 32,
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
 
 
32
  "o_proj",
33
+ "v_proj",
34
+ "k_proj",
35
+ "q_proj"
36
  ],
37
  "target_parameters": null,
38
  "task_type": "CAUSAL_LM",
checkpoints/checkpoint-500/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_global_step": 500,
3
- "best_metric": 0.9080732464790344,
4
- "best_model_checkpoint": "task2file/sft_devstral_24B_v2/checkpoints/checkpoint-500",
5
  "epoch": 0.2109704641350211,
6
  "eval_steps": 100,
7
  "global_step": 500,
@@ -11,1792 +11,1792 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.0008438818565400844,
14
- "grad_norm": 1.597854733467102,
15
- "learning_rate": 8.787346221441124e-08,
16
- "loss": 1.3927901983261108,
17
  "step": 2
18
  },
19
  {
20
  "epoch": 0.0016877637130801688,
21
- "grad_norm": 1.6547431945800781,
22
- "learning_rate": 2.6362038664323375e-07,
23
- "loss": 1.407160758972168,
24
  "step": 4
25
  },
26
  {
27
  "epoch": 0.002531645569620253,
28
- "grad_norm": 1.8221601247787476,
29
- "learning_rate": 4.393673110720563e-07,
30
- "loss": 1.376656174659729,
31
  "step": 6
32
  },
33
  {
34
  "epoch": 0.0033755274261603376,
35
- "grad_norm": 1.4831048250198364,
36
- "learning_rate": 6.151142355008788e-07,
37
- "loss": 1.247712254524231,
38
  "step": 8
39
  },
40
  {
41
  "epoch": 0.004219409282700422,
42
- "grad_norm": 1.668201208114624,
43
- "learning_rate": 7.908611599297013e-07,
44
- "loss": 1.2685163021087646,
45
  "step": 10
46
  },
47
  {
48
  "epoch": 0.005063291139240506,
49
- "grad_norm": 1.67417311668396,
50
- "learning_rate": 9.666080843585237e-07,
51
- "loss": 1.2942761182785034,
52
  "step": 12
53
  },
54
  {
55
  "epoch": 0.00590717299578059,
56
- "grad_norm": 1.7154079675674438,
57
- "learning_rate": 1.1423550087873463e-06,
58
- "loss": 1.3638604879379272,
59
  "step": 14
60
  },
61
  {
62
  "epoch": 0.006751054852320675,
63
- "grad_norm": 1.729427456855774,
64
- "learning_rate": 1.3181019332161688e-06,
65
- "loss": 1.3476728200912476,
66
  "step": 16
67
  },
68
  {
69
  "epoch": 0.007594936708860759,
70
- "grad_norm": 1.3813447952270508,
71
- "learning_rate": 1.4938488576449913e-06,
72
- "loss": 1.3476393222808838,
73
  "step": 18
74
  },
75
  {
76
  "epoch": 0.008438818565400843,
77
- "grad_norm": 1.557220458984375,
78
- "learning_rate": 1.6695957820738139e-06,
79
- "loss": 1.2449309825897217,
80
  "step": 20
81
  },
82
  {
83
  "epoch": 0.009282700421940928,
84
- "grad_norm": 1.1883500814437866,
85
- "learning_rate": 1.8453427065026362e-06,
86
- "loss": 1.3125361204147339,
87
  "step": 22
88
  },
89
  {
90
  "epoch": 0.010126582278481013,
91
- "grad_norm": 1.7290029525756836,
92
- "learning_rate": 2.0210896309314587e-06,
93
- "loss": 1.3724769353866577,
94
  "step": 24
95
  },
96
  {
97
  "epoch": 0.010970464135021098,
98
- "grad_norm": 1.5627557039260864,
99
- "learning_rate": 2.1968365553602812e-06,
100
- "loss": 1.3401387929916382,
101
  "step": 26
102
  },
103
  {
104
  "epoch": 0.01181434599156118,
105
- "grad_norm": 1.796866774559021,
106
- "learning_rate": 2.3725834797891038e-06,
107
- "loss": 1.365437388420105,
108
  "step": 28
109
  },
110
  {
111
  "epoch": 0.012658227848101266,
112
- "grad_norm": 1.7030404806137085,
113
- "learning_rate": 2.5483304042179263e-06,
114
- "loss": 1.2706533670425415,
115
  "step": 30
116
  },
117
  {
118
  "epoch": 0.01350210970464135,
119
- "grad_norm": 1.3186293840408325,
120
- "learning_rate": 2.724077328646749e-06,
121
- "loss": 1.3084994554519653,
122
  "step": 32
123
  },
124
  {
125
  "epoch": 0.014345991561181435,
126
- "grad_norm": 1.5762513875961304,
127
- "learning_rate": 2.8998242530755714e-06,
128
- "loss": 1.3259696960449219,
129
  "step": 34
130
  },
131
  {
132
  "epoch": 0.015189873417721518,
133
- "grad_norm": 1.422295331954956,
134
- "learning_rate": 3.075571177504394e-06,
135
- "loss": 1.3205676078796387,
136
  "step": 36
137
  },
138
  {
139
  "epoch": 0.016033755274261603,
140
- "grad_norm": 1.495523452758789,
141
- "learning_rate": 3.2513181019332165e-06,
142
- "loss": 1.3740568161010742,
143
  "step": 38
144
  },
145
  {
146
  "epoch": 0.016877637130801686,
147
- "grad_norm": 1.5112254619598389,
148
- "learning_rate": 3.427065026362039e-06,
149
- "loss": 1.321828842163086,
150
  "step": 40
151
  },
152
  {
153
  "epoch": 0.017721518987341773,
154
- "grad_norm": 1.4667807817459106,
155
- "learning_rate": 3.602811950790861e-06,
156
- "loss": 1.3673173189163208,
157
  "step": 42
158
  },
159
  {
160
  "epoch": 0.018565400843881856,
161
- "grad_norm": 1.6609723567962646,
162
- "learning_rate": 3.7785588752196836e-06,
163
- "loss": 1.3968093395233154,
164
  "step": 44
165
  },
166
  {
167
  "epoch": 0.019409282700421943,
168
- "grad_norm": 1.59381103515625,
169
- "learning_rate": 3.954305799648506e-06,
170
- "loss": 1.4295302629470825,
171
  "step": 46
172
  },
173
  {
174
  "epoch": 0.020253164556962026,
175
- "grad_norm": 1.1470608711242676,
176
- "learning_rate": 4.130052724077329e-06,
177
- "loss": 1.2536572217941284,
178
  "step": 48
179
  },
180
  {
181
  "epoch": 0.02109704641350211,
182
- "grad_norm": 1.2014588117599487,
183
- "learning_rate": 4.305799648506151e-06,
184
- "loss": 1.242217779159546,
185
  "step": 50
186
  },
187
  {
188
  "epoch": 0.021940928270042195,
189
- "grad_norm": 1.2327464818954468,
190
- "learning_rate": 4.481546572934974e-06,
191
- "loss": 1.2166963815689087,
192
  "step": 52
193
  },
194
  {
195
  "epoch": 0.02278481012658228,
196
- "grad_norm": 1.9708983898162842,
197
- "learning_rate": 4.657293497363796e-06,
198
- "loss": 1.25709867477417,
199
  "step": 54
200
  },
201
  {
202
  "epoch": 0.02362869198312236,
203
- "grad_norm": 1.180569052696228,
204
- "learning_rate": 4.833040421792619e-06,
205
- "loss": 1.2886158227920532,
206
  "step": 56
207
  },
208
  {
209
  "epoch": 0.024472573839662448,
210
- "grad_norm": 1.5029548406600952,
211
- "learning_rate": 5.008787346221441e-06,
212
- "loss": 1.29886794090271,
213
  "step": 58
214
  },
215
  {
216
  "epoch": 0.02531645569620253,
217
- "grad_norm": 1.5380216836929321,
218
- "learning_rate": 5.184534270650264e-06,
219
- "loss": 1.2387628555297852,
220
  "step": 60
221
  },
222
  {
223
  "epoch": 0.026160337552742614,
224
- "grad_norm": 1.572144865989685,
225
- "learning_rate": 5.3602811950790864e-06,
226
- "loss": 1.2177000045776367,
227
  "step": 62
228
  },
229
  {
230
  "epoch": 0.0270042194092827,
231
- "grad_norm": 1.4882780313491821,
232
- "learning_rate": 5.536028119507909e-06,
233
- "loss": 1.181516170501709,
234
  "step": 64
235
  },
236
  {
237
  "epoch": 0.027848101265822784,
238
- "grad_norm": 1.2982488870620728,
239
- "learning_rate": 5.7117750439367315e-06,
240
- "loss": 1.2101733684539795,
241
  "step": 66
242
  },
243
  {
244
  "epoch": 0.02869198312236287,
245
- "grad_norm": 1.5236955881118774,
246
- "learning_rate": 5.887521968365554e-06,
247
- "loss": 1.2277681827545166,
248
  "step": 68
249
  },
250
  {
251
  "epoch": 0.029535864978902954,
252
- "grad_norm": 1.4521006345748901,
253
- "learning_rate": 6.0632688927943766e-06,
254
- "loss": 1.1688424348831177,
255
  "step": 70
256
  },
257
  {
258
  "epoch": 0.030379746835443037,
259
- "grad_norm": 1.2352311611175537,
260
- "learning_rate": 6.239015817223199e-06,
261
- "loss": 1.273059368133545,
262
  "step": 72
263
  },
264
  {
265
  "epoch": 0.031223628691983123,
266
- "grad_norm": 1.3438209295272827,
267
- "learning_rate": 6.414762741652021e-06,
268
- "loss": 1.1609034538269043,
269
  "step": 74
270
  },
271
  {
272
  "epoch": 0.032067510548523206,
273
- "grad_norm": 1.9009398221969604,
274
- "learning_rate": 6.590509666080843e-06,
275
- "loss": 1.2508260011672974,
276
  "step": 76
277
  },
278
  {
279
  "epoch": 0.03291139240506329,
280
- "grad_norm": 1.6718412637710571,
281
- "learning_rate": 6.766256590509666e-06,
282
- "loss": 1.2524956464767456,
283
  "step": 78
284
  },
285
  {
286
  "epoch": 0.03375527426160337,
287
- "grad_norm": 1.249891757965088,
288
- "learning_rate": 6.942003514938488e-06,
289
- "loss": 1.1472493410110474,
290
  "step": 80
291
  },
292
  {
293
  "epoch": 0.03459915611814346,
294
- "grad_norm": 1.4398653507232666,
295
- "learning_rate": 7.117750439367312e-06,
296
- "loss": 1.0845389366149902,
297
  "step": 82
298
  },
299
  {
300
  "epoch": 0.035443037974683546,
301
- "grad_norm": 1.3701167106628418,
302
- "learning_rate": 7.293497363796134e-06,
303
- "loss": 1.1088868379592896,
304
  "step": 84
305
  },
306
  {
307
  "epoch": 0.036286919831223625,
308
- "grad_norm": 1.277998924255371,
309
- "learning_rate": 7.469244288224957e-06,
310
- "loss": 1.1513772010803223,
311
  "step": 86
312
  },
313
  {
314
  "epoch": 0.03713080168776371,
315
- "grad_norm": 1.4970002174377441,
316
- "learning_rate": 7.644991212653779e-06,
317
- "loss": 1.1385771036148071,
318
  "step": 88
319
  },
320
  {
321
  "epoch": 0.0379746835443038,
322
- "grad_norm": 1.3384218215942383,
323
- "learning_rate": 7.820738137082601e-06,
324
- "loss": 1.1632680892944336,
325
  "step": 90
326
  },
327
  {
328
  "epoch": 0.038818565400843885,
329
- "grad_norm": 1.4317446947097778,
330
- "learning_rate": 7.996485061511425e-06,
331
- "loss": 1.2256064414978027,
332
  "step": 92
333
  },
334
  {
335
  "epoch": 0.039662447257383965,
336
- "grad_norm": 1.8743640184402466,
337
- "learning_rate": 8.172231985940246e-06,
338
- "loss": 1.1935789585113525,
339
  "step": 94
340
  },
341
  {
342
  "epoch": 0.04050632911392405,
343
- "grad_norm": 1.4789546728134155,
344
- "learning_rate": 8.347978910369069e-06,
345
- "loss": 1.1429362297058105,
346
  "step": 96
347
  },
348
  {
349
  "epoch": 0.04135021097046414,
350
- "grad_norm": 1.658605694770813,
351
- "learning_rate": 8.523725834797891e-06,
352
- "loss": 1.1831508874893188,
353
  "step": 98
354
  },
355
  {
356
  "epoch": 0.04219409282700422,
357
- "grad_norm": 1.5077892541885376,
358
- "learning_rate": 8.699472759226714e-06,
359
- "loss": 1.0539867877960205,
360
  "step": 100
361
  },
362
  {
363
  "epoch": 0.04219409282700422,
364
- "eval_loss": 1.138856053352356,
365
- "eval_runtime": 859.7128,
366
- "eval_samples_per_second": 2.451,
367
- "eval_steps_per_second": 2.451,
368
  "step": 100
369
  },
370
  {
371
  "epoch": 0.043037974683544304,
372
- "grad_norm": 1.4335681200027466,
373
- "learning_rate": 8.875219683655536e-06,
374
- "loss": 1.0719901323318481,
375
  "step": 102
376
  },
377
  {
378
  "epoch": 0.04388185654008439,
379
- "grad_norm": 1.7387681007385254,
380
- "learning_rate": 9.050966608084359e-06,
381
- "loss": 1.0654313564300537,
382
  "step": 104
383
  },
384
  {
385
  "epoch": 0.04472573839662447,
386
- "grad_norm": 1.6071950197219849,
387
- "learning_rate": 9.226713532513181e-06,
388
- "loss": 1.0752698183059692,
389
  "step": 106
390
  },
391
  {
392
  "epoch": 0.04556962025316456,
393
- "grad_norm": 1.40005362033844,
394
- "learning_rate": 9.402460456942004e-06,
395
- "loss": 1.1029763221740723,
396
  "step": 108
397
  },
398
  {
399
  "epoch": 0.046413502109704644,
400
- "grad_norm": 2.2338669300079346,
401
- "learning_rate": 9.578207381370826e-06,
402
- "loss": 1.1157960891723633,
403
  "step": 110
404
  },
405
  {
406
  "epoch": 0.04725738396624472,
407
- "grad_norm": 1.4972727298736572,
408
- "learning_rate": 9.753954305799649e-06,
409
- "loss": 1.1095420122146606,
410
  "step": 112
411
  },
412
  {
413
  "epoch": 0.04810126582278481,
414
- "grad_norm": 1.317979097366333,
415
- "learning_rate": 9.929701230228471e-06,
416
- "loss": 1.109113097190857,
417
  "step": 114
418
  },
419
  {
420
  "epoch": 0.048945147679324896,
421
- "grad_norm": 1.496346116065979,
422
- "learning_rate": 1.0105448154657294e-05,
423
- "loss": 1.1055104732513428,
424
  "step": 116
425
  },
426
  {
427
  "epoch": 0.049789029535864976,
428
- "grad_norm": 1.385406732559204,
429
- "learning_rate": 1.0281195079086117e-05,
430
- "loss": 1.118395209312439,
431
  "step": 118
432
  },
433
  {
434
  "epoch": 0.05063291139240506,
435
- "grad_norm": 1.524222731590271,
436
- "learning_rate": 1.0456942003514939e-05,
437
- "loss": 1.1008446216583252,
438
  "step": 120
439
  },
440
  {
441
  "epoch": 0.05147679324894515,
442
- "grad_norm": 1.6308200359344482,
443
- "learning_rate": 1.0632688927943762e-05,
444
- "loss": 1.0891425609588623,
445
  "step": 122
446
  },
447
  {
448
  "epoch": 0.05232067510548523,
449
- "grad_norm": 1.3681106567382812,
450
- "learning_rate": 1.0808435852372584e-05,
451
- "loss": 0.9080473184585571,
452
  "step": 124
453
  },
454
  {
455
  "epoch": 0.053164556962025315,
456
- "grad_norm": 1.9429908990859985,
457
- "learning_rate": 1.0984182776801407e-05,
458
- "loss": 1.0337369441986084,
459
  "step": 126
460
  },
461
  {
462
  "epoch": 0.0540084388185654,
463
- "grad_norm": 1.5830830335617065,
464
- "learning_rate": 1.115992970123023e-05,
465
- "loss": 1.0703333616256714,
466
  "step": 128
467
  },
468
  {
469
  "epoch": 0.05485232067510549,
470
- "grad_norm": 1.4792555570602417,
471
- "learning_rate": 1.1335676625659052e-05,
472
- "loss": 1.004652738571167,
473
  "step": 130
474
  },
475
  {
476
  "epoch": 0.05569620253164557,
477
- "grad_norm": 1.7196226119995117,
478
- "learning_rate": 1.1511423550087874e-05,
479
- "loss": 0.9798293709754944,
480
  "step": 132
481
  },
482
  {
483
  "epoch": 0.056540084388185655,
484
- "grad_norm": 1.8733659982681274,
485
- "learning_rate": 1.1687170474516697e-05,
486
- "loss": 1.0213249921798706,
487
  "step": 134
488
  },
489
  {
490
  "epoch": 0.05738396624472574,
491
- "grad_norm": 1.3431142568588257,
492
- "learning_rate": 1.186291739894552e-05,
493
- "loss": 1.0358591079711914,
494
  "step": 136
495
  },
496
  {
497
  "epoch": 0.05822784810126582,
498
- "grad_norm": 1.527864933013916,
499
- "learning_rate": 1.2038664323374342e-05,
500
- "loss": 0.9372249841690063,
501
  "step": 138
502
  },
503
  {
504
  "epoch": 0.05907172995780591,
505
- "grad_norm": 1.5495563745498657,
506
- "learning_rate": 1.2214411247803164e-05,
507
- "loss": 1.0277758836746216,
508
  "step": 140
509
  },
510
  {
511
  "epoch": 0.059915611814345994,
512
- "grad_norm": 1.6792418956756592,
513
- "learning_rate": 1.2390158172231985e-05,
514
- "loss": 1.0349801778793335,
515
  "step": 142
516
  },
517
  {
518
  "epoch": 0.060759493670886074,
519
- "grad_norm": 1.6468945741653442,
520
- "learning_rate": 1.256590509666081e-05,
521
- "loss": 0.9578297734260559,
522
  "step": 144
523
  },
524
  {
525
  "epoch": 0.06160337552742616,
526
- "grad_norm": 1.7243824005126953,
527
- "learning_rate": 1.2741652021089632e-05,
528
- "loss": 1.0628854036331177,
529
  "step": 146
530
  },
531
  {
532
  "epoch": 0.06244725738396625,
533
- "grad_norm": 1.7286981344223022,
534
- "learning_rate": 1.2917398945518455e-05,
535
- "loss": 0.9336449503898621,
536
  "step": 148
537
  },
538
  {
539
  "epoch": 0.06329113924050633,
540
- "grad_norm": 1.6411832571029663,
541
- "learning_rate": 1.3093145869947277e-05,
542
- "loss": 0.953730583190918,
543
  "step": 150
544
  },
545
  {
546
  "epoch": 0.06413502109704641,
547
- "grad_norm": 1.8297001123428345,
548
- "learning_rate": 1.3268892794376098e-05,
549
- "loss": 1.051239013671875,
550
  "step": 152
551
  },
552
  {
553
  "epoch": 0.06497890295358649,
554
- "grad_norm": 1.9660519361495972,
555
- "learning_rate": 1.3444639718804922e-05,
556
- "loss": 0.9955035448074341,
557
  "step": 154
558
  },
559
  {
560
  "epoch": 0.06582278481012659,
561
- "grad_norm": 1.8423733711242676,
562
- "learning_rate": 1.3620386643233743e-05,
563
- "loss": 0.913300096988678,
564
  "step": 156
565
  },
566
  {
567
  "epoch": 0.06666666666666667,
568
- "grad_norm": 1.9146347045898438,
569
- "learning_rate": 1.3796133567662567e-05,
570
- "loss": 1.0429846048355103,
571
  "step": 158
572
  },
573
  {
574
  "epoch": 0.06751054852320675,
575
- "grad_norm": 1.6221821308135986,
576
- "learning_rate": 1.3971880492091388e-05,
577
- "loss": 1.0360238552093506,
578
  "step": 160
579
  },
580
  {
581
  "epoch": 0.06835443037974684,
582
- "grad_norm": 2.173283338546753,
583
- "learning_rate": 1.4147627416520212e-05,
584
- "loss": 1.0227266550064087,
585
  "step": 162
586
  },
587
  {
588
  "epoch": 0.06919831223628692,
589
- "grad_norm": 1.7091665267944336,
590
- "learning_rate": 1.4323374340949033e-05,
591
- "loss": 1.0075194835662842,
592
  "step": 164
593
  },
594
  {
595
  "epoch": 0.070042194092827,
596
- "grad_norm": 1.7219135761260986,
597
- "learning_rate": 1.4499121265377857e-05,
598
- "loss": 1.0044782161712646,
599
  "step": 166
600
  },
601
  {
602
  "epoch": 0.07088607594936709,
603
- "grad_norm": 1.6558159589767456,
604
- "learning_rate": 1.4674868189806678e-05,
605
- "loss": 0.9393973350524902,
606
  "step": 168
607
  },
608
  {
609
  "epoch": 0.07172995780590717,
610
- "grad_norm": 1.9362739324569702,
611
- "learning_rate": 1.4850615114235502e-05,
612
- "loss": 0.9955337643623352,
613
  "step": 170
614
  },
615
  {
616
  "epoch": 0.07257383966244725,
617
- "grad_norm": 1.7792853116989136,
618
- "learning_rate": 1.5026362038664323e-05,
619
- "loss": 0.9659126400947571,
620
  "step": 172
621
  },
622
  {
623
  "epoch": 0.07341772151898734,
624
- "grad_norm": 1.7184511423110962,
625
- "learning_rate": 1.5202108963093147e-05,
626
- "loss": 0.9077855348587036,
627
  "step": 174
628
  },
629
  {
630
  "epoch": 0.07426160337552742,
631
- "grad_norm": 1.5701428651809692,
632
- "learning_rate": 1.537785588752197e-05,
633
- "loss": 0.9305018782615662,
634
  "step": 176
635
  },
636
  {
637
  "epoch": 0.0751054852320675,
638
- "grad_norm": 1.970229148864746,
639
- "learning_rate": 1.555360281195079e-05,
640
- "loss": 1.0211774110794067,
641
  "step": 178
642
  },
643
  {
644
  "epoch": 0.0759493670886076,
645
- "grad_norm": 1.8410269021987915,
646
- "learning_rate": 1.5729349736379615e-05,
647
- "loss": 0.9479315876960754,
648
  "step": 180
649
  },
650
  {
651
  "epoch": 0.07679324894514768,
652
- "grad_norm": 1.8991246223449707,
653
- "learning_rate": 1.5905096660808434e-05,
654
- "loss": 1.0629050731658936,
655
  "step": 182
656
  },
657
  {
658
  "epoch": 0.07763713080168777,
659
- "grad_norm": 1.8052008152008057,
660
- "learning_rate": 1.608084358523726e-05,
661
- "loss": 0.946983814239502,
662
  "step": 184
663
  },
664
  {
665
  "epoch": 0.07848101265822785,
666
- "grad_norm": 1.547108769416809,
667
- "learning_rate": 1.625659050966608e-05,
668
- "loss": 0.9413356184959412,
669
  "step": 186
670
  },
671
  {
672
  "epoch": 0.07932489451476793,
673
- "grad_norm": 1.9713538885116577,
674
- "learning_rate": 1.6432337434094905e-05,
675
- "loss": 0.9337888956069946,
676
  "step": 188
677
  },
678
  {
679
  "epoch": 0.08016877637130802,
680
- "grad_norm": 1.708789348602295,
681
- "learning_rate": 1.6608084358523728e-05,
682
- "loss": 0.9816337823867798,
683
  "step": 190
684
  },
685
  {
686
  "epoch": 0.0810126582278481,
687
- "grad_norm": 1.815292477607727,
688
- "learning_rate": 1.678383128295255e-05,
689
- "loss": 1.017122507095337,
690
  "step": 192
691
  },
692
  {
693
  "epoch": 0.08185654008438818,
694
- "grad_norm": 1.7950682640075684,
695
- "learning_rate": 1.6959578207381373e-05,
696
- "loss": 0.991599440574646,
697
  "step": 194
698
  },
699
  {
700
  "epoch": 0.08270042194092828,
701
- "grad_norm": 1.692512035369873,
702
- "learning_rate": 1.7135325131810195e-05,
703
- "loss": 0.9570834040641785,
704
  "step": 196
705
  },
706
  {
707
  "epoch": 0.08354430379746836,
708
- "grad_norm": 2.056089162826538,
709
- "learning_rate": 1.7311072056239018e-05,
710
- "loss": 1.035754919052124,
711
  "step": 198
712
  },
713
  {
714
  "epoch": 0.08438818565400844,
715
- "grad_norm": 1.7022203207015991,
716
- "learning_rate": 1.7486818980667837e-05,
717
- "loss": 1.0124205350875854,
718
  "step": 200
719
  },
720
  {
721
  "epoch": 0.08438818565400844,
722
- "eval_loss": 0.995743453502655,
723
- "eval_runtime": 846.8257,
724
- "eval_samples_per_second": 2.488,
725
- "eval_steps_per_second": 2.488,
726
  "step": 200
727
  },
728
  {
729
  "epoch": 0.08523206751054853,
730
- "grad_norm": 1.6088604927062988,
731
- "learning_rate": 1.7662565905096663e-05,
732
- "loss": 0.8946985006332397,
733
  "step": 202
734
  },
735
  {
736
  "epoch": 0.08607594936708861,
737
- "grad_norm": 2.02270770072937,
738
- "learning_rate": 1.7838312829525482e-05,
739
- "loss": 0.976133406162262,
740
  "step": 204
741
  },
742
  {
743
  "epoch": 0.08691983122362869,
744
- "grad_norm": 1.7832789421081543,
745
- "learning_rate": 1.8014059753954308e-05,
746
- "loss": 0.9079383611679077,
747
  "step": 206
748
  },
749
  {
750
  "epoch": 0.08776371308016878,
751
- "grad_norm": 1.9793545007705688,
752
- "learning_rate": 1.8189806678383127e-05,
753
- "loss": 0.8650367856025696,
754
  "step": 208
755
  },
756
  {
757
  "epoch": 0.08860759493670886,
758
- "grad_norm": 1.8124271631240845,
759
- "learning_rate": 1.8365553602811953e-05,
760
- "loss": 0.9327266812324524,
761
  "step": 210
762
  },
763
  {
764
  "epoch": 0.08945147679324894,
765
- "grad_norm": 1.8581212759017944,
766
- "learning_rate": 1.8541300527240772e-05,
767
- "loss": 0.9811079502105713,
768
  "step": 212
769
  },
770
  {
771
  "epoch": 0.09029535864978903,
772
- "grad_norm": 2.001699447631836,
773
- "learning_rate": 1.8717047451669598e-05,
774
- "loss": 0.9546971321105957,
775
  "step": 214
776
  },
777
  {
778
  "epoch": 0.09113924050632911,
779
- "grad_norm": 1.6994978189468384,
780
- "learning_rate": 1.8892794376098417e-05,
781
- "loss": 0.9611319899559021,
782
  "step": 216
783
  },
784
  {
785
  "epoch": 0.0919831223628692,
786
- "grad_norm": 2.1379497051239014,
787
- "learning_rate": 1.9068541300527243e-05,
788
- "loss": 0.9781531095504761,
789
  "step": 218
790
  },
791
  {
792
  "epoch": 0.09282700421940929,
793
- "grad_norm": 1.8961224555969238,
794
- "learning_rate": 1.9244288224956066e-05,
795
- "loss": 0.9374833106994629,
796
  "step": 220
797
  },
798
  {
799
  "epoch": 0.09367088607594937,
800
- "grad_norm": 1.851464033126831,
801
- "learning_rate": 1.9420035149384885e-05,
802
- "loss": 0.9681299328804016,
803
  "step": 222
804
  },
805
  {
806
  "epoch": 0.09451476793248945,
807
- "grad_norm": 2.0642266273498535,
808
- "learning_rate": 1.959578207381371e-05,
809
- "loss": 1.0086225271224976,
810
  "step": 224
811
  },
812
  {
813
  "epoch": 0.09535864978902954,
814
- "grad_norm": 1.8658756017684937,
815
- "learning_rate": 1.977152899824253e-05,
816
- "loss": 0.9190312623977661,
817
  "step": 226
818
  },
819
  {
820
  "epoch": 0.09620253164556962,
821
- "grad_norm": 2.4398674964904785,
822
- "learning_rate": 1.9947275922671356e-05,
823
- "loss": 0.9740874171257019,
824
  "step": 228
825
  },
826
  {
827
  "epoch": 0.0970464135021097,
828
- "grad_norm": 1.849183440208435,
829
- "learning_rate": 2.0123022847100175e-05,
830
- "loss": 0.884376049041748,
831
  "step": 230
832
  },
833
  {
834
  "epoch": 0.09789029535864979,
835
- "grad_norm": 2.027320384979248,
836
- "learning_rate": 2.0298769771529e-05,
837
- "loss": 0.9116487503051758,
838
  "step": 232
839
  },
840
  {
841
  "epoch": 0.09873417721518987,
842
- "grad_norm": 1.6800135374069214,
843
- "learning_rate": 2.047451669595782e-05,
844
- "loss": 0.9035115242004395,
845
  "step": 234
846
  },
847
  {
848
  "epoch": 0.09957805907172995,
849
- "grad_norm": 2.2362256050109863,
850
- "learning_rate": 2.0650263620386646e-05,
851
- "loss": 0.9043796062469482,
852
  "step": 236
853
  },
854
  {
855
  "epoch": 0.10042194092827005,
856
- "grad_norm": 1.938215970993042,
857
- "learning_rate": 2.0826010544815465e-05,
858
- "loss": 1.0888828039169312,
859
  "step": 238
860
  },
861
  {
862
  "epoch": 0.10126582278481013,
863
- "grad_norm": 1.890328049659729,
864
- "learning_rate": 2.100175746924429e-05,
865
- "loss": 0.9960280656814575,
866
  "step": 240
867
  },
868
  {
869
  "epoch": 0.1021097046413502,
870
- "grad_norm": 2.021235227584839,
871
- "learning_rate": 2.117750439367311e-05,
872
- "loss": 0.9848901629447937,
873
  "step": 242
874
  },
875
  {
876
  "epoch": 0.1029535864978903,
877
- "grad_norm": 2.023920774459839,
878
- "learning_rate": 2.1353251318101936e-05,
879
- "loss": 0.891694188117981,
880
  "step": 244
881
  },
882
  {
883
  "epoch": 0.10379746835443038,
884
- "grad_norm": 1.8061069250106812,
885
- "learning_rate": 2.1528998242530755e-05,
886
- "loss": 0.9059976935386658,
887
  "step": 246
888
  },
889
  {
890
  "epoch": 0.10464135021097046,
891
- "grad_norm": 2.176302194595337,
892
- "learning_rate": 2.1704745166959578e-05,
893
- "loss": 1.0056109428405762,
894
  "step": 248
895
  },
896
  {
897
  "epoch": 0.10548523206751055,
898
- "grad_norm": 1.9820969104766846,
899
- "learning_rate": 2.18804920913884e-05,
900
- "loss": 0.9645357728004456,
901
  "step": 250
902
  },
903
  {
904
  "epoch": 0.10632911392405063,
905
- "grad_norm": 1.8764572143554688,
906
- "learning_rate": 2.2056239015817223e-05,
907
- "loss": 1.0178182125091553,
908
  "step": 252
909
  },
910
  {
911
  "epoch": 0.10717299578059072,
912
- "grad_norm": 2.56221342086792,
913
- "learning_rate": 2.223198594024605e-05,
914
- "loss": 0.9546761512756348,
915
  "step": 254
916
  },
917
  {
918
  "epoch": 0.1080168776371308,
919
- "grad_norm": 2.6779074668884277,
920
- "learning_rate": 2.2407732864674868e-05,
921
- "loss": 0.9300968647003174,
922
  "step": 256
923
  },
924
  {
925
  "epoch": 0.10886075949367088,
926
- "grad_norm": 2.140897512435913,
927
- "learning_rate": 2.2583479789103694e-05,
928
- "loss": 0.926638662815094,
929
  "step": 258
930
  },
931
  {
932
  "epoch": 0.10970464135021098,
933
- "grad_norm": 2.0880508422851562,
934
- "learning_rate": 2.2759226713532513e-05,
935
- "loss": 1.0681840181350708,
936
  "step": 260
937
  },
938
  {
939
  "epoch": 0.11054852320675106,
940
- "grad_norm": 2.7273616790771484,
941
- "learning_rate": 2.293497363796134e-05,
942
- "loss": 1.0840941667556763,
943
  "step": 262
944
  },
945
  {
946
  "epoch": 0.11139240506329114,
947
- "grad_norm": 1.6723874807357788,
948
- "learning_rate": 2.3110720562390158e-05,
949
- "loss": 0.8637182116508484,
950
  "step": 264
951
  },
952
  {
953
  "epoch": 0.11223628691983123,
954
- "grad_norm": 1.806243896484375,
955
- "learning_rate": 2.3286467486818984e-05,
956
- "loss": 0.9554686546325684,
957
  "step": 266
958
  },
959
  {
960
  "epoch": 0.11308016877637131,
961
- "grad_norm": 1.9086743593215942,
962
- "learning_rate": 2.3462214411247803e-05,
963
- "loss": 0.9556593894958496,
964
  "step": 268
965
  },
966
  {
967
  "epoch": 0.11392405063291139,
968
- "grad_norm": 2.1822304725646973,
969
- "learning_rate": 2.3637961335676626e-05,
970
- "loss": 0.9177709817886353,
971
  "step": 270
972
  },
973
  {
974
  "epoch": 0.11476793248945148,
975
- "grad_norm": 2.1009039878845215,
976
- "learning_rate": 2.3813708260105448e-05,
977
- "loss": 0.9288759827613831,
978
  "step": 272
979
  },
980
  {
981
  "epoch": 0.11561181434599156,
982
- "grad_norm": 1.9814810752868652,
983
- "learning_rate": 2.398945518453427e-05,
984
- "loss": 0.9881691932678223,
985
  "step": 274
986
  },
987
  {
988
  "epoch": 0.11645569620253164,
989
- "grad_norm": 1.9946284294128418,
990
- "learning_rate": 2.4165202108963093e-05,
991
- "loss": 0.9390727281570435,
992
  "step": 276
993
  },
994
  {
995
  "epoch": 0.11729957805907174,
996
- "grad_norm": 2.4489169120788574,
997
- "learning_rate": 2.4340949033391916e-05,
998
- "loss": 0.9625692963600159,
999
  "step": 278
1000
  },
1001
  {
1002
  "epoch": 0.11814345991561181,
1003
- "grad_norm": 2.0919103622436523,
1004
- "learning_rate": 2.451669595782074e-05,
1005
- "loss": 0.9304702877998352,
1006
  "step": 280
1007
  },
1008
  {
1009
  "epoch": 0.1189873417721519,
1010
- "grad_norm": 1.912914752960205,
1011
- "learning_rate": 2.469244288224956e-05,
1012
- "loss": 0.9313994646072388,
1013
  "step": 282
1014
  },
1015
  {
1016
  "epoch": 0.11983122362869199,
1017
- "grad_norm": 2.1553256511688232,
1018
- "learning_rate": 2.4868189806678387e-05,
1019
- "loss": 1.004011869430542,
1020
  "step": 284
1021
  },
1022
  {
1023
  "epoch": 0.12067510548523207,
1024
- "grad_norm": 2.0129058361053467,
1025
- "learning_rate": 2.504393673110721e-05,
1026
- "loss": 0.9092531204223633,
1027
  "step": 286
1028
  },
1029
  {
1030
  "epoch": 0.12151898734177215,
1031
- "grad_norm": 2.1632325649261475,
1032
- "learning_rate": 2.5219683655536032e-05,
1033
- "loss": 0.993347704410553,
1034
  "step": 288
1035
  },
1036
  {
1037
  "epoch": 0.12236286919831224,
1038
- "grad_norm": 2.3072738647460938,
1039
- "learning_rate": 2.539543057996485e-05,
1040
- "loss": 0.978348433971405,
1041
  "step": 290
1042
  },
1043
  {
1044
  "epoch": 0.12320675105485232,
1045
- "grad_norm": 2.056560516357422,
1046
- "learning_rate": 2.5571177504393674e-05,
1047
- "loss": 1.0018101930618286,
1048
  "step": 292
1049
  },
1050
  {
1051
  "epoch": 0.1240506329113924,
1052
- "grad_norm": 1.8906747102737427,
1053
- "learning_rate": 2.5746924428822493e-05,
1054
- "loss": 0.9607775211334229,
1055
  "step": 294
1056
  },
1057
  {
1058
  "epoch": 0.1248945147679325,
1059
- "grad_norm": 2.1375651359558105,
1060
- "learning_rate": 2.5922671353251322e-05,
1061
- "loss": 0.9259153008460999,
1062
  "step": 296
1063
  },
1064
  {
1065
  "epoch": 0.1257383966244726,
1066
- "grad_norm": 1.9994823932647705,
1067
- "learning_rate": 2.609841827768014e-05,
1068
- "loss": 0.8524524569511414,
1069
  "step": 298
1070
  },
1071
  {
1072
  "epoch": 0.12658227848101267,
1073
- "grad_norm": 2.2421181201934814,
1074
- "learning_rate": 2.6274165202108964e-05,
1075
- "loss": 1.0047069787979126,
1076
  "step": 300
1077
  },
1078
  {
1079
  "epoch": 0.12658227848101267,
1080
- "eval_loss": 0.9517185688018799,
1081
- "eval_runtime": 860.0287,
1082
- "eval_samples_per_second": 2.45,
1083
- "eval_steps_per_second": 2.45,
1084
  "step": 300
1085
  },
1086
  {
1087
  "epoch": 0.12742616033755275,
1088
- "grad_norm": 2.1206254959106445,
1089
- "learning_rate": 2.6449912126537786e-05,
1090
- "loss": 0.8475471138954163,
1091
  "step": 302
1092
  },
1093
  {
1094
  "epoch": 0.12827004219409283,
1095
- "grad_norm": 1.885161280632019,
1096
- "learning_rate": 2.6625659050966612e-05,
1097
- "loss": 0.8643121123313904,
1098
  "step": 304
1099
  },
1100
  {
1101
  "epoch": 0.1291139240506329,
1102
- "grad_norm": 3.1441781520843506,
1103
- "learning_rate": 2.680140597539543e-05,
1104
- "loss": 0.8804612159729004,
1105
  "step": 306
1106
  },
1107
  {
1108
  "epoch": 0.12995780590717299,
1109
- "grad_norm": 1.953133225440979,
1110
- "learning_rate": 2.6977152899824254e-05,
1111
- "loss": 0.8348029255867004,
1112
  "step": 308
1113
  },
1114
  {
1115
  "epoch": 0.1308016877637131,
1116
- "grad_norm": 2.3762667179107666,
1117
- "learning_rate": 2.7152899824253076e-05,
1118
- "loss": 0.8889057040214539,
1119
  "step": 310
1120
  },
1121
  {
1122
  "epoch": 0.13164556962025317,
1123
- "grad_norm": 2.4651103019714355,
1124
- "learning_rate": 2.7328646748681902e-05,
1125
- "loss": 1.025565505027771,
1126
  "step": 312
1127
  },
1128
  {
1129
  "epoch": 0.13248945147679325,
1130
- "grad_norm": 1.8522284030914307,
1131
- "learning_rate": 2.7504393673110725e-05,
1132
- "loss": 0.868915855884552,
1133
  "step": 314
1134
  },
1135
  {
1136
  "epoch": 0.13333333333333333,
1137
- "grad_norm": 1.8048083782196045,
1138
- "learning_rate": 2.7680140597539544e-05,
1139
- "loss": 0.8821638226509094,
1140
  "step": 316
1141
  },
1142
  {
1143
  "epoch": 0.1341772151898734,
1144
- "grad_norm": 1.9933605194091797,
1145
- "learning_rate": 2.7855887521968367e-05,
1146
- "loss": 0.8735360503196716,
1147
  "step": 318
1148
  },
1149
  {
1150
  "epoch": 0.1350210970464135,
1151
- "grad_norm": 2.044337034225464,
1152
- "learning_rate": 2.8031634446397186e-05,
1153
- "loss": 0.8288834691047668,
1154
  "step": 320
1155
  },
1156
  {
1157
  "epoch": 0.1358649789029536,
1158
- "grad_norm": 2.416067361831665,
1159
- "learning_rate": 2.8207381370826015e-05,
1160
- "loss": 0.9104969501495361,
1161
  "step": 322
1162
  },
1163
  {
1164
  "epoch": 0.13670886075949368,
1165
- "grad_norm": 2.0731265544891357,
1166
- "learning_rate": 2.8383128295254834e-05,
1167
- "loss": 0.8689924478530884,
1168
  "step": 324
1169
  },
1170
  {
1171
  "epoch": 0.13755274261603376,
1172
- "grad_norm": 2.049126386642456,
1173
- "learning_rate": 2.8558875219683657e-05,
1174
- "loss": 0.9312222003936768,
1175
  "step": 326
1176
  },
1177
  {
1178
  "epoch": 0.13839662447257384,
1179
- "grad_norm": 2.131026268005371,
1180
- "learning_rate": 2.8734622144112476e-05,
1181
- "loss": 0.8933501839637756,
1182
  "step": 328
1183
  },
1184
  {
1185
  "epoch": 0.13924050632911392,
1186
- "grad_norm": 1.766754150390625,
1187
- "learning_rate": 2.8910369068541305e-05,
1188
- "loss": 0.8998261094093323,
1189
  "step": 330
1190
  },
1191
  {
1192
  "epoch": 0.140084388185654,
1193
- "grad_norm": 2.197706460952759,
1194
- "learning_rate": 2.9086115992970124e-05,
1195
- "loss": 0.8826426267623901,
1196
  "step": 332
1197
  },
1198
  {
1199
  "epoch": 0.1409282700421941,
1200
- "grad_norm": 1.953715443611145,
1201
- "learning_rate": 2.9261862917398947e-05,
1202
- "loss": 0.8590307831764221,
1203
  "step": 334
1204
  },
1205
  {
1206
  "epoch": 0.14177215189873418,
1207
- "grad_norm": 2.200929880142212,
1208
- "learning_rate": 2.943760984182777e-05,
1209
- "loss": 0.9317060708999634,
1210
  "step": 336
1211
  },
1212
  {
1213
  "epoch": 0.14261603375527426,
1214
- "grad_norm": 2.1195082664489746,
1215
- "learning_rate": 2.961335676625659e-05,
1216
- "loss": 0.9965578317642212,
1217
  "step": 338
1218
  },
1219
  {
1220
  "epoch": 0.14345991561181434,
1221
- "grad_norm": 2.3449771404266357,
1222
- "learning_rate": 2.9789103690685414e-05,
1223
- "loss": 0.8353848457336426,
1224
  "step": 340
1225
  },
1226
  {
1227
  "epoch": 0.14430379746835442,
1228
- "grad_norm": 2.000497579574585,
1229
- "learning_rate": 2.9964850615114237e-05,
1230
- "loss": 0.9154735803604126,
1231
  "step": 342
1232
  },
1233
  {
1234
  "epoch": 0.1451476793248945,
1235
- "grad_norm": 2.141890525817871,
1236
- "learning_rate": 3.014059753954306e-05,
1237
- "loss": 0.9530655741691589,
1238
  "step": 344
1239
  },
1240
  {
1241
  "epoch": 0.1459915611814346,
1242
- "grad_norm": 1.7717392444610596,
1243
- "learning_rate": 3.031634446397188e-05,
1244
- "loss": 0.896998405456543,
1245
  "step": 346
1246
  },
1247
  {
1248
  "epoch": 0.1468354430379747,
1249
- "grad_norm": 1.8796685934066772,
1250
- "learning_rate": 3.0492091388400708e-05,
1251
- "loss": 0.9084208011627197,
1252
  "step": 348
1253
  },
1254
  {
1255
  "epoch": 0.14767932489451477,
1256
- "grad_norm": 2.0298709869384766,
1257
- "learning_rate": 3.066783831282953e-05,
1258
- "loss": 0.9183387756347656,
1259
  "step": 350
1260
  },
1261
  {
1262
  "epoch": 0.14852320675105485,
1263
- "grad_norm": 1.9245645999908447,
1264
- "learning_rate": 3.084358523725835e-05,
1265
- "loss": 0.8624772429466248,
1266
  "step": 352
1267
  },
1268
  {
1269
  "epoch": 0.14936708860759493,
1270
- "grad_norm": 2.325681209564209,
1271
- "learning_rate": 3.101933216168717e-05,
1272
- "loss": 0.9142400026321411,
1273
  "step": 354
1274
  },
1275
  {
1276
  "epoch": 0.150210970464135,
1277
- "grad_norm": 2.1200530529022217,
1278
- "learning_rate": 3.1195079086115995e-05,
1279
- "loss": 0.9064018130302429,
1280
  "step": 356
1281
  },
1282
  {
1283
  "epoch": 0.15105485232067511,
1284
- "grad_norm": 1.979314923286438,
1285
- "learning_rate": 3.137082601054482e-05,
1286
- "loss": 0.9199238419532776,
1287
  "step": 358
1288
  },
1289
  {
1290
  "epoch": 0.1518987341772152,
1291
- "grad_norm": 2.1122689247131348,
1292
- "learning_rate": 3.154657293497364e-05,
1293
- "loss": 0.8030132055282593,
1294
  "step": 360
1295
  },
1296
  {
1297
  "epoch": 0.15274261603375527,
1298
- "grad_norm": 2.105767250061035,
1299
- "learning_rate": 3.172231985940246e-05,
1300
- "loss": 0.9185854196548462,
1301
  "step": 362
1302
  },
1303
  {
1304
  "epoch": 0.15358649789029535,
1305
- "grad_norm": 2.179471015930176,
1306
- "learning_rate": 3.1898066783831285e-05,
1307
- "loss": 0.9365083575248718,
1308
  "step": 364
1309
  },
1310
  {
1311
  "epoch": 0.15443037974683543,
1312
- "grad_norm": 2.1444311141967773,
1313
- "learning_rate": 3.207381370826011e-05,
1314
- "loss": 0.8965140581130981,
1315
  "step": 366
1316
  },
1317
  {
1318
  "epoch": 0.15527426160337554,
1319
- "grad_norm": 2.4171674251556396,
1320
- "learning_rate": 3.224956063268893e-05,
1321
- "loss": 0.8787504434585571,
1322
  "step": 368
1323
  },
1324
  {
1325
  "epoch": 0.15611814345991562,
1326
- "grad_norm": 2.418628215789795,
1327
- "learning_rate": 3.242530755711775e-05,
1328
- "loss": 0.8925284147262573,
1329
  "step": 370
1330
  },
1331
  {
1332
  "epoch": 0.1569620253164557,
1333
- "grad_norm": 2.2228314876556396,
1334
- "learning_rate": 3.2601054481546575e-05,
1335
- "loss": 0.876179039478302,
1336
  "step": 372
1337
  },
1338
  {
1339
  "epoch": 0.15780590717299578,
1340
- "grad_norm": 2.324237108230591,
1341
- "learning_rate": 3.27768014059754e-05,
1342
- "loss": 0.8365707993507385,
1343
  "step": 374
1344
  },
1345
  {
1346
  "epoch": 0.15864978902953586,
1347
- "grad_norm": 2.6344552040100098,
1348
- "learning_rate": 3.295254833040422e-05,
1349
- "loss": 0.7864399552345276,
1350
  "step": 376
1351
  },
1352
  {
1353
  "epoch": 0.15949367088607594,
1354
- "grad_norm": 2.047536611557007,
1355
- "learning_rate": 3.312829525483304e-05,
1356
- "loss": 0.9271875023841858,
1357
  "step": 378
1358
  },
1359
  {
1360
  "epoch": 0.16033755274261605,
1361
- "grad_norm": 2.120025157928467,
1362
- "learning_rate": 3.3304042179261865e-05,
1363
- "loss": 0.8799133896827698,
1364
  "step": 380
1365
  },
1366
  {
1367
  "epoch": 0.16118143459915613,
1368
- "grad_norm": 2.363692045211792,
1369
- "learning_rate": 3.347978910369069e-05,
1370
- "loss": 0.8973530530929565,
1371
  "step": 382
1372
  },
1373
  {
1374
  "epoch": 0.1620253164556962,
1375
- "grad_norm": 2.1796772480010986,
1376
- "learning_rate": 3.365553602811951e-05,
1377
- "loss": 1.0277652740478516,
1378
  "step": 384
1379
  },
1380
  {
1381
  "epoch": 0.16286919831223629,
1382
- "grad_norm": 1.9192595481872559,
1383
- "learning_rate": 3.383128295254833e-05,
1384
- "loss": 0.8909643888473511,
1385
  "step": 386
1386
  },
1387
  {
1388
  "epoch": 0.16371308016877636,
1389
- "grad_norm": 1.7874376773834229,
1390
- "learning_rate": 3.4007029876977155e-05,
1391
- "loss": 0.837049663066864,
1392
  "step": 388
1393
  },
1394
  {
1395
  "epoch": 0.16455696202531644,
1396
- "grad_norm": 2.3402366638183594,
1397
- "learning_rate": 3.4182776801405974e-05,
1398
- "loss": 0.8625202775001526,
1399
  "step": 390
1400
  },
1401
  {
1402
  "epoch": 0.16540084388185655,
1403
- "grad_norm": 2.1137185096740723,
1404
- "learning_rate": 3.43585237258348e-05,
1405
- "loss": 0.9288321137428284,
1406
  "step": 392
1407
  },
1408
  {
1409
  "epoch": 0.16624472573839663,
1410
- "grad_norm": 2.3776895999908447,
1411
- "learning_rate": 3.453427065026362e-05,
1412
- "loss": 0.9328726530075073,
1413
  "step": 394
1414
  },
1415
  {
1416
  "epoch": 0.1670886075949367,
1417
- "grad_norm": 2.34941029548645,
1418
- "learning_rate": 3.4710017574692445e-05,
1419
- "loss": 0.9273309707641602,
1420
  "step": 396
1421
  },
1422
  {
1423
  "epoch": 0.1679324894514768,
1424
- "grad_norm": 2.1272573471069336,
1425
- "learning_rate": 3.4885764499121264e-05,
1426
- "loss": 0.8703887462615967,
1427
  "step": 398
1428
  },
1429
  {
1430
  "epoch": 0.16877637130801687,
1431
- "grad_norm": 2.047290802001953,
1432
- "learning_rate": 3.506151142355009e-05,
1433
- "loss": 0.8808165788650513,
1434
  "step": 400
1435
  },
1436
  {
1437
  "epoch": 0.16877637130801687,
1438
- "eval_loss": 0.9282881617546082,
1439
- "eval_runtime": 869.6867,
1440
- "eval_samples_per_second": 2.423,
1441
- "eval_steps_per_second": 2.423,
1442
  "step": 400
1443
  },
1444
  {
1445
  "epoch": 0.16962025316455695,
1446
- "grad_norm": 1.9874159097671509,
1447
- "learning_rate": 3.5237258347978916e-05,
1448
- "loss": 0.9643645286560059,
1449
  "step": 402
1450
  },
1451
  {
1452
  "epoch": 0.17046413502109706,
1453
- "grad_norm": 1.9299919605255127,
1454
- "learning_rate": 3.5413005272407735e-05,
1455
- "loss": 0.9173495769500732,
1456
  "step": 404
1457
  },
1458
  {
1459
  "epoch": 0.17130801687763714,
1460
- "grad_norm": 2.3379697799682617,
1461
- "learning_rate": 3.5588752196836555e-05,
1462
- "loss": 0.8998411893844604,
1463
  "step": 406
1464
  },
1465
  {
1466
  "epoch": 0.17215189873417722,
1467
- "grad_norm": 2.241370916366577,
1468
- "learning_rate": 3.5764499121265374e-05,
1469
- "loss": 0.9310802221298218,
1470
  "step": 408
1471
  },
1472
  {
1473
  "epoch": 0.1729957805907173,
1474
- "grad_norm": 2.4490108489990234,
1475
- "learning_rate": 3.5940246045694206e-05,
1476
- "loss": 0.9605053067207336,
1477
  "step": 410
1478
  },
1479
  {
1480
  "epoch": 0.17383966244725738,
1481
- "grad_norm": 1.8247230052947998,
1482
- "learning_rate": 3.6115992970123026e-05,
1483
- "loss": 0.8485683798789978,
1484
  "step": 412
1485
  },
1486
  {
1487
  "epoch": 0.17468354430379746,
1488
- "grad_norm": 2.4608843326568604,
1489
- "learning_rate": 3.6291739894551845e-05,
1490
- "loss": 0.9325968623161316,
1491
  "step": 414
1492
  },
1493
  {
1494
  "epoch": 0.17552742616033756,
1495
- "grad_norm": 1.8923161029815674,
1496
- "learning_rate": 3.646748681898067e-05,
1497
- "loss": 0.9125096201896667,
1498
  "step": 416
1499
  },
1500
  {
1501
  "epoch": 0.17637130801687764,
1502
- "grad_norm": 1.8502769470214844,
1503
- "learning_rate": 3.6643233743409497e-05,
1504
- "loss": 0.8852217197418213,
1505
  "step": 418
1506
  },
1507
  {
1508
  "epoch": 0.17721518987341772,
1509
- "grad_norm": 1.9155100584030151,
1510
- "learning_rate": 3.6818980667838316e-05,
1511
- "loss": 0.9192792773246765,
1512
  "step": 420
1513
  },
1514
  {
1515
  "epoch": 0.1780590717299578,
1516
- "grad_norm": 2.181476593017578,
1517
- "learning_rate": 3.6994727592267135e-05,
1518
- "loss": 0.8787404298782349,
1519
  "step": 422
1520
  },
1521
  {
1522
  "epoch": 0.17890295358649788,
1523
- "grad_norm": 2.2469847202301025,
1524
- "learning_rate": 3.717047451669596e-05,
1525
- "loss": 0.9109582901000977,
1526
  "step": 424
1527
  },
1528
  {
1529
  "epoch": 0.17974683544303796,
1530
- "grad_norm": 2.08145809173584,
1531
- "learning_rate": 3.734622144112479e-05,
1532
- "loss": 0.8560389280319214,
1533
  "step": 426
1534
  },
1535
  {
1536
  "epoch": 0.18059071729957807,
1537
- "grad_norm": 4.121932506561279,
1538
- "learning_rate": 3.7521968365553606e-05,
1539
- "loss": 0.9456104040145874,
1540
  "step": 428
1541
  },
1542
  {
1543
  "epoch": 0.18143459915611815,
1544
- "grad_norm": 2.177459478378296,
1545
- "learning_rate": 3.7697715289982425e-05,
1546
- "loss": 0.8421300649642944,
1547
  "step": 430
1548
  },
1549
  {
1550
  "epoch": 0.18227848101265823,
1551
- "grad_norm": 2.324970245361328,
1552
- "learning_rate": 3.787346221441125e-05,
1553
- "loss": 0.9199858903884888,
1554
  "step": 432
1555
  },
1556
  {
1557
  "epoch": 0.1831223628691983,
1558
- "grad_norm": 2.133718490600586,
1559
- "learning_rate": 3.804920913884007e-05,
1560
- "loss": 0.8953126668930054,
1561
  "step": 434
1562
  },
1563
  {
1564
  "epoch": 0.1839662447257384,
1565
- "grad_norm": 1.8527995347976685,
1566
- "learning_rate": 3.8224956063268896e-05,
1567
- "loss": 0.8732239007949829,
1568
  "step": 436
1569
  },
1570
  {
1571
  "epoch": 0.1848101265822785,
1572
- "grad_norm": 1.95817232131958,
1573
- "learning_rate": 3.8400702987697715e-05,
1574
- "loss": 0.8818746209144592,
1575
  "step": 438
1576
  },
1577
  {
1578
  "epoch": 0.18565400843881857,
1579
- "grad_norm": 2.2107293605804443,
1580
- "learning_rate": 3.857644991212654e-05,
1581
- "loss": 0.9153507947921753,
1582
  "step": 440
1583
  },
1584
  {
1585
  "epoch": 0.18649789029535865,
1586
- "grad_norm": 2.004754066467285,
1587
- "learning_rate": 3.875219683655536e-05,
1588
- "loss": 0.8960154056549072,
1589
  "step": 442
1590
  },
1591
  {
1592
  "epoch": 0.18734177215189873,
1593
- "grad_norm": 2.1851706504821777,
1594
- "learning_rate": 3.8927943760984186e-05,
1595
- "loss": 0.909011721611023,
1596
  "step": 444
1597
  },
1598
  {
1599
  "epoch": 0.1881856540084388,
1600
- "grad_norm": 2.4492485523223877,
1601
- "learning_rate": 3.9103690685413005e-05,
1602
- "loss": 0.8880158066749573,
1603
  "step": 446
1604
  },
1605
  {
1606
  "epoch": 0.1890295358649789,
1607
- "grad_norm": 2.745453119277954,
1608
- "learning_rate": 3.927943760984183e-05,
1609
- "loss": 0.8500842452049255,
1610
  "step": 448
1611
  },
1612
  {
1613
  "epoch": 0.189873417721519,
1614
- "grad_norm": 2.1924264430999756,
1615
- "learning_rate": 3.945518453427065e-05,
1616
- "loss": 0.9004045724868774,
1617
  "step": 450
1618
  },
1619
  {
1620
  "epoch": 0.19071729957805908,
1621
- "grad_norm": 2.4051687717437744,
1622
- "learning_rate": 3.9630931458699476e-05,
1623
- "loss": 0.9020664095878601,
1624
  "step": 452
1625
  },
1626
  {
1627
  "epoch": 0.19156118143459916,
1628
- "grad_norm": 1.8077667951583862,
1629
- "learning_rate": 3.9806678383128295e-05,
1630
- "loss": 0.8639500737190247,
1631
  "step": 454
1632
  },
1633
  {
1634
  "epoch": 0.19240506329113924,
1635
- "grad_norm": 2.089043378829956,
1636
- "learning_rate": 3.998242530755712e-05,
1637
- "loss": 0.8642048239707947,
1638
  "step": 456
1639
  },
1640
  {
1641
  "epoch": 0.19324894514767932,
1642
- "grad_norm": 2.029578447341919,
1643
- "learning_rate": 4.015817223198594e-05,
1644
- "loss": 0.9371927380561829,
1645
  "step": 458
1646
  },
1647
  {
1648
  "epoch": 0.1940928270042194,
1649
- "grad_norm": 2.26582407951355,
1650
- "learning_rate": 4.033391915641476e-05,
1651
- "loss": 0.9120588302612305,
1652
  "step": 460
1653
  },
1654
  {
1655
  "epoch": 0.1949367088607595,
1656
- "grad_norm": 1.8671411275863647,
1657
- "learning_rate": 4.050966608084359e-05,
1658
- "loss": 0.8758644461631775,
1659
  "step": 462
1660
  },
1661
  {
1662
  "epoch": 0.19578059071729959,
1663
- "grad_norm": 1.9403492212295532,
1664
- "learning_rate": 4.068541300527241e-05,
1665
- "loss": 0.914577305316925,
1666
  "step": 464
1667
  },
1668
  {
1669
  "epoch": 0.19662447257383966,
1670
- "grad_norm": 1.9939641952514648,
1671
- "learning_rate": 4.086115992970123e-05,
1672
- "loss": 0.8592531681060791,
1673
  "step": 466
1674
  },
1675
  {
1676
  "epoch": 0.19746835443037974,
1677
- "grad_norm": 2.1511380672454834,
1678
- "learning_rate": 4.103690685413005e-05,
1679
- "loss": 0.9251965880393982,
1680
  "step": 468
1681
  },
1682
  {
1683
  "epoch": 0.19831223628691982,
1684
- "grad_norm": 2.2260982990264893,
1685
- "learning_rate": 4.121265377855888e-05,
1686
- "loss": 0.8465172052383423,
1687
  "step": 470
1688
  },
1689
  {
1690
  "epoch": 0.1991561181434599,
1691
- "grad_norm": 2.0510010719299316,
1692
- "learning_rate": 4.13884007029877e-05,
1693
- "loss": 0.8943672180175781,
1694
  "step": 472
1695
  },
1696
  {
1697
  "epoch": 0.2,
1698
- "grad_norm": 2.2040133476257324,
1699
- "learning_rate": 4.156414762741652e-05,
1700
- "loss": 0.9594319462776184,
1701
  "step": 474
1702
  },
1703
  {
1704
  "epoch": 0.2008438818565401,
1705
- "grad_norm": 2.355181932449341,
1706
- "learning_rate": 4.173989455184534e-05,
1707
- "loss": 0.9031813144683838,
1708
  "step": 476
1709
  },
1710
  {
1711
  "epoch": 0.20168776371308017,
1712
- "grad_norm": 2.8434665203094482,
1713
- "learning_rate": 4.1915641476274166e-05,
1714
- "loss": 0.9225798845291138,
1715
  "step": 478
1716
  },
1717
  {
1718
  "epoch": 0.20253164556962025,
1719
- "grad_norm": 2.1715340614318848,
1720
- "learning_rate": 4.209138840070299e-05,
1721
- "loss": 0.894163966178894,
1722
  "step": 480
1723
  },
1724
  {
1725
  "epoch": 0.20337552742616033,
1726
- "grad_norm": 2.078916072845459,
1727
- "learning_rate": 4.226713532513181e-05,
1728
- "loss": 0.8424109816551208,
1729
  "step": 482
1730
  },
1731
  {
1732
  "epoch": 0.2042194092827004,
1733
- "grad_norm": 1.9760961532592773,
1734
- "learning_rate": 4.244288224956064e-05,
1735
- "loss": 0.9102715849876404,
1736
  "step": 484
1737
  },
1738
  {
1739
  "epoch": 0.20506329113924052,
1740
- "grad_norm": 1.9684507846832275,
1741
- "learning_rate": 4.2618629173989456e-05,
1742
- "loss": 0.8693854808807373,
1743
  "step": 486
1744
  },
1745
  {
1746
  "epoch": 0.2059071729957806,
1747
- "grad_norm": 2.1633450984954834,
1748
- "learning_rate": 4.279437609841828e-05,
1749
- "loss": 0.8617543578147888,
1750
  "step": 488
1751
  },
1752
  {
1753
  "epoch": 0.20675105485232068,
1754
- "grad_norm": 2.2695257663726807,
1755
- "learning_rate": 4.29701230228471e-05,
1756
- "loss": 0.9167086482048035,
1757
  "step": 490
1758
  },
1759
  {
1760
  "epoch": 0.20759493670886076,
1761
- "grad_norm": 2.4180049896240234,
1762
- "learning_rate": 4.314586994727593e-05,
1763
- "loss": 0.8333520889282227,
1764
  "step": 492
1765
  },
1766
  {
1767
  "epoch": 0.20843881856540084,
1768
- "grad_norm": 2.2942769527435303,
1769
- "learning_rate": 4.3321616871704746e-05,
1770
- "loss": 0.918351411819458,
1771
  "step": 494
1772
  },
1773
  {
1774
  "epoch": 0.20928270042194091,
1775
- "grad_norm": 1.826458215713501,
1776
- "learning_rate": 4.349736379613357e-05,
1777
- "loss": 0.8565171957015991,
1778
  "step": 496
1779
  },
1780
  {
1781
  "epoch": 0.21012658227848102,
1782
- "grad_norm": 1.9694055318832397,
1783
- "learning_rate": 4.367311072056239e-05,
1784
- "loss": 0.8684167861938477,
1785
  "step": 498
1786
  },
1787
  {
1788
  "epoch": 0.2109704641350211,
1789
- "grad_norm": 1.892659306526184,
1790
- "learning_rate": 4.384885764499122e-05,
1791
- "loss": 0.7752788662910461,
1792
  "step": 500
1793
  },
1794
  {
1795
  "epoch": 0.2109704641350211,
1796
- "eval_loss": 0.9080732464790344,
1797
- "eval_runtime": 857.0753,
1798
- "eval_samples_per_second": 2.458,
1799
- "eval_steps_per_second": 2.458,
1800
  "step": 500
1801
  }
1802
  ],
@@ -1826,7 +1826,7 @@
1826
  "attributes": {}
1827
  }
1828
  },
1829
- "total_flos": 5.1928835720736154e+17,
1830
  "train_batch_size": 1,
1831
  "trial_name": null,
1832
  "trial_params": null
 
1
  {
2
  "best_global_step": 500,
3
+ "best_metric": 1.042096495628357,
4
+ "best_model_checkpoint": "task2file/sft_qwen_14B_v2/checkpoints/checkpoint-500",
5
  "epoch": 0.2109704641350211,
6
  "eval_steps": 100,
7
  "global_step": 500,
 
11
  "log_history": [
12
  {
13
  "epoch": 0.0008438818565400844,
14
+ "grad_norm": 0.5386583805084229,
15
+ "learning_rate": 1.7574692442882248e-07,
16
+ "loss": 1.6941628456115723,
17
  "step": 2
18
  },
19
  {
20
  "epoch": 0.0016877637130801688,
21
+ "grad_norm": 0.5477277636528015,
22
+ "learning_rate": 5.272407732864675e-07,
23
+ "loss": 1.7132279872894287,
24
  "step": 4
25
  },
26
  {
27
  "epoch": 0.002531645569620253,
28
+ "grad_norm": 0.5390765070915222,
29
+ "learning_rate": 8.787346221441126e-07,
30
+ "loss": 1.641180396080017,
31
  "step": 6
32
  },
33
  {
34
  "epoch": 0.0033755274261603376,
35
+ "grad_norm": 0.5023683905601501,
36
+ "learning_rate": 1.2302284710017575e-06,
37
+ "loss": 1.5616240501403809,
38
  "step": 8
39
  },
40
  {
41
  "epoch": 0.004219409282700422,
42
+ "grad_norm": 0.4899154603481293,
43
+ "learning_rate": 1.5817223198594026e-06,
44
+ "loss": 1.572033405303955,
45
  "step": 10
46
  },
47
  {
48
  "epoch": 0.005063291139240506,
49
+ "grad_norm": 0.5239788293838501,
50
+ "learning_rate": 1.9332161687170474e-06,
51
+ "loss": 1.6242921352386475,
52
  "step": 12
53
  },
54
  {
55
  "epoch": 0.00590717299578059,
56
+ "grad_norm": 0.5172926783561707,
57
+ "learning_rate": 2.2847100175746925e-06,
58
+ "loss": 1.6800041198730469,
59
  "step": 14
60
  },
61
  {
62
  "epoch": 0.006751054852320675,
63
+ "grad_norm": 0.5539224743843079,
64
+ "learning_rate": 2.6362038664323376e-06,
65
+ "loss": 1.6450834274291992,
66
  "step": 16
67
  },
68
  {
69
  "epoch": 0.007594936708860759,
70
+ "grad_norm": 0.5255337953567505,
71
+ "learning_rate": 2.9876977152899827e-06,
72
+ "loss": 1.6673263311386108,
73
  "step": 18
74
  },
75
  {
76
  "epoch": 0.008438818565400843,
77
+ "grad_norm": 0.5074548721313477,
78
+ "learning_rate": 3.3391915641476277e-06,
79
+ "loss": 1.531802773475647,
80
  "step": 20
81
  },
82
  {
83
  "epoch": 0.009282700421940928,
84
+ "grad_norm": 0.4160279333591461,
85
+ "learning_rate": 3.6906854130052724e-06,
86
+ "loss": 1.599354863166809,
87
  "step": 22
88
  },
89
  {
90
  "epoch": 0.010126582278481013,
91
+ "grad_norm": 0.5716474652290344,
92
+ "learning_rate": 4.0421792618629174e-06,
93
+ "loss": 1.6700962781906128,
94
  "step": 24
95
  },
96
  {
97
  "epoch": 0.010970464135021098,
98
+ "grad_norm": 0.5148899555206299,
99
+ "learning_rate": 4.3936731107205625e-06,
100
+ "loss": 1.66217839717865,
101
  "step": 26
102
  },
103
  {
104
  "epoch": 0.01181434599156118,
105
+ "grad_norm": 0.575722336769104,
106
+ "learning_rate": 4.7451669595782076e-06,
107
+ "loss": 1.6692266464233398,
108
  "step": 28
109
  },
110
  {
111
  "epoch": 0.012658227848101266,
112
+ "grad_norm": 0.5345953106880188,
113
+ "learning_rate": 5.096660808435853e-06,
114
+ "loss": 1.5518689155578613,
115
  "step": 30
116
  },
117
  {
118
  "epoch": 0.01350210970464135,
119
+ "grad_norm": 0.4462043344974518,
120
+ "learning_rate": 5.448154657293498e-06,
121
+ "loss": 1.5930007696151733,
122
  "step": 32
123
  },
124
  {
125
  "epoch": 0.014345991561181435,
126
+ "grad_norm": 0.5119605660438538,
127
+ "learning_rate": 5.799648506151143e-06,
128
+ "loss": 1.6069684028625488,
129
  "step": 34
130
  },
131
  {
132
  "epoch": 0.015189873417721518,
133
+ "grad_norm": 0.5328608751296997,
134
+ "learning_rate": 6.151142355008788e-06,
135
+ "loss": 1.5838109254837036,
136
  "step": 36
137
  },
138
  {
139
  "epoch": 0.016033755274261603,
140
+ "grad_norm": 0.5065920352935791,
141
+ "learning_rate": 6.502636203866433e-06,
142
+ "loss": 1.608130931854248,
143
  "step": 38
144
  },
145
  {
146
  "epoch": 0.016877637130801686,
147
+ "grad_norm": 0.4479359984397888,
148
+ "learning_rate": 6.854130052724078e-06,
149
+ "loss": 1.5942182540893555,
150
  "step": 40
151
  },
152
  {
153
  "epoch": 0.017721518987341773,
154
+ "grad_norm": 0.42844903469085693,
155
+ "learning_rate": 7.205623901581722e-06,
156
+ "loss": 1.6441553831100464,
157
  "step": 42
158
  },
159
  {
160
  "epoch": 0.018565400843881856,
161
+ "grad_norm": 0.476630836725235,
162
+ "learning_rate": 7.557117750439367e-06,
163
+ "loss": 1.6068111658096313,
164
  "step": 44
165
  },
166
  {
167
  "epoch": 0.019409282700421943,
168
+ "grad_norm": 0.4532654881477356,
169
+ "learning_rate": 7.908611599297012e-06,
170
+ "loss": 1.6618021726608276,
171
  "step": 46
172
  },
173
  {
174
  "epoch": 0.020253164556962026,
175
+ "grad_norm": 0.3701118230819702,
176
+ "learning_rate": 8.260105448154657e-06,
177
+ "loss": 1.4730033874511719,
178
  "step": 48
179
  },
180
  {
181
  "epoch": 0.02109704641350211,
182
+ "grad_norm": 0.38471561670303345,
183
+ "learning_rate": 8.611599297012302e-06,
184
+ "loss": 1.4828267097473145,
185
  "step": 50
186
  },
187
  {
188
  "epoch": 0.021940928270042195,
189
+ "grad_norm": 0.3602336347103119,
190
+ "learning_rate": 8.963093145869948e-06,
191
+ "loss": 1.3877452611923218,
192
  "step": 52
193
  },
194
  {
195
  "epoch": 0.02278481012658228,
196
+ "grad_norm": 0.40318572521209717,
197
+ "learning_rate": 9.314586994727593e-06,
198
+ "loss": 1.49052894115448,
199
  "step": 54
200
  },
201
  {
202
  "epoch": 0.02362869198312236,
203
+ "grad_norm": 0.3223826587200165,
204
+ "learning_rate": 9.666080843585238e-06,
205
+ "loss": 1.4912524223327637,
206
  "step": 56
207
  },
208
  {
209
  "epoch": 0.024472573839662448,
210
+ "grad_norm": 0.3873065114021301,
211
+ "learning_rate": 1.0017574692442883e-05,
212
+ "loss": 1.526674509048462,
213
  "step": 58
214
  },
215
  {
216
  "epoch": 0.02531645569620253,
217
+ "grad_norm": 0.410159707069397,
218
+ "learning_rate": 1.0369068541300528e-05,
219
+ "loss": 1.4480271339416504,
220
  "step": 60
221
  },
222
  {
223
  "epoch": 0.026160337552742614,
224
+ "grad_norm": 0.3632003962993622,
225
+ "learning_rate": 1.0720562390158173e-05,
226
+ "loss": 1.4222990274429321,
227
  "step": 62
228
  },
229
  {
230
  "epoch": 0.0270042194092827,
231
+ "grad_norm": 0.33118435740470886,
232
+ "learning_rate": 1.1072056239015818e-05,
233
+ "loss": 1.387171745300293,
234
  "step": 64
235
  },
236
  {
237
  "epoch": 0.027848101265822784,
238
+ "grad_norm": 0.3301764726638794,
239
+ "learning_rate": 1.1423550087873463e-05,
240
+ "loss": 1.3523777723312378,
241
  "step": 66
242
  },
243
  {
244
  "epoch": 0.02869198312236287,
245
+ "grad_norm": 0.34342435002326965,
246
+ "learning_rate": 1.1775043936731108e-05,
247
+ "loss": 1.4515162706375122,
248
  "step": 68
249
  },
250
  {
251
  "epoch": 0.029535864978902954,
252
+ "grad_norm": 0.3243122100830078,
253
+ "learning_rate": 1.2126537785588753e-05,
254
+ "loss": 1.3509243726730347,
255
  "step": 70
256
  },
257
  {
258
  "epoch": 0.030379746835443037,
259
+ "grad_norm": 0.3450150787830353,
260
+ "learning_rate": 1.2478031634446398e-05,
261
+ "loss": 1.4936245679855347,
262
  "step": 72
263
  },
264
  {
265
  "epoch": 0.031223628691983123,
266
+ "grad_norm": 0.38912028074264526,
267
+ "learning_rate": 1.2829525483304042e-05,
268
+ "loss": 1.3419109582901,
269
  "step": 74
270
  },
271
  {
272
  "epoch": 0.032067510548523206,
273
+ "grad_norm": 0.3019310235977173,
274
+ "learning_rate": 1.3181019332161687e-05,
275
+ "loss": 1.4284154176712036,
276
  "step": 76
277
  },
278
  {
279
  "epoch": 0.03291139240506329,
280
+ "grad_norm": 0.37803682684898376,
281
+ "learning_rate": 1.3532513181019332e-05,
282
+ "loss": 1.4256561994552612,
283
  "step": 78
284
  },
285
  {
286
  "epoch": 0.03375527426160337,
287
+ "grad_norm": 0.34191736578941345,
288
+ "learning_rate": 1.3884007029876977e-05,
289
+ "loss": 1.3256909847259521,
290
  "step": 80
291
  },
292
  {
293
  "epoch": 0.03459915611814346,
294
+ "grad_norm": 0.35242700576782227,
295
+ "learning_rate": 1.4235500878734624e-05,
296
+ "loss": 1.2710685729980469,
297
  "step": 82
298
  },
299
  {
300
  "epoch": 0.035443037974683546,
301
+ "grad_norm": 0.38094228506088257,
302
+ "learning_rate": 1.4586994727592269e-05,
303
+ "loss": 1.253411889076233,
304
  "step": 84
305
  },
306
  {
307
  "epoch": 0.036286919831223625,
308
+ "grad_norm": 0.36837366223335266,
309
+ "learning_rate": 1.4938488576449914e-05,
310
+ "loss": 1.3064342737197876,
311
  "step": 86
312
  },
313
  {
314
  "epoch": 0.03713080168776371,
315
+ "grad_norm": 0.3443569242954254,
316
+ "learning_rate": 1.5289982425307557e-05,
317
+ "loss": 1.293562412261963,
318
  "step": 88
319
  },
320
  {
321
  "epoch": 0.0379746835443038,
322
+ "grad_norm": 0.3799338936805725,
323
+ "learning_rate": 1.5641476274165202e-05,
324
+ "loss": 1.3382648229599,
325
  "step": 90
326
  },
327
  {
328
  "epoch": 0.038818565400843885,
329
+ "grad_norm": 0.40501922369003296,
330
+ "learning_rate": 1.599297012302285e-05,
331
+ "loss": 1.3925724029541016,
332
  "step": 92
333
  },
334
  {
335
  "epoch": 0.039662447257383965,
336
+ "grad_norm": 0.4419630467891693,
337
+ "learning_rate": 1.6344463971880492e-05,
338
+ "loss": 1.357171893119812,
339
  "step": 94
340
  },
341
  {
342
  "epoch": 0.04050632911392405,
343
+ "grad_norm": 0.3619817793369293,
344
+ "learning_rate": 1.6695957820738137e-05,
345
+ "loss": 1.3029985427856445,
346
  "step": 96
347
  },
348
  {
349
  "epoch": 0.04135021097046414,
350
+ "grad_norm": 0.4851357340812683,
351
+ "learning_rate": 1.7047451669595782e-05,
352
+ "loss": 1.3498191833496094,
353
  "step": 98
354
  },
355
  {
356
  "epoch": 0.04219409282700422,
357
+ "grad_norm": 0.418658584356308,
358
+ "learning_rate": 1.7398945518453427e-05,
359
+ "loss": 1.185287356376648,
360
  "step": 100
361
  },
362
  {
363
  "epoch": 0.04219409282700422,
364
+ "eval_loss": 1.2979938983917236,
365
+ "eval_runtime": 682.1979,
366
+ "eval_samples_per_second": 3.089,
367
+ "eval_steps_per_second": 3.089,
368
  "step": 100
369
  },
370
  {
371
  "epoch": 0.043037974683544304,
372
+ "grad_norm": 0.4464418888092041,
373
+ "learning_rate": 1.7750439367311073e-05,
374
+ "loss": 1.2217272520065308,
375
  "step": 102
376
  },
377
  {
378
  "epoch": 0.04388185654008439,
379
+ "grad_norm": 0.4706237316131592,
380
+ "learning_rate": 1.8101933216168718e-05,
381
+ "loss": 1.2052050828933716,
382
  "step": 104
383
  },
384
  {
385
  "epoch": 0.04472573839662447,
386
+ "grad_norm": 0.46394404768943787,
387
+ "learning_rate": 1.8453427065026363e-05,
388
+ "loss": 1.221343994140625,
389
  "step": 106
390
  },
391
  {
392
  "epoch": 0.04556962025316456,
393
+ "grad_norm": 0.4726889431476593,
394
+ "learning_rate": 1.8804920913884008e-05,
395
+ "loss": 1.2387475967407227,
396
  "step": 108
397
  },
398
  {
399
  "epoch": 0.046413502109704644,
400
+ "grad_norm": 0.42130985856056213,
401
+ "learning_rate": 1.9156414762741653e-05,
402
+ "loss": 1.2851309776306152,
403
  "step": 110
404
  },
405
  {
406
  "epoch": 0.04725738396624472,
407
+ "grad_norm": 0.4504576623439789,
408
+ "learning_rate": 1.9507908611599298e-05,
409
+ "loss": 1.2753145694732666,
410
  "step": 112
411
  },
412
  {
413
  "epoch": 0.04810126582278481,
414
+ "grad_norm": 0.396085262298584,
415
+ "learning_rate": 1.9859402460456943e-05,
416
+ "loss": 1.2427717447280884,
417
  "step": 114
418
  },
419
  {
420
  "epoch": 0.048945147679324896,
421
+ "grad_norm": 0.5106491446495056,
422
+ "learning_rate": 2.0210896309314588e-05,
423
+ "loss": 1.2943825721740723,
424
  "step": 116
425
  },
426
  {
427
  "epoch": 0.049789029535864976,
428
+ "grad_norm": 0.42351317405700684,
429
+ "learning_rate": 2.0562390158172233e-05,
430
+ "loss": 1.263301134109497,
431
  "step": 118
432
  },
433
  {
434
  "epoch": 0.05063291139240506,
435
+ "grad_norm": 0.4403539299964905,
436
+ "learning_rate": 2.0913884007029878e-05,
437
+ "loss": 1.2647849321365356,
438
  "step": 120
439
  },
440
  {
441
  "epoch": 0.05147679324894515,
442
+ "grad_norm": 0.5260752439498901,
443
+ "learning_rate": 2.1265377855887523e-05,
444
+ "loss": 1.2351393699645996,
445
  "step": 122
446
  },
447
  {
448
  "epoch": 0.05232067510548523,
449
+ "grad_norm": 0.44978851079940796,
450
+ "learning_rate": 2.1616871704745168e-05,
451
+ "loss": 1.0384471416473389,
452
  "step": 124
453
  },
454
  {
455
  "epoch": 0.053164556962025315,
456
+ "grad_norm": 0.47732362151145935,
457
+ "learning_rate": 2.1968365553602813e-05,
458
+ "loss": 1.1518068313598633,
459
  "step": 126
460
  },
461
  {
462
  "epoch": 0.0540084388185654,
463
+ "grad_norm": 0.5473551750183105,
464
+ "learning_rate": 2.231985940246046e-05,
465
+ "loss": 1.2264912128448486,
466
  "step": 128
467
  },
468
  {
469
  "epoch": 0.05485232067510549,
470
+ "grad_norm": 0.4473855197429657,
471
+ "learning_rate": 2.2671353251318103e-05,
472
+ "loss": 1.1615246534347534,
473
  "step": 130
474
  },
475
  {
476
  "epoch": 0.05569620253164557,
477
+ "grad_norm": 0.5980377197265625,
478
+ "learning_rate": 2.302284710017575e-05,
479
+ "loss": 1.1334880590438843,
480
  "step": 132
481
  },
482
  {
483
  "epoch": 0.056540084388185655,
484
+ "grad_norm": 0.5987792015075684,
485
+ "learning_rate": 2.3374340949033394e-05,
486
+ "loss": 1.1546804904937744,
487
  "step": 134
488
  },
489
  {
490
  "epoch": 0.05738396624472574,
491
+ "grad_norm": 0.45355498790740967,
492
+ "learning_rate": 2.372583479789104e-05,
493
+ "loss": 1.194953441619873,
494
  "step": 136
495
  },
496
  {
497
  "epoch": 0.05822784810126582,
498
+ "grad_norm": 0.5373698472976685,
499
+ "learning_rate": 2.4077328646748684e-05,
500
+ "loss": 1.1067466735839844,
501
  "step": 138
502
  },
503
  {
504
  "epoch": 0.05907172995780591,
505
+ "grad_norm": 0.48734328150749207,
506
+ "learning_rate": 2.442882249560633e-05,
507
+ "loss": 1.188468098640442,
508
  "step": 140
509
  },
510
  {
511
  "epoch": 0.059915611814345994,
512
+ "grad_norm": 0.4692173898220062,
513
+ "learning_rate": 2.478031634446397e-05,
514
+ "loss": 1.1624362468719482,
515
  "step": 142
516
  },
517
  {
518
  "epoch": 0.060759493670886074,
519
+ "grad_norm": 0.532554030418396,
520
+ "learning_rate": 2.513181019332162e-05,
521
+ "loss": 1.0978907346725464,
522
  "step": 144
523
  },
524
  {
525
  "epoch": 0.06160337552742616,
526
+ "grad_norm": 0.5853802561759949,
527
+ "learning_rate": 2.5483304042179264e-05,
528
+ "loss": 1.2030781507492065,
529
  "step": 146
530
  },
531
  {
532
  "epoch": 0.06244725738396625,
533
+ "grad_norm": 0.5061611533164978,
534
+ "learning_rate": 2.583479789103691e-05,
535
+ "loss": 1.082366943359375,
536
  "step": 148
537
  },
538
  {
539
  "epoch": 0.06329113924050633,
540
+ "grad_norm": 0.49426141381263733,
541
+ "learning_rate": 2.6186291739894554e-05,
542
+ "loss": 1.10564386844635,
543
  "step": 150
544
  },
545
  {
546
  "epoch": 0.06413502109704641,
547
+ "grad_norm": 0.5846618413925171,
548
+ "learning_rate": 2.6537785588752196e-05,
549
+ "loss": 1.1992807388305664,
550
  "step": 152
551
  },
552
  {
553
  "epoch": 0.06497890295358649,
554
+ "grad_norm": 0.5517552495002747,
555
+ "learning_rate": 2.6889279437609844e-05,
556
+ "loss": 1.1757566928863525,
557
  "step": 154
558
  },
559
  {
560
  "epoch": 0.06582278481012659,
561
+ "grad_norm": 0.5667305588722229,
562
+ "learning_rate": 2.7240773286467486e-05,
563
+ "loss": 1.0548783540725708,
564
  "step": 156
565
  },
566
  {
567
  "epoch": 0.06666666666666667,
568
+ "grad_norm": 0.6760414242744446,
569
+ "learning_rate": 2.7592267135325134e-05,
570
+ "loss": 1.184364914894104,
571
  "step": 158
572
  },
573
  {
574
  "epoch": 0.06751054852320675,
575
+ "grad_norm": 0.5261430740356445,
576
+ "learning_rate": 2.7943760984182776e-05,
577
+ "loss": 1.1945042610168457,
578
  "step": 160
579
  },
580
  {
581
  "epoch": 0.06835443037974684,
582
+ "grad_norm": 0.6155015230178833,
583
+ "learning_rate": 2.8295254833040425e-05,
584
+ "loss": 1.2021973133087158,
585
  "step": 162
586
  },
587
  {
588
  "epoch": 0.06919831223628692,
589
+ "grad_norm": 0.6131619215011597,
590
+ "learning_rate": 2.8646748681898066e-05,
591
+ "loss": 1.144123911857605,
592
  "step": 164
593
  },
594
  {
595
  "epoch": 0.070042194092827,
596
+ "grad_norm": 0.5749185681343079,
597
+ "learning_rate": 2.8998242530755715e-05,
598
+ "loss": 1.1329256296157837,
599
  "step": 166
600
  },
601
  {
602
  "epoch": 0.07088607594936709,
603
+ "grad_norm": 0.5243118405342102,
604
+ "learning_rate": 2.9349736379613356e-05,
605
+ "loss": 1.0892387628555298,
606
  "step": 168
607
  },
608
  {
609
  "epoch": 0.07172995780590717,
610
+ "grad_norm": 0.7190104722976685,
611
+ "learning_rate": 2.9701230228471005e-05,
612
+ "loss": 1.163260817527771,
613
  "step": 170
614
  },
615
  {
616
  "epoch": 0.07257383966244725,
617
+ "grad_norm": 0.5486982464790344,
618
+ "learning_rate": 3.0052724077328647e-05,
619
+ "loss": 1.0880777835845947,
620
  "step": 172
621
  },
622
  {
623
  "epoch": 0.07341772151898734,
624
+ "grad_norm": 0.5020889043807983,
625
+ "learning_rate": 3.0404217926186295e-05,
626
+ "loss": 1.0433368682861328,
627
  "step": 174
628
  },
629
  {
630
  "epoch": 0.07426160337552742,
631
+ "grad_norm": 0.47329774498939514,
632
+ "learning_rate": 3.075571177504394e-05,
633
+ "loss": 1.0528991222381592,
634
  "step": 176
635
  },
636
  {
637
  "epoch": 0.0751054852320675,
638
+ "grad_norm": 0.6635547876358032,
639
+ "learning_rate": 3.110720562390158e-05,
640
+ "loss": 1.1627811193466187,
641
  "step": 178
642
  },
643
  {
644
  "epoch": 0.0759493670886076,
645
+ "grad_norm": 0.5624618530273438,
646
+ "learning_rate": 3.145869947275923e-05,
647
+ "loss": 1.084869384765625,
648
  "step": 180
649
  },
650
  {
651
  "epoch": 0.07679324894514768,
652
+ "grad_norm": 0.6029536724090576,
653
+ "learning_rate": 3.181019332161687e-05,
654
+ "loss": 1.2227671146392822,
655
  "step": 182
656
  },
657
  {
658
  "epoch": 0.07763713080168777,
659
+ "grad_norm": 0.930959939956665,
660
+ "learning_rate": 3.216168717047452e-05,
661
+ "loss": 1.0955452919006348,
662
  "step": 184
663
  },
664
  {
665
  "epoch": 0.07848101265822785,
666
+ "grad_norm": 0.5326952338218689,
667
+ "learning_rate": 3.251318101933216e-05,
668
+ "loss": 1.0640798807144165,
669
  "step": 186
670
  },
671
  {
672
  "epoch": 0.07932489451476793,
673
+ "grad_norm": 0.5484727621078491,
674
+ "learning_rate": 3.286467486818981e-05,
675
+ "loss": 1.0700589418411255,
676
  "step": 188
677
  },
678
  {
679
  "epoch": 0.08016877637130802,
680
+ "grad_norm": 0.605273425579071,
681
+ "learning_rate": 3.3216168717047456e-05,
682
+ "loss": 1.1593081951141357,
683
  "step": 190
684
  },
685
  {
686
  "epoch": 0.0810126582278481,
687
+ "grad_norm": 0.5704394578933716,
688
+ "learning_rate": 3.35676625659051e-05,
689
+ "loss": 1.1617076396942139,
690
  "step": 192
691
  },
692
  {
693
  "epoch": 0.08185654008438818,
694
+ "grad_norm": 0.5929452180862427,
695
+ "learning_rate": 3.3919156414762746e-05,
696
+ "loss": 1.1346839666366577,
697
  "step": 194
698
  },
699
  {
700
  "epoch": 0.08270042194092828,
701
+ "grad_norm": 0.5624077916145325,
702
+ "learning_rate": 3.427065026362039e-05,
703
+ "loss": 1.0934710502624512,
704
  "step": 196
705
  },
706
  {
707
  "epoch": 0.08354430379746836,
708
+ "grad_norm": 0.6717425584793091,
709
+ "learning_rate": 3.4622144112478036e-05,
710
+ "loss": 1.1810534000396729,
711
  "step": 198
712
  },
713
  {
714
  "epoch": 0.08438818565400844,
715
+ "grad_norm": 0.5120199918746948,
716
+ "learning_rate": 3.4973637961335674e-05,
717
+ "loss": 1.1525514125823975,
718
  "step": 200
719
  },
720
  {
721
  "epoch": 0.08438818565400844,
722
+ "eval_loss": 1.142486810684204,
723
+ "eval_runtime": 668.2356,
724
+ "eval_samples_per_second": 3.153,
725
+ "eval_steps_per_second": 3.153,
726
  "step": 200
727
  },
728
  {
729
  "epoch": 0.08523206751054853,
730
+ "grad_norm": 0.5144487023353577,
731
+ "learning_rate": 3.5325131810193326e-05,
732
+ "loss": 1.0243735313415527,
733
  "step": 202
734
  },
735
  {
736
  "epoch": 0.08607594936708861,
737
+ "grad_norm": 0.6325069069862366,
738
+ "learning_rate": 3.5676625659050964e-05,
739
+ "loss": 1.118743896484375,
740
  "step": 204
741
  },
742
  {
743
  "epoch": 0.08691983122362869,
744
+ "grad_norm": 0.5501633882522583,
745
+ "learning_rate": 3.6028119507908616e-05,
746
+ "loss": 1.0380504131317139,
747
  "step": 206
748
  },
749
  {
750
  "epoch": 0.08776371308016878,
751
+ "grad_norm": 0.6133899688720703,
752
+ "learning_rate": 3.6379613356766254e-05,
753
+ "loss": 0.9837555885314941,
754
  "step": 208
755
  },
756
  {
757
  "epoch": 0.08860759493670886,
758
+ "grad_norm": 0.5799810886383057,
759
+ "learning_rate": 3.6731107205623906e-05,
760
+ "loss": 1.090720295906067,
761
  "step": 210
762
  },
763
  {
764
  "epoch": 0.08945147679324894,
765
+ "grad_norm": 0.6039511561393738,
766
+ "learning_rate": 3.7082601054481544e-05,
767
+ "loss": 1.120232343673706,
768
  "step": 212
769
  },
770
  {
771
  "epoch": 0.09029535864978903,
772
+ "grad_norm": 0.5983024835586548,
773
+ "learning_rate": 3.7434094903339196e-05,
774
+ "loss": 1.096949815750122,
775
  "step": 214
776
  },
777
  {
778
  "epoch": 0.09113924050632911,
779
+ "grad_norm": 0.5641079545021057,
780
+ "learning_rate": 3.7785588752196835e-05,
781
+ "loss": 1.1226298809051514,
782
  "step": 216
783
  },
784
  {
785
  "epoch": 0.0919831223628692,
786
+ "grad_norm": 0.655717134475708,
787
+ "learning_rate": 3.8137082601054486e-05,
788
+ "loss": 1.1260643005371094,
789
  "step": 218
790
  },
791
  {
792
  "epoch": 0.09282700421940929,
793
+ "grad_norm": 0.6111898422241211,
794
+ "learning_rate": 3.848857644991213e-05,
795
+ "loss": 1.0777709484100342,
796
  "step": 220
797
  },
798
  {
799
  "epoch": 0.09367088607594937,
800
+ "grad_norm": 0.6821302771568298,
801
+ "learning_rate": 3.884007029876977e-05,
802
+ "loss": 1.10588800907135,
803
  "step": 222
804
  },
805
  {
806
  "epoch": 0.09451476793248945,
807
+ "grad_norm": 0.693175733089447,
808
+ "learning_rate": 3.919156414762742e-05,
809
+ "loss": 1.1498671770095825,
810
  "step": 224
811
  },
812
  {
813
  "epoch": 0.09535864978902954,
814
+ "grad_norm": 0.5288166403770447,
815
+ "learning_rate": 3.954305799648506e-05,
816
+ "loss": 1.0587562322616577,
817
  "step": 226
818
  },
819
  {
820
  "epoch": 0.09620253164556962,
821
+ "grad_norm": 0.6882867813110352,
822
+ "learning_rate": 3.989455184534271e-05,
823
+ "loss": 1.1107512712478638,
824
  "step": 228
825
  },
826
  {
827
  "epoch": 0.0970464135021097,
828
+ "grad_norm": 0.5834154486656189,
829
+ "learning_rate": 4.024604569420035e-05,
830
+ "loss": 1.020510196685791,
831
  "step": 230
832
  },
833
  {
834
  "epoch": 0.09789029535864979,
835
+ "grad_norm": 0.7157064080238342,
836
+ "learning_rate": 4.0597539543058e-05,
837
+ "loss": 1.0642449855804443,
838
  "step": 232
839
  },
840
  {
841
  "epoch": 0.09873417721518987,
842
+ "grad_norm": 0.6530708074569702,
843
+ "learning_rate": 4.094903339191564e-05,
844
+ "loss": 1.0359872579574585,
845
  "step": 234
846
  },
847
  {
848
  "epoch": 0.09957805907172995,
849
+ "grad_norm": 0.6329686045646667,
850
+ "learning_rate": 4.130052724077329e-05,
851
+ "loss": 1.050504446029663,
852
  "step": 236
853
  },
854
  {
855
  "epoch": 0.10042194092827005,
856
+ "grad_norm": 0.6597026586532593,
857
+ "learning_rate": 4.165202108963093e-05,
858
+ "loss": 1.2621175050735474,
859
  "step": 238
860
  },
861
  {
862
  "epoch": 0.10126582278481013,
863
+ "grad_norm": 0.6195225119590759,
864
+ "learning_rate": 4.200351493848858e-05,
865
+ "loss": 1.1218310594558716,
866
  "step": 240
867
  },
868
  {
869
  "epoch": 0.1021097046413502,
870
+ "grad_norm": 0.6764137744903564,
871
+ "learning_rate": 4.235500878734622e-05,
872
+ "loss": 1.1250728368759155,
873
  "step": 242
874
  },
875
  {
876
  "epoch": 0.1029535864978903,
877
+ "grad_norm": 0.552363395690918,
878
+ "learning_rate": 4.270650263620387e-05,
879
+ "loss": 1.028212308883667,
880
  "step": 244
881
  },
882
  {
883
  "epoch": 0.10379746835443038,
884
+ "grad_norm": 0.5620495676994324,
885
+ "learning_rate": 4.305799648506151e-05,
886
+ "loss": 1.0425450801849365,
887
  "step": 246
888
  },
889
  {
890
  "epoch": 0.10464135021097046,
891
+ "grad_norm": 0.6860032081604004,
892
+ "learning_rate": 4.3409490333919156e-05,
893
+ "loss": 1.144278883934021,
894
  "step": 248
895
  },
896
  {
897
  "epoch": 0.10548523206751055,
898
+ "grad_norm": 0.6033259034156799,
899
+ "learning_rate": 4.37609841827768e-05,
900
+ "loss": 1.1223982572555542,
901
  "step": 250
902
  },
903
  {
904
  "epoch": 0.10632911392405063,
905
+ "grad_norm": 0.6292146444320679,
906
+ "learning_rate": 4.4112478031634446e-05,
907
+ "loss": 1.1609960794448853,
908
  "step": 252
909
  },
910
  {
911
  "epoch": 0.10717299578059072,
912
+ "grad_norm": 0.7982883453369141,
913
+ "learning_rate": 4.44639718804921e-05,
914
+ "loss": 1.063547968864441,
915
  "step": 254
916
  },
917
  {
918
  "epoch": 0.1080168776371308,
919
+ "grad_norm": 0.7719110250473022,
920
+ "learning_rate": 4.4815465729349736e-05,
921
+ "loss": 1.0719804763793945,
922
  "step": 256
923
  },
924
  {
925
  "epoch": 0.10886075949367088,
926
+ "grad_norm": 0.6101011633872986,
927
+ "learning_rate": 4.516695957820739e-05,
928
+ "loss": 1.0778400897979736,
929
  "step": 258
930
  },
931
  {
932
  "epoch": 0.10970464135021098,
933
+ "grad_norm": 0.7300994396209717,
934
+ "learning_rate": 4.5518453427065026e-05,
935
+ "loss": 1.2129558324813843,
936
  "step": 260
937
  },
938
  {
939
  "epoch": 0.11054852320675106,
940
+ "grad_norm": 0.8348747491836548,
941
+ "learning_rate": 4.586994727592268e-05,
942
+ "loss": 1.221714735031128,
943
  "step": 262
944
  },
945
  {
946
  "epoch": 0.11139240506329114,
947
+ "grad_norm": 0.5445612072944641,
948
+ "learning_rate": 4.6221441124780316e-05,
949
+ "loss": 1.0187978744506836,
950
  "step": 264
951
  },
952
  {
953
  "epoch": 0.11223628691983123,
954
+ "grad_norm": 0.6230319738388062,
955
+ "learning_rate": 4.657293497363797e-05,
956
+ "loss": 1.096561312675476,
957
  "step": 266
958
  },
959
  {
960
  "epoch": 0.11308016877637131,
961
+ "grad_norm": 0.6231237649917603,
962
+ "learning_rate": 4.6924428822495606e-05,
963
+ "loss": 1.089842438697815,
964
  "step": 268
965
  },
966
  {
967
  "epoch": 0.11392405063291139,
968
+ "grad_norm": 0.7178627252578735,
969
+ "learning_rate": 4.727592267135325e-05,
970
+ "loss": 1.0696645975112915,
971
  "step": 270
972
  },
973
  {
974
  "epoch": 0.11476793248945148,
975
+ "grad_norm": 0.6895854473114014,
976
+ "learning_rate": 4.7627416520210896e-05,
977
+ "loss": 1.0511361360549927,
978
  "step": 272
979
  },
980
  {
981
  "epoch": 0.11561181434599156,
982
+ "grad_norm": 0.6046878695487976,
983
+ "learning_rate": 4.797891036906854e-05,
984
+ "loss": 1.1373958587646484,
985
  "step": 274
986
  },
987
  {
988
  "epoch": 0.11645569620253164,
989
+ "grad_norm": 0.6524552702903748,
990
+ "learning_rate": 4.833040421792619e-05,
991
+ "loss": 1.0734186172485352,
992
  "step": 276
993
  },
994
  {
995
  "epoch": 0.11729957805907174,
996
+ "grad_norm": 0.6331019997596741,
997
+ "learning_rate": 4.868189806678383e-05,
998
+ "loss": 1.123913049697876,
999
  "step": 278
1000
  },
1001
  {
1002
  "epoch": 0.11814345991561181,
1003
+ "grad_norm": 0.5919018983840942,
1004
+ "learning_rate": 4.903339191564148e-05,
1005
+ "loss": 1.0635710954666138,
1006
  "step": 280
1007
  },
1008
  {
1009
  "epoch": 0.1189873417721519,
1010
+ "grad_norm": 0.6067633032798767,
1011
+ "learning_rate": 4.938488576449912e-05,
1012
+ "loss": 1.0429247617721558,
1013
  "step": 282
1014
  },
1015
  {
1016
  "epoch": 0.11983122362869199,
1017
+ "grad_norm": 0.6583750247955322,
1018
+ "learning_rate": 4.9736379613356774e-05,
1019
+ "loss": 1.1397464275360107,
1020
  "step": 284
1021
  },
1022
  {
1023
  "epoch": 0.12067510548523207,
1024
+ "grad_norm": 0.6200069785118103,
1025
+ "learning_rate": 5.008787346221442e-05,
1026
+ "loss": 1.0590803623199463,
1027
  "step": 286
1028
  },
1029
  {
1030
  "epoch": 0.12151898734177215,
1031
+ "grad_norm": 0.6798665523529053,
1032
+ "learning_rate": 5.0439367311072064e-05,
1033
+ "loss": 1.1318789720535278,
1034
  "step": 288
1035
  },
1036
  {
1037
  "epoch": 0.12236286919831224,
1038
+ "grad_norm": 0.7508794069290161,
1039
+ "learning_rate": 5.07908611599297e-05,
1040
+ "loss": 1.0934956073760986,
1041
  "step": 290
1042
  },
1043
  {
1044
  "epoch": 0.12320675105485232,
1045
+ "grad_norm": 0.6901452541351318,
1046
+ "learning_rate": 5.114235500878735e-05,
1047
+ "loss": 1.163407802581787,
1048
  "step": 292
1049
  },
1050
  {
1051
  "epoch": 0.1240506329113924,
1052
+ "grad_norm": 0.6423285603523254,
1053
+ "learning_rate": 5.1493848857644985e-05,
1054
+ "loss": 1.09059476852417,
1055
  "step": 294
1056
  },
1057
  {
1058
  "epoch": 0.1248945147679325,
1059
+ "grad_norm": 0.6839275360107422,
1060
+ "learning_rate": 5.1845342706502644e-05,
1061
+ "loss": 1.0690211057662964,
1062
  "step": 296
1063
  },
1064
  {
1065
  "epoch": 0.1257383966244726,
1066
+ "grad_norm": 0.6350128054618835,
1067
+ "learning_rate": 5.219683655536028e-05,
1068
+ "loss": 0.982322096824646,
1069
  "step": 298
1070
  },
1071
  {
1072
  "epoch": 0.12658227848101267,
1073
+ "grad_norm": 0.7136530876159668,
1074
+ "learning_rate": 5.254833040421793e-05,
1075
+ "loss": 1.1132930517196655,
1076
  "step": 300
1077
  },
1078
  {
1079
  "epoch": 0.12658227848101267,
1080
+ "eval_loss": 1.0952109098434448,
1081
+ "eval_runtime": 677.0652,
1082
+ "eval_samples_per_second": 3.112,
1083
+ "eval_steps_per_second": 3.112,
1084
  "step": 300
1085
  },
1086
  {
1087
  "epoch": 0.12742616033755275,
1088
+ "grad_norm": 0.7339721322059631,
1089
+ "learning_rate": 5.289982425307557e-05,
1090
+ "loss": 0.973595917224884,
1091
  "step": 302
1092
  },
1093
  {
1094
  "epoch": 0.12827004219409283,
1095
+ "grad_norm": 0.5941481590270996,
1096
+ "learning_rate": 5.3251318101933224e-05,
1097
+ "loss": 0.9819849729537964,
1098
  "step": 304
1099
  },
1100
  {
1101
  "epoch": 0.1291139240506329,
1102
+ "grad_norm": 0.7153938412666321,
1103
+ "learning_rate": 5.360281195079086e-05,
1104
+ "loss": 1.0315470695495605,
1105
  "step": 306
1106
  },
1107
  {
1108
  "epoch": 0.12995780590717299,
1109
+ "grad_norm": 0.5167180299758911,
1110
+ "learning_rate": 5.395430579964851e-05,
1111
+ "loss": 0.9492001533508301,
1112
  "step": 308
1113
  },
1114
  {
1115
  "epoch": 0.1308016877637131,
1116
+ "grad_norm": 0.6055944561958313,
1117
+ "learning_rate": 5.430579964850615e-05,
1118
+ "loss": 1.0156209468841553,
1119
  "step": 310
1120
  },
1121
  {
1122
  "epoch": 0.13164556962025317,
1123
+ "grad_norm": 0.7662386298179626,
1124
+ "learning_rate": 5.4657293497363805e-05,
1125
+ "loss": 1.1791651248931885,
1126
  "step": 312
1127
  },
1128
  {
1129
  "epoch": 0.13248945147679325,
1130
+ "grad_norm": 0.6065546274185181,
1131
+ "learning_rate": 5.500878734622145e-05,
1132
+ "loss": 1.0009297132492065,
1133
  "step": 314
1134
  },
1135
  {
1136
  "epoch": 0.13333333333333333,
1137
+ "grad_norm": 0.604225754737854,
1138
+ "learning_rate": 5.536028119507909e-05,
1139
+ "loss": 1.0208244323730469,
1140
  "step": 316
1141
  },
1142
  {
1143
  "epoch": 0.1341772151898734,
1144
+ "grad_norm": 0.6186763048171997,
1145
+ "learning_rate": 5.571177504393673e-05,
1146
+ "loss": 0.9968416690826416,
1147
  "step": 318
1148
  },
1149
  {
1150
  "epoch": 0.1350210970464135,
1151
+ "grad_norm": 0.7100363969802856,
1152
+ "learning_rate": 5.606326889279437e-05,
1153
+ "loss": 0.9540256857872009,
1154
  "step": 320
1155
  },
1156
  {
1157
  "epoch": 0.1358649789029536,
1158
+ "grad_norm": 0.6979711055755615,
1159
+ "learning_rate": 5.641476274165203e-05,
1160
+ "loss": 1.0631953477859497,
1161
  "step": 322
1162
  },
1163
  {
1164
  "epoch": 0.13670886075949368,
1165
+ "grad_norm": 0.6237109303474426,
1166
+ "learning_rate": 5.676625659050967e-05,
1167
+ "loss": 1.0170501470565796,
1168
  "step": 324
1169
  },
1170
  {
1171
  "epoch": 0.13755274261603376,
1172
+ "grad_norm": 0.6525548696517944,
1173
+ "learning_rate": 5.711775043936731e-05,
1174
+ "loss": 1.0715603828430176,
1175
  "step": 326
1176
  },
1177
  {
1178
  "epoch": 0.13839662447257384,
1179
+ "grad_norm": 0.6869221329689026,
1180
+ "learning_rate": 5.746924428822495e-05,
1181
+ "loss": 1.0111541748046875,
1182
  "step": 328
1183
  },
1184
  {
1185
  "epoch": 0.13924050632911392,
1186
+ "grad_norm": 0.553188145160675,
1187
+ "learning_rate": 5.782073813708261e-05,
1188
+ "loss": 1.0311682224273682,
1189
  "step": 330
1190
  },
1191
  {
1192
  "epoch": 0.140084388185654,
1193
+ "grad_norm": 0.6760852932929993,
1194
+ "learning_rate": 5.817223198594025e-05,
1195
+ "loss": 1.0213634967803955,
1196
  "step": 332
1197
  },
1198
  {
1199
  "epoch": 0.1409282700421941,
1200
+ "grad_norm": 0.5907419919967651,
1201
+ "learning_rate": 5.8523725834797894e-05,
1202
+ "loss": 0.9748594164848328,
1203
  "step": 334
1204
  },
1205
  {
1206
  "epoch": 0.14177215189873418,
1207
+ "grad_norm": 0.7044920921325684,
1208
+ "learning_rate": 5.887521968365554e-05,
1209
+ "loss": 1.05863356590271,
1210
  "step": 336
1211
  },
1212
  {
1213
  "epoch": 0.14261603375527426,
1214
+ "grad_norm": 0.679073691368103,
1215
+ "learning_rate": 5.922671353251318e-05,
1216
+ "loss": 1.1341127157211304,
1217
  "step": 338
1218
  },
1219
  {
1220
  "epoch": 0.14345991561181434,
1221
+ "grad_norm": 0.7676237225532532,
1222
+ "learning_rate": 5.957820738137083e-05,
1223
+ "loss": 0.9540836215019226,
1224
  "step": 340
1225
  },
1226
  {
1227
  "epoch": 0.14430379746835442,
1228
+ "grad_norm": 0.6313899755477905,
1229
+ "learning_rate": 5.9929701230228474e-05,
1230
+ "loss": 1.0585911273956299,
1231
  "step": 342
1232
  },
1233
  {
1234
  "epoch": 0.1451476793248945,
1235
+ "grad_norm": 0.7123099565505981,
1236
+ "learning_rate": 6.028119507908612e-05,
1237
+ "loss": 1.0760118961334229,
1238
  "step": 344
1239
  },
1240
  {
1241
  "epoch": 0.1459915611814346,
1242
+ "grad_norm": 0.585935652256012,
1243
+ "learning_rate": 6.063268892794376e-05,
1244
+ "loss": 1.036866307258606,
1245
  "step": 346
1246
  },
1247
  {
1248
  "epoch": 0.1468354430379747,
1249
+ "grad_norm": 0.5643263459205627,
1250
+ "learning_rate": 6.0984182776801416e-05,
1251
+ "loss": 1.0242938995361328,
1252
  "step": 348
1253
  },
1254
  {
1255
  "epoch": 0.14767932489451477,
1256
+ "grad_norm": 0.626761794090271,
1257
+ "learning_rate": 6.133567662565906e-05,
1258
+ "loss": 1.0497376918792725,
1259
  "step": 350
1260
  },
1261
  {
1262
  "epoch": 0.14852320675105485,
1263
+ "grad_norm": 0.5106956958770752,
1264
+ "learning_rate": 6.16871704745167e-05,
1265
+ "loss": 0.9811885356903076,
1266
  "step": 352
1267
  },
1268
  {
1269
  "epoch": 0.14936708860759493,
1270
+ "grad_norm": 0.6948089003562927,
1271
+ "learning_rate": 6.203866432337434e-05,
1272
+ "loss": 1.0715330839157104,
1273
  "step": 354
1274
  },
1275
  {
1276
  "epoch": 0.150210970464135,
1277
+ "grad_norm": 0.699713945388794,
1278
+ "learning_rate": 6.239015817223199e-05,
1279
+ "loss": 1.0405226945877075,
1280
  "step": 356
1281
  },
1282
  {
1283
  "epoch": 0.15105485232067511,
1284
+ "grad_norm": 0.6437667012214661,
1285
+ "learning_rate": 6.274165202108964e-05,
1286
+ "loss": 1.0490930080413818,
1287
  "step": 358
1288
  },
1289
  {
1290
  "epoch": 0.1518987341772152,
1291
+ "grad_norm": 0.6952699422836304,
1292
+ "learning_rate": 6.309314586994728e-05,
1293
+ "loss": 0.9267548322677612,
1294
  "step": 360
1295
  },
1296
  {
1297
  "epoch": 0.15274261603375527,
1298
+ "grad_norm": 0.6713186502456665,
1299
+ "learning_rate": 6.344463971880492e-05,
1300
+ "loss": 1.0427420139312744,
1301
  "step": 362
1302
  },
1303
  {
1304
  "epoch": 0.15358649789029535,
1305
+ "grad_norm": 0.6750379800796509,
1306
+ "learning_rate": 6.379613356766257e-05,
1307
+ "loss": 1.048950433731079,
1308
  "step": 364
1309
  },
1310
  {
1311
  "epoch": 0.15443037974683543,
1312
+ "grad_norm": 0.6053379774093628,
1313
+ "learning_rate": 6.414762741652022e-05,
1314
+ "loss": 1.0156004428863525,
1315
  "step": 366
1316
  },
1317
  {
1318
  "epoch": 0.15527426160337554,
1319
+ "grad_norm": 0.8063633441925049,
1320
+ "learning_rate": 6.449912126537786e-05,
1321
+ "loss": 1.0020819902420044,
1322
  "step": 368
1323
  },
1324
  {
1325
  "epoch": 0.15611814345991562,
1326
+ "grad_norm": 0.8027494549751282,
1327
+ "learning_rate": 6.48506151142355e-05,
1328
+ "loss": 1.055633783340454,
1329
  "step": 370
1330
  },
1331
  {
1332
  "epoch": 0.1569620253164557,
1333
+ "grad_norm": 0.6580121517181396,
1334
+ "learning_rate": 6.520210896309315e-05,
1335
+ "loss": 1.0149940252304077,
1336
  "step": 372
1337
  },
1338
  {
1339
  "epoch": 0.15780590717299578,
1340
+ "grad_norm": 0.6561233997344971,
1341
+ "learning_rate": 6.55536028119508e-05,
1342
+ "loss": 0.9769611954689026,
1343
  "step": 374
1344
  },
1345
  {
1346
  "epoch": 0.15864978902953586,
1347
+ "grad_norm": 0.6444346308708191,
1348
+ "learning_rate": 6.590509666080844e-05,
1349
+ "loss": 0.9099349975585938,
1350
  "step": 376
1351
  },
1352
  {
1353
  "epoch": 0.15949367088607594,
1354
+ "grad_norm": 0.5879359245300293,
1355
+ "learning_rate": 6.625659050966608e-05,
1356
+ "loss": 1.0797548294067383,
1357
  "step": 378
1358
  },
1359
  {
1360
  "epoch": 0.16033755274261605,
1361
+ "grad_norm": 0.6994144916534424,
1362
+ "learning_rate": 6.660808435852373e-05,
1363
+ "loss": 1.0336791276931763,
1364
  "step": 380
1365
  },
1366
  {
1367
  "epoch": 0.16118143459915613,
1368
+ "grad_norm": 0.6128669381141663,
1369
+ "learning_rate": 6.695957820738138e-05,
1370
+ "loss": 1.018118143081665,
1371
  "step": 382
1372
  },
1373
  {
1374
  "epoch": 0.1620253164556962,
1375
+ "grad_norm": 1.0237540006637573,
1376
+ "learning_rate": 6.731107205623902e-05,
1377
+ "loss": 1.1405497789382935,
1378
  "step": 384
1379
  },
1380
  {
1381
  "epoch": 0.16286919831223629,
1382
+ "grad_norm": 0.6091578006744385,
1383
+ "learning_rate": 6.766256590509666e-05,
1384
+ "loss": 1.0314189195632935,
1385
  "step": 386
1386
  },
1387
  {
1388
  "epoch": 0.16371308016877636,
1389
+ "grad_norm": 0.5916037559509277,
1390
+ "learning_rate": 6.801405975395431e-05,
1391
+ "loss": 0.9564052820205688,
1392
  "step": 388
1393
  },
1394
  {
1395
  "epoch": 0.16455696202531644,
1396
+ "grad_norm": 0.771653950214386,
1397
+ "learning_rate": 6.836555360281195e-05,
1398
+ "loss": 1.0023859739303589,
1399
  "step": 390
1400
  },
1401
  {
1402
  "epoch": 0.16540084388185655,
1403
+ "grad_norm": 0.654658317565918,
1404
+ "learning_rate": 6.87170474516696e-05,
1405
+ "loss": 1.07024085521698,
1406
  "step": 392
1407
  },
1408
  {
1409
  "epoch": 0.16624472573839663,
1410
+ "grad_norm": 0.6611968874931335,
1411
+ "learning_rate": 6.906854130052724e-05,
1412
+ "loss": 1.0552500486373901,
1413
  "step": 394
1414
  },
1415
  {
1416
  "epoch": 0.1670886075949367,
1417
+ "grad_norm": 0.6955893039703369,
1418
+ "learning_rate": 6.942003514938489e-05,
1419
+ "loss": 1.0562875270843506,
1420
  "step": 396
1421
  },
1422
  {
1423
  "epoch": 0.1679324894514768,
1424
+ "grad_norm": 0.6666058301925659,
1425
+ "learning_rate": 6.977152899824253e-05,
1426
+ "loss": 0.9850592017173767,
1427
  "step": 398
1428
  },
1429
  {
1430
  "epoch": 0.16877637130801687,
1431
+ "grad_norm": 0.6131711006164551,
1432
+ "learning_rate": 7.012302284710018e-05,
1433
+ "loss": 1.0077755451202393,
1434
  "step": 400
1435
  },
1436
  {
1437
  "epoch": 0.16877637130801687,
1438
+ "eval_loss": 1.0625108480453491,
1439
+ "eval_runtime": 691.0068,
1440
+ "eval_samples_per_second": 3.049,
1441
+ "eval_steps_per_second": 3.049,
1442
  "step": 400
1443
  },
1444
  {
1445
  "epoch": 0.16962025316455695,
1446
+ "grad_norm": 0.6286499500274658,
1447
+ "learning_rate": 7.047451669595783e-05,
1448
+ "loss": 1.1012427806854248,
1449
  "step": 402
1450
  },
1451
  {
1452
  "epoch": 0.17046413502109706,
1453
+ "grad_norm": 0.6639351844787598,
1454
+ "learning_rate": 7.082601054481547e-05,
1455
+ "loss": 1.0379719734191895,
1456
  "step": 404
1457
  },
1458
  {
1459
  "epoch": 0.17130801687763714,
1460
+ "grad_norm": 0.750401496887207,
1461
+ "learning_rate": 7.117750439367311e-05,
1462
+ "loss": 1.031856656074524,
1463
  "step": 406
1464
  },
1465
  {
1466
  "epoch": 0.17215189873417722,
1467
+ "grad_norm": 0.8084847331047058,
1468
+ "learning_rate": 7.152899824253075e-05,
1469
+ "loss": 1.0493193864822388,
1470
  "step": 408
1471
  },
1472
  {
1473
  "epoch": 0.1729957805907173,
1474
+ "grad_norm": 0.7448462247848511,
1475
+ "learning_rate": 7.188049209138841e-05,
1476
+ "loss": 1.1012418270111084,
1477
  "step": 410
1478
  },
1479
  {
1480
  "epoch": 0.17383966244725738,
1481
+ "grad_norm": 0.5841867923736572,
1482
+ "learning_rate": 7.223198594024605e-05,
1483
+ "loss": 0.9926692247390747,
1484
  "step": 412
1485
  },
1486
  {
1487
  "epoch": 0.17468354430379746,
1488
+ "grad_norm": 0.7125606536865234,
1489
+ "learning_rate": 7.258347978910369e-05,
1490
+ "loss": 1.0588877201080322,
1491
  "step": 414
1492
  },
1493
  {
1494
  "epoch": 0.17552742616033756,
1495
+ "grad_norm": 0.5750942230224609,
1496
+ "learning_rate": 7.293497363796134e-05,
1497
+ "loss": 1.038270354270935,
1498
  "step": 416
1499
  },
1500
  {
1501
  "epoch": 0.17637130801687764,
1502
+ "grad_norm": 0.565444827079773,
1503
+ "learning_rate": 7.328646748681899e-05,
1504
+ "loss": 0.9843021035194397,
1505
  "step": 418
1506
  },
1507
  {
1508
  "epoch": 0.17721518987341772,
1509
+ "grad_norm": 0.5825693011283875,
1510
+ "learning_rate": 7.363796133567663e-05,
1511
+ "loss": 1.0731632709503174,
1512
  "step": 420
1513
  },
1514
  {
1515
  "epoch": 0.1780590717299578,
1516
+ "grad_norm": 0.6267391443252563,
1517
+ "learning_rate": 7.398945518453427e-05,
1518
+ "loss": 1.0061273574829102,
1519
  "step": 422
1520
  },
1521
  {
1522
  "epoch": 0.17890295358649788,
1523
+ "grad_norm": 0.6621372103691101,
1524
+ "learning_rate": 7.434094903339192e-05,
1525
+ "loss": 1.0461612939834595,
1526
  "step": 424
1527
  },
1528
  {
1529
  "epoch": 0.17974683544303796,
1530
+ "grad_norm": 0.6635435223579407,
1531
+ "learning_rate": 7.469244288224957e-05,
1532
+ "loss": 0.9789207577705383,
1533
  "step": 426
1534
  },
1535
  {
1536
  "epoch": 0.18059071729957807,
1537
+ "grad_norm": 0.6342346668243408,
1538
+ "learning_rate": 7.504393673110721e-05,
1539
+ "loss": 1.0527069568634033,
1540
  "step": 428
1541
  },
1542
  {
1543
  "epoch": 0.18143459915611815,
1544
+ "grad_norm": 0.6762149930000305,
1545
+ "learning_rate": 7.539543057996485e-05,
1546
+ "loss": 0.9708702564239502,
1547
  "step": 430
1548
  },
1549
  {
1550
  "epoch": 0.18227848101265823,
1551
+ "grad_norm": 0.7073282599449158,
1552
+ "learning_rate": 7.57469244288225e-05,
1553
+ "loss": 1.0509834289550781,
1554
  "step": 432
1555
  },
1556
  {
1557
  "epoch": 0.1831223628691983,
1558
+ "grad_norm": 0.6917856931686401,
1559
+ "learning_rate": 7.609841827768014e-05,
1560
+ "loss": 1.0128819942474365,
1561
  "step": 434
1562
  },
1563
  {
1564
  "epoch": 0.1839662447257384,
1565
+ "grad_norm": 0.5574942231178284,
1566
+ "learning_rate": 7.644991212653779e-05,
1567
+ "loss": 0.989395797252655,
1568
  "step": 436
1569
  },
1570
  {
1571
  "epoch": 0.1848101265822785,
1572
+ "grad_norm": 0.640765905380249,
1573
+ "learning_rate": 7.680140597539543e-05,
1574
+ "loss": 0.9846042990684509,
1575
  "step": 438
1576
  },
1577
  {
1578
  "epoch": 0.18565400843881857,
1579
+ "grad_norm": 0.6699127554893494,
1580
+ "learning_rate": 7.715289982425308e-05,
1581
+ "loss": 1.0344442129135132,
1582
  "step": 440
1583
  },
1584
  {
1585
  "epoch": 0.18649789029535865,
1586
+ "grad_norm": 0.6164930462837219,
1587
+ "learning_rate": 7.750439367311072e-05,
1588
+ "loss": 1.0179373025894165,
1589
  "step": 442
1590
  },
1591
  {
1592
  "epoch": 0.18734177215189873,
1593
+ "grad_norm": 0.6880720853805542,
1594
+ "learning_rate": 7.785588752196837e-05,
1595
+ "loss": 1.0518895387649536,
1596
  "step": 444
1597
  },
1598
  {
1599
  "epoch": 0.1881856540084388,
1600
+ "grad_norm": 0.6501413583755493,
1601
+ "learning_rate": 7.820738137082601e-05,
1602
+ "loss": 1.0442606210708618,
1603
  "step": 446
1604
  },
1605
  {
1606
  "epoch": 0.1890295358649789,
1607
+ "grad_norm": 0.6076085567474365,
1608
+ "learning_rate": 7.855887521968366e-05,
1609
+ "loss": 0.9828442335128784,
1610
  "step": 448
1611
  },
1612
  {
1613
  "epoch": 0.189873417721519,
1614
+ "grad_norm": 0.6418202519416809,
1615
+ "learning_rate": 7.89103690685413e-05,
1616
+ "loss": 1.0573710203170776,
1617
  "step": 450
1618
  },
1619
  {
1620
  "epoch": 0.19071729957805908,
1621
+ "grad_norm": 0.7055076360702515,
1622
+ "learning_rate": 7.926186291739895e-05,
1623
+ "loss": 1.0216103792190552,
1624
  "step": 452
1625
  },
1626
  {
1627
  "epoch": 0.19156118143459916,
1628
+ "grad_norm": 0.5668330192565918,
1629
+ "learning_rate": 7.961335676625659e-05,
1630
+ "loss": 0.9837722778320312,
1631
  "step": 454
1632
  },
1633
  {
1634
  "epoch": 0.19240506329113924,
1635
+ "grad_norm": 0.6419380307197571,
1636
+ "learning_rate": 7.996485061511424e-05,
1637
+ "loss": 1.0003894567489624,
1638
  "step": 456
1639
  },
1640
  {
1641
  "epoch": 0.19324894514767932,
1642
+ "grad_norm": 0.5949198007583618,
1643
+ "learning_rate": 8.031634446397188e-05,
1644
+ "loss": 1.0609031915664673,
1645
  "step": 458
1646
  },
1647
  {
1648
  "epoch": 0.1940928270042194,
1649
+ "grad_norm": 0.7032039761543274,
1650
+ "learning_rate": 8.066783831282952e-05,
1651
+ "loss": 1.0543403625488281,
1652
  "step": 460
1653
  },
1654
  {
1655
  "epoch": 0.1949367088607595,
1656
+ "grad_norm": 0.5775868892669678,
1657
+ "learning_rate": 8.101933216168718e-05,
1658
+ "loss": 0.9819303154945374,
1659
  "step": 462
1660
  },
1661
  {
1662
  "epoch": 0.19578059071729959,
1663
+ "grad_norm": 0.9301062226295471,
1664
+ "learning_rate": 8.137082601054482e-05,
1665
+ "loss": 1.0542067289352417,
1666
  "step": 464
1667
  },
1668
  {
1669
  "epoch": 0.19662447257383966,
1670
+ "grad_norm": 0.6193217039108276,
1671
+ "learning_rate": 8.172231985940246e-05,
1672
+ "loss": 0.9966341257095337,
1673
  "step": 466
1674
  },
1675
  {
1676
  "epoch": 0.19746835443037974,
1677
+ "grad_norm": 0.6286146640777588,
1678
+ "learning_rate": 8.20738137082601e-05,
1679
+ "loss": 1.0474121570587158,
1680
  "step": 468
1681
  },
1682
  {
1683
  "epoch": 0.19831223628691982,
1684
+ "grad_norm": 0.7418972253799438,
1685
+ "learning_rate": 8.242530755711776e-05,
1686
+ "loss": 0.9549239277839661,
1687
  "step": 470
1688
  },
1689
  {
1690
  "epoch": 0.1991561181434599,
1691
+ "grad_norm": 0.6122808456420898,
1692
+ "learning_rate": 8.27768014059754e-05,
1693
+ "loss": 1.0191338062286377,
1694
  "step": 472
1695
  },
1696
  {
1697
  "epoch": 0.2,
1698
+ "grad_norm": 0.6375362277030945,
1699
+ "learning_rate": 8.312829525483304e-05,
1700
+ "loss": 1.0987539291381836,
1701
  "step": 474
1702
  },
1703
  {
1704
  "epoch": 0.2008438818565401,
1705
+ "grad_norm": 0.6459513306617737,
1706
+ "learning_rate": 8.347978910369068e-05,
1707
+ "loss": 1.0369136333465576,
1708
  "step": 476
1709
  },
1710
  {
1711
  "epoch": 0.20168776371308017,
1712
+ "grad_norm": 0.7029640674591064,
1713
+ "learning_rate": 8.383128295254833e-05,
1714
+ "loss": 1.0582096576690674,
1715
  "step": 478
1716
  },
1717
  {
1718
  "epoch": 0.20253164556962025,
1719
+ "grad_norm": 0.6345387697219849,
1720
+ "learning_rate": 8.418277680140598e-05,
1721
+ "loss": 1.022916316986084,
1722
  "step": 480
1723
  },
1724
  {
1725
  "epoch": 0.20337552742616033,
1726
+ "grad_norm": 0.5764590501785278,
1727
+ "learning_rate": 8.453427065026362e-05,
1728
+ "loss": 0.973024308681488,
1729
  "step": 482
1730
  },
1731
  {
1732
  "epoch": 0.2042194092827004,
1733
+ "grad_norm": 0.5884482860565186,
1734
+ "learning_rate": 8.488576449912127e-05,
1735
+ "loss": 1.0292812585830688,
1736
  "step": 484
1737
  },
1738
  {
1739
  "epoch": 0.20506329113924052,
1740
+ "grad_norm": 0.616357147693634,
1741
+ "learning_rate": 8.523725834797891e-05,
1742
+ "loss": 1.0083447694778442,
1743
  "step": 486
1744
  },
1745
  {
1746
  "epoch": 0.2059071729957806,
1747
+ "grad_norm": 0.7671196460723877,
1748
+ "learning_rate": 8.558875219683656e-05,
1749
+ "loss": 0.9936985373497009,
1750
  "step": 488
1751
  },
1752
  {
1753
  "epoch": 0.20675105485232068,
1754
+ "grad_norm": 0.6197299957275391,
1755
+ "learning_rate": 8.59402460456942e-05,
1756
+ "loss": 1.051513910293579,
1757
  "step": 490
1758
  },
1759
  {
1760
  "epoch": 0.20759493670886076,
1761
+ "grad_norm": 0.6912890672683716,
1762
+ "learning_rate": 8.629173989455185e-05,
1763
+ "loss": 0.9474978446960449,
1764
  "step": 492
1765
  },
1766
  {
1767
  "epoch": 0.20843881856540084,
1768
+ "grad_norm": 0.6941592693328857,
1769
+ "learning_rate": 8.664323374340949e-05,
1770
+ "loss": 1.0671660900115967,
1771
  "step": 494
1772
  },
1773
  {
1774
  "epoch": 0.20928270042194091,
1775
+ "grad_norm": 0.5889528393745422,
1776
+ "learning_rate": 8.699472759226714e-05,
1777
+ "loss": 1.0020159482955933,
1778
  "step": 496
1779
  },
1780
  {
1781
  "epoch": 0.21012658227848102,
1782
+ "grad_norm": 0.6478549838066101,
1783
+ "learning_rate": 8.734622144112478e-05,
1784
+ "loss": 1.0165860652923584,
1785
  "step": 498
1786
  },
1787
  {
1788
  "epoch": 0.2109704641350211,
1789
+ "grad_norm": 0.6018255949020386,
1790
+ "learning_rate": 8.769771528998243e-05,
1791
+ "loss": 0.8798263072967529,
1792
  "step": 500
1793
  },
1794
  {
1795
  "epoch": 0.2109704641350211,
1796
+ "eval_loss": 1.042096495628357,
1797
+ "eval_runtime": 692.4361,
1798
+ "eval_samples_per_second": 3.043,
1799
+ "eval_steps_per_second": 3.043,
1800
  "step": 500
1801
  }
1802
  ],
 
1826
  "attributes": {}
1827
  }
1828
  },
1829
+ "total_flos": 2.9886635097296486e+17,
1830
  "train_batch_size": 1,
1831
  "trial_name": null,
1832
  "trial_params": null
checkpoints/checkpoint-5000/README.md CHANGED
@@ -1,9 +1,9 @@
1
  ---
2
- base_model: Models/Devstral-Small-2-24B-HS-CPT
3
  library_name: peft
4
  pipeline_tag: text-generation
5
  tags:
6
- - base_model:adapter:Models/Devstral-Small-2-24B-HS-CPT
7
  - lora
8
  - transformers
9
  ---
 
1
  ---
2
+ base_model: Models/Qwen2.5-Coder-14B-CPT
3
  library_name: peft
4
  pipeline_tag: text-generation
5
  tags:
6
+ - base_model:adapter:Models/Qwen2.5-Coder-14B-CPT
7
  - lora
8
  - transformers
9
  ---
checkpoints/checkpoint-5000/adapter_config.json CHANGED
@@ -3,7 +3,7 @@
3
  "alpha_pattern": {},
4
  "arrow_config": null,
5
  "auto_mapping": null,
6
- "base_model_name_or_path": "Models/Devstral-Small-2-24B-HS-CPT",
7
  "bias": "none",
8
  "corda_config": null,
9
  "ensure_weight_tying": false,
@@ -16,7 +16,7 @@
16
  "layers_pattern": null,
17
  "layers_to_transform": null,
18
  "loftq_config": {},
19
- "lora_alpha": 16,
20
  "lora_bias": false,
21
  "lora_dropout": 0.05,
22
  "megatron_config": null,
@@ -25,14 +25,14 @@
25
  "peft_type": "LORA",
26
  "peft_version": "0.18.0",
27
  "qalora_group_size": 16,
28
- "r": 8,
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
- "v_proj",
33
- "q_proj",
34
  "o_proj",
35
- "k_proj"
 
 
36
  ],
37
  "target_parameters": null,
38
  "task_type": "CAUSAL_LM",
 
3
  "alpha_pattern": {},
4
  "arrow_config": null,
5
  "auto_mapping": null,
6
+ "base_model_name_or_path": "Models/Qwen2.5-Coder-14B-CPT",
7
  "bias": "none",
8
  "corda_config": null,
9
  "ensure_weight_tying": false,
 
16
  "layers_pattern": null,
17
  "layers_to_transform": null,
18
  "loftq_config": {},
19
+ "lora_alpha": 64,
20
  "lora_bias": false,
21
  "lora_dropout": 0.05,
22
  "megatron_config": null,
 
25
  "peft_type": "LORA",
26
  "peft_version": "0.18.0",
27
  "qalora_group_size": 16,
28
+ "r": 32,
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
 
 
32
  "o_proj",
33
+ "v_proj",
34
+ "k_proj",
35
+ "q_proj"
36
  ],
37
  "target_parameters": null,
38
  "task_type": "CAUSAL_LM",
checkpoints/checkpoint-5000/trainer_state.json CHANGED
The diff for this file is too large to render. See raw diff
 
config_resolved.yaml CHANGED
@@ -1,5 +1,5 @@
1
  run:
2
- run_dir: ./task2file/sft_devstral_24B_v2
3
  seed: 42
4
  wandb:
5
  enabled: true
@@ -8,10 +8,10 @@ wandb:
8
  name: null
9
  tags:
10
  - sft-lora
11
- - 24b-Devstral
12
  notes: null
13
  model:
14
- repo_id: ./Models/Devstral-Small-2-24B-HS-CPT
15
  revision: null
16
  base_local_dir: base_model
17
  trust_remote_code: true
@@ -64,8 +64,8 @@ data:
64
  num_proc: 4
65
  peft:
66
  enabled: true
67
- r: 8
68
- lora_alpha: 16
69
  lora_dropout: 0.05
70
  bias: none
71
  target_modules: auto
@@ -74,12 +74,12 @@ train:
74
  per_device_train_batch_size: 1
75
  per_device_eval_batch_size: 1
76
  gradient_accumulation_steps: 8
77
- learning_rate: 1e-4
78
  weight_decay: 0.0
79
  warmup_ratio: 0.08
80
  lr_scheduler_type: cosine
81
  optim: adamw_torch
82
- max_grad_norm: 0.8
83
  gradient_checkpointing: true
84
  logging_steps: 2
85
  save_strategy: steps
@@ -99,4 +99,4 @@ merge:
99
  enabled: true
100
  merged_dtype: float16
101
  max_shard_size: 2GB
102
- output_dir: ./Models/Devstral-Small-2-24B-HS-CPT-SFT_v2
 
1
  run:
2
+ run_dir: ./task2file/sft_qwen_14B_v2
3
  seed: 42
4
  wandb:
5
  enabled: true
 
8
  name: null
9
  tags:
10
  - sft-lora
11
+ - instruction-tuning
12
  notes: null
13
  model:
14
+ repo_id: ./Models/Qwen2.5-Coder-14B-CPT
15
  revision: null
16
  base_local_dir: base_model
17
  trust_remote_code: true
 
64
  num_proc: 4
65
  peft:
66
  enabled: true
67
+ r: 32
68
+ lora_alpha: 64
69
  lora_dropout: 0.05
70
  bias: none
71
  target_modules: auto
 
74
  per_device_train_batch_size: 1
75
  per_device_eval_batch_size: 1
76
  gradient_accumulation_steps: 8
77
+ learning_rate: 2e-4
78
  weight_decay: 0.0
79
  warmup_ratio: 0.08
80
  lr_scheduler_type: cosine
81
  optim: adamw_torch
82
+ max_grad_norm: 1.0
83
  gradient_checkpointing: true
84
  logging_steps: 2
85
  save_strategy: steps
 
99
  enabled: true
100
  merged_dtype: float16
101
  max_shard_size: 2GB
102
+ output_dir: ./Models/Qwen2.5-Coder-14B-CPT-SFT_v2
eval_final.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "eval_loss": 0.6706293225288391,
3
- "eval_runtime": 511.6513,
4
- "eval_samples_per_second": 4.118,
5
- "eval_steps_per_second": 4.118,
6
- "epoch": 3.2067510548523206,
7
- "perplexity": 1.955467553274469
8
  }
 
1
  {
2
+ "eval_loss": 0.7600579857826233,
3
+ "eval_runtime": 674.048,
4
+ "eval_samples_per_second": 3.126,
5
+ "eval_steps_per_second": 3.126,
6
+ "epoch": 2.151898734177215,
7
+ "perplexity": 2.138400213711816
8
  }
logs/eval.jsonl CHANGED
@@ -1,77 +1,52 @@
1
- {"ts": "2025-12-26T18:34:59", "event": "eval", "step": 100, "epoch": 0.04219409282700422, "eval_loss": 1.138856053352356, "eval_runtime": 859.7128, "eval_samples_per_second": 2.451, "eval_steps_per_second": 2.451, "perplexity": 3.1231935540832674}
2
- {"ts": "2025-12-26T19:05:22", "event": "eval", "step": 200, "epoch": 0.08438818565400844, "eval_loss": 0.995743453502655, "eval_runtime": 846.8257, "eval_samples_per_second": 2.488, "eval_steps_per_second": 2.488, "perplexity": 2.7067359257317922}
3
- {"ts": "2025-12-26T19:35:57", "event": "eval", "step": 300, "epoch": 0.12658227848101267, "eval_loss": 0.9517185688018799, "eval_runtime": 860.0287, "eval_samples_per_second": 2.45, "eval_steps_per_second": 2.45, "perplexity": 2.5901571998746475}
4
- {"ts": "2025-12-26T20:06:52", "event": "eval", "step": 400, "epoch": 0.16877637130801687, "eval_loss": 0.9282881617546082, "eval_runtime": 869.6867, "eval_samples_per_second": 2.423, "eval_steps_per_second": 2.423, "perplexity": 2.5301742193066197}
5
- {"ts": "2025-12-26T20:37:22", "event": "eval", "step": 500, "epoch": 0.2109704641350211, "eval_loss": 0.9080732464790344, "eval_runtime": 857.0753, "eval_samples_per_second": 2.458, "eval_steps_per_second": 2.458, "perplexity": 2.4795404646097325}
6
- {"ts": "2025-12-26T21:07:55", "event": "eval", "step": 600, "epoch": 0.25316455696202533, "eval_loss": 0.8903881311416626, "eval_runtime": 845.9969, "eval_samples_per_second": 2.491, "eval_steps_per_second": 2.491, "perplexity": 2.4360749843862655}
7
- {"ts": "2025-12-26T21:38:29", "event": "eval", "step": 700, "epoch": 0.29535864978902954, "eval_loss": 0.8730722069740295, "eval_runtime": 858.184, "eval_samples_per_second": 2.455, "eval_steps_per_second": 2.455, "perplexity": 2.3942552136153896}
8
- {"ts": "2025-12-26T22:09:04", "event": "eval", "step": 800, "epoch": 0.33755274261603374, "eval_loss": 0.8635594248771667, "eval_runtime": 865.9348, "eval_samples_per_second": 2.433, "eval_steps_per_second": 2.433, "perplexity": 2.371587174483758}
9
- {"ts": "2025-12-26T22:39:42", "event": "eval", "step": 900, "epoch": 0.379746835443038, "eval_loss": 0.8491304516792297, "eval_runtime": 852.6211, "eval_samples_per_second": 2.471, "eval_steps_per_second": 2.471, "perplexity": 2.3376133001985813}
10
- {"ts": "2025-12-26T23:10:19", "event": "eval", "step": 1000, "epoch": 0.4219409282700422, "eval_loss": 0.8388314247131348, "eval_runtime": 847.4828, "eval_samples_per_second": 2.486, "eval_steps_per_second": 2.486, "perplexity": 2.3136617085393727}
11
- {"ts": "2025-12-26T23:41:01", "event": "eval", "step": 1100, "epoch": 0.4641350210970464, "eval_loss": 0.8283821940422058, "eval_runtime": 861.0464, "eval_samples_per_second": 2.447, "eval_steps_per_second": 2.447, "perplexity": 2.2896115950724094}
12
- {"ts": "2025-12-27T00:11:32", "event": "eval", "step": 1200, "epoch": 0.5063291139240507, "eval_loss": 0.8186545968055725, "eval_runtime": 862.1638, "eval_samples_per_second": 2.444, "eval_steps_per_second": 2.444, "perplexity": 2.267447153803737}
13
- {"ts": "2025-12-27T00:42:14", "event": "eval", "step": 1300, "epoch": 0.5485232067510548, "eval_loss": 0.808323085308075, "eval_runtime": 853.577, "eval_samples_per_second": 2.468, "eval_steps_per_second": 2.468, "perplexity": 2.244141595588398}
14
- {"ts": "2025-12-27T01:12:54", "event": "eval", "step": 1400, "epoch": 0.5907172995780591, "eval_loss": 0.8009664416313171, "eval_runtime": 851.9417, "eval_samples_per_second": 2.473, "eval_steps_per_second": 2.473, "perplexity": 2.227692823570967}
15
- {"ts": "2025-12-27T01:43:40", "event": "eval", "step": 1500, "epoch": 0.6329113924050633, "eval_loss": 0.7896141409873962, "eval_runtime": 865.9069, "eval_samples_per_second": 2.433, "eval_steps_per_second": 2.433, "perplexity": 2.2025463898941693}
16
- {"ts": "2025-12-27T02:14:07", "event": "eval", "step": 1600, "epoch": 0.6751054852320675, "eval_loss": 0.7836604714393616, "eval_runtime": 861.5352, "eval_samples_per_second": 2.446, "eval_steps_per_second": 2.446, "perplexity": 2.189472115099779}
17
- {"ts": "2025-12-27T02:44:39", "event": "eval", "step": 1700, "epoch": 0.7172995780590717, "eval_loss": 0.7783148884773254, "eval_runtime": 846.1986, "eval_samples_per_second": 2.49, "eval_steps_per_second": 2.49, "perplexity": 2.1777993369634507}
18
- {"ts": "2025-12-27T03:15:22", "event": "eval", "step": 1800, "epoch": 0.759493670886076, "eval_loss": 0.7719914317131042, "eval_runtime": 853.1943, "eval_samples_per_second": 2.47, "eval_steps_per_second": 2.47, "perplexity": 2.16407156624064}
19
- {"ts": "2025-12-27T03:45:59", "event": "eval", "step": 1900, "epoch": 0.8016877637130801, "eval_loss": 0.7648926973342896, "eval_runtime": 865.9394, "eval_samples_per_second": 2.433, "eval_steps_per_second": 2.433, "perplexity": 2.148763794201393}
20
- {"ts": "2025-12-27T04:16:30", "event": "eval", "step": 2000, "epoch": 0.8438818565400844, "eval_loss": 0.7587011456489563, "eval_runtime": 856.2276, "eval_samples_per_second": 2.461, "eval_steps_per_second": 2.461, "perplexity": 2.135500714003631}
21
- {"ts": "2025-12-27T04:47:14", "event": "eval", "step": 2100, "epoch": 0.8860759493670886, "eval_loss": 0.7559094429016113, "eval_runtime": 847.8311, "eval_samples_per_second": 2.485, "eval_steps_per_second": 2.485, "perplexity": 2.1295473446786564}
22
- {"ts": "2025-12-27T05:17:56", "event": "eval", "step": 2200, "epoch": 0.9282700421940928, "eval_loss": 0.7497645616531372, "eval_runtime": 856.8766, "eval_samples_per_second": 2.459, "eval_steps_per_second": 2.459, "perplexity": 2.116501652297792}
23
- {"ts": "2025-12-27T05:48:33", "event": "eval", "step": 2300, "epoch": 0.9704641350210971, "eval_loss": 0.7464568614959717, "eval_runtime": 864.2128, "eval_samples_per_second": 2.438, "eval_steps_per_second": 2.438, "perplexity": 2.1095124648903094}
24
- {"ts": "2025-12-27T06:18:53", "event": "eval", "step": 2400, "epoch": 1.0126582278481013, "eval_loss": 0.7421699166297913, "eval_runtime": 854.2185, "eval_samples_per_second": 2.467, "eval_steps_per_second": 2.467, "perplexity": 2.100488457789446}
25
- {"ts": "2025-12-27T06:49:31", "event": "eval", "step": 2500, "epoch": 1.0548523206751055, "eval_loss": 0.741338849067688, "eval_runtime": 847.7478, "eval_samples_per_second": 2.485, "eval_steps_per_second": 2.485, "perplexity": 2.098743535142341}
26
- {"ts": "2025-12-27T07:20:16", "event": "eval", "step": 2600, "epoch": 1.0970464135021096, "eval_loss": 0.7377332448959351, "eval_runtime": 859.6612, "eval_samples_per_second": 2.451, "eval_steps_per_second": 2.451, "perplexity": 2.091189922548451}
27
- {"ts": "2025-12-27T07:51:03", "event": "eval", "step": 2700, "epoch": 1.139240506329114, "eval_loss": 0.7335711717605591, "eval_runtime": 861.9651, "eval_samples_per_second": 2.444, "eval_steps_per_second": 2.444, "perplexity": 2.0825043247357775}
28
- {"ts": "2025-12-27T08:21:29", "event": "eval", "step": 2800, "epoch": 1.1814345991561181, "eval_loss": 0.7298192977905273, "eval_runtime": 849.544, "eval_samples_per_second": 2.48, "eval_steps_per_second": 2.48, "perplexity": 2.074705669900544}
29
- {"ts": "2025-12-27T08:52:09", "event": "eval", "step": 2900, "epoch": 1.2236286919831223, "eval_loss": 0.7281573414802551, "eval_runtime": 854.563, "eval_samples_per_second": 2.466, "eval_steps_per_second": 2.466, "perplexity": 2.0712604634048333}
30
- {"ts": "2025-12-27T09:23:05", "event": "eval", "step": 3000, "epoch": 1.2658227848101267, "eval_loss": 0.72515869140625, "eval_runtime": 868.0515, "eval_samples_per_second": 2.427, "eval_steps_per_second": 2.427, "perplexity": 2.0650587810476666}
31
- {"ts": "2025-12-27T09:53:39", "event": "eval", "step": 3100, "epoch": 1.3080168776371308, "eval_loss": 0.7225774526596069, "eval_runtime": 862.4006, "eval_samples_per_second": 2.443, "eval_steps_per_second": 2.443, "perplexity": 2.0597352449225896}
32
- {"ts": "2025-12-27T10:24:10", "event": "eval", "step": 3200, "epoch": 1.350210970464135, "eval_loss": 0.7200453281402588, "eval_runtime": 846.2953, "eval_samples_per_second": 2.49, "eval_steps_per_second": 2.49, "perplexity": 2.0545263363912047}
33
- {"ts": "2025-12-27T10:54:40", "event": "eval", "step": 3300, "epoch": 1.3924050632911391, "eval_loss": 0.7173135876655579, "eval_runtime": 853.5344, "eval_samples_per_second": 2.469, "eval_steps_per_second": 2.469, "perplexity": 2.0489215625209867}
34
- {"ts": "2025-12-27T11:25:25", "event": "eval", "step": 3400, "epoch": 1.4345991561181435, "eval_loss": 0.715917706489563, "eval_runtime": 868.51, "eval_samples_per_second": 2.426, "eval_steps_per_second": 2.426, "perplexity": 2.046063506698008}
35
- {"ts": "2025-12-27T11:55:47", "event": "eval", "step": 3500, "epoch": 1.4767932489451476, "eval_loss": 0.7155047059059143, "eval_runtime": 855.8428, "eval_samples_per_second": 2.462, "eval_steps_per_second": 2.462, "perplexity": 2.0452186557495358}
36
- {"ts": "2025-12-27T12:26:22", "event": "eval", "step": 3600, "epoch": 1.518987341772152, "eval_loss": 0.7118256688117981, "eval_runtime": 851.3079, "eval_samples_per_second": 2.475, "eval_steps_per_second": 2.475, "perplexity": 2.0377080448290807}
37
- {"ts": "2025-12-27T12:57:01", "event": "eval", "step": 3700, "epoch": 1.5611814345991561, "eval_loss": 0.7099412679672241, "eval_runtime": 857.2273, "eval_samples_per_second": 2.458, "eval_steps_per_second": 2.458, "perplexity": 2.0338718017134907}
38
- {"ts": "2025-12-27T13:27:39", "event": "eval", "step": 3800, "epoch": 1.6033755274261603, "eval_loss": 0.7080941200256348, "eval_runtime": 865.6774, "eval_samples_per_second": 2.434, "eval_steps_per_second": 2.434, "perplexity": 2.030118407206169}
39
- {"ts": "2025-12-27T13:58:20", "event": "eval", "step": 3900, "epoch": 1.6455696202531644, "eval_loss": 0.7049403786659241, "eval_runtime": 854.9866, "eval_samples_per_second": 2.464, "eval_steps_per_second": 2.464, "perplexity": 2.023726024080043}
40
- {"ts": "2025-12-27T14:28:59", "event": "eval", "step": 4000, "epoch": 1.6877637130801688, "eval_loss": 0.7027890682220459, "eval_runtime": 848.7529, "eval_samples_per_second": 2.482, "eval_steps_per_second": 2.482, "perplexity": 2.0193770408327394}
41
- {"ts": "2025-12-27T14:59:26", "event": "eval", "step": 4100, "epoch": 1.729957805907173, "eval_loss": 0.7022181153297424, "eval_runtime": 844.6405, "eval_samples_per_second": 2.495, "eval_steps_per_second": 2.495, "perplexity": 2.0182244007535304}
42
- {"ts": "2025-12-27T15:20:08", "event": "eval", "step": 4200, "epoch": 1.7721518987341773, "eval_loss": 0.6993561387062073, "eval_runtime": 542.0281, "eval_samples_per_second": 3.887, "eval_steps_per_second": 3.887, "perplexity": 2.012456547365305}
43
- {"ts": "2025-12-27T15:39:13", "event": "eval", "step": 4300, "epoch": 1.8143459915611815, "eval_loss": 0.6981000900268555, "eval_runtime": 514.4659, "eval_samples_per_second": 4.096, "eval_steps_per_second": 4.096, "perplexity": 2.0099303907966624}
44
- {"ts": "2025-12-27T15:58:13", "event": "eval", "step": 4400, "epoch": 1.8565400843881856, "eval_loss": 0.6961485147476196, "eval_runtime": 513.5724, "eval_samples_per_second": 4.103, "eval_steps_per_second": 4.103, "perplexity": 2.0060116854010337}
45
- {"ts": "2025-12-27T16:17:15", "event": "eval", "step": 4500, "epoch": 1.8987341772151898, "eval_loss": 0.6938078999519348, "eval_runtime": 513.615, "eval_samples_per_second": 4.102, "eval_steps_per_second": 4.102, "perplexity": 2.0013218754302557}
46
- {"ts": "2025-12-27T16:38:02", "event": "eval", "step": 4600, "epoch": 1.9409282700421941, "eval_loss": 0.6930755376815796, "eval_runtime": 617.8927, "eval_samples_per_second": 3.41, "eval_steps_per_second": 3.41, "perplexity": 1.999856719375848}
47
- {"ts": "2025-12-27T16:58:16", "event": "eval", "step": 4700, "epoch": 1.9831223628691983, "eval_loss": 0.6923081278800964, "eval_runtime": 514.7729, "eval_samples_per_second": 4.093, "eval_steps_per_second": 4.093, "perplexity": 1.9983225984528428}
48
- {"ts": "2025-12-27T17:17:24", "event": "eval", "step": 4800, "epoch": 2.0253164556962027, "eval_loss": 0.6924457550048828, "eval_runtime": 514.0427, "eval_samples_per_second": 4.099, "eval_steps_per_second": 4.099, "perplexity": 1.998597640772671}
49
- {"ts": "2025-12-27T17:36:32", "event": "eval", "step": 4900, "epoch": 2.067510548523207, "eval_loss": 0.6941288113594055, "eval_runtime": 513.4497, "eval_samples_per_second": 4.104, "eval_steps_per_second": 4.104, "perplexity": 2.0019642255133236}
50
- {"ts": "2025-12-27T17:58:22", "event": "eval", "step": 5000, "epoch": 2.109704641350211, "eval_loss": 0.6908889412879944, "eval_runtime": 675.8398, "eval_samples_per_second": 3.118, "eval_steps_per_second": 3.118, "perplexity": 1.9954886172641344}
51
- {"ts": "2025-12-27T18:23:03", "event": "eval", "step": 5100, "epoch": 2.151898734177215, "eval_loss": 0.6902023553848267, "eval_runtime": 733.915, "eval_samples_per_second": 2.871, "eval_steps_per_second": 2.871, "perplexity": 1.9941190131388347}
52
- {"ts": "2025-12-27T18:55:59", "event": "eval", "step": 5200, "epoch": 2.1940928270042193, "eval_loss": 0.6915348172187805, "eval_runtime": 1167.9782, "eval_samples_per_second": 1.804, "eval_steps_per_second": 1.804, "perplexity": 1.9967778716365487}
53
- {"ts": "2025-12-27T19:32:22", "event": "eval", "step": 5300, "epoch": 2.2362869198312234, "eval_loss": 0.6898328065872192, "eval_runtime": 739.3794, "eval_samples_per_second": 2.85, "eval_steps_per_second": 2.85, "perplexity": 1.993382225003213}
54
- {"ts": "2025-12-27T19:58:17", "event": "eval", "step": 5400, "epoch": 2.278481012658228, "eval_loss": 0.6875645518302917, "eval_runtime": 861.3558, "eval_samples_per_second": 2.446, "eval_steps_per_second": 2.446, "perplexity": 1.988865850369486}
55
- {"ts": "2025-12-27T20:31:44", "event": "eval", "step": 5500, "epoch": 2.320675105485232, "eval_loss": 0.6867148876190186, "eval_runtime": 941.3545, "eval_samples_per_second": 2.238, "eval_steps_per_second": 2.238, "perplexity": 1.9871766999423568}
56
- {"ts": "2025-12-27T21:05:14", "event": "eval", "step": 5600, "epoch": 2.3628691983122363, "eval_loss": 0.6851074695587158, "eval_runtime": 938.5536, "eval_samples_per_second": 2.245, "eval_steps_per_second": 2.245, "perplexity": 1.9839850420773193}
57
- {"ts": "2025-12-27T21:38:52", "event": "eval", "step": 5700, "epoch": 2.4050632911392404, "eval_loss": 0.6841402053833008, "eval_runtime": 941.6641, "eval_samples_per_second": 2.238, "eval_steps_per_second": 2.238, "perplexity": 1.9820669322305768}
58
- {"ts": "2025-12-27T22:09:41", "event": "eval", "step": 5800, "epoch": 2.4472573839662446, "eval_loss": 0.6835155487060547, "eval_runtime": 758.407, "eval_samples_per_second": 2.778, "eval_steps_per_second": 2.778, "perplexity": 1.9808292075033642}
59
- {"ts": "2025-12-27T22:28:42", "event": "eval", "step": 5900, "epoch": 2.489451476793249, "eval_loss": 0.6820966005325317, "eval_runtime": 513.3515, "eval_samples_per_second": 4.104, "eval_steps_per_second": 4.104, "perplexity": 1.9780205066890182}
60
- {"ts": "2025-12-27T22:47:43", "event": "eval", "step": 6000, "epoch": 2.5316455696202533, "eval_loss": 0.6813357472419739, "eval_runtime": 513.5491, "eval_samples_per_second": 4.103, "eval_steps_per_second": 4.103, "perplexity": 1.9765160956683256}
61
- {"ts": "2025-12-27T23:06:47", "event": "eval", "step": 6100, "epoch": 2.5738396624472575, "eval_loss": 0.6812278628349304, "eval_runtime": 513.4749, "eval_samples_per_second": 4.103, "eval_steps_per_second": 4.103, "perplexity": 1.9763028719032991}
62
- {"ts": "2025-12-27T23:25:56", "event": "eval", "step": 6200, "epoch": 2.6160337552742616, "eval_loss": 0.6795271039009094, "eval_runtime": 513.2393, "eval_samples_per_second": 4.105, "eval_steps_per_second": 4.105, "perplexity": 1.972944513825857}
63
- {"ts": "2025-12-27T23:44:50", "event": "eval", "step": 6300, "epoch": 2.6582278481012658, "eval_loss": 0.6781066656112671, "eval_runtime": 512.3669, "eval_samples_per_second": 4.112, "eval_steps_per_second": 4.112, "perplexity": 1.9701440573037758}
64
- {"ts": "2025-12-28T00:03:48", "event": "eval", "step": 6400, "epoch": 2.70042194092827, "eval_loss": 0.6764505505561829, "eval_runtime": 512.7682, "eval_samples_per_second": 4.109, "eval_steps_per_second": 4.109, "perplexity": 1.9668839723527984}
65
- {"ts": "2025-12-28T00:22:46", "event": "eval", "step": 6500, "epoch": 2.742616033755274, "eval_loss": 0.6768895387649536, "eval_runtime": 513.0657, "eval_samples_per_second": 4.107, "eval_steps_per_second": 4.107, "perplexity": 1.9677476007721588}
66
- {"ts": "2025-12-28T00:41:51", "event": "eval", "step": 6600, "epoch": 2.7848101265822782, "eval_loss": 0.6737648844718933, "eval_runtime": 512.921, "eval_samples_per_second": 4.108, "eval_steps_per_second": 4.108, "perplexity": 1.9616086658032716}
67
- {"ts": "2025-12-28T01:00:52", "event": "eval", "step": 6700, "epoch": 2.827004219409283, "eval_loss": 0.6737436056137085, "eval_runtime": 513.2559, "eval_samples_per_second": 4.105, "eval_steps_per_second": 4.105, "perplexity": 1.961566925454753}
68
- {"ts": "2025-12-28T01:19:55", "event": "eval", "step": 6800, "epoch": 2.869198312236287, "eval_loss": 0.6721681356430054, "eval_runtime": 513.1285, "eval_samples_per_second": 4.106, "eval_steps_per_second": 4.106, "perplexity": 1.9584789687983855}
69
- {"ts": "2025-12-28T01:38:47", "event": "eval", "step": 6900, "epoch": 2.911392405063291, "eval_loss": 0.6713213920593262, "eval_runtime": 513.1265, "eval_samples_per_second": 4.106, "eval_steps_per_second": 4.106, "perplexity": 1.9568213411895954}
70
- {"ts": "2025-12-28T01:57:47", "event": "eval", "step": 7000, "epoch": 2.9535864978902953, "eval_loss": 0.6706293225288391, "eval_runtime": 513.4396, "eval_samples_per_second": 4.104, "eval_steps_per_second": 4.104, "perplexity": 1.955467553274469}
71
- {"ts": "2025-12-28T02:16:49", "event": "eval", "step": 7100, "epoch": 2.9957805907173, "eval_loss": 0.6692973375320435, "eval_runtime": 512.8985, "eval_samples_per_second": 4.108, "eval_steps_per_second": 4.108, "perplexity": 1.9528646337415076}
72
- {"ts": "2025-12-28T02:35:50", "event": "eval", "step": 7200, "epoch": 3.037974683544304, "eval_loss": 0.6751418709754944, "eval_runtime": 513.8972, "eval_samples_per_second": 4.1, "eval_steps_per_second": 4.1, "perplexity": 1.9643116350103986}
73
- {"ts": "2025-12-28T02:54:46", "event": "eval", "step": 7300, "epoch": 3.080168776371308, "eval_loss": 0.678839385509491, "eval_runtime": 513.7013, "eval_samples_per_second": 4.102, "eval_steps_per_second": 4.102, "perplexity": 1.9715881500500663}
74
- {"ts": "2025-12-28T03:13:51", "event": "eval", "step": 7400, "epoch": 3.1223628691983123, "eval_loss": 0.676459550857544, "eval_runtime": 513.5901, "eval_samples_per_second": 4.102, "eval_steps_per_second": 4.102, "perplexity": 1.9669016749809562}
75
- {"ts": "2025-12-28T03:32:54", "event": "eval", "step": 7500, "epoch": 3.1645569620253164, "eval_loss": 0.6774632334709167, "eval_runtime": 513.4064, "eval_samples_per_second": 4.104, "eval_steps_per_second": 4.104, "perplexity": 1.9688768110333967}
76
- {"ts": "2025-12-28T03:51:52", "event": "eval", "step": 7600, "epoch": 3.2067510548523206, "eval_loss": 0.6755207777023315, "eval_runtime": 513.9779, "eval_samples_per_second": 4.099, "eval_steps_per_second": 4.099, "perplexity": 1.965056066928733}
77
- {"ts": "2025-12-28T04:00:24", "event": "eval", "step": 7600, "epoch": 3.2067510548523206, "eval_loss": 0.6706293225288391, "eval_runtime": 511.6513, "eval_samples_per_second": 4.118, "eval_steps_per_second": 4.118, "perplexity": 1.955467553274469}
 
1
+ {"ts": "2025-12-26T18:41:10", "event": "eval", "step": 100, "epoch": 0.04219409282700422, "eval_loss": 1.2979938983917236, "eval_runtime": 682.1979, "eval_samples_per_second": 3.089, "eval_steps_per_second": 3.089, "perplexity": 3.661943064177116}
2
+ {"ts": "2025-12-26T19:05:13", "event": "eval", "step": 200, "epoch": 0.08438818565400844, "eval_loss": 1.142486810684204, "eval_runtime": 668.2356, "eval_samples_per_second": 3.153, "eval_steps_per_second": 3.153, "perplexity": 3.134553722506413}
3
+ {"ts": "2025-12-26T19:29:29", "event": "eval", "step": 300, "epoch": 0.12658227848101267, "eval_loss": 1.0952109098434448, "eval_runtime": 677.0652, "eval_samples_per_second": 3.112, "eval_steps_per_second": 3.112, "perplexity": 2.98981319793367}
4
+ {"ts": "2025-12-26T19:53:55", "event": "eval", "step": 400, "epoch": 0.16877637130801687, "eval_loss": 1.0625108480453491, "eval_runtime": 691.0068, "eval_samples_per_second": 3.049, "eval_steps_per_second": 3.049, "perplexity": 2.893627334202045}
5
+ {"ts": "2025-12-26T20:18:00", "event": "eval", "step": 500, "epoch": 0.2109704641350211, "eval_loss": 1.042096495628357, "eval_runtime": 692.4361, "eval_samples_per_second": 3.043, "eval_steps_per_second": 3.043, "perplexity": 2.8351546774213405}
6
+ {"ts": "2025-12-26T20:42:00", "event": "eval", "step": 600, "epoch": 0.25316455696202533, "eval_loss": 1.0193854570388794, "eval_runtime": 677.9523, "eval_samples_per_second": 3.108, "eval_steps_per_second": 3.108, "perplexity": 2.7714910402016297}
7
+ {"ts": "2025-12-26T21:06:13", "event": "eval", "step": 700, "epoch": 0.29535864978902954, "eval_loss": 0.996929407119751, "eval_runtime": 668.6398, "eval_samples_per_second": 3.151, "eval_steps_per_second": 3.151, "perplexity": 2.7099478932392134}
8
+ {"ts": "2025-12-26T21:30:25", "event": "eval", "step": 800, "epoch": 0.33755274261603374, "eval_loss": 0.9800403714179993, "eval_runtime": 678.8306, "eval_samples_per_second": 3.104, "eval_steps_per_second": 3.104, "perplexity": 2.6645638119774637}
9
+ {"ts": "2025-12-26T21:54:42", "event": "eval", "step": 900, "epoch": 0.379746835443038, "eval_loss": 0.9643027186393738, "eval_runtime": 691.7929, "eval_samples_per_second": 3.046, "eval_steps_per_second": 3.046, "perplexity": 2.6229580789054108}
10
+ {"ts": "2025-12-26T22:18:39", "event": "eval", "step": 1000, "epoch": 0.4219409282700422, "eval_loss": 0.9487298727035522, "eval_runtime": 689.4288, "eval_samples_per_second": 3.056, "eval_steps_per_second": 3.056, "perplexity": 2.5824275636777196}
11
+ {"ts": "2025-12-26T22:42:41", "event": "eval", "step": 1100, "epoch": 0.4641350210970464, "eval_loss": 0.9357889294624329, "eval_runtime": 676.9573, "eval_samples_per_second": 3.112, "eval_steps_per_second": 3.112, "perplexity": 2.549223822396605}
12
+ {"ts": "2025-12-26T23:06:55", "event": "eval", "step": 1200, "epoch": 0.5063291139240507, "eval_loss": 0.9224098324775696, "eval_runtime": 669.7542, "eval_samples_per_second": 3.146, "eval_steps_per_second": 3.146, "perplexity": 2.515344651361619}
13
+ {"ts": "2025-12-26T23:31:25", "event": "eval", "step": 1300, "epoch": 0.5485232067510548, "eval_loss": 0.9068717360496521, "eval_runtime": 680.7718, "eval_samples_per_second": 3.095, "eval_steps_per_second": 3.095, "perplexity": 2.476563059931004}
14
+ {"ts": "2025-12-26T23:55:39", "event": "eval", "step": 1400, "epoch": 0.5907172995780591, "eval_loss": 0.8971880674362183, "eval_runtime": 692.8046, "eval_samples_per_second": 3.041, "eval_steps_per_second": 3.041, "perplexity": 2.452696587964245}
15
+ {"ts": "2025-12-27T00:19:35", "event": "eval", "step": 1500, "epoch": 0.6329113924050633, "eval_loss": 0.887488842010498, "eval_runtime": 686.2804, "eval_samples_per_second": 3.07, "eval_steps_per_second": 3.07, "perplexity": 2.4290223274474503}
16
+ {"ts": "2025-12-27T00:43:47", "event": "eval", "step": 1600, "epoch": 0.6751054852320675, "eval_loss": 0.8769772052764893, "eval_runtime": 677.9338, "eval_samples_per_second": 3.108, "eval_steps_per_second": 3.108, "perplexity": 2.403623054958293}
17
+ {"ts": "2025-12-27T01:08:03", "event": "eval", "step": 1700, "epoch": 0.7172995780590717, "eval_loss": 0.8708170056343079, "eval_runtime": 670.3019, "eval_samples_per_second": 3.143, "eval_steps_per_second": 3.143, "perplexity": 2.388861769986548}
18
+ {"ts": "2025-12-27T01:32:23", "event": "eval", "step": 1800, "epoch": 0.759493670886076, "eval_loss": 0.8625519275665283, "eval_runtime": 686.4271, "eval_samples_per_second": 3.07, "eval_steps_per_second": 3.07, "perplexity": 2.369199010020167}
19
+ {"ts": "2025-12-27T01:56:20", "event": "eval", "step": 1900, "epoch": 0.8016877637130801, "eval_loss": 0.8546335697174072, "eval_runtime": 688.5301, "eval_samples_per_second": 3.06, "eval_steps_per_second": 3.06, "perplexity": 2.3505129236377402}
20
+ {"ts": "2025-12-27T02:20:26", "event": "eval", "step": 2000, "epoch": 0.8438818565400844, "eval_loss": 0.8460908532142639, "eval_runtime": 685.2518, "eval_samples_per_second": 3.075, "eval_steps_per_second": 3.075, "perplexity": 2.330518682256874}
21
+ {"ts": "2025-12-27T02:44:39", "event": "eval", "step": 2100, "epoch": 0.8860759493670886, "eval_loss": 0.8401098847389221, "eval_runtime": 669.1149, "eval_samples_per_second": 3.149, "eval_steps_per_second": 3.149, "perplexity": 2.3166215241467625}
22
+ {"ts": "2025-12-27T03:09:05", "event": "eval", "step": 2200, "epoch": 0.9282700421940928, "eval_loss": 0.8336610198020935, "eval_runtime": 674.5134, "eval_samples_per_second": 3.124, "eval_steps_per_second": 3.124, "perplexity": 2.3017300131082887}
23
+ {"ts": "2025-12-27T03:33:21", "event": "eval", "step": 2300, "epoch": 0.9704641350210971, "eval_loss": 0.8281980156898499, "eval_runtime": 688.6136, "eval_samples_per_second": 3.06, "eval_steps_per_second": 3.06, "perplexity": 2.289189937012629}
24
+ {"ts": "2025-12-27T03:57:18", "event": "eval", "step": 2400, "epoch": 1.0126582278481013, "eval_loss": 0.8250564932823181, "eval_runtime": 691.5833, "eval_samples_per_second": 3.047, "eval_steps_per_second": 3.047, "perplexity": 2.282009679904965}
25
+ {"ts": "2025-12-27T04:21:22", "event": "eval", "step": 2500, "epoch": 1.0548523206751055, "eval_loss": 0.8249453902244568, "eval_runtime": 679.4446, "eval_samples_per_second": 3.101, "eval_steps_per_second": 3.101, "perplexity": 2.2817561557353745}
26
+ {"ts": "2025-12-27T04:45:36", "event": "eval", "step": 2600, "epoch": 1.0970464135021096, "eval_loss": 0.8211485743522644, "eval_runtime": 670.2276, "eval_samples_per_second": 3.144, "eval_steps_per_second": 3.144, "perplexity": 2.2731091736340194}
27
+ {"ts": "2025-12-27T05:10:00", "event": "eval", "step": 2700, "epoch": 1.139240506329114, "eval_loss": 0.8155058026313782, "eval_runtime": 678.284, "eval_samples_per_second": 3.106, "eval_steps_per_second": 3.106, "perplexity": 2.2603186583878263}
28
+ {"ts": "2025-12-27T05:34:19", "event": "eval", "step": 2800, "epoch": 1.1814345991561181, "eval_loss": 0.8124309182167053, "eval_runtime": 688.4759, "eval_samples_per_second": 3.06, "eval_steps_per_second": 3.06, "perplexity": 2.2533791143885313}
29
+ {"ts": "2025-12-27T05:58:11", "event": "eval", "step": 2900, "epoch": 1.2236286919831223, "eval_loss": 0.8077136278152466, "eval_runtime": 685.6042, "eval_samples_per_second": 3.073, "eval_steps_per_second": 3.073, "perplexity": 2.2427743033735634}
30
+ {"ts": "2025-12-27T06:22:11", "event": "eval", "step": 3000, "epoch": 1.2658227848101267, "eval_loss": 0.8033165335655212, "eval_runtime": 678.7554, "eval_samples_per_second": 3.104, "eval_steps_per_second": 3.104, "perplexity": 2.232934263027478}
31
+ {"ts": "2025-12-27T06:46:29", "event": "eval", "step": 3100, "epoch": 1.3080168776371308, "eval_loss": 0.8010181784629822, "eval_runtime": 668.1688, "eval_samples_per_second": 3.153, "eval_steps_per_second": 3.153, "perplexity": 2.2278080803210654}
32
+ {"ts": "2025-12-27T07:10:54", "event": "eval", "step": 3200, "epoch": 1.350210970464135, "eval_loss": 0.797160804271698, "eval_runtime": 680.976, "eval_samples_per_second": 3.094, "eval_steps_per_second": 3.094, "perplexity": 2.2192311437906307}
33
+ {"ts": "2025-12-27T07:35:02", "event": "eval", "step": 3300, "epoch": 1.3924050632911391, "eval_loss": 0.795619547367096, "eval_runtime": 692.7157, "eval_samples_per_second": 3.042, "eval_steps_per_second": 3.042, "perplexity": 2.215813372975358}
34
+ {"ts": "2025-12-27T07:59:01", "event": "eval", "step": 3400, "epoch": 1.4345991561181435, "eval_loss": 0.7917885780334473, "eval_runtime": 686.1689, "eval_samples_per_second": 3.071, "eval_steps_per_second": 3.071, "perplexity": 2.2073408991501657}
35
+ {"ts": "2025-12-27T08:22:55", "event": "eval", "step": 3500, "epoch": 1.4767932489451476, "eval_loss": 0.7902651429176331, "eval_runtime": 672.312, "eval_samples_per_second": 3.134, "eval_steps_per_second": 3.134, "perplexity": 2.203980718670171}
36
+ {"ts": "2025-12-27T08:47:16", "event": "eval", "step": 3600, "epoch": 1.518987341772152, "eval_loss": 0.785450279712677, "eval_runtime": 675.2312, "eval_samples_per_second": 3.12, "eval_steps_per_second": 3.12, "perplexity": 2.1933943593911716}
37
+ {"ts": "2025-12-27T09:11:38", "event": "eval", "step": 3700, "epoch": 1.5611814345991561, "eval_loss": 0.7854447960853577, "eval_runtime": 687.7907, "eval_samples_per_second": 3.063, "eval_steps_per_second": 3.063, "perplexity": 2.193382331666918}
38
+ {"ts": "2025-12-27T09:35:44", "event": "eval", "step": 3800, "epoch": 1.6033755274261603, "eval_loss": 0.778353214263916, "eval_runtime": 692.5522, "eval_samples_per_second": 3.042, "eval_steps_per_second": 3.042, "perplexity": 2.1778828044355443}
39
+ {"ts": "2025-12-27T09:59:49", "event": "eval", "step": 3900, "epoch": 1.6455696202531644, "eval_loss": 0.7763351202011108, "eval_runtime": 682.0824, "eval_samples_per_second": 3.089, "eval_steps_per_second": 3.089, "perplexity": 2.173492064032179}
40
+ {"ts": "2025-12-27T10:23:55", "event": "eval", "step": 4000, "epoch": 1.6877637130801688, "eval_loss": 0.7721371650695801, "eval_runtime": 668.395, "eval_samples_per_second": 3.152, "eval_steps_per_second": 3.152, "perplexity": 2.1643869666352633}
41
+ {"ts": "2025-12-27T10:48:15", "event": "eval", "step": 4100, "epoch": 1.729957805907173, "eval_loss": 0.7690847516059875, "eval_runtime": 673.6323, "eval_samples_per_second": 3.128, "eval_steps_per_second": 3.128, "perplexity": 2.157790435509873}
42
+ {"ts": "2025-12-27T11:12:31", "event": "eval", "step": 4200, "epoch": 1.7721518987341773, "eval_loss": 0.7676366567611694, "eval_runtime": 687.9619, "eval_samples_per_second": 3.063, "eval_steps_per_second": 3.063, "perplexity": 2.1546680116326113}
43
+ {"ts": "2025-12-27T11:36:30", "event": "eval", "step": 4300, "epoch": 1.8143459915611815, "eval_loss": 0.7672964930534363, "eval_runtime": 688.4249, "eval_samples_per_second": 3.061, "eval_steps_per_second": 3.061, "perplexity": 2.1539351964184767}
44
+ {"ts": "2025-12-27T12:00:29", "event": "eval", "step": 4400, "epoch": 1.8565400843881856, "eval_loss": 0.7635221481323242, "eval_runtime": 678.243, "eval_samples_per_second": 3.107, "eval_steps_per_second": 3.107, "perplexity": 2.1458208249008255}
45
+ {"ts": "2025-12-27T12:24:47", "event": "eval", "step": 4500, "epoch": 1.8987341772151898, "eval_loss": 0.7600579857826233, "eval_runtime": 674.2593, "eval_samples_per_second": 3.125, "eval_steps_per_second": 3.125, "perplexity": 2.138400213711816}
46
+ {"ts": "2025-12-27T12:49:10", "event": "eval", "step": 4600, "epoch": 1.9409282700421941, "eval_loss": 0.7585541009902954, "eval_runtime": 679.0866, "eval_samples_per_second": 3.103, "eval_steps_per_second": 3.103, "perplexity": 2.1351867231159773}
47
+ {"ts": "2025-12-27T13:13:19", "event": "eval", "step": 4700, "epoch": 1.9831223628691983, "eval_loss": 0.7582268714904785, "eval_runtime": 690.081, "eval_samples_per_second": 3.053, "eval_steps_per_second": 3.053, "perplexity": 2.134488141337073}
48
+ {"ts": "2025-12-27T13:37:21", "event": "eval", "step": 4800, "epoch": 2.0253164556962027, "eval_loss": 0.7633068561553955, "eval_runtime": 688.8684, "eval_samples_per_second": 3.059, "eval_steps_per_second": 3.059, "perplexity": 2.145358896619808}
49
+ {"ts": "2025-12-27T14:01:30", "event": "eval", "step": 4900, "epoch": 2.067510548523207, "eval_loss": 0.7676681280136108, "eval_runtime": 676.0104, "eval_samples_per_second": 3.117, "eval_steps_per_second": 3.117, "perplexity": 2.1547358228005784}
50
+ {"ts": "2025-12-27T14:25:57", "event": "eval", "step": 5000, "epoch": 2.109704641350211, "eval_loss": 0.7635271549224854, "eval_runtime": 669.5049, "eval_samples_per_second": 3.147, "eval_steps_per_second": 3.147, "perplexity": 2.145831568602315}
51
+ {"ts": "2025-12-27T14:50:12", "event": "eval", "step": 5100, "epoch": 2.151898734177215, "eval_loss": 0.7654595971107483, "eval_runtime": 681.4966, "eval_samples_per_second": 3.092, "eval_steps_per_second": 3.092, "perplexity": 2.149982273261109}
52
+ {"ts": "2025-12-27T15:01:27", "event": "eval", "step": 5100, "epoch": 2.151898734177215, "eval_loss": 0.7600579857826233, "eval_runtime": 674.048, "eval_samples_per_second": 3.126, "eval_steps_per_second": 3.126, "perplexity": 2.138400213711816}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
logs/train.jsonl CHANGED
The diff for this file is too large to render. See raw diff
 
wandb/debug-internal.log CHANGED
@@ -1,12 +1,12 @@
1
- {"time":"2025-12-26T18:08:08.66103332Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"}
2
- {"time":"2025-12-26T18:08:08.82172381Z","level":"INFO","msg":"stream: created new stream","id":"ny9q48hd"}
3
- {"time":"2025-12-26T18:08:08.821819478Z","level":"INFO","msg":"handler: started","stream_id":"ny9q48hd"}
4
- {"time":"2025-12-26T18:08:08.822049155Z","level":"INFO","msg":"stream: started","id":"ny9q48hd"}
5
- {"time":"2025-12-26T18:08:08.822072296Z","level":"INFO","msg":"writer: started","stream_id":"ny9q48hd"}
6
- {"time":"2025-12-26T18:08:08.822098276Z","level":"INFO","msg":"sender: started","stream_id":"ny9q48hd"}
7
- {"time":"2025-12-28T04:02:04.935383596Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
8
- {"time":"2025-12-28T04:02:05.045953421Z","level":"INFO","msg":"handler: operation stats","stats":{}}
9
- {"time":"2025-12-28T04:02:05.051806259Z","level":"INFO","msg":"stream: closing","id":"ny9q48hd"}
10
- {"time":"2025-12-28T04:02:05.051833004Z","level":"INFO","msg":"handler: closed","stream_id":"ny9q48hd"}
11
- {"time":"2025-12-28T04:02:05.051917075Z","level":"INFO","msg":"sender: closed","stream_id":"ny9q48hd"}
12
- {"time":"2025-12-28T04:02:05.051937152Z","level":"INFO","msg":"stream: closed","id":"ny9q48hd"}
 
1
+ {"time":"2025-12-26T18:15:45.044149374Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"}
2
+ {"time":"2025-12-26T18:15:45.212148231Z","level":"INFO","msg":"stream: created new stream","id":"upub1jan"}
3
+ {"time":"2025-12-26T18:15:45.212312297Z","level":"INFO","msg":"handler: started","stream_id":"upub1jan"}
4
+ {"time":"2025-12-26T18:15:45.212463318Z","level":"INFO","msg":"stream: started","id":"upub1jan"}
5
+ {"time":"2025-12-26T18:15:45.212498387Z","level":"INFO","msg":"writer: started","stream_id":"upub1jan"}
6
+ {"time":"2025-12-26T18:15:45.212503642Z","level":"INFO","msg":"sender: started","stream_id":"upub1jan"}
7
+ {"time":"2025-12-27T15:02:32.177112089Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
8
+ {"time":"2025-12-27T15:02:32.275585886Z","level":"INFO","msg":"handler: operation stats","stats":{}}
9
+ {"time":"2025-12-27T15:02:32.280819216Z","level":"INFO","msg":"stream: closing","id":"upub1jan"}
10
+ {"time":"2025-12-27T15:02:32.280845154Z","level":"INFO","msg":"handler: closed","stream_id":"upub1jan"}
11
+ {"time":"2025-12-27T15:02:32.280896164Z","level":"INFO","msg":"sender: closed","stream_id":"upub1jan"}
12
+ {"time":"2025-12-27T15:02:32.280915121Z","level":"INFO","msg":"stream: closed","id":"upub1jan"}
wandb/debug.log CHANGED
@@ -1,29 +1,29 @@
1
- 2025-12-26 18:08:08,385 INFO MainThread:190322 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1
2
- 2025-12-26 18:08:08,385 INFO MainThread:190322 [wandb_setup.py:_flush():80] Configure stats pid to 190322
3
- 2025-12-26 18:08:08,385 INFO MainThread:190322 [wandb_setup.py:_flush():80] Loading settings from /root/.config/wandb/settings
4
- 2025-12-26 18:08:08,385 INFO MainThread:190322 [wandb_setup.py:_flush():80] Loading settings from /workspace/wandb/settings
5
- 2025-12-26 18:08:08,385 INFO MainThread:190322 [wandb_setup.py:_flush():80] Loading settings from environment variables
6
- 2025-12-26 18:08:08,385 INFO MainThread:190322 [wandb_init.py:setup_run_log_directory():714] Logging user logs to task2file/sft_devstral_24B_v2/wandb/run-20251226_180808-ny9q48hd/logs/debug.log
7
- 2025-12-26 18:08:08,385 INFO MainThread:190322 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to task2file/sft_devstral_24B_v2/wandb/run-20251226_180808-ny9q48hd/logs/debug-internal.log
8
- 2025-12-26 18:08:08,385 INFO MainThread:190322 [wandb_init.py:init():841] calling init triggers
9
- 2025-12-26 18:08:08,385 INFO MainThread:190322 [wandb_init.py:init():846] wandb.init called with sweep_config: {}
10
- config: {'model': {'repo_id': './Models/Devstral-Small-2-24B-HS-CPT', 'revision': None, 'base_local_dir': 'base_model', 'trust_remote_code': True, 'tokenizer_use_fast': True, 'device_map': 'auto', 'torch_dtype': 'bfloat16', 'use_4bit': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': False, 'bnb_4bit_compute_dtype': 'bfloat16', 'attn_implementation': None}, 'data': {'train_jsonl': 'sft_dataset.jsonl', 'eval_jsonl': None, 'eval_split_ratio': 0.1, 'instruction_field': 'instruction', 'input_field': 'input', 'output_field': 'output', 'format_type': 'custom', 'system_prompt': 'You are a Hyperswitch Rust code analyzer. Identify functions/structs that need modification for a given task.\n\n## Output Format\n\n##OUTPUT\nExplain the data flow and why each component must change:\n- Flow: [Input → Processing → Output with arrows]\n- For each component: "The [ComponentName] ([path]) must [action] because [reason]—without this, [consequence]"\n- Explain coupling between components\n\n##SELECT\nmodify::crates/path/to/file.rs::impl::ComponentName\nadd::crates/another/file.rs::function::AnotherComponent\n<EOS>\n\n## Rules\n\n1. Use full paths: `remove::crates/folder/file.rs::Type::Name`\n2. Use `::` for nested items: `status::StructName::Type::Name`\n3. Always explain "must change because" and "without this"\n3. Types of components: function, struct, enum, impl, trait\n4. If there is extra information (e.g., enum variants), include that too.\n5. Start with ##OUTPUT, end with ##SELECT, terminate with <EOS>\n\n## Example\n\n##TASK\nAdd webhook subscription support\n\n##OUTPUT\nThe webhook system routes events via EventClass enum. Flow: webhook → EventClass → handler → processing. The EventClass enum (crates/common_enums/src/enums.rs::EventClass) must add Subscriptions variant because it defines event routing—without this, subscription events cannot be processed. The SubscriptionStatus impl (crates/common_enums/src/transformers.rs::SubscriptionStatus) must map to EventType because it converts status to events—without this, status changes don\'t trigger webhooks. These are coupled: EventClass routes to handlers that use SubscriptionStatus mappings.\n\n##SELECT\ncrates/common_enums/src/enums.rs::EventClass\ncrates/common_enums/src/transformers.rs::SubscriptionStatus\n<EOS>\n', 'custom_template': '##INSTRUCTION\n{instruction}<|im_end|>\n{input}<|im_end|>\n{output}<|im_end|>', 'max_length': 2048, 'shuffle': True, 'num_proc': 4}, 'peft': {'enabled': True, 'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.05, 'bias': 'none', 'target_modules': 'auto'}, 'train': {'num_train_epochs': 6, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 8, 'learning_rate': '1e-4', 'weight_decay': 0.0, 'warmup_ratio': 0.08, 'lr_scheduler_type': 'cosine', 'optim': 'adamw_torch', 'max_grad_norm': 0.8, 'gradient_checkpointing': True, 'logging_steps': 2, 'save_strategy': 'steps', 'save_steps': 500, 'save_total_limit': 20, 'evaluation_strategy': 'steps', 'eval_steps': 100, 'load_best_model_at_end': True, 'early_stopping': {'enabled': True, 'patience': 5, 'min_delta': 0.001, 'metric': 'eval_loss', 'mode': 'min'}, 'resume_from_checkpoint': 'auto'}, 'run_dir': 'task2file/sft_devstral_24B_v2', '_wandb': {}}
11
- 2025-12-26 18:08:08,385 INFO MainThread:190322 [wandb_init.py:init():889] starting backend
12
- 2025-12-26 18:08:08,653 INFO MainThread:190322 [wandb_init.py:init():892] sending inform_init request
13
- 2025-12-26 18:08:08,658 INFO MainThread:190322 [wandb_init.py:init():900] backend started and connected
14
- 2025-12-26 18:08:08,661 INFO MainThread:190322 [wandb_init.py:init():970] updated telemetry
15
- 2025-12-26 18:08:08,662 INFO MainThread:190322 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout
16
- 2025-12-26 18:08:09,021 INFO MainThread:190322 [wandb_init.py:init():1041] starting run threads in backend
17
- 2025-12-26 18:08:09,134 INFO MainThread:190322 [wandb_run.py:_console_start():2521] atexit reg
18
- 2025-12-26 18:08:09,134 INFO MainThread:190322 [wandb_run.py:_redirect():2369] redirect: wrap_raw
19
- 2025-12-26 18:08:09,135 INFO MainThread:190322 [wandb_run.py:_redirect():2438] Wrapping output streams.
20
- 2025-12-26 18:08:09,135 INFO MainThread:190322 [wandb_run.py:_redirect():2461] Redirects installed.
21
- 2025-12-26 18:08:09,138 INFO MainThread:190322 [wandb_init.py:init():1081] run started, returning control to user process
22
- 2025-12-26 18:08:52,955 INFO MainThread:190322 [wandb_run.py:_config_callback():1396] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': 'LORA', 'auto_mapping': None, 'peft_version': '0.18.0', 'base_model_name_or_path': 'Models/Devstral-Small-2-24B-HS-CPT', 'revision': None, 'inference_mode': False, 'r': 8, 'target_modules': ['v_proj', 'q_proj', 'o_proj', 'k_proj'], 'exclude_modules': None, 'lora_alpha': 16, 'lora_dropout': 0.05, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'alora_invocation_tokens': None, 'use_qalora': False, 'qalora_group_size': 16, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False, 'target_parameters': None, 'arrow_config': None, 'ensure_weight_tying': False}}, 'image_token_index': 10, 'projector_hidden_act': 'gelu', 'vision_feature_layer': -1, 'vision_config': {'hidden_size': 1024, 'intermediate_size': 4096, 'num_hidden_layers': 24, 'num_attention_heads': 16, 'num_channels': 3, 'patch_size': 14, 'image_size': 1540, 'attention_dropout': 0.0, 'hidden_act': 'silu', 'head_dim': 64, 'initializer_range': 0.02, 'rope_parameters': {'rope_theta': 10000.0, 'rope_type': 'default'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'architectures': None, 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'task_specific_params': None, 'problem_type': None, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': None, 'pad_token_id': None, 'eos_token_id': None, 'sep_token_id': None, 'decoder_start_token_id': None, '_name_or_path': '', 'model_type': 'pixtral', 'output_attentions': False}, 'text_config': {'vocab_size': 131072, 'max_position_embeddings': 393216, 'hidden_size': 5120, 'intermediate_size': 32768, 'num_hidden_layers': 40, 'num_attention_heads': 32, 'sliding_window': None, 'head_dim': 128, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'use_cache': True, 'attention_dropout': 0.0, 'rope_parameters': {'beta_fast': 32.0, 'beta_slow': 1.0, 'factor': 48.0, 'llama_4_scaling_beta': 0.1, 'mscale': 1.0, 'mscale_all_dim': 1.0, 'original_max_position_embeddings': 8192, 'rope_theta': 100000000.0, 'rope_type': 'yarn', 'type': 'yarn'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'architectures': None, 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'task_specific_params': None, 'problem_type': None, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': 11, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': None, '_name_or_path': '', 'model_type': 'ministral3', 'output_attentions': False}, 'multimodal_projector_bias': False, 'spatial_merge_size': 2, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'architectures': ['Mistral3ForConditionalGeneration'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'task_specific_params': None, 'problem_type': None, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': None, 'pad_token_id': None, 'eos_token_id': None, 'sep_token_id': None, 'decoder_start_token_id': None, '_name_or_path': 'Models/Devstral-Small-2-24B-HS-CPT', 'transformers_version': '5.0.0.dev0', 'model_type': 'mistral3', 'use_cache': False, 'output_attentions': False, 'output_dir': 'task2file/sft_devstral_24B_v2/checkpoints', 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 8, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 0.0001, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 0.8, 'num_train_epochs': 6.0, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': None, 'warmup_ratio': 0.08, 'warmup_steps': 0.08, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': None, 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 2, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 500, 'save_total_limit': 20, 'enable_jit_checkpoint': False, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'use_cpu': False, 'seed': 42, 'data_seed': None, 'bf16': True, 'fp16': False, 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': -1, 'ddp_backend': None, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 100, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': None, 'load_best_model_at_end': True, 'metric_for_best_model': 'eval_loss', 'greater_is_better': False, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'project': 'huggingface', 'trackio_space_id': 'trackio', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'auto_find_batch_size': False, 'full_determinism': False, 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_num_input_tokens_seen': 'no', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True}
23
- 2025-12-26 18:08:52,965 INFO MainThread:190322 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 24022764544 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x7b8940b75420>>
24
- 2025-12-26 18:08:52,965 INFO MainThread:190322 [wandb_run.py:_config_callback():1396] config_cb model/num_parameters 24022764544 None
25
- 2025-12-28 04:02:04,643 INFO MainThread:190322 [wandb_run.py:_finish():2287] finishing run sirajuddin-shaik-007/sft-training/ny9q48hd
26
- 2025-12-28 04:02:04,645 INFO MainThread:190322 [wandb_run.py:_atexit_cleanup():2486] got exitcode: 0
27
- 2025-12-28 04:02:04,646 INFO MainThread:190322 [wandb_run.py:_restore():2468] restore
28
- 2025-12-28 04:02:04,646 INFO MainThread:190322 [wandb_run.py:_restore():2474] restore done
29
- 2025-12-28 04:02:05,050 INFO MainThread:190322 [wandb_run.py:_footer_sync_info():3862] logging synced files
 
1
+ 2025-12-26 18:15:44,766 INFO MainThread:194421 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1
2
+ 2025-12-26 18:15:44,767 INFO MainThread:194421 [wandb_setup.py:_flush():80] Configure stats pid to 194421
3
+ 2025-12-26 18:15:44,767 INFO MainThread:194421 [wandb_setup.py:_flush():80] Loading settings from /root/.config/wandb/settings
4
+ 2025-12-26 18:15:44,767 INFO MainThread:194421 [wandb_setup.py:_flush():80] Loading settings from /workspace/wandb/settings
5
+ 2025-12-26 18:15:44,767 INFO MainThread:194421 [wandb_setup.py:_flush():80] Loading settings from environment variables
6
+ 2025-12-26 18:15:44,767 INFO MainThread:194421 [wandb_init.py:setup_run_log_directory():714] Logging user logs to task2file/sft_qwen_14B_v2/wandb/run-20251226_181544-upub1jan/logs/debug.log
7
+ 2025-12-26 18:15:44,767 INFO MainThread:194421 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to task2file/sft_qwen_14B_v2/wandb/run-20251226_181544-upub1jan/logs/debug-internal.log
8
+ 2025-12-26 18:15:44,767 INFO MainThread:194421 [wandb_init.py:init():841] calling init triggers
9
+ 2025-12-26 18:15:44,767 INFO MainThread:194421 [wandb_init.py:init():846] wandb.init called with sweep_config: {}
10
+ config: {'model': {'repo_id': './Models/Qwen2.5-Coder-14B-CPT', 'revision': None, 'base_local_dir': 'base_model', 'trust_remote_code': True, 'tokenizer_use_fast': True, 'device_map': 'auto', 'torch_dtype': 'bfloat16', 'use_4bit': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': False, 'bnb_4bit_compute_dtype': 'bfloat16', 'attn_implementation': None}, 'data': {'train_jsonl': 'sft_dataset.jsonl', 'eval_jsonl': None, 'eval_split_ratio': 0.1, 'instruction_field': 'instruction', 'input_field': 'input', 'output_field': 'output', 'format_type': 'custom', 'system_prompt': 'You are a Hyperswitch Rust code analyzer. Identify functions/structs that need modification for a given task.\n\n## Output Format\n\n##OUTPUT\nExplain the data flow and why each component must change:\n- Flow: [Input → Processing → Output with arrows]\n- For each component: "The [ComponentName] ([path]) must [action] because [reason]—without this, [consequence]"\n- Explain coupling between components\n\n##SELECT\nmodify::crates/path/to/file.rs::impl::ComponentName\nadd::crates/another/file.rs::function::AnotherComponent\n<EOS>\n\n## Rules\n\n1. Use full paths: `remove::crates/folder/file.rs::Type::Name`\n2. Use `::` for nested items: `status::StructName::Type::Name`\n3. Always explain "must change because" and "without this"\n3. Types of components: function, struct, enum, impl, trait\n4. If there is extra information (e.g., enum variants), include that too.\n5. Start with ##OUTPUT, end with ##SELECT, terminate with <EOS>\n\n## Example\n\n##TASK\nAdd webhook subscription support\n\n##OUTPUT\nThe webhook system routes events via EventClass enum. Flow: webhook → EventClass → handler → processing. The EventClass enum (crates/common_enums/src/enums.rs::EventClass) must add Subscriptions variant because it defines event routing—without this, subscription events cannot be processed. The SubscriptionStatus impl (crates/common_enums/src/transformers.rs::SubscriptionStatus) must map to EventType because it converts status to events—without this, status changes don\'t trigger webhooks. These are coupled: EventClass routes to handlers that use SubscriptionStatus mappings.\n\n##SELECT\ncrates/common_enums/src/enums.rs::EventClass\ncrates/common_enums/src/transformers.rs::SubscriptionStatus\n<EOS>\n', 'custom_template': '##INSTRUCTION\n{instruction}<|im_end|>\n{input}<|im_end|>\n{output}<|im_end|>', 'max_length': 2048, 'shuffle': True, 'num_proc': 4}, 'peft': {'enabled': True, 'r': 32, 'lora_alpha': 64, 'lora_dropout': 0.05, 'bias': 'none', 'target_modules': 'auto'}, 'train': {'num_train_epochs': 6, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 8, 'learning_rate': '2e-4', 'weight_decay': 0.0, 'warmup_ratio': 0.08, 'lr_scheduler_type': 'cosine', 'optim': 'adamw_torch', 'max_grad_norm': 1.0, 'gradient_checkpointing': True, 'logging_steps': 2, 'save_strategy': 'steps', 'save_steps': 500, 'save_total_limit': 20, 'evaluation_strategy': 'steps', 'eval_steps': 100, 'load_best_model_at_end': True, 'early_stopping': {'enabled': True, 'patience': 5, 'min_delta': 0.001, 'metric': 'eval_loss', 'mode': 'min'}, 'resume_from_checkpoint': 'auto'}, 'run_dir': 'task2file/sft_qwen_14B_v2', '_wandb': {}}
11
+ 2025-12-26 18:15:44,767 INFO MainThread:194421 [wandb_init.py:init():889] starting backend
12
+ 2025-12-26 18:15:45,035 INFO MainThread:194421 [wandb_init.py:init():892] sending inform_init request
13
+ 2025-12-26 18:15:45,040 INFO MainThread:194421 [wandb_init.py:init():900] backend started and connected
14
+ 2025-12-26 18:15:45,044 INFO MainThread:194421 [wandb_init.py:init():970] updated telemetry
15
+ 2025-12-26 18:15:45,045 INFO MainThread:194421 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout
16
+ 2025-12-26 18:15:45,420 INFO MainThread:194421 [wandb_init.py:init():1041] starting run threads in backend
17
+ 2025-12-26 18:15:45,537 INFO MainThread:194421 [wandb_run.py:_console_start():2521] atexit reg
18
+ 2025-12-26 18:15:45,537 INFO MainThread:194421 [wandb_run.py:_redirect():2369] redirect: wrap_raw
19
+ 2025-12-26 18:15:45,537 INFO MainThread:194421 [wandb_run.py:_redirect():2438] Wrapping output streams.
20
+ 2025-12-26 18:15:45,537 INFO MainThread:194421 [wandb_run.py:_redirect():2461] Redirects installed.
21
+ 2025-12-26 18:15:45,542 INFO MainThread:194421 [wandb_init.py:init():1081] run started, returning control to user process
22
+ 2025-12-26 18:17:10,652 INFO MainThread:194421 [wandb_run.py:_config_callback():1396] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': 'LORA', 'auto_mapping': None, 'peft_version': '0.18.0', 'base_model_name_or_path': 'Models/Qwen2.5-Coder-14B-CPT', 'revision': None, 'inference_mode': False, 'r': 32, 'target_modules': ['o_proj', 'v_proj', 'k_proj', 'q_proj'], 'exclude_modules': None, 'lora_alpha': 64, 'lora_dropout': 0.05, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'alora_invocation_tokens': None, 'use_qalora': False, 'qalora_group_size': 16, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False, 'target_parameters': None, 'arrow_config': None, 'ensure_weight_tying': False}}, 'vocab_size': 152064, 'max_position_embeddings': 32768, 'hidden_size': 5120, 'intermediate_size': 13824, 'num_hidden_layers': 48, 'num_attention_heads': 40, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 48, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'attention_dropout': 0.0, 'layer_types': ['full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention'], 'rope_parameters': {'rope_theta': 1000000.0, 'rope_type': 'default'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'architectures': ['Qwen2ForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'task_specific_params': None, 'problem_type': None, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 151643, 'pad_token_id': None, 'eos_token_id': 151643, 'sep_token_id': None, 'decoder_start_token_id': None, '_name_or_path': 'Models/Qwen2.5-Coder-14B-CPT', 'transformers_version': '5.0.0.dev0', 'model_type': 'qwen2', 'output_attentions': False, 'output_dir': 'task2file/sft_qwen_14B_v2/checkpoints', 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 8, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 0.0002, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 6.0, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': None, 'warmup_ratio': 0.08, 'warmup_steps': 0.08, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': None, 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 2, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 500, 'save_total_limit': 20, 'enable_jit_checkpoint': False, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'use_cpu': False, 'seed': 42, 'data_seed': None, 'bf16': True, 'fp16': False, 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': -1, 'ddp_backend': None, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 100, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': None, 'load_best_model_at_end': True, 'metric_for_best_model': 'eval_loss', 'greater_is_better': False, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'project': 'huggingface', 'trackio_space_id': 'trackio', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'auto_find_batch_size': False, 'full_determinism': False, 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_num_input_tokens_seen': 'no', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True}
23
+ 2025-12-26 18:17:10,660 INFO MainThread:194421 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 14820365312 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x7ed24f0556c0>>
24
+ 2025-12-26 18:17:10,660 INFO MainThread:194421 [wandb_run.py:_config_callback():1396] config_cb model/num_parameters 14820365312 None
25
+ 2025-12-27 15:02:31,903 INFO MainThread:194421 [wandb_run.py:_finish():2287] finishing run sirajuddin-shaik-007/sft-training/upub1jan
26
+ 2025-12-27 15:02:31,905 INFO MainThread:194421 [wandb_run.py:_atexit_cleanup():2486] got exitcode: 0
27
+ 2025-12-27 15:02:31,906 INFO MainThread:194421 [wandb_run.py:_restore():2468] restore
28
+ 2025-12-27 15:02:31,906 INFO MainThread:194421 [wandb_run.py:_restore():2474] restore done
29
+ 2025-12-27 15:02:32,279 INFO MainThread:194421 [wandb_run.py:_footer_sync_info():3862] logging synced files
wandb/run-20251226_181544-upub1jan/files/config.yaml ADDED
@@ -0,0 +1,601 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _name_or_path:
2
+ value: Models/Qwen2.5-Coder-14B-CPT
3
+ _wandb:
4
+ value:
5
+ cli_version: 0.23.1
6
+ e:
7
+ ba9dsvvs7npkm5vvx6733495pp0yghz4:
8
+ args:
9
+ - --config
10
+ - trainer-kit/SFT-14b/config_instruct.yaml
11
+ codePath: trainer-kit/SFT-14b/run_instruct.py
12
+ codePathLocal: trainer-kit/SFT-14b/run_instruct.py
13
+ cpu_count: 12
14
+ cpu_count_logical: 24
15
+ cudaVersion: "13.0"
16
+ disk:
17
+ /:
18
+ total: "791251738624"
19
+ used: "392925650944"
20
+ email: shaiksirajuddin9949@gmail.com
21
+ executable: /workspace/llm_finetuning_env/bin/python
22
+ gpu: NVIDIA A100-SXM4-80GB
23
+ gpu_count: 2
24
+ gpu_nvidia:
25
+ - architecture: Ampere
26
+ cudaCores: 6912
27
+ memoryTotal: "85899345920"
28
+ name: NVIDIA A100-SXM4-80GB
29
+ uuid: GPU-989794b0-ec3b-13bf-db9f-3fbe341497ba
30
+ - architecture: Ampere
31
+ cudaCores: 6912
32
+ memoryTotal: "85899345920"
33
+ name: NVIDIA A100-SXM4-80GB
34
+ uuid: GPU-3790aa64-60ef-9eac-b0b1-b278ee8c0d40
35
+ host: a100-2gpu-shell-session-757d587799-mfdvv
36
+ memory:
37
+ total: "359047892992"
38
+ os: Linux-6.12.46+-x86_64-with-glibc2.35
39
+ program: /workspace/trainer-kit/SFT-14b/run_instruct.py
40
+ python: CPython 3.10.12
41
+ root: task2file/sft_qwen_14B_v2
42
+ startedAt: "2025-12-26T18:15:44.765252Z"
43
+ writerId: ba9dsvvs7npkm5vvx6733495pp0yghz4
44
+ m:
45
+ - "1": train/global_step
46
+ "6":
47
+ - 3
48
+ "7": []
49
+ - "2": '*'
50
+ "5": 1
51
+ "6":
52
+ - 1
53
+ "7": []
54
+ python_version: 3.10.12
55
+ t:
56
+ "1":
57
+ - 1
58
+ - 11
59
+ - 41
60
+ - 49
61
+ - 51
62
+ - 71
63
+ - 98
64
+ "2":
65
+ - 1
66
+ - 11
67
+ - 41
68
+ - 49
69
+ - 51
70
+ - 71
71
+ - 98
72
+ "3":
73
+ - 2
74
+ - 7
75
+ - 15
76
+ - 16
77
+ - 19
78
+ - 62
79
+ - 66
80
+ "4": 3.10.12
81
+ "5": 0.23.1
82
+ "6": 5.0.0.dev0
83
+ "9":
84
+ "1": transformers_trainer
85
+ "12": 0.23.1
86
+ "13": linux-x86_64
87
+ accelerator_config:
88
+ value:
89
+ dispatch_batches: null
90
+ even_batches: true
91
+ gradient_accumulation_kwargs: null
92
+ non_blocking: false
93
+ split_batches: false
94
+ use_seedable_sampler: true
95
+ adam_beta1:
96
+ value: 0.9
97
+ adam_beta2:
98
+ value: 0.999
99
+ adam_epsilon:
100
+ value: 1e-08
101
+ add_cross_attention:
102
+ value: false
103
+ architectures:
104
+ value:
105
+ - Qwen2ForCausalLM
106
+ attention_dropout:
107
+ value: 0
108
+ auto_find_batch_size:
109
+ value: false
110
+ average_tokens_across_devices:
111
+ value: true
112
+ batch_eval_metrics:
113
+ value: false
114
+ bf16:
115
+ value: true
116
+ bf16_full_eval:
117
+ value: false
118
+ bos_token_id:
119
+ value: 151643
120
+ chunk_size_feed_forward:
121
+ value: 0
122
+ cross_attention_hidden_size:
123
+ value: null
124
+ data:
125
+ value:
126
+ custom_template: |-
127
+ ##INSTRUCTION
128
+ {instruction}<|im_end|>
129
+ {input}<|im_end|>
130
+ {output}<|im_end|>
131
+ eval_jsonl: null
132
+ eval_split_ratio: 0.1
133
+ format_type: custom
134
+ input_field: input
135
+ instruction_field: instruction
136
+ max_length: 2048
137
+ num_proc: 4
138
+ output_field: output
139
+ shuffle: true
140
+ system_prompt: |
141
+ You are a Hyperswitch Rust code analyzer. Identify functions/structs that need modification for a given task.
142
+
143
+ ## Output Format
144
+
145
+ ##OUTPUT
146
+ Explain the data flow and why each component must change:
147
+ - Flow: [Input → Processing → Output with arrows]
148
+ - For each component: "The [ComponentName] ([path]) must [action] because [reason]—without this, [consequence]"
149
+ - Explain coupling between components
150
+
151
+ ##SELECT
152
+ modify::crates/path/to/file.rs::impl::ComponentName
153
+ add::crates/another/file.rs::function::AnotherComponent
154
+ <EOS>
155
+
156
+ ## Rules
157
+
158
+ 1. Use full paths: `remove::crates/folder/file.rs::Type::Name`
159
+ 2. Use `::` for nested items: `status::StructName::Type::Name`
160
+ 3. Always explain "must change because" and "without this"
161
+ 3. Types of components: function, struct, enum, impl, trait
162
+ 4. If there is extra information (e.g., enum variants), include that too.
163
+ 5. Start with ##OUTPUT, end with ##SELECT, terminate with <EOS>
164
+
165
+ ## Example
166
+
167
+ ##TASK
168
+ Add webhook subscription support
169
+
170
+ ##OUTPUT
171
+ The webhook system routes events via EventClass enum. Flow: webhook → EventClass → handler → processing. The EventClass enum (crates/common_enums/src/enums.rs::EventClass) must add Subscriptions variant because it defines event routing—without this, subscription events cannot be processed. The SubscriptionStatus impl (crates/common_enums/src/transformers.rs::SubscriptionStatus) must map to EventType because it converts status to events—without this, status changes don't trigger webhooks. These are coupled: EventClass routes to handlers that use SubscriptionStatus mappings.
172
+
173
+ ##SELECT
174
+ crates/common_enums/src/enums.rs::EventClass
175
+ crates/common_enums/src/transformers.rs::SubscriptionStatus
176
+ <EOS>
177
+ train_jsonl: sft_dataset.jsonl
178
+ data_seed:
179
+ value: null
180
+ dataloader_drop_last:
181
+ value: false
182
+ dataloader_num_workers:
183
+ value: 0
184
+ dataloader_persistent_workers:
185
+ value: false
186
+ dataloader_pin_memory:
187
+ value: true
188
+ dataloader_prefetch_factor:
189
+ value: null
190
+ ddp_backend:
191
+ value: null
192
+ ddp_broadcast_buffers:
193
+ value: null
194
+ ddp_bucket_cap_mb:
195
+ value: null
196
+ ddp_find_unused_parameters:
197
+ value: null
198
+ ddp_timeout:
199
+ value: 1800
200
+ debug:
201
+ value: []
202
+ decoder_start_token_id:
203
+ value: null
204
+ deepspeed:
205
+ value: null
206
+ disable_tqdm:
207
+ value: false
208
+ do_eval:
209
+ value: true
210
+ do_predict:
211
+ value: false
212
+ do_train:
213
+ value: false
214
+ dtype:
215
+ value: bfloat16
216
+ enable_jit_checkpoint:
217
+ value: false
218
+ eos_token_id:
219
+ value: 151643
220
+ eval_accumulation_steps:
221
+ value: null
222
+ eval_delay:
223
+ value: 0
224
+ eval_do_concat_batches:
225
+ value: true
226
+ eval_on_start:
227
+ value: false
228
+ eval_steps:
229
+ value: 100
230
+ eval_strategy:
231
+ value: steps
232
+ eval_use_gather_object:
233
+ value: false
234
+ finetuning_task:
235
+ value: null
236
+ fp16:
237
+ value: false
238
+ fp16_full_eval:
239
+ value: false
240
+ fsdp:
241
+ value: []
242
+ fsdp_config:
243
+ value:
244
+ min_num_params: 0
245
+ xla: false
246
+ xla_fsdp_grad_ckpt: false
247
+ xla_fsdp_v2: false
248
+ full_determinism:
249
+ value: false
250
+ gradient_accumulation_steps:
251
+ value: 8
252
+ gradient_checkpointing:
253
+ value: false
254
+ gradient_checkpointing_kwargs:
255
+ value: null
256
+ greater_is_better:
257
+ value: false
258
+ group_by_length:
259
+ value: false
260
+ hidden_act:
261
+ value: silu
262
+ hidden_size:
263
+ value: 5120
264
+ hub_always_push:
265
+ value: false
266
+ hub_model_id:
267
+ value: null
268
+ hub_private_repo:
269
+ value: null
270
+ hub_revision:
271
+ value: null
272
+ hub_strategy:
273
+ value: every_save
274
+ hub_token:
275
+ value: <HUB_TOKEN>
276
+ id2label:
277
+ value:
278
+ "0": LABEL_0
279
+ "1": LABEL_1
280
+ ignore_data_skip:
281
+ value: false
282
+ include_for_metrics:
283
+ value: []
284
+ include_num_input_tokens_seen:
285
+ value: "no"
286
+ initializer_range:
287
+ value: 0.02
288
+ intermediate_size:
289
+ value: 13824
290
+ is_decoder:
291
+ value: false
292
+ is_encoder_decoder:
293
+ value: false
294
+ label_names:
295
+ value: null
296
+ label_smoothing_factor:
297
+ value: 0
298
+ label2id:
299
+ value:
300
+ LABEL_0: 0
301
+ LABEL_1: 1
302
+ layer_types:
303
+ value:
304
+ - full_attention
305
+ - full_attention
306
+ - full_attention
307
+ - full_attention
308
+ - full_attention
309
+ - full_attention
310
+ - full_attention
311
+ - full_attention
312
+ - full_attention
313
+ - full_attention
314
+ - full_attention
315
+ - full_attention
316
+ - full_attention
317
+ - full_attention
318
+ - full_attention
319
+ - full_attention
320
+ - full_attention
321
+ - full_attention
322
+ - full_attention
323
+ - full_attention
324
+ - full_attention
325
+ - full_attention
326
+ - full_attention
327
+ - full_attention
328
+ - full_attention
329
+ - full_attention
330
+ - full_attention
331
+ - full_attention
332
+ - full_attention
333
+ - full_attention
334
+ - full_attention
335
+ - full_attention
336
+ - full_attention
337
+ - full_attention
338
+ - full_attention
339
+ - full_attention
340
+ - full_attention
341
+ - full_attention
342
+ - full_attention
343
+ - full_attention
344
+ - full_attention
345
+ - full_attention
346
+ - full_attention
347
+ - full_attention
348
+ - full_attention
349
+ - full_attention
350
+ - full_attention
351
+ - full_attention
352
+ learning_rate:
353
+ value: 0.0002
354
+ length_column_name:
355
+ value: length
356
+ liger_kernel_config:
357
+ value: null
358
+ load_best_model_at_end:
359
+ value: true
360
+ local_rank:
361
+ value: -1
362
+ log_level:
363
+ value: passive
364
+ log_level_replica:
365
+ value: warning
366
+ log_on_each_node:
367
+ value: true
368
+ logging_dir:
369
+ value: null
370
+ logging_first_step:
371
+ value: false
372
+ logging_nan_inf_filter:
373
+ value: true
374
+ logging_steps:
375
+ value: 2
376
+ logging_strategy:
377
+ value: steps
378
+ lr_scheduler_kwargs:
379
+ value: null
380
+ lr_scheduler_type:
381
+ value: cosine
382
+ max_grad_norm:
383
+ value: 1
384
+ max_position_embeddings:
385
+ value: 32768
386
+ max_steps:
387
+ value: -1
388
+ max_window_layers:
389
+ value: 48
390
+ metric_for_best_model:
391
+ value: eval_loss
392
+ model:
393
+ value:
394
+ attn_implementation: null
395
+ base_local_dir: base_model
396
+ bnb_4bit_compute_dtype: bfloat16
397
+ bnb_4bit_quant_type: nf4
398
+ bnb_4bit_use_double_quant: false
399
+ device_map: auto
400
+ repo_id: ./Models/Qwen2.5-Coder-14B-CPT
401
+ revision: null
402
+ tokenizer_use_fast: true
403
+ torch_dtype: bfloat16
404
+ trust_remote_code: true
405
+ use_4bit: false
406
+ model/num_parameters:
407
+ value: 14820365312
408
+ model_type:
409
+ value: qwen2
410
+ neftune_noise_alpha:
411
+ value: null
412
+ num_attention_heads:
413
+ value: 40
414
+ num_hidden_layers:
415
+ value: 48
416
+ num_key_value_heads:
417
+ value: 8
418
+ num_train_epochs:
419
+ value: 6
420
+ optim:
421
+ value: adamw_torch
422
+ optim_args:
423
+ value: null
424
+ optim_target_modules:
425
+ value: null
426
+ output_attentions:
427
+ value: false
428
+ output_dir:
429
+ value: task2file/sft_qwen_14B_v2/checkpoints
430
+ output_hidden_states:
431
+ value: false
432
+ pad_token_id:
433
+ value: null
434
+ parallelism_config:
435
+ value: null
436
+ peft:
437
+ value:
438
+ bias: none
439
+ enabled: true
440
+ lora_alpha: 64
441
+ lora_dropout: 0.05
442
+ r: 32
443
+ target_modules: auto
444
+ peft_config:
445
+ value:
446
+ default:
447
+ alora_invocation_tokens: null
448
+ arrow_config: null
449
+ auto_mapping: null
450
+ base_model_name_or_path: Models/Qwen2.5-Coder-14B-CPT
451
+ bias: none
452
+ corda_config: null
453
+ ensure_weight_tying: false
454
+ eva_config: null
455
+ exclude_modules: null
456
+ fan_in_fan_out: false
457
+ inference_mode: false
458
+ init_lora_weights: true
459
+ layer_replication: null
460
+ layers_pattern: null
461
+ layers_to_transform: null
462
+ lora_alpha: 64
463
+ lora_bias: false
464
+ lora_dropout: 0.05
465
+ megatron_config: null
466
+ megatron_core: megatron.core
467
+ modules_to_save: null
468
+ peft_type: LORA
469
+ peft_version: 0.18.0
470
+ qalora_group_size: 16
471
+ r: 32
472
+ revision: null
473
+ runtime_config:
474
+ ephemeral_gpu_offload: false
475
+ target_modules:
476
+ - o_proj
477
+ - v_proj
478
+ - k_proj
479
+ - q_proj
480
+ target_parameters: null
481
+ task_type: CAUSAL_LM
482
+ trainable_token_indices: null
483
+ use_dora: false
484
+ use_qalora: false
485
+ use_rslora: false
486
+ per_device_eval_batch_size:
487
+ value: 1
488
+ per_device_train_batch_size:
489
+ value: 1
490
+ prediction_loss_only:
491
+ value: false
492
+ prefix:
493
+ value: null
494
+ problem_type:
495
+ value: null
496
+ project:
497
+ value: huggingface
498
+ push_to_hub:
499
+ value: false
500
+ remove_unused_columns:
501
+ value: false
502
+ report_to:
503
+ value:
504
+ - wandb
505
+ restore_callback_states_from_checkpoint:
506
+ value: false
507
+ resume_from_checkpoint:
508
+ value: null
509
+ return_dict:
510
+ value: true
511
+ rms_norm_eps:
512
+ value: 1e-06
513
+ rope_parameters:
514
+ value:
515
+ rope_theta: 1e+06
516
+ rope_type: default
517
+ run_dir:
518
+ value: task2file/sft_qwen_14B_v2
519
+ run_name:
520
+ value: null
521
+ save_on_each_node:
522
+ value: false
523
+ save_only_model:
524
+ value: false
525
+ save_steps:
526
+ value: 500
527
+ save_strategy:
528
+ value: steps
529
+ save_total_limit:
530
+ value: 20
531
+ seed:
532
+ value: 42
533
+ sep_token_id:
534
+ value: null
535
+ skip_memory_metrics:
536
+ value: true
537
+ sliding_window:
538
+ value: null
539
+ task_specific_params:
540
+ value: null
541
+ tf32:
542
+ value: null
543
+ tie_word_embeddings:
544
+ value: false
545
+ tokenizer_class:
546
+ value: null
547
+ torch_compile:
548
+ value: false
549
+ torch_compile_backend:
550
+ value: null
551
+ torch_compile_mode:
552
+ value: null
553
+ torch_empty_cache_steps:
554
+ value: null
555
+ trackio_space_id:
556
+ value: trackio
557
+ train:
558
+ value:
559
+ early_stopping:
560
+ enabled: true
561
+ metric: eval_loss
562
+ min_delta: 0.001
563
+ mode: min
564
+ patience: 5
565
+ eval_steps: 100
566
+ evaluation_strategy: steps
567
+ gradient_accumulation_steps: 8
568
+ gradient_checkpointing: true
569
+ learning_rate: "2e-4"
570
+ load_best_model_at_end: true
571
+ logging_steps: 2
572
+ lr_scheduler_type: cosine
573
+ max_grad_norm: 1
574
+ num_train_epochs: 6
575
+ optim: adamw_torch
576
+ per_device_eval_batch_size: 1
577
+ per_device_train_batch_size: 1
578
+ resume_from_checkpoint: auto
579
+ save_steps: 500
580
+ save_strategy: steps
581
+ save_total_limit: 20
582
+ warmup_ratio: 0.08
583
+ weight_decay: 0
584
+ transformers_version:
585
+ value: 5.0.0.dev0
586
+ use_cache:
587
+ value: false
588
+ use_cpu:
589
+ value: false
590
+ use_liger_kernel:
591
+ value: false
592
+ use_sliding_window:
593
+ value: false
594
+ vocab_size:
595
+ value: 152064
596
+ warmup_ratio:
597
+ value: 0.08
598
+ warmup_steps:
599
+ value: 0.08
600
+ weight_decay:
601
+ value: 0
wandb/run-20251226_181544-upub1jan/files/output.log ADDED
The diff for this file is too large to render. See raw diff
 
wandb/run-20251226_181544-upub1jan/files/requirements.txt ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ exceptiongroup==1.3.1
2
+ wheel==0.45.1
3
+ python-dateutil==2.9.0.post0
4
+ nvidia-ml-py==13.580.82
5
+ huggingface_hub==1.2.3
6
+ idna==3.11
7
+ click==8.3.1
8
+ numpy==2.2.6
9
+ httpx==0.28.1
10
+ tokenizers==0.22.1
11
+ sympy==1.13.1
12
+ yarl==1.22.0
13
+ async-timeout==5.0.1
14
+ datasets==4.4.2
15
+ platformdirs==4.5.1
16
+ nvidia-cuda-cupti-cu12==12.1.105
17
+ nvidia-nvtx-cu12==12.1.105
18
+ smmap==5.0.2
19
+ accelerate==1.12.0
20
+ requests==2.32.5
21
+ aiohttp==3.13.2
22
+ bitsandbytes==0.49.0
23
+ nvidia-cublas-cu12==12.1.3.1
24
+ mpmath==1.3.0
25
+ torchaudio==2.5.1+cu121
26
+ nvidia-cuda-runtime-cu12==12.1.105
27
+ typing-inspection==0.4.2
28
+ GitPython==3.1.45
29
+ xxhash==3.6.0
30
+ nvidia-cusolver-cu12==11.4.5.107
31
+ pydantic_core==2.41.5
32
+ six==1.17.0
33
+ torchvision==0.20.1+cu121
34
+ typing_extensions==4.15.0
35
+ triton==3.1.0
36
+ charset-normalizer==3.4.4
37
+ nvitop==1.6.1
38
+ wandb==0.23.1
39
+ regex==2025.11.3
40
+ pip==25.3
41
+ nvidia-cusparse-cu12==12.1.0.106
42
+ pytz==2025.2
43
+ Jinja2==3.1.6
44
+ psutil==7.2.0
45
+ pillow==12.0.0
46
+ packaging==25.0
47
+ safetensors==0.7.0
48
+ sentry-sdk==2.48.0
49
+ gitdb==4.0.12
50
+ httpcore==1.0.9
51
+ setuptools==80.9.0
52
+ nvidia-cufft-cu12==11.0.2.54
53
+ anyio==4.12.0
54
+ transformers==5.0.0.dev0
55
+ pydantic==2.12.5
56
+ fsspec==2025.10.0
57
+ filelock==3.20.0
58
+ PyYAML==6.0.3
59
+ hf-xet==1.2.0
60
+ nvidia-cudnn-cu12==9.1.0.70
61
+ tqdm==4.67.1
62
+ MarkupSafe==2.1.5
63
+ attrs==25.4.0
64
+ nvidia-cuda-nvrtc-cu12==12.1.105
65
+ peft==0.18.0
66
+ aiohappyeyeballs==2.6.1
67
+ networkx==3.4.2
68
+ nvidia-nvjitlink-cu12==12.9.86
69
+ certifi==2025.11.12
70
+ pyarrow==22.0.0
71
+ dill==0.4.0
72
+ protobuf==6.33.2
73
+ aiosignal==1.4.0
74
+ frozenlist==1.8.0
75
+ urllib3==2.6.2
76
+ propcache==0.4.1
77
+ tzdata==2025.3
78
+ pandas==2.3.3
79
+ annotated-types==0.7.0
80
+ shellingham==1.5.4
81
+ nvidia-nccl-cu12==2.21.5
82
+ multidict==6.7.0
83
+ nvidia-curand-cu12==10.3.2.106
84
+ trl==0.26.2
85
+ torch==2.5.1+cu121
86
+ h11==0.16.0
87
+ multiprocess==0.70.18
88
+ typer-slim==0.21.0
89
+ wheel==0.45.1
90
+ tomli==2.0.1
91
+ autocommand==2.2.2
92
+ jaraco.context==5.3.0
93
+ zipp==3.19.2
94
+ packaging==24.2
95
+ inflect==7.3.1
96
+ typing_extensions==4.12.2
97
+ platformdirs==4.2.2
98
+ jaraco.functools==4.0.1
99
+ jaraco.collections==5.1.0
100
+ jaraco.text==3.12.1
101
+ backports.tarfile==1.2.0
102
+ more-itertools==10.3.0
103
+ importlib_metadata==8.0.0
104
+ typeguard==4.3.0
wandb/run-20251226_181544-upub1jan/files/wandb-metadata.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-6.12.46+-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.10.12",
4
+ "startedAt": "2025-12-26T18:15:44.765252Z",
5
+ "args": [
6
+ "--config",
7
+ "trainer-kit/SFT-14b/config_instruct.yaml"
8
+ ],
9
+ "program": "/workspace/trainer-kit/SFT-14b/run_instruct.py",
10
+ "codePath": "trainer-kit/SFT-14b/run_instruct.py",
11
+ "codePathLocal": "trainer-kit/SFT-14b/run_instruct.py",
12
+ "email": "shaiksirajuddin9949@gmail.com",
13
+ "root": "task2file/sft_qwen_14B_v2",
14
+ "host": "a100-2gpu-shell-session-757d587799-mfdvv",
15
+ "executable": "/workspace/llm_finetuning_env/bin/python",
16
+ "cpu_count": 12,
17
+ "cpu_count_logical": 24,
18
+ "gpu": "NVIDIA A100-SXM4-80GB",
19
+ "gpu_count": 2,
20
+ "disk": {
21
+ "/": {
22
+ "total": "791251738624",
23
+ "used": "392925650944"
24
+ }
25
+ },
26
+ "memory": {
27
+ "total": "359047892992"
28
+ },
29
+ "gpu_nvidia": [
30
+ {
31
+ "name": "NVIDIA A100-SXM4-80GB",
32
+ "memoryTotal": "85899345920",
33
+ "cudaCores": 6912,
34
+ "architecture": "Ampere",
35
+ "uuid": "GPU-989794b0-ec3b-13bf-db9f-3fbe341497ba"
36
+ },
37
+ {
38
+ "name": "NVIDIA A100-SXM4-80GB",
39
+ "memoryTotal": "85899345920",
40
+ "cudaCores": 6912,
41
+ "architecture": "Ampere",
42
+ "uuid": "GPU-3790aa64-60ef-9eac-b0b1-b278ee8c0d40"
43
+ }
44
+ ],
45
+ "cudaVersion": "13.0",
46
+ "writerId": "ba9dsvvs7npkm5vvx6733495pp0yghz4"
47
+ }
wandb/run-20251226_181544-upub1jan/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"train_steps_per_second":0.192,"train_loss":0.8076860591944526,"eval/steps_per_second":3.126,"_runtime":74806,"train/learning_rate":0.00015806910080135527,"train_runtime":73982.3221,"total_flos":3.049019256156893e+18,"eval/runtime":674.048,"train/epoch":2.151898734177215,"train/global_step":5100,"eval/loss":0.7600579857826233,"_step":2602,"_wandb":{"runtime":74806},"train/loss":0.6138747930526733,"train/grad_norm":0.48821282386779785,"eval/samples_per_second":3.126,"_timestamp":1.766847687515407e+09,"train_samples_per_second":1.538}
wandb/run-20251226_181544-upub1jan/logs/debug-core.log ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-12-26T18:15:44.85463749Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmp8sddanmm/port-194421.txt","pid":194421,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2025-12-26T18:15:44.855355082Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":194421}
3
+ {"time":"2025-12-26T18:15:44.85534748Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-194421-194516-4014985148/socket","Net":"unix"}}
4
+ {"time":"2025-12-26T18:15:45.035681883Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2025-12-26T18:15:45.043999704Z","level":"INFO","msg":"handleInformInit: received","streamId":"upub1jan","id":"1(@)"}
6
+ {"time":"2025-12-26T18:15:45.212475024Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"upub1jan","id":"1(@)"}
7
+ {"time":"2025-12-27T15:02:32.280521859Z","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"upub1jan","id":"1(@)"}
8
+ {"time":"2025-12-27T15:02:32.281560524Z","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"upub1jan","id":"1(@)"}
9
+ {"time":"2025-12-27T15:02:32.336985081Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
10
+ {"time":"2025-12-27T15:02:32.337047862Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
11
+ {"time":"2025-12-27T15:02:32.337066801Z","level":"INFO","msg":"server is shutting down"}
12
+ {"time":"2025-12-27T15:02:32.337073486Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
13
+ {"time":"2025-12-27T15:02:32.33713325Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
14
+ {"time":"2025-12-27T15:02:32.337139996Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
15
+ {"time":"2025-12-27T15:02:32.337231073Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-194421-194516-4014985148/socket","Net":"unix"}}
16
+ {"time":"2025-12-27T15:02:32.337292822Z","level":"INFO","msg":"server is closed"}
wandb/run-20251226_181544-upub1jan/logs/debug-internal.log ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-12-26T18:15:45.044149374Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"}
2
+ {"time":"2025-12-26T18:15:45.212148231Z","level":"INFO","msg":"stream: created new stream","id":"upub1jan"}
3
+ {"time":"2025-12-26T18:15:45.212312297Z","level":"INFO","msg":"handler: started","stream_id":"upub1jan"}
4
+ {"time":"2025-12-26T18:15:45.212463318Z","level":"INFO","msg":"stream: started","id":"upub1jan"}
5
+ {"time":"2025-12-26T18:15:45.212498387Z","level":"INFO","msg":"writer: started","stream_id":"upub1jan"}
6
+ {"time":"2025-12-26T18:15:45.212503642Z","level":"INFO","msg":"sender: started","stream_id":"upub1jan"}
7
+ {"time":"2025-12-27T15:02:32.177112089Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
8
+ {"time":"2025-12-27T15:02:32.275585886Z","level":"INFO","msg":"handler: operation stats","stats":{}}
9
+ {"time":"2025-12-27T15:02:32.280819216Z","level":"INFO","msg":"stream: closing","id":"upub1jan"}
10
+ {"time":"2025-12-27T15:02:32.280845154Z","level":"INFO","msg":"handler: closed","stream_id":"upub1jan"}
11
+ {"time":"2025-12-27T15:02:32.280896164Z","level":"INFO","msg":"sender: closed","stream_id":"upub1jan"}
12
+ {"time":"2025-12-27T15:02:32.280915121Z","level":"INFO","msg":"stream: closed","id":"upub1jan"}
wandb/run-20251226_181544-upub1jan/logs/debug.log ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-12-26 18:15:44,766 INFO MainThread:194421 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1
2
+ 2025-12-26 18:15:44,767 INFO MainThread:194421 [wandb_setup.py:_flush():80] Configure stats pid to 194421
3
+ 2025-12-26 18:15:44,767 INFO MainThread:194421 [wandb_setup.py:_flush():80] Loading settings from /root/.config/wandb/settings
4
+ 2025-12-26 18:15:44,767 INFO MainThread:194421 [wandb_setup.py:_flush():80] Loading settings from /workspace/wandb/settings
5
+ 2025-12-26 18:15:44,767 INFO MainThread:194421 [wandb_setup.py:_flush():80] Loading settings from environment variables
6
+ 2025-12-26 18:15:44,767 INFO MainThread:194421 [wandb_init.py:setup_run_log_directory():714] Logging user logs to task2file/sft_qwen_14B_v2/wandb/run-20251226_181544-upub1jan/logs/debug.log
7
+ 2025-12-26 18:15:44,767 INFO MainThread:194421 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to task2file/sft_qwen_14B_v2/wandb/run-20251226_181544-upub1jan/logs/debug-internal.log
8
+ 2025-12-26 18:15:44,767 INFO MainThread:194421 [wandb_init.py:init():841] calling init triggers
9
+ 2025-12-26 18:15:44,767 INFO MainThread:194421 [wandb_init.py:init():846] wandb.init called with sweep_config: {}
10
+ config: {'model': {'repo_id': './Models/Qwen2.5-Coder-14B-CPT', 'revision': None, 'base_local_dir': 'base_model', 'trust_remote_code': True, 'tokenizer_use_fast': True, 'device_map': 'auto', 'torch_dtype': 'bfloat16', 'use_4bit': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': False, 'bnb_4bit_compute_dtype': 'bfloat16', 'attn_implementation': None}, 'data': {'train_jsonl': 'sft_dataset.jsonl', 'eval_jsonl': None, 'eval_split_ratio': 0.1, 'instruction_field': 'instruction', 'input_field': 'input', 'output_field': 'output', 'format_type': 'custom', 'system_prompt': 'You are a Hyperswitch Rust code analyzer. Identify functions/structs that need modification for a given task.\n\n## Output Format\n\n##OUTPUT\nExplain the data flow and why each component must change:\n- Flow: [Input → Processing → Output with arrows]\n- For each component: "The [ComponentName] ([path]) must [action] because [reason]—without this, [consequence]"\n- Explain coupling between components\n\n##SELECT\nmodify::crates/path/to/file.rs::impl::ComponentName\nadd::crates/another/file.rs::function::AnotherComponent\n<EOS>\n\n## Rules\n\n1. Use full paths: `remove::crates/folder/file.rs::Type::Name`\n2. Use `::` for nested items: `status::StructName::Type::Name`\n3. Always explain "must change because" and "without this"\n3. Types of components: function, struct, enum, impl, trait\n4. If there is extra information (e.g., enum variants), include that too.\n5. Start with ##OUTPUT, end with ##SELECT, terminate with <EOS>\n\n## Example\n\n##TASK\nAdd webhook subscription support\n\n##OUTPUT\nThe webhook system routes events via EventClass enum. Flow: webhook → EventClass → handler → processing. The EventClass enum (crates/common_enums/src/enums.rs::EventClass) must add Subscriptions variant because it defines event routing—without this, subscription events cannot be processed. The SubscriptionStatus impl (crates/common_enums/src/transformers.rs::SubscriptionStatus) must map to EventType because it converts status to events—without this, status changes don\'t trigger webhooks. These are coupled: EventClass routes to handlers that use SubscriptionStatus mappings.\n\n##SELECT\ncrates/common_enums/src/enums.rs::EventClass\ncrates/common_enums/src/transformers.rs::SubscriptionStatus\n<EOS>\n', 'custom_template': '##INSTRUCTION\n{instruction}<|im_end|>\n{input}<|im_end|>\n{output}<|im_end|>', 'max_length': 2048, 'shuffle': True, 'num_proc': 4}, 'peft': {'enabled': True, 'r': 32, 'lora_alpha': 64, 'lora_dropout': 0.05, 'bias': 'none', 'target_modules': 'auto'}, 'train': {'num_train_epochs': 6, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 8, 'learning_rate': '2e-4', 'weight_decay': 0.0, 'warmup_ratio': 0.08, 'lr_scheduler_type': 'cosine', 'optim': 'adamw_torch', 'max_grad_norm': 1.0, 'gradient_checkpointing': True, 'logging_steps': 2, 'save_strategy': 'steps', 'save_steps': 500, 'save_total_limit': 20, 'evaluation_strategy': 'steps', 'eval_steps': 100, 'load_best_model_at_end': True, 'early_stopping': {'enabled': True, 'patience': 5, 'min_delta': 0.001, 'metric': 'eval_loss', 'mode': 'min'}, 'resume_from_checkpoint': 'auto'}, 'run_dir': 'task2file/sft_qwen_14B_v2', '_wandb': {}}
11
+ 2025-12-26 18:15:44,767 INFO MainThread:194421 [wandb_init.py:init():889] starting backend
12
+ 2025-12-26 18:15:45,035 INFO MainThread:194421 [wandb_init.py:init():892] sending inform_init request
13
+ 2025-12-26 18:15:45,040 INFO MainThread:194421 [wandb_init.py:init():900] backend started and connected
14
+ 2025-12-26 18:15:45,044 INFO MainThread:194421 [wandb_init.py:init():970] updated telemetry
15
+ 2025-12-26 18:15:45,045 INFO MainThread:194421 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout
16
+ 2025-12-26 18:15:45,420 INFO MainThread:194421 [wandb_init.py:init():1041] starting run threads in backend
17
+ 2025-12-26 18:15:45,537 INFO MainThread:194421 [wandb_run.py:_console_start():2521] atexit reg
18
+ 2025-12-26 18:15:45,537 INFO MainThread:194421 [wandb_run.py:_redirect():2369] redirect: wrap_raw
19
+ 2025-12-26 18:15:45,537 INFO MainThread:194421 [wandb_run.py:_redirect():2438] Wrapping output streams.
20
+ 2025-12-26 18:15:45,537 INFO MainThread:194421 [wandb_run.py:_redirect():2461] Redirects installed.
21
+ 2025-12-26 18:15:45,542 INFO MainThread:194421 [wandb_init.py:init():1081] run started, returning control to user process
22
+ 2025-12-26 18:17:10,652 INFO MainThread:194421 [wandb_run.py:_config_callback():1396] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': 'LORA', 'auto_mapping': None, 'peft_version': '0.18.0', 'base_model_name_or_path': 'Models/Qwen2.5-Coder-14B-CPT', 'revision': None, 'inference_mode': False, 'r': 32, 'target_modules': ['o_proj', 'v_proj', 'k_proj', 'q_proj'], 'exclude_modules': None, 'lora_alpha': 64, 'lora_dropout': 0.05, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'alora_invocation_tokens': None, 'use_qalora': False, 'qalora_group_size': 16, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False, 'target_parameters': None, 'arrow_config': None, 'ensure_weight_tying': False}}, 'vocab_size': 152064, 'max_position_embeddings': 32768, 'hidden_size': 5120, 'intermediate_size': 13824, 'num_hidden_layers': 48, 'num_attention_heads': 40, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 48, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'attention_dropout': 0.0, 'layer_types': ['full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention'], 'rope_parameters': {'rope_theta': 1000000.0, 'rope_type': 'default'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'architectures': ['Qwen2ForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'task_specific_params': None, 'problem_type': None, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 151643, 'pad_token_id': None, 'eos_token_id': 151643, 'sep_token_id': None, 'decoder_start_token_id': None, '_name_or_path': 'Models/Qwen2.5-Coder-14B-CPT', 'transformers_version': '5.0.0.dev0', 'model_type': 'qwen2', 'output_attentions': False, 'output_dir': 'task2file/sft_qwen_14B_v2/checkpoints', 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 8, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 0.0002, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 6.0, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': None, 'warmup_ratio': 0.08, 'warmup_steps': 0.08, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': None, 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 2, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 500, 'save_total_limit': 20, 'enable_jit_checkpoint': False, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'use_cpu': False, 'seed': 42, 'data_seed': None, 'bf16': True, 'fp16': False, 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': -1, 'ddp_backend': None, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 100, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': None, 'load_best_model_at_end': True, 'metric_for_best_model': 'eval_loss', 'greater_is_better': False, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'project': 'huggingface', 'trackio_space_id': 'trackio', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'auto_find_batch_size': False, 'full_determinism': False, 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_num_input_tokens_seen': 'no', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True}
23
+ 2025-12-26 18:17:10,660 INFO MainThread:194421 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 14820365312 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x7ed24f0556c0>>
24
+ 2025-12-26 18:17:10,660 INFO MainThread:194421 [wandb_run.py:_config_callback():1396] config_cb model/num_parameters 14820365312 None
25
+ 2025-12-27 15:02:31,903 INFO MainThread:194421 [wandb_run.py:_finish():2287] finishing run sirajuddin-shaik-007/sft-training/upub1jan
26
+ 2025-12-27 15:02:31,905 INFO MainThread:194421 [wandb_run.py:_atexit_cleanup():2486] got exitcode: 0
27
+ 2025-12-27 15:02:31,906 INFO MainThread:194421 [wandb_run.py:_restore():2468] restore
28
+ 2025-12-27 15:02:31,906 INFO MainThread:194421 [wandb_run.py:_restore():2474] restore done
29
+ 2025-12-27 15:02:32,279 INFO MainThread:194421 [wandb_run.py:_footer_sync_info():3862] logging synced files