Petri99 commited on
Commit
d269c20
·
verified ·
1 Parent(s): 9f72192

Training in progress, step 4500

Browse files
Files changed (42) hide show
  1. model.safetensors +1 -1
  2. run-2/checkpoint-1000/model.safetensors +1 -1
  3. run-2/checkpoint-1000/optimizer.pt +1 -1
  4. run-2/checkpoint-1000/scheduler.pt +1 -1
  5. run-2/checkpoint-1000/trainer_state.json +25 -25
  6. run-2/checkpoint-1000/training_args.bin +1 -1
  7. run-2/checkpoint-1500/model.safetensors +1 -1
  8. run-2/checkpoint-1500/optimizer.pt +1 -1
  9. run-2/checkpoint-1500/scheduler.pt +1 -1
  10. run-2/checkpoint-1500/trainer_state.json +31 -31
  11. run-2/checkpoint-1500/training_args.bin +1 -1
  12. run-2/checkpoint-2000/model.safetensors +1 -1
  13. run-2/checkpoint-2000/optimizer.pt +1 -1
  14. run-2/checkpoint-2000/scheduler.pt +1 -1
  15. run-2/checkpoint-2000/trainer_state.json +45 -45
  16. run-2/checkpoint-2000/training_args.bin +1 -1
  17. run-2/checkpoint-2500/model.safetensors +1 -1
  18. run-2/checkpoint-2500/optimizer.pt +1 -1
  19. run-2/checkpoint-2500/scheduler.pt +1 -1
  20. run-2/checkpoint-2500/trainer_state.json +51 -51
  21. run-2/checkpoint-2500/training_args.bin +1 -1
  22. run-2/checkpoint-3000/model.safetensors +1 -1
  23. run-2/checkpoint-3000/optimizer.pt +1 -1
  24. run-2/checkpoint-3000/scheduler.pt +1 -1
  25. run-2/checkpoint-3000/trainer_state.json +65 -65
  26. run-2/checkpoint-3000/training_args.bin +1 -1
  27. run-2/checkpoint-3500/model.safetensors +1 -1
  28. run-2/checkpoint-3500/optimizer.pt +1 -1
  29. run-2/checkpoint-3500/scheduler.pt +1 -1
  30. run-2/checkpoint-3500/trainer_state.json +71 -71
  31. run-2/checkpoint-3500/training_args.bin +1 -1
  32. run-2/checkpoint-4000/model.safetensors +1 -1
  33. run-2/checkpoint-4000/optimizer.pt +1 -1
  34. run-2/checkpoint-4000/scheduler.pt +1 -1
  35. run-2/checkpoint-4000/trainer_state.json +85 -85
  36. run-2/checkpoint-4000/training_args.bin +1 -1
  37. run-2/checkpoint-4500/model.safetensors +1 -1
  38. run-2/checkpoint-4500/optimizer.pt +1 -1
  39. run-2/checkpoint-4500/scheduler.pt +1 -1
  40. run-2/checkpoint-4500/trainer_state.json +91 -91
  41. run-2/checkpoint-4500/training_args.bin +1 -1
  42. runs/Oct23_11-35-49_ec65ff5e83b6/events.out.tfevents.1761223404.ec65ff5e83b6.4546.3 +2 -2
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:427a451a2bfb7e16dbf8baa6495252b971bce9eaa780bcf2cc78cf4bfd80219c
3
  size 598898116
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7bd48eacc76940b767b8081cd817fd1c37797da71518ea157e83924dd4e29138
3
  size 598898116
run-2/checkpoint-1000/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:252587b419a7657532c60396af051177bc1b295f2374f28d11b88bf5106578cb
3
  size 598898116
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f28288ac778bb6f024ce4a73fcc5092a2413b9eeb7560420472810b120686f57
3
  size 598898116
run-2/checkpoint-1000/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fe42fc94266cc405f1d18e43d9ac72a32e9038140eb42bb15d2aaafc179d8d67
3
  size 1197886411
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28b456041c32b3168b2f02bb4b47d0e388ca1212cf2326f20b01c76d5e280799
3
  size 1197886411
run-2/checkpoint-1000/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b14bcab6a32d4c606a8987a9aa27253db51004ed24b536392eede06025d89b9b
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fbbbb32195293dd885e75215e07e363d7c22cc73f1b1d0c7c508696c26691417
3
  size 1465
run-2/checkpoint-1000/trainer_state.json CHANGED
@@ -11,53 +11,53 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.20964360587002095,
14
- "grad_norm": 19.045116424560547,
15
- "learning_rate": 2.9106019766397126e-05,
16
- "loss": 3.1735,
17
  "step": 200
18
  },
19
  {
20
  "epoch": 0.4192872117400419,
21
- "grad_norm": 14.753777503967285,
22
- "learning_rate": 2.820754716981132e-05,
23
- "loss": 0.6335,
24
  "step": 400
25
  },
26
  {
27
  "epoch": 0.6289308176100629,
28
- "grad_norm": 7.134149074554443,
29
- "learning_rate": 2.730907457322552e-05,
30
- "loss": 0.3331,
31
  "step": 600
32
  },
33
  {
34
  "epoch": 0.8385744234800838,
35
- "grad_norm": 11.017728805541992,
36
- "learning_rate": 2.6410601976639712e-05,
37
- "loss": 0.2265,
38
  "step": 800
39
  },
40
  {
41
  "epoch": 1.0,
42
- "eval_accuracy": 0.9341935483870968,
43
- "eval_loss": 0.25923657417297363,
44
- "eval_runtime": 6.2485,
45
- "eval_samples_per_second": 496.116,
46
- "eval_steps_per_second": 31.047,
47
  "step": 954
48
  },
49
  {
50
  "epoch": 1.0482180293501049,
51
- "grad_norm": 17.758880615234375,
52
- "learning_rate": 2.551212938005391e-05,
53
- "loss": 0.1617,
54
  "step": 1000
55
  }
56
  ],
57
  "logging_steps": 200,
58
- "max_steps": 6678,
59
  "num_input_tokens_seen": 0,
60
- "num_train_epochs": 7,
61
  "save_steps": 500,
62
  "stateful_callbacks": {
63
  "TrainerControl": {
@@ -75,8 +75,8 @@
75
  "train_batch_size": 16,
76
  "trial_name": null,
77
  "trial_params": {
78
- "alpha": 0.8225267279347407,
79
- "num_train_epochs": 7,
80
- "temperature": 9
81
  }
82
  }
 
11
  "log_history": [
12
  {
13
  "epoch": 0.20964360587002095,
14
+ "grad_norm": 19.058637619018555,
15
+ "learning_rate": 2.8748427672955975e-05,
16
+ "loss": 3.1779,
17
  "step": 200
18
  },
19
  {
20
  "epoch": 0.4192872117400419,
21
+ "grad_norm": 16.188446044921875,
22
+ "learning_rate": 2.749056603773585e-05,
23
+ "loss": 0.6362,
24
  "step": 400
25
  },
26
  {
27
  "epoch": 0.6289308176100629,
28
+ "grad_norm": 7.167396545410156,
29
+ "learning_rate": 2.6232704402515723e-05,
30
+ "loss": 0.3306,
31
  "step": 600
32
  },
33
  {
34
  "epoch": 0.8385744234800838,
35
+ "grad_norm": 11.347878456115723,
36
+ "learning_rate": 2.49748427672956e-05,
37
+ "loss": 0.2273,
38
  "step": 800
39
  },
40
  {
41
  "epoch": 1.0,
42
+ "eval_accuracy": 0.9325806451612904,
43
+ "eval_loss": 0.2720068097114563,
44
+ "eval_runtime": 6.3485,
45
+ "eval_samples_per_second": 488.302,
46
+ "eval_steps_per_second": 30.558,
47
  "step": 954
48
  },
49
  {
50
  "epoch": 1.0482180293501049,
51
+ "grad_norm": 15.08842658996582,
52
+ "learning_rate": 2.3716981132075474e-05,
53
+ "loss": 0.1632,
54
  "step": 1000
55
  }
56
  ],
57
  "logging_steps": 200,
58
+ "max_steps": 4770,
59
  "num_input_tokens_seen": 0,
60
+ "num_train_epochs": 5,
61
  "save_steps": 500,
62
  "stateful_callbacks": {
63
  "TrainerControl": {
 
75
  "train_batch_size": 16,
76
  "trial_name": null,
77
  "trial_params": {
78
+ "alpha": 0.7896637522578164,
79
+ "num_train_epochs": 5,
80
+ "temperature": 4
81
  }
82
  }
run-2/checkpoint-1000/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:71f85117fc3c233d4584172d7a3335ae3c759791cdd7fc1ffafeb612dabc2e87
3
  size 5905
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:210679227dd2b3037b48d4414e3a7d05adcbca6f5fde35131afb41f4714ec72e
3
  size 5905
run-2/checkpoint-1500/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3915e3a6c08b479a01866b9b7286613ca3eab642a72d1ea030f7e4dc0e9af7f2
3
  size 598898116
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1905fe41fb81fe4b5d38b9e652c318ffbd46e23986515a5e820ed291c6f988d0
3
  size 598898116
run-2/checkpoint-1500/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:897e2c897234a90c857a49fda7ec2e3a0a59c2669834f78571d8cd3a348183a1
3
  size 1197886411
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e73e459986df9de94cad4322a98423855bfa5bfddae92b272623ac8d4e3f3e0
3
  size 1197886411
run-2/checkpoint-1500/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6ccecd3a82a1022f6770b975fae3e619bfd11ed43f7394892908565468b74aa4
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abdaff917f8d0d0fa51c87b92a586474c34f6688f66a58df19a2d3f2ab5de74c
3
  size 1465
run-2/checkpoint-1500/trainer_state.json CHANGED
@@ -11,67 +11,67 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.20964360587002095,
14
- "grad_norm": 19.045116424560547,
15
- "learning_rate": 2.9106019766397126e-05,
16
- "loss": 3.1735,
17
  "step": 200
18
  },
19
  {
20
  "epoch": 0.4192872117400419,
21
- "grad_norm": 14.753777503967285,
22
- "learning_rate": 2.820754716981132e-05,
23
- "loss": 0.6335,
24
  "step": 400
25
  },
26
  {
27
  "epoch": 0.6289308176100629,
28
- "grad_norm": 7.134149074554443,
29
- "learning_rate": 2.730907457322552e-05,
30
- "loss": 0.3331,
31
  "step": 600
32
  },
33
  {
34
  "epoch": 0.8385744234800838,
35
- "grad_norm": 11.017728805541992,
36
- "learning_rate": 2.6410601976639712e-05,
37
- "loss": 0.2265,
38
  "step": 800
39
  },
40
  {
41
  "epoch": 1.0,
42
- "eval_accuracy": 0.9341935483870968,
43
- "eval_loss": 0.25923657417297363,
44
- "eval_runtime": 6.2485,
45
- "eval_samples_per_second": 496.116,
46
- "eval_steps_per_second": 31.047,
47
  "step": 954
48
  },
49
  {
50
  "epoch": 1.0482180293501049,
51
- "grad_norm": 17.758880615234375,
52
- "learning_rate": 2.551212938005391e-05,
53
- "loss": 0.1617,
54
  "step": 1000
55
  },
56
  {
57
  "epoch": 1.2578616352201257,
58
- "grad_norm": 2.7743964195251465,
59
- "learning_rate": 2.4613656783468106e-05,
60
- "loss": 0.0715,
61
  "step": 1200
62
  },
63
  {
64
  "epoch": 1.4675052410901468,
65
- "grad_norm": 0.491705060005188,
66
- "learning_rate": 2.3715184186882298e-05,
67
- "loss": 0.059,
68
  "step": 1400
69
  }
70
  ],
71
  "logging_steps": 200,
72
- "max_steps": 6678,
73
  "num_input_tokens_seen": 0,
74
- "num_train_epochs": 7,
75
  "save_steps": 500,
76
  "stateful_callbacks": {
77
  "TrainerControl": {
@@ -89,8 +89,8 @@
89
  "train_batch_size": 16,
90
  "trial_name": null,
91
  "trial_params": {
92
- "alpha": 0.8225267279347407,
93
- "num_train_epochs": 7,
94
- "temperature": 9
95
  }
96
  }
 
11
  "log_history": [
12
  {
13
  "epoch": 0.20964360587002095,
14
+ "grad_norm": 19.058637619018555,
15
+ "learning_rate": 2.8748427672955975e-05,
16
+ "loss": 3.1779,
17
  "step": 200
18
  },
19
  {
20
  "epoch": 0.4192872117400419,
21
+ "grad_norm": 16.188446044921875,
22
+ "learning_rate": 2.749056603773585e-05,
23
+ "loss": 0.6362,
24
  "step": 400
25
  },
26
  {
27
  "epoch": 0.6289308176100629,
28
+ "grad_norm": 7.167396545410156,
29
+ "learning_rate": 2.6232704402515723e-05,
30
+ "loss": 0.3306,
31
  "step": 600
32
  },
33
  {
34
  "epoch": 0.8385744234800838,
35
+ "grad_norm": 11.347878456115723,
36
+ "learning_rate": 2.49748427672956e-05,
37
+ "loss": 0.2273,
38
  "step": 800
39
  },
40
  {
41
  "epoch": 1.0,
42
+ "eval_accuracy": 0.9325806451612904,
43
+ "eval_loss": 0.2720068097114563,
44
+ "eval_runtime": 6.3485,
45
+ "eval_samples_per_second": 488.302,
46
+ "eval_steps_per_second": 30.558,
47
  "step": 954
48
  },
49
  {
50
  "epoch": 1.0482180293501049,
51
+ "grad_norm": 15.08842658996582,
52
+ "learning_rate": 2.3716981132075474e-05,
53
+ "loss": 0.1632,
54
  "step": 1000
55
  },
56
  {
57
  "epoch": 1.2578616352201257,
58
+ "grad_norm": 1.7787141799926758,
59
+ "learning_rate": 2.2459119496855346e-05,
60
+ "loss": 0.0698,
61
  "step": 1200
62
  },
63
  {
64
  "epoch": 1.4675052410901468,
65
+ "grad_norm": 0.3887692987918854,
66
+ "learning_rate": 2.120125786163522e-05,
67
+ "loss": 0.0551,
68
  "step": 1400
69
  }
70
  ],
71
  "logging_steps": 200,
72
+ "max_steps": 4770,
73
  "num_input_tokens_seen": 0,
74
+ "num_train_epochs": 5,
75
  "save_steps": 500,
76
  "stateful_callbacks": {
77
  "TrainerControl": {
 
89
  "train_batch_size": 16,
90
  "trial_name": null,
91
  "trial_params": {
92
+ "alpha": 0.7896637522578164,
93
+ "num_train_epochs": 5,
94
+ "temperature": 4
95
  }
96
  }
run-2/checkpoint-1500/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:71f85117fc3c233d4584172d7a3335ae3c759791cdd7fc1ffafeb612dabc2e87
3
  size 5905
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:210679227dd2b3037b48d4414e3a7d05adcbca6f5fde35131afb41f4714ec72e
3
  size 5905
run-2/checkpoint-2000/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:42c6bfda229929d7019d5fd7929bbd0f1895cf4ad5ac99076d934bb2ae93cd9f
3
  size 598898116
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5089196f4a13dded45220594cc91ac3c2e254c5da9950d2fb32f1376046ee70f
3
  size 598898116
run-2/checkpoint-2000/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:682f16cc8d06518b950f72bb60ac4b8135e034b42ce297e492fe8594dd6df5d5
3
  size 1197886411
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b3b800a7fd50bb06d4ac7cc561fbd4b01e9318508644ceca01fec8f9d5b9a2c
3
  size 1197886411
run-2/checkpoint-2000/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:510783aaece3fd510395ca4990e4058484c69ee310ae76f15dbc901d1326aca0
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad6299911574d785b66d71f02acef1c78edd67929fbb86d1e12dcb5b017c4eae
3
  size 1465
run-2/checkpoint-2000/trainer_state.json CHANGED
@@ -11,97 +11,97 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.20964360587002095,
14
- "grad_norm": 19.045116424560547,
15
- "learning_rate": 2.9106019766397126e-05,
16
- "loss": 3.1735,
17
  "step": 200
18
  },
19
  {
20
  "epoch": 0.4192872117400419,
21
- "grad_norm": 14.753777503967285,
22
- "learning_rate": 2.820754716981132e-05,
23
- "loss": 0.6335,
24
  "step": 400
25
  },
26
  {
27
  "epoch": 0.6289308176100629,
28
- "grad_norm": 7.134149074554443,
29
- "learning_rate": 2.730907457322552e-05,
30
- "loss": 0.3331,
31
  "step": 600
32
  },
33
  {
34
  "epoch": 0.8385744234800838,
35
- "grad_norm": 11.017728805541992,
36
- "learning_rate": 2.6410601976639712e-05,
37
- "loss": 0.2265,
38
  "step": 800
39
  },
40
  {
41
  "epoch": 1.0,
42
- "eval_accuracy": 0.9341935483870968,
43
- "eval_loss": 0.25923657417297363,
44
- "eval_runtime": 6.2485,
45
- "eval_samples_per_second": 496.116,
46
- "eval_steps_per_second": 31.047,
47
  "step": 954
48
  },
49
  {
50
  "epoch": 1.0482180293501049,
51
- "grad_norm": 17.758880615234375,
52
- "learning_rate": 2.551212938005391e-05,
53
- "loss": 0.1617,
54
  "step": 1000
55
  },
56
  {
57
  "epoch": 1.2578616352201257,
58
- "grad_norm": 2.7743964195251465,
59
- "learning_rate": 2.4613656783468106e-05,
60
- "loss": 0.0715,
61
  "step": 1200
62
  },
63
  {
64
  "epoch": 1.4675052410901468,
65
- "grad_norm": 0.491705060005188,
66
- "learning_rate": 2.3715184186882298e-05,
67
- "loss": 0.059,
68
  "step": 1400
69
  },
70
  {
71
  "epoch": 1.6771488469601676,
72
- "grad_norm": 0.3010256290435791,
73
- "learning_rate": 2.2816711590296497e-05,
74
- "loss": 0.0569,
75
  "step": 1600
76
  },
77
  {
78
  "epoch": 1.8867924528301887,
79
- "grad_norm": 22.237369537353516,
80
- "learning_rate": 2.1918238993710692e-05,
81
- "loss": 0.0672,
82
  "step": 1800
83
  },
84
  {
85
  "epoch": 2.0,
86
- "eval_accuracy": 0.9487096774193549,
87
- "eval_loss": 0.22349213063716888,
88
- "eval_runtime": 5.967,
89
- "eval_samples_per_second": 519.525,
90
- "eval_steps_per_second": 32.512,
91
  "step": 1908
92
  },
93
  {
94
  "epoch": 2.0964360587002098,
95
- "grad_norm": 0.024833250790834427,
96
- "learning_rate": 2.1019766397124887e-05,
97
- "loss": 0.0422,
98
  "step": 2000
99
  }
100
  ],
101
  "logging_steps": 200,
102
- "max_steps": 6678,
103
  "num_input_tokens_seen": 0,
104
- "num_train_epochs": 7,
105
  "save_steps": 500,
106
  "stateful_callbacks": {
107
  "TrainerControl": {
@@ -119,8 +119,8 @@
119
  "train_batch_size": 16,
120
  "trial_name": null,
121
  "trial_params": {
122
- "alpha": 0.8225267279347407,
123
- "num_train_epochs": 7,
124
- "temperature": 9
125
  }
126
  }
 
11
  "log_history": [
12
  {
13
  "epoch": 0.20964360587002095,
14
+ "grad_norm": 19.058637619018555,
15
+ "learning_rate": 2.8748427672955975e-05,
16
+ "loss": 3.1779,
17
  "step": 200
18
  },
19
  {
20
  "epoch": 0.4192872117400419,
21
+ "grad_norm": 16.188446044921875,
22
+ "learning_rate": 2.749056603773585e-05,
23
+ "loss": 0.6362,
24
  "step": 400
25
  },
26
  {
27
  "epoch": 0.6289308176100629,
28
+ "grad_norm": 7.167396545410156,
29
+ "learning_rate": 2.6232704402515723e-05,
30
+ "loss": 0.3306,
31
  "step": 600
32
  },
33
  {
34
  "epoch": 0.8385744234800838,
35
+ "grad_norm": 11.347878456115723,
36
+ "learning_rate": 2.49748427672956e-05,
37
+ "loss": 0.2273,
38
  "step": 800
39
  },
40
  {
41
  "epoch": 1.0,
42
+ "eval_accuracy": 0.9325806451612904,
43
+ "eval_loss": 0.2720068097114563,
44
+ "eval_runtime": 6.3485,
45
+ "eval_samples_per_second": 488.302,
46
+ "eval_steps_per_second": 30.558,
47
  "step": 954
48
  },
49
  {
50
  "epoch": 1.0482180293501049,
51
+ "grad_norm": 15.08842658996582,
52
+ "learning_rate": 2.3716981132075474e-05,
53
+ "loss": 0.1632,
54
  "step": 1000
55
  },
56
  {
57
  "epoch": 1.2578616352201257,
58
+ "grad_norm": 1.7787141799926758,
59
+ "learning_rate": 2.2459119496855346e-05,
60
+ "loss": 0.0698,
61
  "step": 1200
62
  },
63
  {
64
  "epoch": 1.4675052410901468,
65
+ "grad_norm": 0.3887692987918854,
66
+ "learning_rate": 2.120125786163522e-05,
67
+ "loss": 0.0551,
68
  "step": 1400
69
  },
70
  {
71
  "epoch": 1.6771488469601676,
72
+ "grad_norm": 0.17968730628490448,
73
+ "learning_rate": 1.9943396226415094e-05,
74
+ "loss": 0.0487,
75
  "step": 1600
76
  },
77
  {
78
  "epoch": 1.8867924528301887,
79
+ "grad_norm": 17.465627670288086,
80
+ "learning_rate": 1.868553459119497e-05,
81
+ "loss": 0.0556,
82
  "step": 1800
83
  },
84
  {
85
  "epoch": 2.0,
86
+ "eval_accuracy": 0.9496774193548387,
87
+ "eval_loss": 0.23183350265026093,
88
+ "eval_runtime": 6.1991,
89
+ "eval_samples_per_second": 500.07,
90
+ "eval_steps_per_second": 31.295,
91
  "step": 1908
92
  },
93
  {
94
  "epoch": 2.0964360587002098,
95
+ "grad_norm": 0.020424585789442062,
96
+ "learning_rate": 1.742767295597484e-05,
97
+ "loss": 0.0409,
98
  "step": 2000
99
  }
100
  ],
101
  "logging_steps": 200,
102
+ "max_steps": 4770,
103
  "num_input_tokens_seen": 0,
104
+ "num_train_epochs": 5,
105
  "save_steps": 500,
106
  "stateful_callbacks": {
107
  "TrainerControl": {
 
119
  "train_batch_size": 16,
120
  "trial_name": null,
121
  "trial_params": {
122
+ "alpha": 0.7896637522578164,
123
+ "num_train_epochs": 5,
124
+ "temperature": 4
125
  }
126
  }
run-2/checkpoint-2000/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:71f85117fc3c233d4584172d7a3335ae3c759791cdd7fc1ffafeb612dabc2e87
3
  size 5905
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:210679227dd2b3037b48d4414e3a7d05adcbca6f5fde35131afb41f4714ec72e
3
  size 5905
run-2/checkpoint-2500/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b8a82a47d173216c2b9370bebc4c54b1526a89600167c72f17e5bebf5eb473bc
3
  size 598898116
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1d0bbc4c63e7a063b5ef81cdfded47d4f3ed44ed2c1ab894a511efd6be31ead
3
  size 598898116
run-2/checkpoint-2500/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c3372a7012a4935ce2d35e2ae16751026df475f2d30b9a0c55df234c60a20eec
3
  size 1197886411
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:250d40838bad8850a5df6cc228a0857f752d202f5661c1dac483641e146782ac
3
  size 1197886411
run-2/checkpoint-2500/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aecd22f284b4345050135631e140c873ba31e3dd7673266e8c31c5d23442d7fa
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7b695bc3dfc76643ff10ec841e274d6113ed0c2a13104892b36fd692fc3b1cc
3
  size 1465
run-2/checkpoint-2500/trainer_state.json CHANGED
@@ -11,111 +11,111 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.20964360587002095,
14
- "grad_norm": 19.045116424560547,
15
- "learning_rate": 2.9106019766397126e-05,
16
- "loss": 3.1735,
17
  "step": 200
18
  },
19
  {
20
  "epoch": 0.4192872117400419,
21
- "grad_norm": 14.753777503967285,
22
- "learning_rate": 2.820754716981132e-05,
23
- "loss": 0.6335,
24
  "step": 400
25
  },
26
  {
27
  "epoch": 0.6289308176100629,
28
- "grad_norm": 7.134149074554443,
29
- "learning_rate": 2.730907457322552e-05,
30
- "loss": 0.3331,
31
  "step": 600
32
  },
33
  {
34
  "epoch": 0.8385744234800838,
35
- "grad_norm": 11.017728805541992,
36
- "learning_rate": 2.6410601976639712e-05,
37
- "loss": 0.2265,
38
  "step": 800
39
  },
40
  {
41
  "epoch": 1.0,
42
- "eval_accuracy": 0.9341935483870968,
43
- "eval_loss": 0.25923657417297363,
44
- "eval_runtime": 6.2485,
45
- "eval_samples_per_second": 496.116,
46
- "eval_steps_per_second": 31.047,
47
  "step": 954
48
  },
49
  {
50
  "epoch": 1.0482180293501049,
51
- "grad_norm": 17.758880615234375,
52
- "learning_rate": 2.551212938005391e-05,
53
- "loss": 0.1617,
54
  "step": 1000
55
  },
56
  {
57
  "epoch": 1.2578616352201257,
58
- "grad_norm": 2.7743964195251465,
59
- "learning_rate": 2.4613656783468106e-05,
60
- "loss": 0.0715,
61
  "step": 1200
62
  },
63
  {
64
  "epoch": 1.4675052410901468,
65
- "grad_norm": 0.491705060005188,
66
- "learning_rate": 2.3715184186882298e-05,
67
- "loss": 0.059,
68
  "step": 1400
69
  },
70
  {
71
  "epoch": 1.6771488469601676,
72
- "grad_norm": 0.3010256290435791,
73
- "learning_rate": 2.2816711590296497e-05,
74
- "loss": 0.0569,
75
  "step": 1600
76
  },
77
  {
78
  "epoch": 1.8867924528301887,
79
- "grad_norm": 22.237369537353516,
80
- "learning_rate": 2.1918238993710692e-05,
81
- "loss": 0.0672,
82
  "step": 1800
83
  },
84
  {
85
  "epoch": 2.0,
86
- "eval_accuracy": 0.9487096774193549,
87
- "eval_loss": 0.22349213063716888,
88
- "eval_runtime": 5.967,
89
- "eval_samples_per_second": 519.525,
90
- "eval_steps_per_second": 32.512,
91
  "step": 1908
92
  },
93
  {
94
  "epoch": 2.0964360587002098,
95
- "grad_norm": 0.024833250790834427,
96
- "learning_rate": 2.1019766397124887e-05,
97
- "loss": 0.0422,
98
  "step": 2000
99
  },
100
  {
101
  "epoch": 2.3060796645702304,
102
- "grad_norm": 0.0831853449344635,
103
- "learning_rate": 2.0121293800539083e-05,
104
- "loss": 0.0181,
105
  "step": 2200
106
  },
107
  {
108
  "epoch": 2.5157232704402515,
109
- "grad_norm": 0.40406087040901184,
110
- "learning_rate": 1.922282120395328e-05,
111
- "loss": 0.0227,
112
  "step": 2400
113
  }
114
  ],
115
  "logging_steps": 200,
116
- "max_steps": 6678,
117
  "num_input_tokens_seen": 0,
118
- "num_train_epochs": 7,
119
  "save_steps": 500,
120
  "stateful_callbacks": {
121
  "TrainerControl": {
@@ -133,8 +133,8 @@
133
  "train_batch_size": 16,
134
  "trial_name": null,
135
  "trial_params": {
136
- "alpha": 0.8225267279347407,
137
- "num_train_epochs": 7,
138
- "temperature": 9
139
  }
140
  }
 
11
  "log_history": [
12
  {
13
  "epoch": 0.20964360587002095,
14
+ "grad_norm": 19.058637619018555,
15
+ "learning_rate": 2.8748427672955975e-05,
16
+ "loss": 3.1779,
17
  "step": 200
18
  },
19
  {
20
  "epoch": 0.4192872117400419,
21
+ "grad_norm": 16.188446044921875,
22
+ "learning_rate": 2.749056603773585e-05,
23
+ "loss": 0.6362,
24
  "step": 400
25
  },
26
  {
27
  "epoch": 0.6289308176100629,
28
+ "grad_norm": 7.167396545410156,
29
+ "learning_rate": 2.6232704402515723e-05,
30
+ "loss": 0.3306,
31
  "step": 600
32
  },
33
  {
34
  "epoch": 0.8385744234800838,
35
+ "grad_norm": 11.347878456115723,
36
+ "learning_rate": 2.49748427672956e-05,
37
+ "loss": 0.2273,
38
  "step": 800
39
  },
40
  {
41
  "epoch": 1.0,
42
+ "eval_accuracy": 0.9325806451612904,
43
+ "eval_loss": 0.2720068097114563,
44
+ "eval_runtime": 6.3485,
45
+ "eval_samples_per_second": 488.302,
46
+ "eval_steps_per_second": 30.558,
47
  "step": 954
48
  },
49
  {
50
  "epoch": 1.0482180293501049,
51
+ "grad_norm": 15.08842658996582,
52
+ "learning_rate": 2.3716981132075474e-05,
53
+ "loss": 0.1632,
54
  "step": 1000
55
  },
56
  {
57
  "epoch": 1.2578616352201257,
58
+ "grad_norm": 1.7787141799926758,
59
+ "learning_rate": 2.2459119496855346e-05,
60
+ "loss": 0.0698,
61
  "step": 1200
62
  },
63
  {
64
  "epoch": 1.4675052410901468,
65
+ "grad_norm": 0.3887692987918854,
66
+ "learning_rate": 2.120125786163522e-05,
67
+ "loss": 0.0551,
68
  "step": 1400
69
  },
70
  {
71
  "epoch": 1.6771488469601676,
72
+ "grad_norm": 0.17968730628490448,
73
+ "learning_rate": 1.9943396226415094e-05,
74
+ "loss": 0.0487,
75
  "step": 1600
76
  },
77
  {
78
  "epoch": 1.8867924528301887,
79
+ "grad_norm": 17.465627670288086,
80
+ "learning_rate": 1.868553459119497e-05,
81
+ "loss": 0.0556,
82
  "step": 1800
83
  },
84
  {
85
  "epoch": 2.0,
86
+ "eval_accuracy": 0.9496774193548387,
87
+ "eval_loss": 0.23183350265026093,
88
+ "eval_runtime": 6.1991,
89
+ "eval_samples_per_second": 500.07,
90
+ "eval_steps_per_second": 31.295,
91
  "step": 1908
92
  },
93
  {
94
  "epoch": 2.0964360587002098,
95
+ "grad_norm": 0.020424585789442062,
96
+ "learning_rate": 1.742767295597484e-05,
97
+ "loss": 0.0409,
98
  "step": 2000
99
  },
100
  {
101
  "epoch": 2.3060796645702304,
102
+ "grad_norm": 0.036295052617788315,
103
+ "learning_rate": 1.6169811320754717e-05,
104
+ "loss": 0.0135,
105
  "step": 2200
106
  },
107
  {
108
  "epoch": 2.5157232704402515,
109
+ "grad_norm": 6.396214008331299,
110
+ "learning_rate": 1.491194968553459e-05,
111
+ "loss": 0.0199,
112
  "step": 2400
113
  }
114
  ],
115
  "logging_steps": 200,
116
+ "max_steps": 4770,
117
  "num_input_tokens_seen": 0,
118
+ "num_train_epochs": 5,
119
  "save_steps": 500,
120
  "stateful_callbacks": {
121
  "TrainerControl": {
 
133
  "train_batch_size": 16,
134
  "trial_name": null,
135
  "trial_params": {
136
+ "alpha": 0.7896637522578164,
137
+ "num_train_epochs": 5,
138
+ "temperature": 4
139
  }
140
  }
run-2/checkpoint-2500/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:71f85117fc3c233d4584172d7a3335ae3c759791cdd7fc1ffafeb612dabc2e87
3
  size 5905
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:210679227dd2b3037b48d4414e3a7d05adcbca6f5fde35131afb41f4714ec72e
3
  size 5905
run-2/checkpoint-3000/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:421e6e1d1d91430612090fb1be9d5e8333bf379a64e5d667df7c5530a4633d42
3
  size 598898116
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6144b67e0c970718b1d25507b5a35543beaa7bd44ad4eaa02f23769e108f4d40
3
  size 598898116
run-2/checkpoint-3000/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:08ebda7b8eec1ded62c9fb938c186fa33249a9851bf4d670e9afb02acfe6ecc2
3
  size 1197886411
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24339245e57fbea92595807384396c41fe482046841a3d51a43b5c8d6a0daf05
3
  size 1197886411
run-2/checkpoint-3000/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:43e6f223028ac1a86229136858f55c765f0b15a620a310ae54accd6015fa868b
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4fae9e5d4cea97d8b9da2a50e7f1db2a81314b175343651f02628e7aadce229e
3
  size 1465
run-2/checkpoint-3000/trainer_state.json CHANGED
@@ -11,141 +11,141 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.20964360587002095,
14
- "grad_norm": 19.045116424560547,
15
- "learning_rate": 2.9106019766397126e-05,
16
- "loss": 3.1735,
17
  "step": 200
18
  },
19
  {
20
  "epoch": 0.4192872117400419,
21
- "grad_norm": 14.753777503967285,
22
- "learning_rate": 2.820754716981132e-05,
23
- "loss": 0.6335,
24
  "step": 400
25
  },
26
  {
27
  "epoch": 0.6289308176100629,
28
- "grad_norm": 7.134149074554443,
29
- "learning_rate": 2.730907457322552e-05,
30
- "loss": 0.3331,
31
  "step": 600
32
  },
33
  {
34
  "epoch": 0.8385744234800838,
35
- "grad_norm": 11.017728805541992,
36
- "learning_rate": 2.6410601976639712e-05,
37
- "loss": 0.2265,
38
  "step": 800
39
  },
40
  {
41
  "epoch": 1.0,
42
- "eval_accuracy": 0.9341935483870968,
43
- "eval_loss": 0.25923657417297363,
44
- "eval_runtime": 6.2485,
45
- "eval_samples_per_second": 496.116,
46
- "eval_steps_per_second": 31.047,
47
  "step": 954
48
  },
49
  {
50
  "epoch": 1.0482180293501049,
51
- "grad_norm": 17.758880615234375,
52
- "learning_rate": 2.551212938005391e-05,
53
- "loss": 0.1617,
54
  "step": 1000
55
  },
56
  {
57
  "epoch": 1.2578616352201257,
58
- "grad_norm": 2.7743964195251465,
59
- "learning_rate": 2.4613656783468106e-05,
60
- "loss": 0.0715,
61
  "step": 1200
62
  },
63
  {
64
  "epoch": 1.4675052410901468,
65
- "grad_norm": 0.491705060005188,
66
- "learning_rate": 2.3715184186882298e-05,
67
- "loss": 0.059,
68
  "step": 1400
69
  },
70
  {
71
  "epoch": 1.6771488469601676,
72
- "grad_norm": 0.3010256290435791,
73
- "learning_rate": 2.2816711590296497e-05,
74
- "loss": 0.0569,
75
  "step": 1600
76
  },
77
  {
78
  "epoch": 1.8867924528301887,
79
- "grad_norm": 22.237369537353516,
80
- "learning_rate": 2.1918238993710692e-05,
81
- "loss": 0.0672,
82
  "step": 1800
83
  },
84
  {
85
  "epoch": 2.0,
86
- "eval_accuracy": 0.9487096774193549,
87
- "eval_loss": 0.22349213063716888,
88
- "eval_runtime": 5.967,
89
- "eval_samples_per_second": 519.525,
90
- "eval_steps_per_second": 32.512,
91
  "step": 1908
92
  },
93
  {
94
  "epoch": 2.0964360587002098,
95
- "grad_norm": 0.024833250790834427,
96
- "learning_rate": 2.1019766397124887e-05,
97
- "loss": 0.0422,
98
  "step": 2000
99
  },
100
  {
101
  "epoch": 2.3060796645702304,
102
- "grad_norm": 0.0831853449344635,
103
- "learning_rate": 2.0121293800539083e-05,
104
- "loss": 0.0181,
105
  "step": 2200
106
  },
107
  {
108
  "epoch": 2.5157232704402515,
109
- "grad_norm": 0.40406087040901184,
110
- "learning_rate": 1.922282120395328e-05,
111
- "loss": 0.0227,
112
  "step": 2400
113
  },
114
  {
115
  "epoch": 2.7253668763102725,
116
- "grad_norm": 0.11021151393651962,
117
- "learning_rate": 1.8324348607367477e-05,
118
- "loss": 0.0109,
119
  "step": 2600
120
  },
121
  {
122
  "epoch": 2.9350104821802936,
123
- "grad_norm": 0.11338778585195541,
124
- "learning_rate": 1.7425876010781672e-05,
125
- "loss": 0.0124,
126
  "step": 2800
127
  },
128
  {
129
  "epoch": 3.0,
130
- "eval_accuracy": 0.9570967741935484,
131
- "eval_loss": 0.19864021241664886,
132
- "eval_runtime": 5.9289,
133
- "eval_samples_per_second": 522.867,
134
- "eval_steps_per_second": 32.721,
135
  "step": 2862
136
  },
137
  {
138
  "epoch": 3.1446540880503147,
139
- "grad_norm": 0.006569345481693745,
140
- "learning_rate": 1.6527403414195867e-05,
141
- "loss": 0.0124,
142
  "step": 3000
143
  }
144
  ],
145
  "logging_steps": 200,
146
- "max_steps": 6678,
147
  "num_input_tokens_seen": 0,
148
- "num_train_epochs": 7,
149
  "save_steps": 500,
150
  "stateful_callbacks": {
151
  "TrainerControl": {
@@ -163,8 +163,8 @@
163
  "train_batch_size": 16,
164
  "trial_name": null,
165
  "trial_params": {
166
- "alpha": 0.8225267279347407,
167
- "num_train_epochs": 7,
168
- "temperature": 9
169
  }
170
  }
 
11
  "log_history": [
12
  {
13
  "epoch": 0.20964360587002095,
14
+ "grad_norm": 19.058637619018555,
15
+ "learning_rate": 2.8748427672955975e-05,
16
+ "loss": 3.1779,
17
  "step": 200
18
  },
19
  {
20
  "epoch": 0.4192872117400419,
21
+ "grad_norm": 16.188446044921875,
22
+ "learning_rate": 2.749056603773585e-05,
23
+ "loss": 0.6362,
24
  "step": 400
25
  },
26
  {
27
  "epoch": 0.6289308176100629,
28
+ "grad_norm": 7.167396545410156,
29
+ "learning_rate": 2.6232704402515723e-05,
30
+ "loss": 0.3306,
31
  "step": 600
32
  },
33
  {
34
  "epoch": 0.8385744234800838,
35
+ "grad_norm": 11.347878456115723,
36
+ "learning_rate": 2.49748427672956e-05,
37
+ "loss": 0.2273,
38
  "step": 800
39
  },
40
  {
41
  "epoch": 1.0,
42
+ "eval_accuracy": 0.9325806451612904,
43
+ "eval_loss": 0.2720068097114563,
44
+ "eval_runtime": 6.3485,
45
+ "eval_samples_per_second": 488.302,
46
+ "eval_steps_per_second": 30.558,
47
  "step": 954
48
  },
49
  {
50
  "epoch": 1.0482180293501049,
51
+ "grad_norm": 15.08842658996582,
52
+ "learning_rate": 2.3716981132075474e-05,
53
+ "loss": 0.1632,
54
  "step": 1000
55
  },
56
  {
57
  "epoch": 1.2578616352201257,
58
+ "grad_norm": 1.7787141799926758,
59
+ "learning_rate": 2.2459119496855346e-05,
60
+ "loss": 0.0698,
61
  "step": 1200
62
  },
63
  {
64
  "epoch": 1.4675052410901468,
65
+ "grad_norm": 0.3887692987918854,
66
+ "learning_rate": 2.120125786163522e-05,
67
+ "loss": 0.0551,
68
  "step": 1400
69
  },
70
  {
71
  "epoch": 1.6771488469601676,
72
+ "grad_norm": 0.17968730628490448,
73
+ "learning_rate": 1.9943396226415094e-05,
74
+ "loss": 0.0487,
75
  "step": 1600
76
  },
77
  {
78
  "epoch": 1.8867924528301887,
79
+ "grad_norm": 17.465627670288086,
80
+ "learning_rate": 1.868553459119497e-05,
81
+ "loss": 0.0556,
82
  "step": 1800
83
  },
84
  {
85
  "epoch": 2.0,
86
+ "eval_accuracy": 0.9496774193548387,
87
+ "eval_loss": 0.23183350265026093,
88
+ "eval_runtime": 6.1991,
89
+ "eval_samples_per_second": 500.07,
90
+ "eval_steps_per_second": 31.295,
91
  "step": 1908
92
  },
93
  {
94
  "epoch": 2.0964360587002098,
95
+ "grad_norm": 0.020424585789442062,
96
+ "learning_rate": 1.742767295597484e-05,
97
+ "loss": 0.0409,
98
  "step": 2000
99
  },
100
  {
101
  "epoch": 2.3060796645702304,
102
+ "grad_norm": 0.036295052617788315,
103
+ "learning_rate": 1.6169811320754717e-05,
104
+ "loss": 0.0135,
105
  "step": 2200
106
  },
107
  {
108
  "epoch": 2.5157232704402515,
109
+ "grad_norm": 6.396214008331299,
110
+ "learning_rate": 1.491194968553459e-05,
111
+ "loss": 0.0199,
112
  "step": 2400
113
  },
114
  {
115
  "epoch": 2.7253668763102725,
116
+ "grad_norm": 0.07038699835538864,
117
+ "learning_rate": 1.3654088050314464e-05,
118
+ "loss": 0.0084,
119
  "step": 2600
120
  },
121
  {
122
  "epoch": 2.9350104821802936,
123
+ "grad_norm": 0.09982823580503464,
124
+ "learning_rate": 1.239622641509434e-05,
125
+ "loss": 0.008,
126
  "step": 2800
127
  },
128
  {
129
  "epoch": 3.0,
130
+ "eval_accuracy": 0.9609677419354838,
131
+ "eval_loss": 0.19234703481197357,
132
+ "eval_runtime": 8.1342,
133
+ "eval_samples_per_second": 381.109,
134
+ "eval_steps_per_second": 23.85,
135
  "step": 2862
136
  },
137
  {
138
  "epoch": 3.1446540880503147,
139
+ "grad_norm": 0.007687446661293507,
140
+ "learning_rate": 1.1138364779874214e-05,
141
+ "loss": 0.0041,
142
  "step": 3000
143
  }
144
  ],
145
  "logging_steps": 200,
146
+ "max_steps": 4770,
147
  "num_input_tokens_seen": 0,
148
+ "num_train_epochs": 5,
149
  "save_steps": 500,
150
  "stateful_callbacks": {
151
  "TrainerControl": {
 
163
  "train_batch_size": 16,
164
  "trial_name": null,
165
  "trial_params": {
166
+ "alpha": 0.7896637522578164,
167
+ "num_train_epochs": 5,
168
+ "temperature": 4
169
  }
170
  }
run-2/checkpoint-3000/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:71f85117fc3c233d4584172d7a3335ae3c759791cdd7fc1ffafeb612dabc2e87
3
  size 5905
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:210679227dd2b3037b48d4414e3a7d05adcbca6f5fde35131afb41f4714ec72e
3
  size 5905
run-2/checkpoint-3500/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:58ecae6961514268e83d2f5fd0cb8e27363b2b46050ce222f6dd900bd14fe93f
3
  size 598898116
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2687be008f3100c5d4839c8c99c609f93d017b4216060a8dc8b46d2ab36c5011
3
  size 598898116
run-2/checkpoint-3500/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:01ef5da9fa079769162dab0ad2ac147f86075a718091c0483747e7dbbddcf101
3
  size 1197886411
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82f171274b6618bef8a7a4e12bfdc1d5f37e19a48112ee1d8bcf1f49ea8f0c74
3
  size 1197886411
run-2/checkpoint-3500/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f92df06f6106d35d079c72ede9546a796410c219fbd8f2d5cd73439bc238b93d
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe421286099f53ea00a29a2d1d78d57e96dcfee55d59a8d0b496851ddc6108f0
3
  size 1465
run-2/checkpoint-3500/trainer_state.json CHANGED
@@ -11,155 +11,155 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.20964360587002095,
14
- "grad_norm": 19.045116424560547,
15
- "learning_rate": 2.9106019766397126e-05,
16
- "loss": 3.1735,
17
  "step": 200
18
  },
19
  {
20
  "epoch": 0.4192872117400419,
21
- "grad_norm": 14.753777503967285,
22
- "learning_rate": 2.820754716981132e-05,
23
- "loss": 0.6335,
24
  "step": 400
25
  },
26
  {
27
  "epoch": 0.6289308176100629,
28
- "grad_norm": 7.134149074554443,
29
- "learning_rate": 2.730907457322552e-05,
30
- "loss": 0.3331,
31
  "step": 600
32
  },
33
  {
34
  "epoch": 0.8385744234800838,
35
- "grad_norm": 11.017728805541992,
36
- "learning_rate": 2.6410601976639712e-05,
37
- "loss": 0.2265,
38
  "step": 800
39
  },
40
  {
41
  "epoch": 1.0,
42
- "eval_accuracy": 0.9341935483870968,
43
- "eval_loss": 0.25923657417297363,
44
- "eval_runtime": 6.2485,
45
- "eval_samples_per_second": 496.116,
46
- "eval_steps_per_second": 31.047,
47
  "step": 954
48
  },
49
  {
50
  "epoch": 1.0482180293501049,
51
- "grad_norm": 17.758880615234375,
52
- "learning_rate": 2.551212938005391e-05,
53
- "loss": 0.1617,
54
  "step": 1000
55
  },
56
  {
57
  "epoch": 1.2578616352201257,
58
- "grad_norm": 2.7743964195251465,
59
- "learning_rate": 2.4613656783468106e-05,
60
- "loss": 0.0715,
61
  "step": 1200
62
  },
63
  {
64
  "epoch": 1.4675052410901468,
65
- "grad_norm": 0.491705060005188,
66
- "learning_rate": 2.3715184186882298e-05,
67
- "loss": 0.059,
68
  "step": 1400
69
  },
70
  {
71
  "epoch": 1.6771488469601676,
72
- "grad_norm": 0.3010256290435791,
73
- "learning_rate": 2.2816711590296497e-05,
74
- "loss": 0.0569,
75
  "step": 1600
76
  },
77
  {
78
  "epoch": 1.8867924528301887,
79
- "grad_norm": 22.237369537353516,
80
- "learning_rate": 2.1918238993710692e-05,
81
- "loss": 0.0672,
82
  "step": 1800
83
  },
84
  {
85
  "epoch": 2.0,
86
- "eval_accuracy": 0.9487096774193549,
87
- "eval_loss": 0.22349213063716888,
88
- "eval_runtime": 5.967,
89
- "eval_samples_per_second": 519.525,
90
- "eval_steps_per_second": 32.512,
91
  "step": 1908
92
  },
93
  {
94
  "epoch": 2.0964360587002098,
95
- "grad_norm": 0.024833250790834427,
96
- "learning_rate": 2.1019766397124887e-05,
97
- "loss": 0.0422,
98
  "step": 2000
99
  },
100
  {
101
  "epoch": 2.3060796645702304,
102
- "grad_norm": 0.0831853449344635,
103
- "learning_rate": 2.0121293800539083e-05,
104
- "loss": 0.0181,
105
  "step": 2200
106
  },
107
  {
108
  "epoch": 2.5157232704402515,
109
- "grad_norm": 0.40406087040901184,
110
- "learning_rate": 1.922282120395328e-05,
111
- "loss": 0.0227,
112
  "step": 2400
113
  },
114
  {
115
  "epoch": 2.7253668763102725,
116
- "grad_norm": 0.11021151393651962,
117
- "learning_rate": 1.8324348607367477e-05,
118
- "loss": 0.0109,
119
  "step": 2600
120
  },
121
  {
122
  "epoch": 2.9350104821802936,
123
- "grad_norm": 0.11338778585195541,
124
- "learning_rate": 1.7425876010781672e-05,
125
- "loss": 0.0124,
126
  "step": 2800
127
  },
128
  {
129
  "epoch": 3.0,
130
- "eval_accuracy": 0.9570967741935484,
131
- "eval_loss": 0.19864021241664886,
132
- "eval_runtime": 5.9289,
133
- "eval_samples_per_second": 522.867,
134
- "eval_steps_per_second": 32.721,
135
  "step": 2862
136
  },
137
  {
138
  "epoch": 3.1446540880503147,
139
- "grad_norm": 0.006569345481693745,
140
- "learning_rate": 1.6527403414195867e-05,
141
- "loss": 0.0124,
142
  "step": 3000
143
  },
144
  {
145
  "epoch": 3.3542976939203353,
146
- "grad_norm": 0.11313613504171371,
147
- "learning_rate": 1.5628930817610063e-05,
148
- "loss": 0.0048,
149
  "step": 3200
150
  },
151
  {
152
  "epoch": 3.5639412997903563,
153
- "grad_norm": 0.007506044581532478,
154
- "learning_rate": 1.473045822102426e-05,
155
- "loss": 0.0027,
156
  "step": 3400
157
  }
158
  ],
159
  "logging_steps": 200,
160
- "max_steps": 6678,
161
  "num_input_tokens_seen": 0,
162
- "num_train_epochs": 7,
163
  "save_steps": 500,
164
  "stateful_callbacks": {
165
  "TrainerControl": {
@@ -177,8 +177,8 @@
177
  "train_batch_size": 16,
178
  "trial_name": null,
179
  "trial_params": {
180
- "alpha": 0.8225267279347407,
181
- "num_train_epochs": 7,
182
- "temperature": 9
183
  }
184
  }
 
11
  "log_history": [
12
  {
13
  "epoch": 0.20964360587002095,
14
+ "grad_norm": 19.058637619018555,
15
+ "learning_rate": 2.8748427672955975e-05,
16
+ "loss": 3.1779,
17
  "step": 200
18
  },
19
  {
20
  "epoch": 0.4192872117400419,
21
+ "grad_norm": 16.188446044921875,
22
+ "learning_rate": 2.749056603773585e-05,
23
+ "loss": 0.6362,
24
  "step": 400
25
  },
26
  {
27
  "epoch": 0.6289308176100629,
28
+ "grad_norm": 7.167396545410156,
29
+ "learning_rate": 2.6232704402515723e-05,
30
+ "loss": 0.3306,
31
  "step": 600
32
  },
33
  {
34
  "epoch": 0.8385744234800838,
35
+ "grad_norm": 11.347878456115723,
36
+ "learning_rate": 2.49748427672956e-05,
37
+ "loss": 0.2273,
38
  "step": 800
39
  },
40
  {
41
  "epoch": 1.0,
42
+ "eval_accuracy": 0.9325806451612904,
43
+ "eval_loss": 0.2720068097114563,
44
+ "eval_runtime": 6.3485,
45
+ "eval_samples_per_second": 488.302,
46
+ "eval_steps_per_second": 30.558,
47
  "step": 954
48
  },
49
  {
50
  "epoch": 1.0482180293501049,
51
+ "grad_norm": 15.08842658996582,
52
+ "learning_rate": 2.3716981132075474e-05,
53
+ "loss": 0.1632,
54
  "step": 1000
55
  },
56
  {
57
  "epoch": 1.2578616352201257,
58
+ "grad_norm": 1.7787141799926758,
59
+ "learning_rate": 2.2459119496855346e-05,
60
+ "loss": 0.0698,
61
  "step": 1200
62
  },
63
  {
64
  "epoch": 1.4675052410901468,
65
+ "grad_norm": 0.3887692987918854,
66
+ "learning_rate": 2.120125786163522e-05,
67
+ "loss": 0.0551,
68
  "step": 1400
69
  },
70
  {
71
  "epoch": 1.6771488469601676,
72
+ "grad_norm": 0.17968730628490448,
73
+ "learning_rate": 1.9943396226415094e-05,
74
+ "loss": 0.0487,
75
  "step": 1600
76
  },
77
  {
78
  "epoch": 1.8867924528301887,
79
+ "grad_norm": 17.465627670288086,
80
+ "learning_rate": 1.868553459119497e-05,
81
+ "loss": 0.0556,
82
  "step": 1800
83
  },
84
  {
85
  "epoch": 2.0,
86
+ "eval_accuracy": 0.9496774193548387,
87
+ "eval_loss": 0.23183350265026093,
88
+ "eval_runtime": 6.1991,
89
+ "eval_samples_per_second": 500.07,
90
+ "eval_steps_per_second": 31.295,
91
  "step": 1908
92
  },
93
  {
94
  "epoch": 2.0964360587002098,
95
+ "grad_norm": 0.020424585789442062,
96
+ "learning_rate": 1.742767295597484e-05,
97
+ "loss": 0.0409,
98
  "step": 2000
99
  },
100
  {
101
  "epoch": 2.3060796645702304,
102
+ "grad_norm": 0.036295052617788315,
103
+ "learning_rate": 1.6169811320754717e-05,
104
+ "loss": 0.0135,
105
  "step": 2200
106
  },
107
  {
108
  "epoch": 2.5157232704402515,
109
+ "grad_norm": 6.396214008331299,
110
+ "learning_rate": 1.491194968553459e-05,
111
+ "loss": 0.0199,
112
  "step": 2400
113
  },
114
  {
115
  "epoch": 2.7253668763102725,
116
+ "grad_norm": 0.07038699835538864,
117
+ "learning_rate": 1.3654088050314464e-05,
118
+ "loss": 0.0084,
119
  "step": 2600
120
  },
121
  {
122
  "epoch": 2.9350104821802936,
123
+ "grad_norm": 0.09982823580503464,
124
+ "learning_rate": 1.239622641509434e-05,
125
+ "loss": 0.008,
126
  "step": 2800
127
  },
128
  {
129
  "epoch": 3.0,
130
+ "eval_accuracy": 0.9609677419354838,
131
+ "eval_loss": 0.19234703481197357,
132
+ "eval_runtime": 8.1342,
133
+ "eval_samples_per_second": 381.109,
134
+ "eval_steps_per_second": 23.85,
135
  "step": 2862
136
  },
137
  {
138
  "epoch": 3.1446540880503147,
139
+ "grad_norm": 0.007687446661293507,
140
+ "learning_rate": 1.1138364779874214e-05,
141
+ "loss": 0.0041,
142
  "step": 3000
143
  },
144
  {
145
  "epoch": 3.3542976939203353,
146
+ "grad_norm": 2.6632392406463623,
147
+ "learning_rate": 9.880503144654089e-06,
148
+ "loss": 0.0055,
149
  "step": 3200
150
  },
151
  {
152
  "epoch": 3.5639412997903563,
153
+ "grad_norm": 0.014160693623125553,
154
+ "learning_rate": 8.622641509433963e-06,
155
+ "loss": 0.002,
156
  "step": 3400
157
  }
158
  ],
159
  "logging_steps": 200,
160
+ "max_steps": 4770,
161
  "num_input_tokens_seen": 0,
162
+ "num_train_epochs": 5,
163
  "save_steps": 500,
164
  "stateful_callbacks": {
165
  "TrainerControl": {
 
177
  "train_batch_size": 16,
178
  "trial_name": null,
179
  "trial_params": {
180
+ "alpha": 0.7896637522578164,
181
+ "num_train_epochs": 5,
182
+ "temperature": 4
183
  }
184
  }
run-2/checkpoint-3500/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:71f85117fc3c233d4584172d7a3335ae3c759791cdd7fc1ffafeb612dabc2e87
3
  size 5905
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:210679227dd2b3037b48d4414e3a7d05adcbca6f5fde35131afb41f4714ec72e
3
  size 5905
run-2/checkpoint-4000/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9c1122692227db61377f6408a01441b8a26953e4222c28dc855d05c8b72cc3e9
3
  size 598898116
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e85cb2426b5b984e4089fb5675451860f62cc1aaeef32ac93b49caa68908ba4
3
  size 598898116
run-2/checkpoint-4000/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cc44051023cbe38abbdd62e72590b373f04116c122fdfb8cae6e9f5550b9aba1
3
  size 1197886411
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8cffde57c8ec2e5043b3899eb91a9379813dc4ad6ea80c57fb3555a093f7453a
3
  size 1197886411
run-2/checkpoint-4000/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:70f3bb2e5d2ab9fc873b44953a5bb880c518edc68b4dc5867b3bdfeba44d8b2c
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8ebfbc57a5ee18fb08554d9eccaa7709c1407d3c3ebd69b471935f08a8b07fe
3
  size 1465
run-2/checkpoint-4000/trainer_state.json CHANGED
@@ -11,185 +11,185 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.20964360587002095,
14
- "grad_norm": 19.045116424560547,
15
- "learning_rate": 2.9106019766397126e-05,
16
- "loss": 3.1735,
17
  "step": 200
18
  },
19
  {
20
  "epoch": 0.4192872117400419,
21
- "grad_norm": 14.753777503967285,
22
- "learning_rate": 2.820754716981132e-05,
23
- "loss": 0.6335,
24
  "step": 400
25
  },
26
  {
27
  "epoch": 0.6289308176100629,
28
- "grad_norm": 7.134149074554443,
29
- "learning_rate": 2.730907457322552e-05,
30
- "loss": 0.3331,
31
  "step": 600
32
  },
33
  {
34
  "epoch": 0.8385744234800838,
35
- "grad_norm": 11.017728805541992,
36
- "learning_rate": 2.6410601976639712e-05,
37
- "loss": 0.2265,
38
  "step": 800
39
  },
40
  {
41
  "epoch": 1.0,
42
- "eval_accuracy": 0.9341935483870968,
43
- "eval_loss": 0.25923657417297363,
44
- "eval_runtime": 6.2485,
45
- "eval_samples_per_second": 496.116,
46
- "eval_steps_per_second": 31.047,
47
  "step": 954
48
  },
49
  {
50
  "epoch": 1.0482180293501049,
51
- "grad_norm": 17.758880615234375,
52
- "learning_rate": 2.551212938005391e-05,
53
- "loss": 0.1617,
54
  "step": 1000
55
  },
56
  {
57
  "epoch": 1.2578616352201257,
58
- "grad_norm": 2.7743964195251465,
59
- "learning_rate": 2.4613656783468106e-05,
60
- "loss": 0.0715,
61
  "step": 1200
62
  },
63
  {
64
  "epoch": 1.4675052410901468,
65
- "grad_norm": 0.491705060005188,
66
- "learning_rate": 2.3715184186882298e-05,
67
- "loss": 0.059,
68
  "step": 1400
69
  },
70
  {
71
  "epoch": 1.6771488469601676,
72
- "grad_norm": 0.3010256290435791,
73
- "learning_rate": 2.2816711590296497e-05,
74
- "loss": 0.0569,
75
  "step": 1600
76
  },
77
  {
78
  "epoch": 1.8867924528301887,
79
- "grad_norm": 22.237369537353516,
80
- "learning_rate": 2.1918238993710692e-05,
81
- "loss": 0.0672,
82
  "step": 1800
83
  },
84
  {
85
  "epoch": 2.0,
86
- "eval_accuracy": 0.9487096774193549,
87
- "eval_loss": 0.22349213063716888,
88
- "eval_runtime": 5.967,
89
- "eval_samples_per_second": 519.525,
90
- "eval_steps_per_second": 32.512,
91
  "step": 1908
92
  },
93
  {
94
  "epoch": 2.0964360587002098,
95
- "grad_norm": 0.024833250790834427,
96
- "learning_rate": 2.1019766397124887e-05,
97
- "loss": 0.0422,
98
  "step": 2000
99
  },
100
  {
101
  "epoch": 2.3060796645702304,
102
- "grad_norm": 0.0831853449344635,
103
- "learning_rate": 2.0121293800539083e-05,
104
- "loss": 0.0181,
105
  "step": 2200
106
  },
107
  {
108
  "epoch": 2.5157232704402515,
109
- "grad_norm": 0.40406087040901184,
110
- "learning_rate": 1.922282120395328e-05,
111
- "loss": 0.0227,
112
  "step": 2400
113
  },
114
  {
115
  "epoch": 2.7253668763102725,
116
- "grad_norm": 0.11021151393651962,
117
- "learning_rate": 1.8324348607367477e-05,
118
- "loss": 0.0109,
119
  "step": 2600
120
  },
121
  {
122
  "epoch": 2.9350104821802936,
123
- "grad_norm": 0.11338778585195541,
124
- "learning_rate": 1.7425876010781672e-05,
125
- "loss": 0.0124,
126
  "step": 2800
127
  },
128
  {
129
  "epoch": 3.0,
130
- "eval_accuracy": 0.9570967741935484,
131
- "eval_loss": 0.19864021241664886,
132
- "eval_runtime": 5.9289,
133
- "eval_samples_per_second": 522.867,
134
- "eval_steps_per_second": 32.721,
135
  "step": 2862
136
  },
137
  {
138
  "epoch": 3.1446540880503147,
139
- "grad_norm": 0.006569345481693745,
140
- "learning_rate": 1.6527403414195867e-05,
141
- "loss": 0.0124,
142
  "step": 3000
143
  },
144
  {
145
  "epoch": 3.3542976939203353,
146
- "grad_norm": 0.11313613504171371,
147
- "learning_rate": 1.5628930817610063e-05,
148
- "loss": 0.0048,
149
  "step": 3200
150
  },
151
  {
152
  "epoch": 3.5639412997903563,
153
- "grad_norm": 0.007506044581532478,
154
- "learning_rate": 1.473045822102426e-05,
155
- "loss": 0.0027,
156
  "step": 3400
157
  },
158
  {
159
  "epoch": 3.7735849056603774,
160
- "grad_norm": 1.1819171905517578,
161
- "learning_rate": 1.3831985624438455e-05,
162
- "loss": 0.0084,
163
  "step": 3600
164
  },
165
  {
166
  "epoch": 3.9832285115303985,
167
- "grad_norm": 0.009302722290158272,
168
- "learning_rate": 1.2933513027852652e-05,
169
- "loss": 0.0161,
170
  "step": 3800
171
  },
172
  {
173
  "epoch": 4.0,
174
- "eval_accuracy": 0.9580645161290322,
175
- "eval_loss": 0.19435587525367737,
176
- "eval_runtime": 6.4961,
177
- "eval_samples_per_second": 477.21,
178
- "eval_steps_per_second": 29.864,
179
  "step": 3816
180
  },
181
  {
182
  "epoch": 4.1928721174004195,
183
- "grad_norm": 0.007662074640393257,
184
- "learning_rate": 1.2035040431266847e-05,
185
- "loss": 0.006,
186
  "step": 4000
187
  }
188
  ],
189
  "logging_steps": 200,
190
- "max_steps": 6678,
191
  "num_input_tokens_seen": 0,
192
- "num_train_epochs": 7,
193
  "save_steps": 500,
194
  "stateful_callbacks": {
195
  "TrainerControl": {
@@ -207,8 +207,8 @@
207
  "train_batch_size": 16,
208
  "trial_name": null,
209
  "trial_params": {
210
- "alpha": 0.8225267279347407,
211
- "num_train_epochs": 7,
212
- "temperature": 9
213
  }
214
  }
 
11
  "log_history": [
12
  {
13
  "epoch": 0.20964360587002095,
14
+ "grad_norm": 19.058637619018555,
15
+ "learning_rate": 2.8748427672955975e-05,
16
+ "loss": 3.1779,
17
  "step": 200
18
  },
19
  {
20
  "epoch": 0.4192872117400419,
21
+ "grad_norm": 16.188446044921875,
22
+ "learning_rate": 2.749056603773585e-05,
23
+ "loss": 0.6362,
24
  "step": 400
25
  },
26
  {
27
  "epoch": 0.6289308176100629,
28
+ "grad_norm": 7.167396545410156,
29
+ "learning_rate": 2.6232704402515723e-05,
30
+ "loss": 0.3306,
31
  "step": 600
32
  },
33
  {
34
  "epoch": 0.8385744234800838,
35
+ "grad_norm": 11.347878456115723,
36
+ "learning_rate": 2.49748427672956e-05,
37
+ "loss": 0.2273,
38
  "step": 800
39
  },
40
  {
41
  "epoch": 1.0,
42
+ "eval_accuracy": 0.9325806451612904,
43
+ "eval_loss": 0.2720068097114563,
44
+ "eval_runtime": 6.3485,
45
+ "eval_samples_per_second": 488.302,
46
+ "eval_steps_per_second": 30.558,
47
  "step": 954
48
  },
49
  {
50
  "epoch": 1.0482180293501049,
51
+ "grad_norm": 15.08842658996582,
52
+ "learning_rate": 2.3716981132075474e-05,
53
+ "loss": 0.1632,
54
  "step": 1000
55
  },
56
  {
57
  "epoch": 1.2578616352201257,
58
+ "grad_norm": 1.7787141799926758,
59
+ "learning_rate": 2.2459119496855346e-05,
60
+ "loss": 0.0698,
61
  "step": 1200
62
  },
63
  {
64
  "epoch": 1.4675052410901468,
65
+ "grad_norm": 0.3887692987918854,
66
+ "learning_rate": 2.120125786163522e-05,
67
+ "loss": 0.0551,
68
  "step": 1400
69
  },
70
  {
71
  "epoch": 1.6771488469601676,
72
+ "grad_norm": 0.17968730628490448,
73
+ "learning_rate": 1.9943396226415094e-05,
74
+ "loss": 0.0487,
75
  "step": 1600
76
  },
77
  {
78
  "epoch": 1.8867924528301887,
79
+ "grad_norm": 17.465627670288086,
80
+ "learning_rate": 1.868553459119497e-05,
81
+ "loss": 0.0556,
82
  "step": 1800
83
  },
84
  {
85
  "epoch": 2.0,
86
+ "eval_accuracy": 0.9496774193548387,
87
+ "eval_loss": 0.23183350265026093,
88
+ "eval_runtime": 6.1991,
89
+ "eval_samples_per_second": 500.07,
90
+ "eval_steps_per_second": 31.295,
91
  "step": 1908
92
  },
93
  {
94
  "epoch": 2.0964360587002098,
95
+ "grad_norm": 0.020424585789442062,
96
+ "learning_rate": 1.742767295597484e-05,
97
+ "loss": 0.0409,
98
  "step": 2000
99
  },
100
  {
101
  "epoch": 2.3060796645702304,
102
+ "grad_norm": 0.036295052617788315,
103
+ "learning_rate": 1.6169811320754717e-05,
104
+ "loss": 0.0135,
105
  "step": 2200
106
  },
107
  {
108
  "epoch": 2.5157232704402515,
109
+ "grad_norm": 6.396214008331299,
110
+ "learning_rate": 1.491194968553459e-05,
111
+ "loss": 0.0199,
112
  "step": 2400
113
  },
114
  {
115
  "epoch": 2.7253668763102725,
116
+ "grad_norm": 0.07038699835538864,
117
+ "learning_rate": 1.3654088050314464e-05,
118
+ "loss": 0.0084,
119
  "step": 2600
120
  },
121
  {
122
  "epoch": 2.9350104821802936,
123
+ "grad_norm": 0.09982823580503464,
124
+ "learning_rate": 1.239622641509434e-05,
125
+ "loss": 0.008,
126
  "step": 2800
127
  },
128
  {
129
  "epoch": 3.0,
130
+ "eval_accuracy": 0.9609677419354838,
131
+ "eval_loss": 0.19234703481197357,
132
+ "eval_runtime": 8.1342,
133
+ "eval_samples_per_second": 381.109,
134
+ "eval_steps_per_second": 23.85,
135
  "step": 2862
136
  },
137
  {
138
  "epoch": 3.1446540880503147,
139
+ "grad_norm": 0.007687446661293507,
140
+ "learning_rate": 1.1138364779874214e-05,
141
+ "loss": 0.0041,
142
  "step": 3000
143
  },
144
  {
145
  "epoch": 3.3542976939203353,
146
+ "grad_norm": 2.6632392406463623,
147
+ "learning_rate": 9.880503144654089e-06,
148
+ "loss": 0.0055,
149
  "step": 3200
150
  },
151
  {
152
  "epoch": 3.5639412997903563,
153
+ "grad_norm": 0.014160693623125553,
154
+ "learning_rate": 8.622641509433963e-06,
155
+ "loss": 0.002,
156
  "step": 3400
157
  },
158
  {
159
  "epoch": 3.7735849056603774,
160
+ "grad_norm": 0.059565939009189606,
161
+ "learning_rate": 7.364779874213837e-06,
162
+ "loss": 0.0017,
163
  "step": 3600
164
  },
165
  {
166
  "epoch": 3.9832285115303985,
167
+ "grad_norm": 0.0034850463271141052,
168
+ "learning_rate": 6.1069182389937105e-06,
169
+ "loss": 0.0075,
170
  "step": 3800
171
  },
172
  {
173
  "epoch": 4.0,
174
+ "eval_accuracy": 0.96,
175
+ "eval_loss": 0.190092071890831,
176
+ "eval_runtime": 6.4445,
177
+ "eval_samples_per_second": 481.034,
178
+ "eval_steps_per_second": 30.103,
179
  "step": 3816
180
  },
181
  {
182
  "epoch": 4.1928721174004195,
183
+ "grad_norm": 0.012302987277507782,
184
+ "learning_rate": 4.849056603773585e-06,
185
+ "loss": 0.0031,
186
  "step": 4000
187
  }
188
  ],
189
  "logging_steps": 200,
190
+ "max_steps": 4770,
191
  "num_input_tokens_seen": 0,
192
+ "num_train_epochs": 5,
193
  "save_steps": 500,
194
  "stateful_callbacks": {
195
  "TrainerControl": {
 
207
  "train_batch_size": 16,
208
  "trial_name": null,
209
  "trial_params": {
210
+ "alpha": 0.7896637522578164,
211
+ "num_train_epochs": 5,
212
+ "temperature": 4
213
  }
214
  }
run-2/checkpoint-4000/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:71f85117fc3c233d4584172d7a3335ae3c759791cdd7fc1ffafeb612dabc2e87
3
  size 5905
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:210679227dd2b3037b48d4414e3a7d05adcbca6f5fde35131afb41f4714ec72e
3
  size 5905
run-2/checkpoint-4500/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e48c541c411c0d8efb8e3c1b61fc7de06887da919d62db0ea13c3bed3cfa7748
3
  size 598898116
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7bd48eacc76940b767b8081cd817fd1c37797da71518ea157e83924dd4e29138
3
  size 598898116
run-2/checkpoint-4500/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:029725a9d1262b07d91acdb6f6e29471df363a15d37b1b6d107bd4b4ffd3d12e
3
  size 1197886411
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dae27d802154d955bcbee1b027965678c6ef789f5b7390766c5c16ea9999afdd
3
  size 1197886411
run-2/checkpoint-4500/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6c34269efc585af4ecdad67c6f9c6240f640ea478e4ed920162045172313d660
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d29df8255baf50ad67160d39d36cfb8aa791577efd7c6a3108d4d01a125b92e
3
  size 1465
run-2/checkpoint-4500/trainer_state.json CHANGED
@@ -11,199 +11,199 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.20964360587002095,
14
- "grad_norm": 19.045116424560547,
15
- "learning_rate": 2.9106019766397126e-05,
16
- "loss": 3.1735,
17
  "step": 200
18
  },
19
  {
20
  "epoch": 0.4192872117400419,
21
- "grad_norm": 14.753777503967285,
22
- "learning_rate": 2.820754716981132e-05,
23
- "loss": 0.6335,
24
  "step": 400
25
  },
26
  {
27
  "epoch": 0.6289308176100629,
28
- "grad_norm": 7.134149074554443,
29
- "learning_rate": 2.730907457322552e-05,
30
- "loss": 0.3331,
31
  "step": 600
32
  },
33
  {
34
  "epoch": 0.8385744234800838,
35
- "grad_norm": 11.017728805541992,
36
- "learning_rate": 2.6410601976639712e-05,
37
- "loss": 0.2265,
38
  "step": 800
39
  },
40
  {
41
  "epoch": 1.0,
42
- "eval_accuracy": 0.9341935483870968,
43
- "eval_loss": 0.25923657417297363,
44
- "eval_runtime": 6.2485,
45
- "eval_samples_per_second": 496.116,
46
- "eval_steps_per_second": 31.047,
47
  "step": 954
48
  },
49
  {
50
  "epoch": 1.0482180293501049,
51
- "grad_norm": 17.758880615234375,
52
- "learning_rate": 2.551212938005391e-05,
53
- "loss": 0.1617,
54
  "step": 1000
55
  },
56
  {
57
  "epoch": 1.2578616352201257,
58
- "grad_norm": 2.7743964195251465,
59
- "learning_rate": 2.4613656783468106e-05,
60
- "loss": 0.0715,
61
  "step": 1200
62
  },
63
  {
64
  "epoch": 1.4675052410901468,
65
- "grad_norm": 0.491705060005188,
66
- "learning_rate": 2.3715184186882298e-05,
67
- "loss": 0.059,
68
  "step": 1400
69
  },
70
  {
71
  "epoch": 1.6771488469601676,
72
- "grad_norm": 0.3010256290435791,
73
- "learning_rate": 2.2816711590296497e-05,
74
- "loss": 0.0569,
75
  "step": 1600
76
  },
77
  {
78
  "epoch": 1.8867924528301887,
79
- "grad_norm": 22.237369537353516,
80
- "learning_rate": 2.1918238993710692e-05,
81
- "loss": 0.0672,
82
  "step": 1800
83
  },
84
  {
85
  "epoch": 2.0,
86
- "eval_accuracy": 0.9487096774193549,
87
- "eval_loss": 0.22349213063716888,
88
- "eval_runtime": 5.967,
89
- "eval_samples_per_second": 519.525,
90
- "eval_steps_per_second": 32.512,
91
  "step": 1908
92
  },
93
  {
94
  "epoch": 2.0964360587002098,
95
- "grad_norm": 0.024833250790834427,
96
- "learning_rate": 2.1019766397124887e-05,
97
- "loss": 0.0422,
98
  "step": 2000
99
  },
100
  {
101
  "epoch": 2.3060796645702304,
102
- "grad_norm": 0.0831853449344635,
103
- "learning_rate": 2.0121293800539083e-05,
104
- "loss": 0.0181,
105
  "step": 2200
106
  },
107
  {
108
  "epoch": 2.5157232704402515,
109
- "grad_norm": 0.40406087040901184,
110
- "learning_rate": 1.922282120395328e-05,
111
- "loss": 0.0227,
112
  "step": 2400
113
  },
114
  {
115
  "epoch": 2.7253668763102725,
116
- "grad_norm": 0.11021151393651962,
117
- "learning_rate": 1.8324348607367477e-05,
118
- "loss": 0.0109,
119
  "step": 2600
120
  },
121
  {
122
  "epoch": 2.9350104821802936,
123
- "grad_norm": 0.11338778585195541,
124
- "learning_rate": 1.7425876010781672e-05,
125
- "loss": 0.0124,
126
  "step": 2800
127
  },
128
  {
129
  "epoch": 3.0,
130
- "eval_accuracy": 0.9570967741935484,
131
- "eval_loss": 0.19864021241664886,
132
- "eval_runtime": 5.9289,
133
- "eval_samples_per_second": 522.867,
134
- "eval_steps_per_second": 32.721,
135
  "step": 2862
136
  },
137
  {
138
  "epoch": 3.1446540880503147,
139
- "grad_norm": 0.006569345481693745,
140
- "learning_rate": 1.6527403414195867e-05,
141
- "loss": 0.0124,
142
  "step": 3000
143
  },
144
  {
145
  "epoch": 3.3542976939203353,
146
- "grad_norm": 0.11313613504171371,
147
- "learning_rate": 1.5628930817610063e-05,
148
- "loss": 0.0048,
149
  "step": 3200
150
  },
151
  {
152
  "epoch": 3.5639412997903563,
153
- "grad_norm": 0.007506044581532478,
154
- "learning_rate": 1.473045822102426e-05,
155
- "loss": 0.0027,
156
  "step": 3400
157
  },
158
  {
159
  "epoch": 3.7735849056603774,
160
- "grad_norm": 1.1819171905517578,
161
- "learning_rate": 1.3831985624438455e-05,
162
- "loss": 0.0084,
163
  "step": 3600
164
  },
165
  {
166
  "epoch": 3.9832285115303985,
167
- "grad_norm": 0.009302722290158272,
168
- "learning_rate": 1.2933513027852652e-05,
169
- "loss": 0.0161,
170
  "step": 3800
171
  },
172
  {
173
  "epoch": 4.0,
174
- "eval_accuracy": 0.9580645161290322,
175
- "eval_loss": 0.19435587525367737,
176
- "eval_runtime": 6.4961,
177
- "eval_samples_per_second": 477.21,
178
- "eval_steps_per_second": 29.864,
179
  "step": 3816
180
  },
181
  {
182
  "epoch": 4.1928721174004195,
183
- "grad_norm": 0.007662074640393257,
184
- "learning_rate": 1.2035040431266847e-05,
185
- "loss": 0.006,
186
  "step": 4000
187
  },
188
  {
189
  "epoch": 4.40251572327044,
190
- "grad_norm": 0.0036032176576554775,
191
- "learning_rate": 1.1136567834681041e-05,
192
- "loss": 0.0007,
193
  "step": 4200
194
  },
195
  {
196
  "epoch": 4.612159329140461,
197
- "grad_norm": 0.00412218039855361,
198
- "learning_rate": 1.0238095238095238e-05,
199
- "loss": 0.0047,
200
  "step": 4400
201
  }
202
  ],
203
  "logging_steps": 200,
204
- "max_steps": 6678,
205
  "num_input_tokens_seen": 0,
206
- "num_train_epochs": 7,
207
  "save_steps": 500,
208
  "stateful_callbacks": {
209
  "TrainerControl": {
@@ -221,8 +221,8 @@
221
  "train_batch_size": 16,
222
  "trial_name": null,
223
  "trial_params": {
224
- "alpha": 0.8225267279347407,
225
- "num_train_epochs": 7,
226
- "temperature": 9
227
  }
228
  }
 
11
  "log_history": [
12
  {
13
  "epoch": 0.20964360587002095,
14
+ "grad_norm": 19.058637619018555,
15
+ "learning_rate": 2.8748427672955975e-05,
16
+ "loss": 3.1779,
17
  "step": 200
18
  },
19
  {
20
  "epoch": 0.4192872117400419,
21
+ "grad_norm": 16.188446044921875,
22
+ "learning_rate": 2.749056603773585e-05,
23
+ "loss": 0.6362,
24
  "step": 400
25
  },
26
  {
27
  "epoch": 0.6289308176100629,
28
+ "grad_norm": 7.167396545410156,
29
+ "learning_rate": 2.6232704402515723e-05,
30
+ "loss": 0.3306,
31
  "step": 600
32
  },
33
  {
34
  "epoch": 0.8385744234800838,
35
+ "grad_norm": 11.347878456115723,
36
+ "learning_rate": 2.49748427672956e-05,
37
+ "loss": 0.2273,
38
  "step": 800
39
  },
40
  {
41
  "epoch": 1.0,
42
+ "eval_accuracy": 0.9325806451612904,
43
+ "eval_loss": 0.2720068097114563,
44
+ "eval_runtime": 6.3485,
45
+ "eval_samples_per_second": 488.302,
46
+ "eval_steps_per_second": 30.558,
47
  "step": 954
48
  },
49
  {
50
  "epoch": 1.0482180293501049,
51
+ "grad_norm": 15.08842658996582,
52
+ "learning_rate": 2.3716981132075474e-05,
53
+ "loss": 0.1632,
54
  "step": 1000
55
  },
56
  {
57
  "epoch": 1.2578616352201257,
58
+ "grad_norm": 1.7787141799926758,
59
+ "learning_rate": 2.2459119496855346e-05,
60
+ "loss": 0.0698,
61
  "step": 1200
62
  },
63
  {
64
  "epoch": 1.4675052410901468,
65
+ "grad_norm": 0.3887692987918854,
66
+ "learning_rate": 2.120125786163522e-05,
67
+ "loss": 0.0551,
68
  "step": 1400
69
  },
70
  {
71
  "epoch": 1.6771488469601676,
72
+ "grad_norm": 0.17968730628490448,
73
+ "learning_rate": 1.9943396226415094e-05,
74
+ "loss": 0.0487,
75
  "step": 1600
76
  },
77
  {
78
  "epoch": 1.8867924528301887,
79
+ "grad_norm": 17.465627670288086,
80
+ "learning_rate": 1.868553459119497e-05,
81
+ "loss": 0.0556,
82
  "step": 1800
83
  },
84
  {
85
  "epoch": 2.0,
86
+ "eval_accuracy": 0.9496774193548387,
87
+ "eval_loss": 0.23183350265026093,
88
+ "eval_runtime": 6.1991,
89
+ "eval_samples_per_second": 500.07,
90
+ "eval_steps_per_second": 31.295,
91
  "step": 1908
92
  },
93
  {
94
  "epoch": 2.0964360587002098,
95
+ "grad_norm": 0.020424585789442062,
96
+ "learning_rate": 1.742767295597484e-05,
97
+ "loss": 0.0409,
98
  "step": 2000
99
  },
100
  {
101
  "epoch": 2.3060796645702304,
102
+ "grad_norm": 0.036295052617788315,
103
+ "learning_rate": 1.6169811320754717e-05,
104
+ "loss": 0.0135,
105
  "step": 2200
106
  },
107
  {
108
  "epoch": 2.5157232704402515,
109
+ "grad_norm": 6.396214008331299,
110
+ "learning_rate": 1.491194968553459e-05,
111
+ "loss": 0.0199,
112
  "step": 2400
113
  },
114
  {
115
  "epoch": 2.7253668763102725,
116
+ "grad_norm": 0.07038699835538864,
117
+ "learning_rate": 1.3654088050314464e-05,
118
+ "loss": 0.0084,
119
  "step": 2600
120
  },
121
  {
122
  "epoch": 2.9350104821802936,
123
+ "grad_norm": 0.09982823580503464,
124
+ "learning_rate": 1.239622641509434e-05,
125
+ "loss": 0.008,
126
  "step": 2800
127
  },
128
  {
129
  "epoch": 3.0,
130
+ "eval_accuracy": 0.9609677419354838,
131
+ "eval_loss": 0.19234703481197357,
132
+ "eval_runtime": 8.1342,
133
+ "eval_samples_per_second": 381.109,
134
+ "eval_steps_per_second": 23.85,
135
  "step": 2862
136
  },
137
  {
138
  "epoch": 3.1446540880503147,
139
+ "grad_norm": 0.007687446661293507,
140
+ "learning_rate": 1.1138364779874214e-05,
141
+ "loss": 0.0041,
142
  "step": 3000
143
  },
144
  {
145
  "epoch": 3.3542976939203353,
146
+ "grad_norm": 2.6632392406463623,
147
+ "learning_rate": 9.880503144654089e-06,
148
+ "loss": 0.0055,
149
  "step": 3200
150
  },
151
  {
152
  "epoch": 3.5639412997903563,
153
+ "grad_norm": 0.014160693623125553,
154
+ "learning_rate": 8.622641509433963e-06,
155
+ "loss": 0.002,
156
  "step": 3400
157
  },
158
  {
159
  "epoch": 3.7735849056603774,
160
+ "grad_norm": 0.059565939009189606,
161
+ "learning_rate": 7.364779874213837e-06,
162
+ "loss": 0.0017,
163
  "step": 3600
164
  },
165
  {
166
  "epoch": 3.9832285115303985,
167
+ "grad_norm": 0.0034850463271141052,
168
+ "learning_rate": 6.1069182389937105e-06,
169
+ "loss": 0.0075,
170
  "step": 3800
171
  },
172
  {
173
  "epoch": 4.0,
174
+ "eval_accuracy": 0.96,
175
+ "eval_loss": 0.190092071890831,
176
+ "eval_runtime": 6.4445,
177
+ "eval_samples_per_second": 481.034,
178
+ "eval_steps_per_second": 30.103,
179
  "step": 3816
180
  },
181
  {
182
  "epoch": 4.1928721174004195,
183
+ "grad_norm": 0.012302987277507782,
184
+ "learning_rate": 4.849056603773585e-06,
185
+ "loss": 0.0031,
186
  "step": 4000
187
  },
188
  {
189
  "epoch": 4.40251572327044,
190
+ "grad_norm": 0.00425491388887167,
191
+ "learning_rate": 3.591194968553459e-06,
192
+ "loss": 0.0019,
193
  "step": 4200
194
  },
195
  {
196
  "epoch": 4.612159329140461,
197
+ "grad_norm": 0.012302517890930176,
198
+ "learning_rate": 2.3333333333333336e-06,
199
+ "loss": 0.0032,
200
  "step": 4400
201
  }
202
  ],
203
  "logging_steps": 200,
204
+ "max_steps": 4770,
205
  "num_input_tokens_seen": 0,
206
+ "num_train_epochs": 5,
207
  "save_steps": 500,
208
  "stateful_callbacks": {
209
  "TrainerControl": {
 
221
  "train_batch_size": 16,
222
  "trial_name": null,
223
  "trial_params": {
224
+ "alpha": 0.7896637522578164,
225
+ "num_train_epochs": 5,
226
+ "temperature": 4
227
  }
228
  }
run-2/checkpoint-4500/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:71f85117fc3c233d4584172d7a3335ae3c759791cdd7fc1ffafeb612dabc2e87
3
  size 5905
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:210679227dd2b3037b48d4414e3a7d05adcbca6f5fde35131afb41f4714ec72e
3
  size 5905
runs/Oct23_11-35-49_ec65ff5e83b6/events.out.tfevents.1761223404.ec65ff5e83b6.4546.3 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2a562947f04c23bd291b79623d520681287061f2bef9f3d4be20863fa8005822
3
- size 15564
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:103cd3a099da6cb61ba71307470855a8f7c8ad777f367a272dd9e3bc7c1d9c44
3
+ size 18997