File size: 5,668 Bytes
9983125
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 3.0,
  "eval_steps": 500,
  "global_step": 177,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "entropy": 0.8946975544095039,
      "epoch": 0.17167381974248927,
      "grad_norm": 3.2923269271850586,
      "learning_rate": 6e-06,
      "loss": 0.9271,
      "mean_token_accuracy": 0.7838601619005203,
      "num_tokens": 646810.0,
      "step": 10
    },
    {
      "entropy": 0.8304631188511848,
      "epoch": 0.34334763948497854,
      "grad_norm": 1.4505877494812012,
      "learning_rate": 9.994965332706574e-06,
      "loss": 0.8398,
      "mean_token_accuracy": 0.8009490087628365,
      "num_tokens": 1291203.0,
      "step": 20
    },
    {
      "entropy": 0.768204678595066,
      "epoch": 0.5150214592274678,
      "grad_norm": 1.083687663078308,
      "learning_rate": 9.938441702975689e-06,
      "loss": 0.7803,
      "mean_token_accuracy": 0.8115046098828316,
      "num_tokens": 1941398.0,
      "step": 30
    },
    {
      "entropy": 0.7275782853364945,
      "epoch": 0.6866952789699571,
      "grad_norm": 0.7802343368530273,
      "learning_rate": 9.819814303479268e-06,
      "loss": 0.7392,
      "mean_token_accuracy": 0.8186714142560959,
      "num_tokens": 2586314.0,
      "step": 40
    },
    {
      "entropy": 0.7106381312012673,
      "epoch": 0.8583690987124464,
      "grad_norm": 0.7659251689910889,
      "learning_rate": 9.640574942595195e-06,
      "loss": 0.7229,
      "mean_token_accuracy": 0.8213993713259697,
      "num_tokens": 3239144.0,
      "step": 50
    },
    {
      "entropy": 0.7049121647267729,
      "epoch": 1.0171673819742488,
      "grad_norm": 0.7232816815376282,
      "learning_rate": 9.40297765928369e-06,
      "loss": 0.7128,
      "mean_token_accuracy": 0.8236586129343187,
      "num_tokens": 3846086.0,
      "step": 60
    },
    {
      "entropy": 0.66157948076725,
      "epoch": 1.1888412017167382,
      "grad_norm": 0.6707538962364197,
      "learning_rate": 9.110010377239552e-06,
      "loss": 0.6685,
      "mean_token_accuracy": 0.832102257013321,
      "num_tokens": 4500217.0,
      "step": 70
    },
    {
      "entropy": 0.6357302084565163,
      "epoch": 1.3605150214592274,
      "grad_norm": 0.695083737373352,
      "learning_rate": 8.765357330018056e-06,
      "loss": 0.6488,
      "mean_token_accuracy": 0.8356550931930542,
      "num_tokens": 5142939.0,
      "step": 80
    },
    {
      "entropy": 0.6433110848069191,
      "epoch": 1.5321888412017168,
      "grad_norm": 0.6362840533256531,
      "learning_rate": 8.373352729660373e-06,
      "loss": 0.6497,
      "mean_token_accuracy": 0.8368361875414848,
      "num_tokens": 5795423.0,
      "step": 90
    },
    {
      "entropy": 0.6335700437426567,
      "epoch": 1.703862660944206,
      "grad_norm": 0.608460545539856,
      "learning_rate": 7.938926261462366e-06,
      "loss": 0.646,
      "mean_token_accuracy": 0.8386218667030334,
      "num_tokens": 6441560.0,
      "step": 100
    },
    {
      "entropy": 0.6372291177511216,
      "epoch": 1.8755364806866952,
      "grad_norm": 0.5837402939796448,
      "learning_rate": 7.467541090321735e-06,
      "loss": 0.6432,
      "mean_token_accuracy": 0.8375322207808494,
      "num_tokens": 7088692.0,
      "step": 110
    },
    {
      "entropy": 0.6296658838117445,
      "epoch": 2.0343347639484977,
      "grad_norm": 0.6868466138839722,
      "learning_rate": 6.965125158269619e-06,
      "loss": 0.6355,
      "mean_token_accuracy": 0.8384278191102518,
      "num_tokens": 7689634.0,
      "step": 120
    },
    {
      "entropy": 0.5824477970600128,
      "epoch": 2.2060085836909873,
      "grad_norm": 0.6492967009544373,
      "learning_rate": 6.437996637160086e-06,
      "loss": 0.5903,
      "mean_token_accuracy": 0.8480394512414933,
      "num_tokens": 8335925.0,
      "step": 130
    },
    {
      "entropy": 0.5835598841309547,
      "epoch": 2.3776824034334765,
      "grad_norm": 0.6394692063331604,
      "learning_rate": 5.892784473993184e-06,
      "loss": 0.5887,
      "mean_token_accuracy": 0.8477904468774795,
      "num_tokens": 8984053.0,
      "step": 140
    },
    {
      "entropy": 0.5792419567704201,
      "epoch": 2.5493562231759657,
      "grad_norm": 0.6243847608566284,
      "learning_rate": 5.336345028060199e-06,
      "loss": 0.5894,
      "mean_token_accuracy": 0.8487418398261071,
      "num_tokens": 9632178.0,
      "step": 150
    },
    {
      "entropy": 0.5750040769577026,
      "epoch": 2.721030042918455,
      "grad_norm": 0.6083446741104126,
      "learning_rate": 4.775675848247427e-06,
      "loss": 0.5818,
      "mean_token_accuracy": 0.8494960919022561,
      "num_tokens": 10281754.0,
      "step": 160
    },
    {
      "entropy": 0.584269268810749,
      "epoch": 2.8927038626609445,
      "grad_norm": 0.6467194557189941,
      "learning_rate": 4.217827674798845e-06,
      "loss": 0.5897,
      "mean_token_accuracy": 0.848649799823761,
      "num_tokens": 10930722.0,
      "step": 170
    }
  ],
  "logging_steps": 10,
  "max_steps": 295,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 5,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 8.385177303604265e+17,
  "train_batch_size": 4,
  "trial_name": null,
  "trial_params": null
}