Jinrui commited on
Commit
618d223
·
verified ·
1 Parent(s): 7efb092

Upload results/scaling_law/owt/qwen3_hyp/owt_scaling_v3/qwen3_hyp_p686m_t1_32_suite-owt_scaling_v3_family-qwen3_geometry_id-hyp_variant-base_init_slope-std/attempt1_20260209_230103/trainer_state.json with huggingface_hub

Browse files
results/scaling_law/owt/qwen3_hyp/owt_scaling_v3/qwen3_hyp_p686m_t1_32_suite-owt_scaling_v3_family-qwen3_geometry_id-hyp_variant-base_init_slope-std/attempt1_20260209_230103/trainer_state.json ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.030113980362736582,
6
+ "eval_steps": 128,
7
+ "global_step": 1001,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.003008389646627031,
14
+ "grad_norm": 1.2120198011398315,
15
+ "learning_rate": 9.844506277446577e-05,
16
+ "loss": 8.1046,
17
+ "step": 100
18
+ },
19
+ {
20
+ "epoch": 0.0038507387476825997,
21
+ "eval_loss": 6.402813911437988,
22
+ "eval_runtime": 28.6031,
23
+ "eval_samples_per_second": 148.166,
24
+ "eval_steps_per_second": 4.65,
25
+ "step": 128
26
+ },
27
+ {
28
+ "epoch": 0.006016779293254062,
29
+ "grad_norm": 1.0962634086608887,
30
+ "learning_rate": 9.207842527714767e-05,
31
+ "loss": 6.2709,
32
+ "step": 200
33
+ },
34
+ {
35
+ "epoch": 0.007701477495365199,
36
+ "eval_loss": 5.732907772064209,
37
+ "eval_runtime": 28.591,
38
+ "eval_samples_per_second": 148.229,
39
+ "eval_steps_per_second": 4.652,
40
+ "step": 256
41
+ },
42
+ {
43
+ "epoch": 0.009025168939881093,
44
+ "grad_norm": 0.6911008954048157,
45
+ "learning_rate": 8.142447989440618e-05,
46
+ "loss": 5.7647,
47
+ "step": 300
48
+ },
49
+ {
50
+ "epoch": 0.0115522162430478,
51
+ "eval_loss": 5.367537975311279,
52
+ "eval_runtime": 28.6593,
53
+ "eval_samples_per_second": 147.875,
54
+ "eval_steps_per_second": 4.641,
55
+ "step": 384
56
+ },
57
+ {
58
+ "epoch": 0.012033558586508125,
59
+ "grad_norm": 0.732850968837738,
60
+ "learning_rate": 6.756874120406714e-05,
61
+ "loss": 5.4545,
62
+ "step": 400
63
+ },
64
+ {
65
+ "epoch": 0.015041948233135156,
66
+ "grad_norm": 0.6058043837547302,
67
+ "learning_rate": 5.192294972051992e-05,
68
+ "loss": 5.2435,
69
+ "step": 500
70
+ },
71
+ {
72
+ "epoch": 0.015402954990730399,
73
+ "eval_loss": 5.149702072143555,
74
+ "eval_runtime": 28.626,
75
+ "eval_samples_per_second": 148.047,
76
+ "eval_steps_per_second": 4.646,
77
+ "step": 512
78
+ },
79
+ {
80
+ "epoch": 0.018050337879762186,
81
+ "grad_norm": 0.6564653515815735,
82
+ "learning_rate": 3.608123176287685e-05,
83
+ "loss": 5.1083,
84
+ "step": 600
85
+ },
86
+ {
87
+ "epoch": 0.019253693738413,
88
+ "eval_loss": 5.0159125328063965,
89
+ "eval_runtime": 28.6214,
90
+ "eval_samples_per_second": 148.071,
91
+ "eval_steps_per_second": 4.647,
92
+ "step": 640
93
+ },
94
+ {
95
+ "epoch": 0.021058727526389216,
96
+ "grad_norm": 0.6653856635093689,
97
+ "learning_rate": 2.165767630597752e-05,
98
+ "loss": 5.0093,
99
+ "step": 700
100
+ },
101
+ {
102
+ "epoch": 0.0231044324860956,
103
+ "eval_loss": 4.945389747619629,
104
+ "eval_runtime": 28.6219,
105
+ "eval_samples_per_second": 148.068,
106
+ "eval_steps_per_second": 4.647,
107
+ "step": 768
108
+ },
109
+ {
110
+ "epoch": 0.02406711717301625,
111
+ "grad_norm": 0.5155009031295776,
112
+ "learning_rate": 1.0121877866225781e-05,
113
+ "loss": 4.9512,
114
+ "step": 800
115
+ },
116
+ {
117
+ "epoch": 0.026955171233778198,
118
+ "eval_loss": 4.9168829917907715,
119
+ "eval_runtime": 28.6624,
120
+ "eval_samples_per_second": 147.859,
121
+ "eval_steps_per_second": 4.64,
122
+ "step": 896
123
+ },
124
+ {
125
+ "epoch": 0.02707550681964328,
126
+ "grad_norm": 0.5433395504951477,
127
+ "learning_rate": 2.6492017119189417e-06,
128
+ "loss": 4.9169,
129
+ "step": 900
130
+ },
131
+ {
132
+ "epoch": 0.030083896466270313,
133
+ "grad_norm": 0.5387678742408752,
134
+ "learning_rate": 1.0276520816976387e-09,
135
+ "loss": 4.9135,
136
+ "step": 1000
137
+ },
138
+ {
139
+ "epoch": 0.030113980362736582,
140
+ "step": 1001,
141
+ "total_flos": 1.3865860380418376e+18,
142
+ "train_loss": 5.57313633631993,
143
+ "train_runtime": 3600.2509,
144
+ "train_samples_per_second": 35.589,
145
+ "train_steps_per_second": 0.278
146
+ }
147
+ ],
148
+ "logging_steps": 100,
149
+ "max_steps": 1001,
150
+ "num_input_tokens_seen": 0,
151
+ "num_train_epochs": 1,
152
+ "save_steps": 256,
153
+ "stateful_callbacks": {
154
+ "TrainerControl": {
155
+ "args": {
156
+ "should_epoch_stop": false,
157
+ "should_evaluate": false,
158
+ "should_log": false,
159
+ "should_save": true,
160
+ "should_training_stop": true
161
+ },
162
+ "attributes": {}
163
+ }
164
+ },
165
+ "total_flos": 1.3865860380418376e+18,
166
+ "train_batch_size": 4,
167
+ "trial_name": null,
168
+ "trial_params": null
169
+ }