robertou2 commited on
Commit
7d86599
·
verified ·
1 Parent(s): 3e6dd15

Upload folder using huggingface_hub

Browse files
Files changed (5) hide show
  1. adapter_model.safetensors +1 -1
  2. optimizer.pt +2 -2
  3. rng_state.pth +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +6 -2106
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dfb5f28cdd9c5e8b5812c333c96294c2058106b03b65ffcd21936c8dd4b38c0c
3
  size 738232680
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d2263abc52214918cb0243613829263c5eb866b6277704478283989215eeae3
3
  size 738232680
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3369b904cd237f36a8d77bf651116368a1a19b5f3db96249b7e6a9f9bb45e3d8
3
- size 1107512523
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21ed021ca7796b559e38f4de1c6f075d653c3c73fdf265bca596f2ba21ee61c8
3
+ size 1476611275
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cb8b281d3670780618442404eb45c98293a0d43e46f8e5ca3eb87cb4663d60e7
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d416d016b635652b44c8f24b86395735e0658c83adbca0c05503d6f290df3a8
3
  size 14645
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5fa59c9185701213ec25411dab80244c017dba754eeac7bca5fb0c59c13e7e9c
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f91f56974adaa6f012d64b7fe0783f94fe00a197b4ccc0cf01788db9b8df0028
3
  size 1465
trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 200,
3
- "best_metric": 0.0016098986379802227,
4
- "best_model_checkpoint": "/content/drive/MyDrive/lora_model/outputs/task15_microsoft/Phi-4-mini-instruct/checkpoint-200",
5
- "epoch": 10.526315789473685,
6
  "eval_steps": 1,
7
- "global_step": 200,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -908,2106 +908,6 @@
908
  "eval_samples_per_second": 33.566,
909
  "eval_steps_per_second": 4.475,
910
  "step": 60
911
- },
912
- {
913
- "epoch": 3.2105263157894735,
914
- "grad_norm": 2.3808937072753906,
915
- "learning_rate": 0.0004561103900854401,
916
- "loss": 0.5372,
917
- "step": 61
918
- },
919
- {
920
- "epoch": 3.2105263157894735,
921
- "eval_loss": 0.535223662853241,
922
- "eval_runtime": 0.8966,
923
- "eval_samples_per_second": 33.459,
924
- "eval_steps_per_second": 4.461,
925
- "step": 61
926
- },
927
- {
928
- "epoch": 3.263157894736842,
929
- "grad_norm": 1.8272178173065186,
930
- "learning_rate": 0.0004542005660466094,
931
- "loss": 0.5399,
932
- "step": 62
933
- },
934
- {
935
- "epoch": 3.263157894736842,
936
- "eval_loss": 0.5316082239151001,
937
- "eval_runtime": 0.8994,
938
- "eval_samples_per_second": 33.354,
939
- "eval_steps_per_second": 4.447,
940
- "step": 62
941
- },
942
- {
943
- "epoch": 3.3157894736842106,
944
- "grad_norm": 2.0635435581207275,
945
- "learning_rate": 0.0004522542485937369,
946
- "loss": 0.5531,
947
- "step": 63
948
- },
949
- {
950
- "epoch": 3.3157894736842106,
951
- "eval_loss": 0.5134085416793823,
952
- "eval_runtime": 0.8937,
953
- "eval_samples_per_second": 33.567,
954
- "eval_steps_per_second": 4.476,
955
- "step": 63
956
- },
957
- {
958
- "epoch": 3.3684210526315788,
959
- "grad_norm": 2.268183708190918,
960
- "learning_rate": 0.0004502717855601809,
961
- "loss": 0.5291,
962
- "step": 64
963
- },
964
- {
965
- "epoch": 3.3684210526315788,
966
- "eval_loss": 0.5419598817825317,
967
- "eval_runtime": 0.8959,
968
- "eval_samples_per_second": 33.486,
969
- "eval_steps_per_second": 4.465,
970
- "step": 64
971
- },
972
- {
973
- "epoch": 3.4210526315789473,
974
- "grad_norm": 1.8800358772277832,
975
- "learning_rate": 0.0004482535312390058,
976
- "loss": 0.5501,
977
- "step": 65
978
- },
979
- {
980
- "epoch": 3.4210526315789473,
981
- "eval_loss": 0.5209227800369263,
982
- "eval_runtime": 0.8927,
983
- "eval_samples_per_second": 33.606,
984
- "eval_steps_per_second": 4.481,
985
- "step": 65
986
- },
987
- {
988
- "epoch": 3.473684210526316,
989
- "grad_norm": 3.1507558822631836,
990
- "learning_rate": 0.00044619984631966527,
991
- "loss": 0.5309,
992
- "step": 66
993
- },
994
- {
995
- "epoch": 3.473684210526316,
996
- "eval_loss": 0.536996603012085,
997
- "eval_runtime": 0.8951,
998
- "eval_samples_per_second": 33.517,
999
- "eval_steps_per_second": 4.469,
1000
- "step": 66
1001
- },
1002
- {
1003
- "epoch": 3.526315789473684,
1004
- "grad_norm": 3.5700478553771973,
1005
- "learning_rate": 0.0004441110978235418,
1006
- "loss": 0.7223,
1007
- "step": 67
1008
- },
1009
- {
1010
- "epoch": 3.526315789473684,
1011
- "eval_loss": 0.5140640139579773,
1012
- "eval_runtime": 0.8962,
1013
- "eval_samples_per_second": 33.474,
1014
- "eval_steps_per_second": 4.463,
1015
- "step": 67
1016
- },
1017
- {
1018
- "epoch": 3.5789473684210527,
1019
- "grad_norm": 1.758971929550171,
1020
- "learning_rate": 0.0004419876590383554,
1021
- "loss": 0.6927,
1022
- "step": 68
1023
- },
1024
- {
1025
- "epoch": 3.5789473684210527,
1026
- "eval_loss": 0.47072505950927734,
1027
- "eval_runtime": 0.9127,
1028
- "eval_samples_per_second": 32.87,
1029
- "eval_steps_per_second": 4.383,
1030
- "step": 68
1031
- },
1032
- {
1033
- "epoch": 3.6315789473684212,
1034
- "grad_norm": 1.5274709463119507,
1035
- "learning_rate": 0.00043982990945145146,
1036
- "loss": 0.4762,
1037
- "step": 69
1038
- },
1039
- {
1040
- "epoch": 3.6315789473684212,
1041
- "eval_loss": 0.4518219828605652,
1042
- "eval_runtime": 0.8967,
1043
- "eval_samples_per_second": 33.456,
1044
- "eval_steps_per_second": 4.461,
1045
- "step": 69
1046
- },
1047
- {
1048
- "epoch": 3.6842105263157894,
1049
- "grad_norm": 1.7685797214508057,
1050
- "learning_rate": 0.0004376382346819819,
1051
- "loss": 0.5629,
1052
- "step": 70
1053
- },
1054
- {
1055
- "epoch": 3.6842105263157894,
1056
- "eval_loss": 0.40707579255104065,
1057
- "eval_runtime": 0.8934,
1058
- "eval_samples_per_second": 33.581,
1059
- "eval_steps_per_second": 4.478,
1060
- "step": 70
1061
- },
1062
- {
1063
- "epoch": 3.736842105263158,
1064
- "grad_norm": 1.6618574857711792,
1065
- "learning_rate": 0.00043541302641198946,
1066
- "loss": 0.5877,
1067
- "step": 71
1068
- },
1069
- {
1070
- "epoch": 3.736842105263158,
1071
- "eval_loss": 0.3780651390552521,
1072
- "eval_runtime": 0.9024,
1073
- "eval_samples_per_second": 33.246,
1074
- "eval_steps_per_second": 4.433,
1075
- "step": 71
1076
- },
1077
- {
1078
- "epoch": 3.7894736842105265,
1079
- "grad_norm": 1.542702317237854,
1080
- "learning_rate": 0.00043315468231640834,
1081
- "loss": 0.5222,
1082
- "step": 72
1083
- },
1084
- {
1085
- "epoch": 3.7894736842105265,
1086
- "eval_loss": 0.3732970356941223,
1087
- "eval_runtime": 0.9166,
1088
- "eval_samples_per_second": 32.73,
1089
- "eval_steps_per_second": 4.364,
1090
- "step": 72
1091
- },
1092
- {
1093
- "epoch": 3.8421052631578947,
1094
- "grad_norm": 1.8039391040802002,
1095
- "learning_rate": 0.00043086360599199516,
1096
- "loss": 0.5238,
1097
- "step": 73
1098
- },
1099
- {
1100
- "epoch": 3.8421052631578947,
1101
- "eval_loss": 0.3568810820579529,
1102
- "eval_runtime": 0.9031,
1103
- "eval_samples_per_second": 33.218,
1104
- "eval_steps_per_second": 4.429,
1105
- "step": 73
1106
- },
1107
- {
1108
- "epoch": 3.8947368421052633,
1109
- "grad_norm": 1.6215863227844238,
1110
- "learning_rate": 0.0004285402068852002,
1111
- "loss": 0.6504,
1112
- "step": 74
1113
- },
1114
- {
1115
- "epoch": 3.8947368421052633,
1116
- "eval_loss": 0.3885921835899353,
1117
- "eval_runtime": 0.896,
1118
- "eval_samples_per_second": 33.483,
1119
- "eval_steps_per_second": 4.464,
1120
- "step": 74
1121
- },
1122
- {
1123
- "epoch": 3.9473684210526314,
1124
- "grad_norm": 1.5152952671051025,
1125
- "learning_rate": 0.00042618490021899383,
1126
- "loss": 0.5694,
1127
- "step": 75
1128
- },
1129
- {
1130
- "epoch": 3.9473684210526314,
1131
- "eval_loss": 0.38745489716529846,
1132
- "eval_runtime": 0.8939,
1133
- "eval_samples_per_second": 33.562,
1134
- "eval_steps_per_second": 4.475,
1135
- "step": 75
1136
- },
1137
- {
1138
- "epoch": 4.0,
1139
- "grad_norm": 2.6989200115203857,
1140
- "learning_rate": 0.00042379810691866064,
1141
- "loss": 0.5849,
1142
- "step": 76
1143
- },
1144
- {
1145
- "epoch": 4.0,
1146
- "eval_loss": 0.42535698413848877,
1147
- "eval_runtime": 0.9073,
1148
- "eval_samples_per_second": 33.066,
1149
- "eval_steps_per_second": 4.409,
1150
- "step": 76
1151
- },
1152
- {
1153
- "epoch": 4.052631578947368,
1154
- "grad_norm": 1.7381691932678223,
1155
- "learning_rate": 0.00042138025353657407,
1156
- "loss": 0.3779,
1157
- "step": 77
1158
- },
1159
- {
1160
- "epoch": 4.052631578947368,
1161
- "eval_loss": 0.37115439772605896,
1162
- "eval_runtime": 0.9112,
1163
- "eval_samples_per_second": 32.922,
1164
- "eval_steps_per_second": 4.39,
1165
- "step": 77
1166
- },
1167
- {
1168
- "epoch": 4.105263157894737,
1169
- "grad_norm": 2.188385248184204,
1170
- "learning_rate": 0.00041893177217596633,
1171
- "loss": 0.44,
1172
- "step": 78
1173
- },
1174
- {
1175
- "epoch": 4.105263157894737,
1176
- "eval_loss": 0.2926563322544098,
1177
- "eval_runtime": 0.8982,
1178
- "eval_samples_per_second": 33.401,
1179
- "eval_steps_per_second": 4.453,
1180
- "step": 78
1181
- },
1182
- {
1183
- "epoch": 4.157894736842105,
1184
- "grad_norm": 2.3652961254119873,
1185
- "learning_rate": 0.0004164531004137049,
1186
- "loss": 0.3639,
1187
- "step": 79
1188
- },
1189
- {
1190
- "epoch": 4.157894736842105,
1191
- "eval_loss": 0.2751067876815796,
1192
- "eval_runtime": 0.9146,
1193
- "eval_samples_per_second": 32.8,
1194
- "eval_steps_per_second": 4.373,
1195
- "step": 79
1196
- },
1197
- {
1198
- "epoch": 4.2105263157894735,
1199
- "grad_norm": 2.165874719619751,
1200
- "learning_rate": 0.0004139446812220924,
1201
- "loss": 0.2683,
1202
- "step": 80
1203
- },
1204
- {
1205
- "epoch": 4.2105263157894735,
1206
- "eval_loss": 0.2685202360153198,
1207
- "eval_runtime": 0.9124,
1208
- "eval_samples_per_second": 32.881,
1209
- "eval_steps_per_second": 4.384,
1210
- "step": 80
1211
- },
1212
- {
1213
- "epoch": 4.2631578947368425,
1214
- "grad_norm": 1.7391912937164307,
1215
- "learning_rate": 0.0004114069628897006,
1216
- "loss": 0.2993,
1217
- "step": 81
1218
- },
1219
- {
1220
- "epoch": 4.2631578947368425,
1221
- "eval_loss": 0.33646491169929504,
1222
- "eval_runtime": 0.8952,
1223
- "eval_samples_per_second": 33.51,
1224
- "eval_steps_per_second": 4.468,
1225
- "step": 81
1226
- },
1227
- {
1228
- "epoch": 4.315789473684211,
1229
- "grad_norm": 3.65714693069458,
1230
- "learning_rate": 0.0004088403989412559,
1231
- "loss": 0.4252,
1232
- "step": 82
1233
- },
1234
- {
1235
- "epoch": 4.315789473684211,
1236
- "eval_loss": 0.2839888632297516,
1237
- "eval_runtime": 0.9057,
1238
- "eval_samples_per_second": 33.123,
1239
- "eval_steps_per_second": 4.416,
1240
- "step": 82
1241
- },
1242
- {
1243
- "epoch": 4.368421052631579,
1244
- "grad_norm": 2.1762771606445312,
1245
- "learning_rate": 0.00040624544805658794,
1246
- "loss": 0.3304,
1247
- "step": 83
1248
- },
1249
- {
1250
- "epoch": 4.368421052631579,
1251
- "eval_loss": 0.27002134919166565,
1252
- "eval_runtime": 0.8939,
1253
- "eval_samples_per_second": 33.562,
1254
- "eval_steps_per_second": 4.475,
1255
- "step": 83
1256
- },
1257
- {
1258
- "epoch": 4.421052631578947,
1259
- "grad_norm": 2.1018354892730713,
1260
- "learning_rate": 0.00040362257398865713,
1261
- "loss": 0.4506,
1262
- "step": 84
1263
- },
1264
- {
1265
- "epoch": 4.421052631578947,
1266
- "eval_loss": 0.2557659149169922,
1267
- "eval_runtime": 0.8969,
1268
- "eval_samples_per_second": 33.45,
1269
- "eval_steps_per_second": 4.46,
1270
- "step": 84
1271
- },
1272
- {
1273
- "epoch": 4.473684210526316,
1274
- "grad_norm": 1.7509180307388306,
1275
- "learning_rate": 0.00040097224548067613,
1276
- "loss": 0.3731,
1277
- "step": 85
1278
- },
1279
- {
1280
- "epoch": 4.473684210526316,
1281
- "eval_loss": 0.26859304308891296,
1282
- "eval_runtime": 0.9009,
1283
- "eval_samples_per_second": 33.299,
1284
- "eval_steps_per_second": 4.44,
1285
- "step": 85
1286
- },
1287
- {
1288
- "epoch": 4.526315789473684,
1289
- "grad_norm": 1.971816897392273,
1290
- "learning_rate": 0.0003982949361823388,
1291
- "loss": 0.38,
1292
- "step": 86
1293
- },
1294
- {
1295
- "epoch": 4.526315789473684,
1296
- "eval_loss": 0.2624681293964386,
1297
- "eval_runtime": 0.8949,
1298
- "eval_samples_per_second": 33.524,
1299
- "eval_steps_per_second": 4.47,
1300
- "step": 86
1301
- },
1302
- {
1303
- "epoch": 4.578947368421053,
1304
- "grad_norm": 1.4714068174362183,
1305
- "learning_rate": 0.0003955911245651726,
1306
- "loss": 0.3944,
1307
- "step": 87
1308
- },
1309
- {
1310
- "epoch": 4.578947368421053,
1311
- "eval_loss": 0.23652420938014984,
1312
- "eval_runtime": 0.8952,
1313
- "eval_samples_per_second": 33.511,
1314
- "eval_steps_per_second": 4.468,
1315
- "step": 87
1316
- },
1317
- {
1318
- "epoch": 4.631578947368421,
1319
- "grad_norm": 2.6970834732055664,
1320
- "learning_rate": 0.0003928612938370292,
1321
- "loss": 0.3374,
1322
- "step": 88
1323
- },
1324
- {
1325
- "epoch": 4.631578947368421,
1326
- "eval_loss": 0.2716277241706848,
1327
- "eval_runtime": 0.8932,
1328
- "eval_samples_per_second": 33.588,
1329
- "eval_steps_per_second": 4.478,
1330
- "step": 88
1331
- },
1332
- {
1333
- "epoch": 4.684210526315789,
1334
- "grad_norm": 1.9066615104675293,
1335
- "learning_rate": 0.00039010593185572867,
1336
- "loss": 0.2442,
1337
- "step": 89
1338
- },
1339
- {
1340
- "epoch": 4.684210526315789,
1341
- "eval_loss": 0.2999991476535797,
1342
- "eval_runtime": 0.8939,
1343
- "eval_samples_per_second": 33.559,
1344
- "eval_steps_per_second": 4.475,
1345
- "step": 89
1346
- },
1347
- {
1348
- "epoch": 4.7368421052631575,
1349
- "grad_norm": 2.6232354640960693,
1350
- "learning_rate": 0.00038732553104187296,
1351
- "loss": 0.2857,
1352
- "step": 90
1353
- },
1354
- {
1355
- "epoch": 4.7368421052631575,
1356
- "eval_loss": 0.2302989959716797,
1357
- "eval_runtime": 0.8938,
1358
- "eval_samples_per_second": 33.564,
1359
- "eval_steps_per_second": 4.475,
1360
- "step": 90
1361
- },
1362
- {
1363
- "epoch": 4.7894736842105265,
1364
- "grad_norm": 2.0710129737854004,
1365
- "learning_rate": 0.0003845205882908432,
1366
- "loss": 0.4195,
1367
- "step": 91
1368
- },
1369
- {
1370
- "epoch": 4.7894736842105265,
1371
- "eval_loss": 0.21816590428352356,
1372
- "eval_runtime": 0.9251,
1373
- "eval_samples_per_second": 32.429,
1374
- "eval_steps_per_second": 4.324,
1375
- "step": 91
1376
- },
1377
- {
1378
- "epoch": 4.842105263157895,
1379
- "grad_norm": 1.8006062507629395,
1380
- "learning_rate": 0.0003816916048839979,
1381
- "loss": 0.2859,
1382
- "step": 92
1383
- },
1384
- {
1385
- "epoch": 4.842105263157895,
1386
- "eval_loss": 0.21071405708789825,
1387
- "eval_runtime": 0.8965,
1388
- "eval_samples_per_second": 33.462,
1389
- "eval_steps_per_second": 4.462,
1390
- "step": 92
1391
- },
1392
- {
1393
- "epoch": 4.894736842105263,
1394
- "grad_norm": 1.6352888345718384,
1395
- "learning_rate": 0.0003788390863990875,
1396
- "loss": 0.4275,
1397
- "step": 93
1398
- },
1399
- {
1400
- "epoch": 4.894736842105263,
1401
- "eval_loss": 0.20206846296787262,
1402
- "eval_runtime": 0.9052,
1403
- "eval_samples_per_second": 33.144,
1404
- "eval_steps_per_second": 4.419,
1405
- "step": 93
1406
- },
1407
- {
1408
- "epoch": 4.947368421052632,
1409
- "grad_norm": 1.6399378776550293,
1410
- "learning_rate": 0.00037596354261990007,
1411
- "loss": 0.389,
1412
- "step": 94
1413
- },
1414
- {
1415
- "epoch": 4.947368421052632,
1416
- "eval_loss": 0.19467315077781677,
1417
- "eval_runtime": 0.8973,
1418
- "eval_samples_per_second": 33.435,
1419
- "eval_steps_per_second": 4.458,
1420
- "step": 94
1421
- },
1422
- {
1423
- "epoch": 5.0,
1424
- "grad_norm": 1.5680173635482788,
1425
- "learning_rate": 0.0003730654874451569,
1426
- "loss": 0.395,
1427
- "step": 95
1428
- },
1429
- {
1430
- "epoch": 5.0,
1431
- "eval_loss": 0.19546455144882202,
1432
- "eval_runtime": 0.91,
1433
- "eval_samples_per_second": 32.968,
1434
- "eval_steps_per_second": 4.396,
1435
- "step": 95
1436
- },
1437
- {
1438
- "epoch": 5.052631578947368,
1439
- "grad_norm": 1.0308386087417603,
1440
- "learning_rate": 0.00037014543879667093,
1441
- "loss": 0.1384,
1442
- "step": 96
1443
- },
1444
- {
1445
- "epoch": 5.052631578947368,
1446
- "eval_loss": 0.18969732522964478,
1447
- "eval_runtime": 0.9021,
1448
- "eval_samples_per_second": 33.258,
1449
- "eval_steps_per_second": 4.434,
1450
- "step": 96
1451
- },
1452
- {
1453
- "epoch": 5.105263157894737,
1454
- "grad_norm": 1.4042502641677856,
1455
- "learning_rate": 0.0003672039185267878,
1456
- "loss": 0.2291,
1457
- "step": 97
1458
- },
1459
- {
1460
- "epoch": 5.105263157894737,
1461
- "eval_loss": 0.16800740361213684,
1462
- "eval_runtime": 0.8938,
1463
- "eval_samples_per_second": 33.563,
1464
- "eval_steps_per_second": 4.475,
1465
- "step": 97
1466
- },
1467
- {
1468
- "epoch": 5.157894736842105,
1469
- "grad_norm": 1.6313552856445312,
1470
- "learning_rate": 0.00036424145232512333,
1471
- "loss": 0.1736,
1472
- "step": 98
1473
- },
1474
- {
1475
- "epoch": 5.157894736842105,
1476
- "eval_loss": 0.16714099049568176,
1477
- "eval_runtime": 0.9009,
1478
- "eval_samples_per_second": 33.301,
1479
- "eval_steps_per_second": 4.44,
1480
- "step": 98
1481
- },
1482
- {
1483
- "epoch": 5.2105263157894735,
1484
- "grad_norm": 1.8922698497772217,
1485
- "learning_rate": 0.0003612585696246158,
1486
- "loss": 0.1677,
1487
- "step": 99
1488
- },
1489
- {
1490
- "epoch": 5.2105263157894735,
1491
- "eval_loss": 0.179762065410614,
1492
- "eval_runtime": 0.9039,
1493
- "eval_samples_per_second": 33.188,
1494
- "eval_steps_per_second": 4.425,
1495
- "step": 99
1496
- },
1497
- {
1498
- "epoch": 5.2631578947368425,
1499
- "grad_norm": 2.409526824951172,
1500
- "learning_rate": 0.0003582558035069091,
1501
- "loss": 0.2379,
1502
- "step": 100
1503
- },
1504
- {
1505
- "epoch": 5.2631578947368425,
1506
- "eval_loss": 0.1902371197938919,
1507
- "eval_runtime": 0.9097,
1508
- "eval_samples_per_second": 32.98,
1509
- "eval_steps_per_second": 4.397,
1510
- "step": 100
1511
- },
1512
- {
1513
- "epoch": 5.315789473684211,
1514
- "grad_norm": 2.084869146347046,
1515
- "learning_rate": 0.0003552336906070838,
1516
- "loss": 0.2165,
1517
- "step": 101
1518
- },
1519
- {
1520
- "epoch": 5.315789473684211,
1521
- "eval_loss": 0.17252177000045776,
1522
- "eval_runtime": 0.8948,
1523
- "eval_samples_per_second": 33.528,
1524
- "eval_steps_per_second": 4.47,
1525
- "step": 101
1526
- },
1527
- {
1528
- "epoch": 5.368421052631579,
1529
- "grad_norm": 1.655718207359314,
1530
- "learning_rate": 0.000352192771017753,
1531
- "loss": 0.223,
1532
- "step": 102
1533
- },
1534
- {
1535
- "epoch": 5.368421052631579,
1536
- "eval_loss": 0.18867380917072296,
1537
- "eval_runtime": 0.8956,
1538
- "eval_samples_per_second": 33.495,
1539
- "eval_steps_per_second": 4.466,
1540
- "step": 102
1541
- },
1542
- {
1543
- "epoch": 5.421052631578947,
1544
- "grad_norm": 2.672633409500122,
1545
- "learning_rate": 0.0003491335881925407,
1546
- "loss": 0.161,
1547
- "step": 103
1548
- },
1549
- {
1550
- "epoch": 5.421052631578947,
1551
- "eval_loss": 0.1944020837545395,
1552
- "eval_runtime": 0.8924,
1553
- "eval_samples_per_second": 33.616,
1554
- "eval_steps_per_second": 4.482,
1555
- "step": 103
1556
- },
1557
- {
1558
- "epoch": 5.473684210526316,
1559
- "grad_norm": 1.9712008237838745,
1560
- "learning_rate": 0.0003460566888489593,
1561
- "loss": 0.2525,
1562
- "step": 104
1563
- },
1564
- {
1565
- "epoch": 5.473684210526316,
1566
- "eval_loss": 0.17671068012714386,
1567
- "eval_runtime": 0.897,
1568
- "eval_samples_per_second": 33.446,
1569
- "eval_steps_per_second": 4.459,
1570
- "step": 104
1571
- },
1572
- {
1573
- "epoch": 5.526315789473684,
1574
- "grad_norm": 2.2153072357177734,
1575
- "learning_rate": 0.00034296262287070335,
1576
- "loss": 0.2105,
1577
- "step": 105
1578
- },
1579
- {
1580
- "epoch": 5.526315789473684,
1581
- "eval_loss": 0.1715732216835022,
1582
- "eval_runtime": 0.8951,
1583
- "eval_samples_per_second": 33.514,
1584
- "eval_steps_per_second": 4.469,
1585
- "step": 105
1586
- },
1587
- {
1588
- "epoch": 5.578947368421053,
1589
- "grad_norm": 1.8106168508529663,
1590
- "learning_rate": 0.0003398519432093782,
1591
- "loss": 0.259,
1592
- "step": 106
1593
- },
1594
- {
1595
- "epoch": 5.578947368421053,
1596
- "eval_loss": 0.1465868353843689,
1597
- "eval_runtime": 0.9077,
1598
- "eval_samples_per_second": 33.051,
1599
- "eval_steps_per_second": 4.407,
1600
- "step": 106
1601
- },
1602
- {
1603
- "epoch": 5.631578947368421,
1604
- "grad_norm": 2.1159439086914062,
1605
- "learning_rate": 0.0003367252057856802,
1606
- "loss": 0.2065,
1607
- "step": 107
1608
- },
1609
- {
1610
- "epoch": 5.631578947368421,
1611
- "eval_loss": 0.14219093322753906,
1612
- "eval_runtime": 0.9049,
1613
- "eval_samples_per_second": 33.154,
1614
- "eval_steps_per_second": 4.42,
1615
- "step": 107
1616
- },
1617
- {
1618
- "epoch": 5.684210526315789,
1619
- "grad_norm": 1.4467761516571045,
1620
- "learning_rate": 0.00033358296939004547,
1621
- "loss": 0.2083,
1622
- "step": 108
1623
- },
1624
- {
1625
- "epoch": 5.684210526315789,
1626
- "eval_loss": 0.1406753957271576,
1627
- "eval_runtime": 0.8954,
1628
- "eval_samples_per_second": 33.505,
1629
- "eval_steps_per_second": 4.467,
1630
- "step": 108
1631
- },
1632
- {
1633
- "epoch": 5.7368421052631575,
1634
- "grad_norm": 1.3671239614486694,
1635
- "learning_rate": 0.00033042579558278717,
1636
- "loss": 0.1825,
1637
- "step": 109
1638
- },
1639
- {
1640
- "epoch": 5.7368421052631575,
1641
- "eval_loss": 0.13007155060768127,
1642
- "eval_runtime": 0.8998,
1643
- "eval_samples_per_second": 33.342,
1644
- "eval_steps_per_second": 4.446,
1645
- "step": 109
1646
- },
1647
- {
1648
- "epoch": 5.7894736842105265,
1649
- "grad_norm": 1.479944109916687,
1650
- "learning_rate": 0.00032725424859373687,
1651
- "loss": 0.2244,
1652
- "step": 110
1653
- },
1654
- {
1655
- "epoch": 5.7894736842105265,
1656
- "eval_loss": 0.12692232429981232,
1657
- "eval_runtime": 0.901,
1658
- "eval_samples_per_second": 33.298,
1659
- "eval_steps_per_second": 4.44,
1660
- "step": 110
1661
- },
1662
- {
1663
- "epoch": 5.842105263157895,
1664
- "grad_norm": 1.5173969268798828,
1665
- "learning_rate": 0.0003240688952214085,
1666
- "loss": 0.2273,
1667
- "step": 111
1668
- },
1669
- {
1670
- "epoch": 5.842105263157895,
1671
- "eval_loss": 0.12454597651958466,
1672
- "eval_runtime": 0.8987,
1673
- "eval_samples_per_second": 33.382,
1674
- "eval_steps_per_second": 4.451,
1675
- "step": 111
1676
- },
1677
- {
1678
- "epoch": 5.894736842105263,
1679
- "grad_norm": 2.7870988845825195,
1680
- "learning_rate": 0.00032087030473170445,
1681
- "loss": 0.2101,
1682
- "step": 112
1683
- },
1684
- {
1685
- "epoch": 5.894736842105263,
1686
- "eval_loss": 0.12002909928560257,
1687
- "eval_runtime": 0.893,
1688
- "eval_samples_per_second": 33.593,
1689
- "eval_steps_per_second": 4.479,
1690
- "step": 112
1691
- },
1692
- {
1693
- "epoch": 5.947368421052632,
1694
- "grad_norm": 1.3659342527389526,
1695
- "learning_rate": 0.00031765904875617973,
1696
- "loss": 0.1882,
1697
- "step": 113
1698
- },
1699
- {
1700
- "epoch": 5.947368421052632,
1701
- "eval_loss": 0.10573837906122208,
1702
- "eval_runtime": 0.8956,
1703
- "eval_samples_per_second": 33.496,
1704
- "eval_steps_per_second": 4.466,
1705
- "step": 113
1706
- },
1707
- {
1708
- "epoch": 6.0,
1709
- "grad_norm": 1.8464044332504272,
1710
- "learning_rate": 0.00031443570118988356,
1711
- "loss": 0.2285,
1712
- "step": 114
1713
- },
1714
- {
1715
- "epoch": 6.0,
1716
- "eval_loss": 0.10221625119447708,
1717
- "eval_runtime": 0.8955,
1718
- "eval_samples_per_second": 33.501,
1719
- "eval_steps_per_second": 4.467,
1720
- "step": 114
1721
- },
1722
- {
1723
- "epoch": 6.052631578947368,
1724
- "grad_norm": 1.3894392251968384,
1725
- "learning_rate": 0.00031120083808879663,
1726
- "loss": 0.1115,
1727
- "step": 115
1728
- },
1729
- {
1730
- "epoch": 6.052631578947368,
1731
- "eval_loss": 0.09458151459693909,
1732
- "eval_runtime": 0.8981,
1733
- "eval_samples_per_second": 33.405,
1734
- "eval_steps_per_second": 4.454,
1735
- "step": 115
1736
- },
1737
- {
1738
- "epoch": 6.105263157894737,
1739
- "grad_norm": 0.933142364025116,
1740
- "learning_rate": 0.0003079550375668821,
1741
- "loss": 0.0888,
1742
- "step": 116
1743
- },
1744
- {
1745
- "epoch": 6.105263157894737,
1746
- "eval_loss": 0.09364737570285797,
1747
- "eval_runtime": 0.9403,
1748
- "eval_samples_per_second": 31.905,
1749
- "eval_steps_per_second": 4.254,
1750
- "step": 116
1751
- },
1752
- {
1753
- "epoch": 6.157894736842105,
1754
- "grad_norm": 0.9676756262779236,
1755
- "learning_rate": 0.00030469887969276877,
1756
- "loss": 0.0785,
1757
- "step": 117
1758
- },
1759
- {
1760
- "epoch": 6.157894736842105,
1761
- "eval_loss": 0.10635325312614441,
1762
- "eval_runtime": 0.903,
1763
- "eval_samples_per_second": 33.224,
1764
- "eval_steps_per_second": 4.43,
1765
- "step": 117
1766
- },
1767
- {
1768
- "epoch": 6.2105263157894735,
1769
- "grad_norm": 1.249068260192871,
1770
- "learning_rate": 0.00030143294638608487,
1771
- "loss": 0.0938,
1772
- "step": 118
1773
- },
1774
- {
1775
- "epoch": 6.2105263157894735,
1776
- "eval_loss": 0.10837359726428986,
1777
- "eval_runtime": 0.9003,
1778
- "eval_samples_per_second": 33.323,
1779
- "eval_steps_per_second": 4.443,
1780
- "step": 118
1781
- },
1782
- {
1783
- "epoch": 6.2631578947368425,
1784
- "grad_norm": 2.1446304321289062,
1785
- "learning_rate": 0.00029815782131346137,
1786
- "loss": 0.1436,
1787
- "step": 119
1788
- },
1789
- {
1790
- "epoch": 6.2631578947368425,
1791
- "eval_loss": 0.1047668606042862,
1792
- "eval_runtime": 0.9066,
1793
- "eval_samples_per_second": 33.09,
1794
- "eval_steps_per_second": 4.412,
1795
- "step": 119
1796
- },
1797
- {
1798
- "epoch": 6.315789473684211,
1799
- "grad_norm": 1.329365611076355,
1800
- "learning_rate": 0.0002948740897842223,
1801
- "loss": 0.1319,
1802
- "step": 120
1803
- },
1804
- {
1805
- "epoch": 6.315789473684211,
1806
- "eval_loss": 0.10011889785528183,
1807
- "eval_runtime": 0.9034,
1808
- "eval_samples_per_second": 33.206,
1809
- "eval_steps_per_second": 4.428,
1810
- "step": 120
1811
- },
1812
- {
1813
- "epoch": 6.368421052631579,
1814
- "grad_norm": 1.4938923120498657,
1815
- "learning_rate": 0.00029158233864578256,
1816
- "loss": 0.1027,
1817
- "step": 121
1818
- },
1819
- {
1820
- "epoch": 6.368421052631579,
1821
- "eval_loss": 0.09962069243192673,
1822
- "eval_runtime": 0.8983,
1823
- "eval_samples_per_second": 33.396,
1824
- "eval_steps_per_second": 4.453,
1825
- "step": 121
1826
- },
1827
- {
1828
- "epoch": 6.421052631578947,
1829
- "grad_norm": 1.295058250427246,
1830
- "learning_rate": 0.00028828315617877,
1831
- "loss": 0.0763,
1832
- "step": 122
1833
- },
1834
- {
1835
- "epoch": 6.421052631578947,
1836
- "eval_loss": 0.10031073540449142,
1837
- "eval_runtime": 0.9155,
1838
- "eval_samples_per_second": 32.768,
1839
- "eval_steps_per_second": 4.369,
1840
- "step": 122
1841
- },
1842
- {
1843
- "epoch": 6.473684210526316,
1844
- "grad_norm": 1.8959721326828003,
1845
- "learning_rate": 0.0002849771319918922,
1846
- "loss": 0.1292,
1847
- "step": 123
1848
- },
1849
- {
1850
- "epoch": 6.473684210526316,
1851
- "eval_loss": 0.11003147065639496,
1852
- "eval_runtime": 0.919,
1853
- "eval_samples_per_second": 32.644,
1854
- "eval_steps_per_second": 4.353,
1855
- "step": 123
1856
- },
1857
- {
1858
- "epoch": 6.526315789473684,
1859
- "grad_norm": 1.3598809242248535,
1860
- "learning_rate": 0.00028166485691656423,
1861
- "loss": 0.1272,
1862
- "step": 124
1863
- },
1864
- {
1865
- "epoch": 6.526315789473684,
1866
- "eval_loss": 0.10435277968645096,
1867
- "eval_runtime": 0.8989,
1868
- "eval_samples_per_second": 33.374,
1869
- "eval_steps_per_second": 4.45,
1870
- "step": 124
1871
- },
1872
- {
1873
- "epoch": 6.578947368421053,
1874
- "grad_norm": 1.4015425443649292,
1875
- "learning_rate": 0.00027834692290132053,
1876
- "loss": 0.1348,
1877
- "step": 125
1878
- },
1879
- {
1880
- "epoch": 6.578947368421053,
1881
- "eval_loss": 0.10004603117704391,
1882
- "eval_runtime": 0.9016,
1883
- "eval_samples_per_second": 33.273,
1884
- "eval_steps_per_second": 4.436,
1885
- "step": 125
1886
- },
1887
- {
1888
- "epoch": 6.631578947368421,
1889
- "grad_norm": 1.1642837524414062,
1890
- "learning_rate": 0.0002750239229060246,
1891
- "loss": 0.1121,
1892
- "step": 126
1893
- },
1894
- {
1895
- "epoch": 6.631578947368421,
1896
- "eval_loss": 0.09635353088378906,
1897
- "eval_runtime": 0.9239,
1898
- "eval_samples_per_second": 32.47,
1899
- "eval_steps_per_second": 4.329,
1900
- "step": 126
1901
- },
1902
- {
1903
- "epoch": 6.684210526315789,
1904
- "grad_norm": 1.363749384880066,
1905
- "learning_rate": 0.0002716964507958994,
1906
- "loss": 0.1415,
1907
- "step": 127
1908
- },
1909
- {
1910
- "epoch": 6.684210526315789,
1911
- "eval_loss": 0.07641066610813141,
1912
- "eval_runtime": 0.912,
1913
- "eval_samples_per_second": 32.894,
1914
- "eval_steps_per_second": 4.386,
1915
- "step": 127
1916
- },
1917
- {
1918
- "epoch": 6.7368421052631575,
1919
- "grad_norm": 1.292934775352478,
1920
- "learning_rate": 0.0002683651012353955,
1921
- "loss": 0.1513,
1922
- "step": 128
1923
- },
1924
- {
1925
- "epoch": 6.7368421052631575,
1926
- "eval_loss": 0.07172319293022156,
1927
- "eval_runtime": 0.8976,
1928
- "eval_samples_per_second": 33.423,
1929
- "eval_steps_per_second": 4.456,
1930
- "step": 128
1931
- },
1932
- {
1933
- "epoch": 6.7894736842105265,
1934
- "grad_norm": 1.0078333616256714,
1935
- "learning_rate": 0.0002650304695819168,
1936
- "loss": 0.1185,
1937
- "step": 129
1938
- },
1939
- {
1940
- "epoch": 6.7894736842105265,
1941
- "eval_loss": 0.06412829458713531,
1942
- "eval_runtime": 0.8976,
1943
- "eval_samples_per_second": 33.424,
1944
- "eval_steps_per_second": 4.456,
1945
- "step": 129
1946
- },
1947
- {
1948
- "epoch": 6.842105263157895,
1949
- "grad_norm": 1.173531413078308,
1950
- "learning_rate": 0.00026169315177942135,
1951
- "loss": 0.158,
1952
- "step": 130
1953
- },
1954
- {
1955
- "epoch": 6.842105263157895,
1956
- "eval_loss": 0.05317940190434456,
1957
- "eval_runtime": 0.895,
1958
- "eval_samples_per_second": 33.52,
1959
- "eval_steps_per_second": 4.469,
1960
- "step": 130
1961
- },
1962
- {
1963
- "epoch": 6.894736842105263,
1964
- "grad_norm": 1.5312238931655884,
1965
- "learning_rate": 0.0002583537442519187,
1966
- "loss": 0.1335,
1967
- "step": 131
1968
- },
1969
- {
1970
- "epoch": 6.894736842105263,
1971
- "eval_loss": 0.04491396248340607,
1972
- "eval_runtime": 0.9089,
1973
- "eval_samples_per_second": 33.006,
1974
- "eval_steps_per_second": 4.401,
1975
- "step": 131
1976
- },
1977
- {
1978
- "epoch": 6.947368421052632,
1979
- "grad_norm": 1.399732232093811,
1980
- "learning_rate": 0.00025501284379688067,
1981
- "loss": 0.1462,
1982
- "step": 132
1983
- },
1984
- {
1985
- "epoch": 6.947368421052632,
1986
- "eval_loss": 0.050081584602594376,
1987
- "eval_runtime": 0.9127,
1988
- "eval_samples_per_second": 32.871,
1989
- "eval_steps_per_second": 4.383,
1990
- "step": 132
1991
- },
1992
- {
1993
- "epoch": 7.0,
1994
- "grad_norm": 1.5357416868209839,
1995
- "learning_rate": 0.0002516710474785856,
1996
- "loss": 0.1136,
1997
- "step": 133
1998
- },
1999
- {
2000
- "epoch": 7.0,
2001
- "eval_loss": 0.05439286679029465,
2002
- "eval_runtime": 0.8974,
2003
- "eval_samples_per_second": 33.429,
2004
- "eval_steps_per_second": 4.457,
2005
- "step": 133
2006
- },
2007
- {
2008
- "epoch": 7.052631578947368,
2009
- "grad_norm": 0.9891072511672974,
2010
- "learning_rate": 0.0002483289525214145,
2011
- "loss": 0.0741,
2012
- "step": 134
2013
- },
2014
- {
2015
- "epoch": 7.052631578947368,
2016
- "eval_loss": 0.05073266103863716,
2017
- "eval_runtime": 0.8953,
2018
- "eval_samples_per_second": 33.508,
2019
- "eval_steps_per_second": 4.468,
2020
- "step": 134
2021
- },
2022
- {
2023
- "epoch": 7.105263157894737,
2024
- "grad_norm": 0.9686666131019592,
2025
- "learning_rate": 0.00024498715620311935,
2026
- "loss": 0.0518,
2027
- "step": 135
2028
- },
2029
- {
2030
- "epoch": 7.105263157894737,
2031
- "eval_loss": 0.05563385412096977,
2032
- "eval_runtime": 0.9008,
2033
- "eval_samples_per_second": 33.305,
2034
- "eval_steps_per_second": 4.441,
2035
- "step": 135
2036
- },
2037
- {
2038
- "epoch": 7.157894736842105,
2039
- "grad_norm": 1.2277772426605225,
2040
- "learning_rate": 0.00024164625574808144,
2041
- "loss": 0.0436,
2042
- "step": 136
2043
- },
2044
- {
2045
- "epoch": 7.157894736842105,
2046
- "eval_loss": 0.058188486844301224,
2047
- "eval_runtime": 0.9014,
2048
- "eval_samples_per_second": 33.283,
2049
- "eval_steps_per_second": 4.438,
2050
- "step": 136
2051
- },
2052
- {
2053
- "epoch": 7.2105263157894735,
2054
- "grad_norm": 1.3704907894134521,
2055
- "learning_rate": 0.00023830684822057877,
2056
- "loss": 0.1041,
2057
- "step": 137
2058
- },
2059
- {
2060
- "epoch": 7.2105263157894735,
2061
- "eval_loss": 0.06476210802793503,
2062
- "eval_runtime": 0.9007,
2063
- "eval_samples_per_second": 33.306,
2064
- "eval_steps_per_second": 4.441,
2065
- "step": 137
2066
- },
2067
- {
2068
- "epoch": 7.2631578947368425,
2069
- "grad_norm": 16.16583251953125,
2070
- "learning_rate": 0.00023496953041808325,
2071
- "loss": 0.0492,
2072
- "step": 138
2073
- },
2074
- {
2075
- "epoch": 7.2631578947368425,
2076
- "eval_loss": 0.07608657330274582,
2077
- "eval_runtime": 0.8935,
2078
- "eval_samples_per_second": 33.576,
2079
- "eval_steps_per_second": 4.477,
2080
- "step": 138
2081
- },
2082
- {
2083
- "epoch": 7.315789473684211,
2084
- "grad_norm": 1.200278639793396,
2085
- "learning_rate": 0.0002316348987646045,
2086
- "loss": 0.0716,
2087
- "step": 139
2088
- },
2089
- {
2090
- "epoch": 7.315789473684211,
2091
- "eval_loss": 0.07832919806241989,
2092
- "eval_runtime": 0.8933,
2093
- "eval_samples_per_second": 33.584,
2094
- "eval_steps_per_second": 4.478,
2095
- "step": 139
2096
- },
2097
- {
2098
- "epoch": 7.368421052631579,
2099
- "grad_norm": 1.10837984085083,
2100
- "learning_rate": 0.00022830354920410064,
2101
- "loss": 0.0867,
2102
- "step": 140
2103
- },
2104
- {
2105
- "epoch": 7.368421052631579,
2106
- "eval_loss": 0.06757114827632904,
2107
- "eval_runtime": 0.8916,
2108
- "eval_samples_per_second": 33.649,
2109
- "eval_steps_per_second": 4.487,
2110
- "step": 140
2111
- },
2112
- {
2113
- "epoch": 7.421052631578947,
2114
- "grad_norm": 0.8154372572898865,
2115
- "learning_rate": 0.0002249760770939754,
2116
- "loss": 0.0596,
2117
- "step": 141
2118
- },
2119
- {
2120
- "epoch": 7.421052631578947,
2121
- "eval_loss": 0.06439080089330673,
2122
- "eval_runtime": 0.8926,
2123
- "eval_samples_per_second": 33.61,
2124
- "eval_steps_per_second": 4.481,
2125
- "step": 141
2126
- },
2127
- {
2128
- "epoch": 7.473684210526316,
2129
- "grad_norm": 1.3668967485427856,
2130
- "learning_rate": 0.0002216530770986795,
2131
- "loss": 0.0742,
2132
- "step": 142
2133
- },
2134
- {
2135
- "epoch": 7.473684210526316,
2136
- "eval_loss": 0.05956079065799713,
2137
- "eval_runtime": 0.9129,
2138
- "eval_samples_per_second": 32.861,
2139
- "eval_steps_per_second": 4.381,
2140
- "step": 142
2141
- },
2142
- {
2143
- "epoch": 7.526315789473684,
2144
- "grad_norm": 1.1893479824066162,
2145
- "learning_rate": 0.0002183351430834358,
2146
- "loss": 0.0885,
2147
- "step": 143
2148
- },
2149
- {
2150
- "epoch": 7.526315789473684,
2151
- "eval_loss": 0.05919176712632179,
2152
- "eval_runtime": 0.9013,
2153
- "eval_samples_per_second": 33.285,
2154
- "eval_steps_per_second": 4.438,
2155
- "step": 143
2156
- },
2157
- {
2158
- "epoch": 7.578947368421053,
2159
- "grad_norm": 0.9393155574798584,
2160
- "learning_rate": 0.0002150228680081079,
2161
- "loss": 0.069,
2162
- "step": 144
2163
- },
2164
- {
2165
- "epoch": 7.578947368421053,
2166
- "eval_loss": 0.055469710379838943,
2167
- "eval_runtime": 0.8928,
2168
- "eval_samples_per_second": 33.603,
2169
- "eval_steps_per_second": 4.48,
2170
- "step": 144
2171
- },
2172
- {
2173
- "epoch": 7.631578947368421,
2174
- "grad_norm": 1.1879485845565796,
2175
- "learning_rate": 0.00021171684382123,
2176
- "loss": 0.0636,
2177
- "step": 145
2178
- },
2179
- {
2180
- "epoch": 7.631578947368421,
2181
- "eval_loss": 0.048830099403858185,
2182
- "eval_runtime": 0.8962,
2183
- "eval_samples_per_second": 33.476,
2184
- "eval_steps_per_second": 4.463,
2185
- "step": 145
2186
- },
2187
- {
2188
- "epoch": 7.684210526315789,
2189
- "grad_norm": 1.3696624040603638,
2190
- "learning_rate": 0.0002084176613542175,
2191
- "loss": 0.0769,
2192
- "step": 146
2193
- },
2194
- {
2195
- "epoch": 7.684210526315789,
2196
- "eval_loss": 0.04780884087085724,
2197
- "eval_runtime": 0.8944,
2198
- "eval_samples_per_second": 33.543,
2199
- "eval_steps_per_second": 4.472,
2200
- "step": 146
2201
- },
2202
- {
2203
- "epoch": 7.7368421052631575,
2204
- "grad_norm": 0.8504798412322998,
2205
- "learning_rate": 0.00020512591021577773,
2206
- "loss": 0.0452,
2207
- "step": 147
2208
- },
2209
- {
2210
- "epoch": 7.7368421052631575,
2211
- "eval_loss": 0.05237739533185959,
2212
- "eval_runtime": 0.8936,
2213
- "eval_samples_per_second": 33.572,
2214
- "eval_steps_per_second": 4.476,
2215
- "step": 147
2216
- },
2217
- {
2218
- "epoch": 7.7894736842105265,
2219
- "grad_norm": 1.4475505352020264,
2220
- "learning_rate": 0.00020184217868653867,
2221
- "loss": 0.0855,
2222
- "step": 148
2223
- },
2224
- {
2225
- "epoch": 7.7894736842105265,
2226
- "eval_loss": 0.04543802887201309,
2227
- "eval_runtime": 0.896,
2228
- "eval_samples_per_second": 33.484,
2229
- "eval_steps_per_second": 4.465,
2230
- "step": 148
2231
- },
2232
- {
2233
- "epoch": 7.842105263157895,
2234
- "grad_norm": 1.5789515972137451,
2235
- "learning_rate": 0.0001985670536139151,
2236
- "loss": 0.0874,
2237
- "step": 149
2238
- },
2239
- {
2240
- "epoch": 7.842105263157895,
2241
- "eval_loss": 0.0420089028775692,
2242
- "eval_runtime": 0.9085,
2243
- "eval_samples_per_second": 33.022,
2244
- "eval_steps_per_second": 4.403,
2245
- "step": 149
2246
- },
2247
- {
2248
- "epoch": 7.894736842105263,
2249
- "grad_norm": 0.9716910719871521,
2250
- "learning_rate": 0.0001953011203072312,
2251
- "loss": 0.0741,
2252
- "step": 150
2253
- },
2254
- {
2255
- "epoch": 7.894736842105263,
2256
- "eval_loss": 0.053930822759866714,
2257
- "eval_runtime": 0.8925,
2258
- "eval_samples_per_second": 33.612,
2259
- "eval_steps_per_second": 4.482,
2260
- "step": 150
2261
- },
2262
- {
2263
- "epoch": 7.947368421052632,
2264
- "grad_norm": 1.258216142654419,
2265
- "learning_rate": 0.00019204496243311792,
2266
- "loss": 0.0988,
2267
- "step": 151
2268
- },
2269
- {
2270
- "epoch": 7.947368421052632,
2271
- "eval_loss": 0.050727710127830505,
2272
- "eval_runtime": 0.9129,
2273
- "eval_samples_per_second": 32.861,
2274
- "eval_steps_per_second": 4.381,
2275
- "step": 151
2276
- },
2277
- {
2278
- "epoch": 8.0,
2279
- "grad_norm": 1.6167078018188477,
2280
- "learning_rate": 0.00018879916191120349,
2281
- "loss": 0.1526,
2282
- "step": 152
2283
- },
2284
- {
2285
- "epoch": 8.0,
2286
- "eval_loss": 0.044940169900655746,
2287
- "eval_runtime": 0.9149,
2288
- "eval_samples_per_second": 32.792,
2289
- "eval_steps_per_second": 4.372,
2290
- "step": 152
2291
- },
2292
- {
2293
- "epoch": 8.052631578947368,
2294
- "grad_norm": 0.7703630328178406,
2295
- "learning_rate": 0.00018556429881011656,
2296
- "loss": 0.029,
2297
- "step": 153
2298
- },
2299
- {
2300
- "epoch": 8.052631578947368,
2301
- "eval_loss": 0.04256557673215866,
2302
- "eval_runtime": 0.8993,
2303
- "eval_samples_per_second": 33.359,
2304
- "eval_steps_per_second": 4.448,
2305
- "step": 153
2306
- },
2307
- {
2308
- "epoch": 8.105263157894736,
2309
- "grad_norm": 0.7948728799819946,
2310
- "learning_rate": 0.0001823409512438203,
2311
- "loss": 0.0294,
2312
- "step": 154
2313
- },
2314
- {
2315
- "epoch": 8.105263157894736,
2316
- "eval_loss": 0.0398668609559536,
2317
- "eval_runtime": 0.9221,
2318
- "eval_samples_per_second": 32.536,
2319
- "eval_steps_per_second": 4.338,
2320
- "step": 154
2321
- },
2322
- {
2323
- "epoch": 8.157894736842104,
2324
- "grad_norm": 0.5918542146682739,
2325
- "learning_rate": 0.00017912969526829559,
2326
- "loss": 0.0219,
2327
- "step": 155
2328
- },
2329
- {
2330
- "epoch": 8.157894736842104,
2331
- "eval_loss": 0.03863578289747238,
2332
- "eval_runtime": 0.8936,
2333
- "eval_samples_per_second": 33.573,
2334
- "eval_steps_per_second": 4.476,
2335
- "step": 155
2336
- },
2337
- {
2338
- "epoch": 8.210526315789474,
2339
- "grad_norm": 0.5533296465873718,
2340
- "learning_rate": 0.00017593110477859153,
2341
- "loss": 0.0238,
2342
- "step": 156
2343
- },
2344
- {
2345
- "epoch": 8.210526315789474,
2346
- "eval_loss": 0.03713521733880043,
2347
- "eval_runtime": 0.8943,
2348
- "eval_samples_per_second": 33.547,
2349
- "eval_steps_per_second": 4.473,
2350
- "step": 156
2351
- },
2352
- {
2353
- "epoch": 8.263157894736842,
2354
- "grad_norm": 0.5387775897979736,
2355
- "learning_rate": 0.00017274575140626317,
2356
- "loss": 0.0332,
2357
- "step": 157
2358
- },
2359
- {
2360
- "epoch": 8.263157894736842,
2361
- "eval_loss": 0.0393383763730526,
2362
- "eval_runtime": 0.8986,
2363
- "eval_samples_per_second": 33.384,
2364
- "eval_steps_per_second": 4.451,
2365
- "step": 157
2366
- },
2367
- {
2368
- "epoch": 8.31578947368421,
2369
- "grad_norm": 2.716648578643799,
2370
- "learning_rate": 0.00016957420441721284,
2371
- "loss": 0.0508,
2372
- "step": 158
2373
- },
2374
- {
2375
- "epoch": 8.31578947368421,
2376
- "eval_loss": 0.039231013506650925,
2377
- "eval_runtime": 0.911,
2378
- "eval_samples_per_second": 32.932,
2379
- "eval_steps_per_second": 4.391,
2380
- "step": 158
2381
- },
2382
- {
2383
- "epoch": 8.368421052631579,
2384
- "grad_norm": 0.6262527704238892,
2385
- "learning_rate": 0.00016641703060995457,
2386
- "loss": 0.0376,
2387
- "step": 159
2388
- },
2389
- {
2390
- "epoch": 8.368421052631579,
2391
- "eval_loss": 0.03573182597756386,
2392
- "eval_runtime": 0.9178,
2393
- "eval_samples_per_second": 32.685,
2394
- "eval_steps_per_second": 4.358,
2395
- "step": 159
2396
- },
2397
- {
2398
- "epoch": 8.421052631578947,
2399
- "grad_norm": 1.1656262874603271,
2400
- "learning_rate": 0.00016327479421431983,
2401
- "loss": 0.0613,
2402
- "step": 160
2403
- },
2404
- {
2405
- "epoch": 8.421052631578947,
2406
- "eval_loss": 0.029768355190753937,
2407
- "eval_runtime": 0.8926,
2408
- "eval_samples_per_second": 33.61,
2409
- "eval_steps_per_second": 4.481,
2410
- "step": 160
2411
- },
2412
- {
2413
- "epoch": 8.473684210526315,
2414
- "grad_norm": 0.9731020927429199,
2415
- "learning_rate": 0.00016014805679062183,
2416
- "loss": 0.0755,
2417
- "step": 161
2418
- },
2419
- {
2420
- "epoch": 8.473684210526315,
2421
- "eval_loss": 0.022336162626743317,
2422
- "eval_runtime": 0.9064,
2423
- "eval_samples_per_second": 33.097,
2424
- "eval_steps_per_second": 4.413,
2425
- "step": 161
2426
- },
2427
- {
2428
- "epoch": 8.526315789473685,
2429
- "grad_norm": 0.9505934119224548,
2430
- "learning_rate": 0.0001570373771292967,
2431
- "loss": 0.0592,
2432
- "step": 162
2433
- },
2434
- {
2435
- "epoch": 8.526315789473685,
2436
- "eval_loss": 0.019842755049467087,
2437
- "eval_runtime": 0.9099,
2438
- "eval_samples_per_second": 32.97,
2439
- "eval_steps_per_second": 4.396,
2440
- "step": 162
2441
- },
2442
- {
2443
- "epoch": 8.578947368421053,
2444
- "grad_norm": 0.709037184715271,
2445
- "learning_rate": 0.00015394331115104075,
2446
- "loss": 0.0386,
2447
- "step": 163
2448
- },
2449
- {
2450
- "epoch": 8.578947368421053,
2451
- "eval_loss": 0.019390322268009186,
2452
- "eval_runtime": 0.8955,
2453
- "eval_samples_per_second": 33.501,
2454
- "eval_steps_per_second": 4.467,
2455
- "step": 163
2456
- },
2457
- {
2458
- "epoch": 8.631578947368421,
2459
- "grad_norm": 1.0144383907318115,
2460
- "learning_rate": 0.00015086641180745932,
2461
- "loss": 0.0392,
2462
- "step": 164
2463
- },
2464
- {
2465
- "epoch": 8.631578947368421,
2466
- "eval_loss": 0.018627820536494255,
2467
- "eval_runtime": 0.8942,
2468
- "eval_samples_per_second": 33.549,
2469
- "eval_steps_per_second": 4.473,
2470
- "step": 164
2471
- },
2472
- {
2473
- "epoch": 8.68421052631579,
2474
- "grad_norm": 1.3723385334014893,
2475
- "learning_rate": 0.00014780722898224708,
2476
- "loss": 0.0286,
2477
- "step": 165
2478
- },
2479
- {
2480
- "epoch": 8.68421052631579,
2481
- "eval_loss": 0.015356449410319328,
2482
- "eval_runtime": 0.8932,
2483
- "eval_samples_per_second": 33.586,
2484
- "eval_steps_per_second": 4.478,
2485
- "step": 165
2486
- },
2487
- {
2488
- "epoch": 8.736842105263158,
2489
- "grad_norm": 0.6372384428977966,
2490
- "learning_rate": 0.0001447663093929163,
2491
- "loss": 0.0425,
2492
- "step": 166
2493
- },
2494
- {
2495
- "epoch": 8.736842105263158,
2496
- "eval_loss": 0.015127343125641346,
2497
- "eval_runtime": 0.9004,
2498
- "eval_samples_per_second": 33.319,
2499
- "eval_steps_per_second": 4.443,
2500
- "step": 166
2501
- },
2502
- {
2503
- "epoch": 8.789473684210526,
2504
- "grad_norm": 0.7628927826881409,
2505
- "learning_rate": 0.00014174419649309089,
2506
- "loss": 0.0218,
2507
- "step": 167
2508
- },
2509
- {
2510
- "epoch": 8.789473684210526,
2511
- "eval_loss": 0.015446596778929234,
2512
- "eval_runtime": 0.8932,
2513
- "eval_samples_per_second": 33.586,
2514
- "eval_steps_per_second": 4.478,
2515
- "step": 167
2516
- },
2517
- {
2518
- "epoch": 8.842105263157894,
2519
- "grad_norm": 0.7694376111030579,
2520
- "learning_rate": 0.00013874143037538418,
2521
- "loss": 0.0251,
2522
- "step": 168
2523
- },
2524
- {
2525
- "epoch": 8.842105263157894,
2526
- "eval_loss": 0.01555707585066557,
2527
- "eval_runtime": 0.9268,
2528
- "eval_samples_per_second": 32.368,
2529
- "eval_steps_per_second": 4.316,
2530
- "step": 168
2531
- },
2532
- {
2533
- "epoch": 8.894736842105264,
2534
- "grad_norm": 0.7292389869689941,
2535
- "learning_rate": 0.0001357585476748766,
2536
- "loss": 0.0345,
2537
- "step": 169
2538
- },
2539
- {
2540
- "epoch": 8.894736842105264,
2541
- "eval_loss": 0.014117183163762093,
2542
- "eval_runtime": 0.8989,
2543
- "eval_samples_per_second": 33.374,
2544
- "eval_steps_per_second": 4.45,
2545
- "step": 169
2546
- },
2547
- {
2548
- "epoch": 8.947368421052632,
2549
- "grad_norm": 0.7417434453964233,
2550
- "learning_rate": 0.00013279608147321223,
2551
- "loss": 0.0355,
2552
- "step": 170
2553
- },
2554
- {
2555
- "epoch": 8.947368421052632,
2556
- "eval_loss": 0.01502351462841034,
2557
- "eval_runtime": 0.8919,
2558
- "eval_samples_per_second": 33.634,
2559
- "eval_steps_per_second": 4.485,
2560
- "step": 170
2561
- },
2562
- {
2563
- "epoch": 9.0,
2564
- "grad_norm": 1.0023473501205444,
2565
- "learning_rate": 0.00012985456120332905,
2566
- "loss": 0.0463,
2567
- "step": 171
2568
- },
2569
- {
2570
- "epoch": 9.0,
2571
- "eval_loss": 0.015432776883244514,
2572
- "eval_runtime": 0.8928,
2573
- "eval_samples_per_second": 33.604,
2574
- "eval_steps_per_second": 4.481,
2575
- "step": 171
2576
- },
2577
- {
2578
- "epoch": 9.052631578947368,
2579
- "grad_norm": 0.9472024440765381,
2580
- "learning_rate": 0.00012693451255484312,
2581
- "loss": 0.0164,
2582
- "step": 172
2583
- },
2584
- {
2585
- "epoch": 9.052631578947368,
2586
- "eval_loss": 0.015417199581861496,
2587
- "eval_runtime": 0.9209,
2588
- "eval_samples_per_second": 32.576,
2589
- "eval_steps_per_second": 4.344,
2590
- "step": 172
2591
- },
2592
- {
2593
- "epoch": 9.105263157894736,
2594
- "grad_norm": 0.48799633979797363,
2595
- "learning_rate": 0.00012403645738009997,
2596
- "loss": 0.0112,
2597
- "step": 173
2598
- },
2599
- {
2600
- "epoch": 9.105263157894736,
2601
- "eval_loss": 0.015746938064694405,
2602
- "eval_runtime": 0.903,
2603
- "eval_samples_per_second": 33.221,
2604
- "eval_steps_per_second": 4.43,
2605
- "step": 173
2606
- },
2607
- {
2608
- "epoch": 9.157894736842104,
2609
- "grad_norm": 0.38101622462272644,
2610
- "learning_rate": 0.00012116091360091261,
2611
- "loss": 0.0107,
2612
- "step": 174
2613
- },
2614
- {
2615
- "epoch": 9.157894736842104,
2616
- "eval_loss": 0.016201062127947807,
2617
- "eval_runtime": 0.9098,
2618
- "eval_samples_per_second": 32.973,
2619
- "eval_steps_per_second": 4.396,
2620
- "step": 174
2621
- },
2622
- {
2623
- "epoch": 9.210526315789474,
2624
- "grad_norm": 0.5602852702140808,
2625
- "learning_rate": 0.00011830839511600211,
2626
- "loss": 0.0214,
2627
- "step": 175
2628
- },
2629
- {
2630
- "epoch": 9.210526315789474,
2631
- "eval_loss": 0.01637989468872547,
2632
- "eval_runtime": 0.8978,
2633
- "eval_samples_per_second": 33.416,
2634
- "eval_steps_per_second": 4.455,
2635
- "step": 175
2636
- },
2637
- {
2638
- "epoch": 9.263157894736842,
2639
- "grad_norm": 0.48026910424232483,
2640
- "learning_rate": 0.00011547941170915685,
2641
- "loss": 0.0159,
2642
- "step": 176
2643
- },
2644
- {
2645
- "epoch": 9.263157894736842,
2646
- "eval_loss": 0.01590169034898281,
2647
- "eval_runtime": 0.8929,
2648
- "eval_samples_per_second": 33.599,
2649
- "eval_steps_per_second": 4.48,
2650
- "step": 176
2651
- },
2652
- {
2653
- "epoch": 9.31578947368421,
2654
- "grad_norm": 0.42628395557403564,
2655
- "learning_rate": 0.00011267446895812702,
2656
- "loss": 0.0103,
2657
- "step": 177
2658
- },
2659
- {
2660
- "epoch": 9.31578947368421,
2661
- "eval_loss": 0.016489733010530472,
2662
- "eval_runtime": 0.9067,
2663
- "eval_samples_per_second": 33.087,
2664
- "eval_steps_per_second": 4.412,
2665
- "step": 177
2666
- },
2667
- {
2668
- "epoch": 9.368421052631579,
2669
- "grad_norm": 0.31815841794013977,
2670
- "learning_rate": 0.0001098940681442713,
2671
- "loss": 0.0127,
2672
- "step": 178
2673
- },
2674
- {
2675
- "epoch": 9.368421052631579,
2676
- "eval_loss": 0.016672790050506592,
2677
- "eval_runtime": 0.9121,
2678
- "eval_samples_per_second": 32.892,
2679
- "eval_steps_per_second": 4.386,
2680
- "step": 178
2681
- },
2682
- {
2683
- "epoch": 9.421052631578947,
2684
- "grad_norm": 0.9146761894226074,
2685
- "learning_rate": 0.00010713870616297092,
2686
- "loss": 0.0272,
2687
- "step": 179
2688
- },
2689
- {
2690
- "epoch": 9.421052631578947,
2691
- "eval_loss": 0.016623031347990036,
2692
- "eval_runtime": 0.8922,
2693
- "eval_samples_per_second": 33.624,
2694
- "eval_steps_per_second": 4.483,
2695
- "step": 179
2696
- },
2697
- {
2698
- "epoch": 9.473684210526315,
2699
- "grad_norm": 0.7005583643913269,
2700
- "learning_rate": 0.00010440887543482746,
2701
- "loss": 0.0316,
2702
- "step": 180
2703
- },
2704
- {
2705
- "epoch": 9.473684210526315,
2706
- "eval_loss": 0.01275827456265688,
2707
- "eval_runtime": 0.8927,
2708
- "eval_samples_per_second": 33.606,
2709
- "eval_steps_per_second": 4.481,
2710
- "step": 180
2711
- },
2712
- {
2713
- "epoch": 9.526315789473685,
2714
- "grad_norm": 0.650211751461029,
2715
- "learning_rate": 0.0001017050638176612,
2716
- "loss": 0.026,
2717
- "step": 181
2718
- },
2719
- {
2720
- "epoch": 9.526315789473685,
2721
- "eval_loss": 0.00972173921763897,
2722
- "eval_runtime": 0.8929,
2723
- "eval_samples_per_second": 33.597,
2724
- "eval_steps_per_second": 4.48,
2725
- "step": 181
2726
- },
2727
- {
2728
- "epoch": 9.578947368421053,
2729
- "grad_norm": 0.6491077542304993,
2730
- "learning_rate": 9.902775451932386e-05,
2731
- "loss": 0.0118,
2732
- "step": 182
2733
- },
2734
- {
2735
- "epoch": 9.578947368421053,
2736
- "eval_loss": 0.008100698702037334,
2737
- "eval_runtime": 0.8982,
2738
- "eval_samples_per_second": 33.399,
2739
- "eval_steps_per_second": 4.453,
2740
- "step": 182
2741
- },
2742
- {
2743
- "epoch": 9.631578947368421,
2744
- "grad_norm": 0.5663555264472961,
2745
- "learning_rate": 9.637742601134286e-05,
2746
- "loss": 0.0179,
2747
- "step": 183
2748
- },
2749
- {
2750
- "epoch": 9.631578947368421,
2751
- "eval_loss": 0.007195114623755217,
2752
- "eval_runtime": 0.8943,
2753
- "eval_samples_per_second": 33.547,
2754
- "eval_steps_per_second": 4.473,
2755
- "step": 183
2756
- },
2757
- {
2758
- "epoch": 9.68421052631579,
2759
- "grad_norm": 0.45350518822669983,
2760
- "learning_rate": 9.375455194341214e-05,
2761
- "loss": 0.0133,
2762
- "step": 184
2763
- },
2764
- {
2765
- "epoch": 9.68421052631579,
2766
- "eval_loss": 0.005673492327332497,
2767
- "eval_runtime": 0.8957,
2768
- "eval_samples_per_second": 33.493,
2769
- "eval_steps_per_second": 4.466,
2770
- "step": 184
2771
- },
2772
- {
2773
- "epoch": 9.736842105263158,
2774
- "grad_norm": 0.4562082886695862,
2775
- "learning_rate": 9.11596010587441e-05,
2776
- "loss": 0.0116,
2777
- "step": 185
2778
- },
2779
- {
2780
- "epoch": 9.736842105263158,
2781
- "eval_loss": 0.005512699484825134,
2782
- "eval_runtime": 0.8942,
2783
- "eval_samples_per_second": 33.551,
2784
- "eval_steps_per_second": 4.473,
2785
- "step": 185
2786
- },
2787
- {
2788
- "epoch": 9.789473684210526,
2789
- "grad_norm": 0.4965287446975708,
2790
- "learning_rate": 8.85930371102994e-05,
2791
- "loss": 0.0175,
2792
- "step": 186
2793
- },
2794
- {
2795
- "epoch": 9.789473684210526,
2796
- "eval_loss": 0.005058939103037119,
2797
- "eval_runtime": 0.8924,
2798
- "eval_samples_per_second": 33.619,
2799
- "eval_steps_per_second": 4.482,
2800
- "step": 186
2801
- },
2802
- {
2803
- "epoch": 9.842105263157894,
2804
- "grad_norm": 0.4823167324066162,
2805
- "learning_rate": 8.605531877790762e-05,
2806
- "loss": 0.0156,
2807
- "step": 187
2808
- },
2809
- {
2810
- "epoch": 9.842105263157894,
2811
- "eval_loss": 0.004006177186965942,
2812
- "eval_runtime": 0.8937,
2813
- "eval_samples_per_second": 33.568,
2814
- "eval_steps_per_second": 4.476,
2815
- "step": 187
2816
- },
2817
- {
2818
- "epoch": 9.894736842105264,
2819
- "grad_norm": 0.5879040360450745,
2820
- "learning_rate": 8.354689958629513e-05,
2821
- "loss": 0.0147,
2822
- "step": 188
2823
- },
2824
- {
2825
- "epoch": 9.894736842105264,
2826
- "eval_loss": 0.003014415269717574,
2827
- "eval_runtime": 0.8965,
2828
- "eval_samples_per_second": 33.465,
2829
- "eval_steps_per_second": 4.462,
2830
- "step": 188
2831
- },
2832
- {
2833
- "epoch": 9.947368421052632,
2834
- "grad_norm": 0.4576377868652344,
2835
- "learning_rate": 8.106822782403376e-05,
2836
- "loss": 0.0095,
2837
- "step": 189
2838
- },
2839
- {
2840
- "epoch": 9.947368421052632,
2841
- "eval_loss": 0.002746094949543476,
2842
- "eval_runtime": 0.8942,
2843
- "eval_samples_per_second": 33.55,
2844
- "eval_steps_per_second": 4.473,
2845
- "step": 189
2846
- },
2847
- {
2848
- "epoch": 10.0,
2849
- "grad_norm": 0.3874748647212982,
2850
- "learning_rate": 7.861974646342596e-05,
2851
- "loss": 0.0065,
2852
- "step": 190
2853
- },
2854
- {
2855
- "epoch": 10.0,
2856
- "eval_loss": 0.0022730662021785975,
2857
- "eval_runtime": 0.8918,
2858
- "eval_samples_per_second": 33.639,
2859
- "eval_steps_per_second": 4.485,
2860
- "step": 190
2861
- },
2862
- {
2863
- "epoch": 10.052631578947368,
2864
- "grad_norm": 0.19529208540916443,
2865
- "learning_rate": 7.620189308133943e-05,
2866
- "loss": 0.0038,
2867
- "step": 191
2868
- },
2869
- {
2870
- "epoch": 10.052631578947368,
2871
- "eval_loss": 0.0020791899878531694,
2872
- "eval_runtime": 0.8957,
2873
- "eval_samples_per_second": 33.495,
2874
- "eval_steps_per_second": 4.466,
2875
- "step": 191
2876
- },
2877
- {
2878
- "epoch": 10.105263157894736,
2879
- "grad_norm": 0.11527393758296967,
2880
- "learning_rate": 7.381509978100626e-05,
2881
- "loss": 0.0022,
2882
- "step": 192
2883
- },
2884
- {
2885
- "epoch": 10.105263157894736,
2886
- "eval_loss": 0.002016394166275859,
2887
- "eval_runtime": 0.8996,
2888
- "eval_samples_per_second": 33.347,
2889
- "eval_steps_per_second": 4.446,
2890
- "step": 192
2891
- },
2892
- {
2893
- "epoch": 10.157894736842104,
2894
- "grad_norm": 0.15251131355762482,
2895
- "learning_rate": 7.145979311479986e-05,
2896
- "loss": 0.003,
2897
- "step": 193
2898
- },
2899
- {
2900
- "epoch": 10.157894736842104,
2901
- "eval_loss": 0.0021317771170288324,
2902
- "eval_runtime": 0.8932,
2903
- "eval_samples_per_second": 33.585,
2904
- "eval_steps_per_second": 4.478,
2905
- "step": 193
2906
- },
2907
- {
2908
- "epoch": 10.210526315789474,
2909
- "grad_norm": 0.16482071578502655,
2910
- "learning_rate": 6.913639400800489e-05,
2911
- "loss": 0.0024,
2912
- "step": 194
2913
- },
2914
- {
2915
- "epoch": 10.210526315789474,
2916
- "eval_loss": 0.0021966167259961367,
2917
- "eval_runtime": 0.8945,
2918
- "eval_samples_per_second": 33.537,
2919
- "eval_steps_per_second": 4.472,
2920
- "step": 194
2921
- },
2922
- {
2923
- "epoch": 10.263157894736842,
2924
- "grad_norm": 0.14208117127418518,
2925
- "learning_rate": 6.684531768359173e-05,
2926
- "loss": 0.002,
2927
- "step": 195
2928
- },
2929
- {
2930
- "epoch": 10.263157894736842,
2931
- "eval_loss": 0.0022034423891454935,
2932
- "eval_runtime": 0.8952,
2933
- "eval_samples_per_second": 33.511,
2934
- "eval_steps_per_second": 4.468,
2935
- "step": 195
2936
- },
2937
- {
2938
- "epoch": 10.31578947368421,
2939
- "grad_norm": 0.11844911426305771,
2940
- "learning_rate": 6.458697358801061e-05,
2941
- "loss": 0.0018,
2942
- "step": 196
2943
- },
2944
- {
2945
- "epoch": 10.31578947368421,
2946
- "eval_loss": 0.002191495383158326,
2947
- "eval_runtime": 0.8926,
2948
- "eval_samples_per_second": 33.611,
2949
- "eval_steps_per_second": 4.481,
2950
- "step": 196
2951
- },
2952
- {
2953
- "epoch": 10.368421052631579,
2954
- "grad_norm": 0.25322437286376953,
2955
- "learning_rate": 6.236176531801813e-05,
2956
- "loss": 0.0049,
2957
- "step": 197
2958
- },
2959
- {
2960
- "epoch": 10.368421052631579,
2961
- "eval_loss": 0.0022686992306262255,
2962
- "eval_runtime": 0.8949,
2963
- "eval_samples_per_second": 33.525,
2964
- "eval_steps_per_second": 4.47,
2965
- "step": 197
2966
- },
2967
- {
2968
- "epoch": 10.421052631578947,
2969
- "grad_norm": 0.29156965017318726,
2970
- "learning_rate": 6.017009054854858e-05,
2971
- "loss": 0.0045,
2972
- "step": 198
2973
- },
2974
- {
2975
- "epoch": 10.421052631578947,
2976
- "eval_loss": 0.002286201808601618,
2977
- "eval_runtime": 0.8929,
2978
- "eval_samples_per_second": 33.597,
2979
- "eval_steps_per_second": 4.48,
2980
- "step": 198
2981
- },
2982
- {
2983
- "epoch": 10.473684210526315,
2984
- "grad_norm": 0.3855668306350708,
2985
- "learning_rate": 5.801234096164468e-05,
2986
- "loss": 0.0034,
2987
- "step": 199
2988
- },
2989
- {
2990
- "epoch": 10.473684210526315,
2991
- "eval_loss": 0.0018616730812937021,
2992
- "eval_runtime": 0.894,
2993
- "eval_samples_per_second": 33.558,
2994
- "eval_steps_per_second": 4.474,
2995
- "step": 199
2996
- },
2997
- {
2998
- "epoch": 10.526315789473685,
2999
- "grad_norm": 0.2883719205856323,
3000
- "learning_rate": 5.58889021764582e-05,
3001
- "loss": 0.0044,
3002
- "step": 200
3003
- },
3004
- {
3005
- "epoch": 10.526315789473685,
3006
- "eval_loss": 0.0016098986379802227,
3007
- "eval_runtime": 0.8994,
3008
- "eval_samples_per_second": 33.357,
3009
- "eval_steps_per_second": 4.448,
3010
- "step": 200
3011
  }
3012
  ],
3013
  "logging_steps": 1,
@@ -3027,7 +927,7 @@
3027
  "attributes": {}
3028
  }
3029
  },
3030
- "total_flos": 8525733259253760.0,
3031
  "train_batch_size": 1,
3032
  "trial_name": null,
3033
  "trial_params": null
 
1
  {
2
+ "best_global_step": 60,
3
+ "best_metric": 0.5306870341300964,
4
+ "best_model_checkpoint": "/content/drive/MyDrive/lora_model/outputs/task15_microsoft/Phi-4-mini-instruct/checkpoint-60",
5
+ "epoch": 3.1578947368421053,
6
  "eval_steps": 1,
7
+ "global_step": 60,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
908
  "eval_samples_per_second": 33.566,
909
  "eval_steps_per_second": 4.475,
910
  "step": 60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
911
  }
912
  ],
913
  "logging_steps": 1,
 
927
  "attributes": {}
928
  }
929
  },
930
+ "total_flos": 2557484965533696.0,
931
  "train_batch_size": 1,
932
  "trial_name": null,
933
  "trial_params": null