| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 1205, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0008298755186721991, | |
| "grad_norm": 23.66355396613384, | |
| "learning_rate": 8.264462809917357e-08, | |
| "loss": 1.4427, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.004149377593360996, | |
| "grad_norm": 23.666427649458576, | |
| "learning_rate": 4.132231404958678e-07, | |
| "loss": 1.4251, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.008298755186721992, | |
| "grad_norm": 8.13472184690251, | |
| "learning_rate": 8.264462809917356e-07, | |
| "loss": 1.3136, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.012448132780082987, | |
| "grad_norm": 10.227426110416495, | |
| "learning_rate": 1.2396694214876035e-06, | |
| "loss": 1.1782, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.016597510373443983, | |
| "grad_norm": 2.9983433935088764, | |
| "learning_rate": 1.6528925619834712e-06, | |
| "loss": 1.0153, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.02074688796680498, | |
| "grad_norm": 2.7990610124740405, | |
| "learning_rate": 2.066115702479339e-06, | |
| "loss": 0.9803, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.024896265560165973, | |
| "grad_norm": 2.4757391776519464, | |
| "learning_rate": 2.479338842975207e-06, | |
| "loss": 0.9414, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.029045643153526972, | |
| "grad_norm": 2.369524052256136, | |
| "learning_rate": 2.8925619834710743e-06, | |
| "loss": 0.9136, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.03319502074688797, | |
| "grad_norm": 2.2685836127457777, | |
| "learning_rate": 3.3057851239669424e-06, | |
| "loss": 0.9046, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.03734439834024896, | |
| "grad_norm": 2.280617723317066, | |
| "learning_rate": 3.71900826446281e-06, | |
| "loss": 0.8837, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.04149377593360996, | |
| "grad_norm": 2.5321104543831545, | |
| "learning_rate": 4.132231404958678e-06, | |
| "loss": 0.8832, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.04564315352697095, | |
| "grad_norm": 2.431703080743525, | |
| "learning_rate": 4.5454545454545455e-06, | |
| "loss": 0.8805, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.04979253112033195, | |
| "grad_norm": 2.3196841399300054, | |
| "learning_rate": 4.958677685950414e-06, | |
| "loss": 0.8701, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.05394190871369295, | |
| "grad_norm": 2.3618202415732195, | |
| "learning_rate": 5.371900826446281e-06, | |
| "loss": 0.8579, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.058091286307053944, | |
| "grad_norm": 2.3717003204055405, | |
| "learning_rate": 5.785123966942149e-06, | |
| "loss": 0.8505, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.06224066390041494, | |
| "grad_norm": 2.524204535368378, | |
| "learning_rate": 6.198347107438017e-06, | |
| "loss": 0.8402, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.06639004149377593, | |
| "grad_norm": 2.4015926775695804, | |
| "learning_rate": 6.611570247933885e-06, | |
| "loss": 0.8402, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.07053941908713693, | |
| "grad_norm": 2.388794422413219, | |
| "learning_rate": 7.0247933884297525e-06, | |
| "loss": 0.8355, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.07468879668049792, | |
| "grad_norm": 2.4148605854925704, | |
| "learning_rate": 7.43801652892562e-06, | |
| "loss": 0.8256, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.07883817427385892, | |
| "grad_norm": 2.4479657967512414, | |
| "learning_rate": 7.851239669421489e-06, | |
| "loss": 0.8126, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.08298755186721991, | |
| "grad_norm": 2.4560082465511863, | |
| "learning_rate": 8.264462809917356e-06, | |
| "loss": 0.8226, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.08713692946058091, | |
| "grad_norm": 2.3573417388565976, | |
| "learning_rate": 8.677685950413224e-06, | |
| "loss": 0.8175, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.0912863070539419, | |
| "grad_norm": 2.468662387879913, | |
| "learning_rate": 9.090909090909091e-06, | |
| "loss": 0.8158, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.0954356846473029, | |
| "grad_norm": 2.5046573038316837, | |
| "learning_rate": 9.50413223140496e-06, | |
| "loss": 0.8116, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.0995850622406639, | |
| "grad_norm": 2.539643690091556, | |
| "learning_rate": 9.917355371900828e-06, | |
| "loss": 0.8005, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.1037344398340249, | |
| "grad_norm": 2.911760533276573, | |
| "learning_rate": 9.999664033241933e-06, | |
| "loss": 0.7939, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.1078838174273859, | |
| "grad_norm": 2.582851539753205, | |
| "learning_rate": 9.9982992456671e-06, | |
| "loss": 0.8055, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.11203319502074689, | |
| "grad_norm": 2.4256847202133356, | |
| "learning_rate": 9.99588492570789e-06, | |
| "loss": 0.7868, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.11618257261410789, | |
| "grad_norm": 2.363704333423417, | |
| "learning_rate": 9.992421580318146e-06, | |
| "loss": 0.7931, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.12033195020746888, | |
| "grad_norm": 2.5814735842129646, | |
| "learning_rate": 9.98790993672386e-06, | |
| "loss": 0.792, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.12448132780082988, | |
| "grad_norm": 2.3697758205953496, | |
| "learning_rate": 9.982350942270482e-06, | |
| "loss": 0.7803, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.12863070539419086, | |
| "grad_norm": 2.715505233595166, | |
| "learning_rate": 9.975745764224003e-06, | |
| "loss": 0.7801, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.13278008298755187, | |
| "grad_norm": 2.408139672943199, | |
| "learning_rate": 9.968095789525844e-06, | |
| "loss": 0.7752, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.13692946058091288, | |
| "grad_norm": 2.6010481805450576, | |
| "learning_rate": 9.959402624501636e-06, | |
| "loss": 0.7643, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.14107883817427386, | |
| "grad_norm": 2.296000230707304, | |
| "learning_rate": 9.949668094523923e-06, | |
| "loss": 0.7693, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.14522821576763487, | |
| "grad_norm": 2.3353134794630943, | |
| "learning_rate": 9.938894243628876e-06, | |
| "loss": 0.76, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.14937759336099585, | |
| "grad_norm": 2.4548337376467226, | |
| "learning_rate": 9.927083334087095e-06, | |
| "loss": 0.7553, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.15352697095435686, | |
| "grad_norm": 2.4799784321191325, | |
| "learning_rate": 9.914237845928574e-06, | |
| "loss": 0.7466, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.15767634854771784, | |
| "grad_norm": 2.3334152911020754, | |
| "learning_rate": 9.900360476421953e-06, | |
| "loss": 0.7469, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.16182572614107885, | |
| "grad_norm": 2.8513960305920065, | |
| "learning_rate": 9.885454139508156e-06, | |
| "loss": 0.7478, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.16597510373443983, | |
| "grad_norm": 2.3540844011309288, | |
| "learning_rate": 9.869521965188516e-06, | |
| "loss": 0.7419, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.17012448132780084, | |
| "grad_norm": 2.4949727451490458, | |
| "learning_rate": 9.852567298867557e-06, | |
| "loss": 0.7443, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.17427385892116182, | |
| "grad_norm": 2.4359748708393196, | |
| "learning_rate": 9.83459370065053e-06, | |
| "loss": 0.715, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.17842323651452283, | |
| "grad_norm": 2.194244707564717, | |
| "learning_rate": 9.815604944595856e-06, | |
| "loss": 0.732, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.1825726141078838, | |
| "grad_norm": 2.3813196851718668, | |
| "learning_rate": 9.79560501792268e-06, | |
| "loss": 0.7133, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.18672199170124482, | |
| "grad_norm": 2.323752421674217, | |
| "learning_rate": 9.774598120173625e-06, | |
| "loss": 0.7241, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.1908713692946058, | |
| "grad_norm": 2.3526306711964797, | |
| "learning_rate": 9.752588662332986e-06, | |
| "loss": 0.7043, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.1950207468879668, | |
| "grad_norm": 2.2642342063534096, | |
| "learning_rate": 9.729581265900524e-06, | |
| "loss": 0.7103, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.1991701244813278, | |
| "grad_norm": 2.516182372123314, | |
| "learning_rate": 9.70558076192105e-06, | |
| "loss": 0.7267, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.2033195020746888, | |
| "grad_norm": 2.370762773316826, | |
| "learning_rate": 9.680592189970015e-06, | |
| "loss": 0.7123, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.2074688796680498, | |
| "grad_norm": 2.207456321652819, | |
| "learning_rate": 9.654620797095307e-06, | |
| "loss": 0.6962, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.21161825726141079, | |
| "grad_norm": 2.2364777487963643, | |
| "learning_rate": 9.627672036715484e-06, | |
| "loss": 0.6898, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.2157676348547718, | |
| "grad_norm": 2.3651072953613754, | |
| "learning_rate": 9.599751567474695e-06, | |
| "loss": 0.6759, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.21991701244813278, | |
| "grad_norm": 2.450633099288444, | |
| "learning_rate": 9.570865252054462e-06, | |
| "loss": 0.7034, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.22406639004149378, | |
| "grad_norm": 2.3473986013625963, | |
| "learning_rate": 9.541019155942663e-06, | |
| "loss": 0.6822, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.22821576763485477, | |
| "grad_norm": 2.548163258213224, | |
| "learning_rate": 9.51021954615992e-06, | |
| "loss": 0.6789, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.23236514522821577, | |
| "grad_norm": 2.5379693961647303, | |
| "learning_rate": 9.478472889943644e-06, | |
| "loss": 0.6625, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.23651452282157676, | |
| "grad_norm": 2.327924483215091, | |
| "learning_rate": 9.445785853390074e-06, | |
| "loss": 0.6721, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.24066390041493776, | |
| "grad_norm": 2.262881099755119, | |
| "learning_rate": 9.412165300054536e-06, | |
| "loss": 0.6566, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.24481327800829875, | |
| "grad_norm": 2.265488203072667, | |
| "learning_rate": 9.377618289510251e-06, | |
| "loss": 0.6608, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.24896265560165975, | |
| "grad_norm": 2.5878567494362175, | |
| "learning_rate": 9.34215207586598e-06, | |
| "loss": 0.6499, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.25311203319502074, | |
| "grad_norm": 2.3658001383539733, | |
| "learning_rate": 9.305774106242825e-06, | |
| "loss": 0.6484, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.2572614107883817, | |
| "grad_norm": 2.243961082145765, | |
| "learning_rate": 9.268492019210486e-06, | |
| "loss": 0.6564, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.26141078838174275, | |
| "grad_norm": 2.380897449863462, | |
| "learning_rate": 9.23031364318335e-06, | |
| "loss": 0.6348, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.26556016597510373, | |
| "grad_norm": 2.5103759793912817, | |
| "learning_rate": 9.191246994776676e-06, | |
| "loss": 0.6363, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.2697095435684647, | |
| "grad_norm": 2.249912565836941, | |
| "learning_rate": 9.1513002771233e-06, | |
| "loss": 0.6419, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.27385892116182575, | |
| "grad_norm": 2.163581249856587, | |
| "learning_rate": 9.110481878151147e-06, | |
| "loss": 0.6301, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.27800829875518673, | |
| "grad_norm": 2.190865179892157, | |
| "learning_rate": 9.068800368821957e-06, | |
| "loss": 0.6325, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.2821576763485477, | |
| "grad_norm": 2.265573915661597, | |
| "learning_rate": 9.026264501331571e-06, | |
| "loss": 0.6322, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.2863070539419087, | |
| "grad_norm": 2.2910874776204286, | |
| "learning_rate": 8.982883207272164e-06, | |
| "loss": 0.623, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.29045643153526973, | |
| "grad_norm": 2.384654294302928, | |
| "learning_rate": 8.938665595756807e-06, | |
| "loss": 0.6246, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.2946058091286307, | |
| "grad_norm": 2.4124577641889537, | |
| "learning_rate": 8.893620951506755e-06, | |
| "loss": 0.6136, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.2987551867219917, | |
| "grad_norm": 2.6221754387144562, | |
| "learning_rate": 8.84775873290186e-06, | |
| "loss": 0.6187, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.3029045643153527, | |
| "grad_norm": 2.2955995327624277, | |
| "learning_rate": 8.801088569994523e-06, | |
| "loss": 0.6048, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.3070539419087137, | |
| "grad_norm": 2.138665428419567, | |
| "learning_rate": 8.75362026248759e-06, | |
| "loss": 0.6031, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.3112033195020747, | |
| "grad_norm": 2.1328698825170593, | |
| "learning_rate": 8.705363777676641e-06, | |
| "loss": 0.6063, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.3153526970954357, | |
| "grad_norm": 2.1479776795024783, | |
| "learning_rate": 8.656329248357065e-06, | |
| "loss": 0.5942, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.31950207468879666, | |
| "grad_norm": 2.134654863907731, | |
| "learning_rate": 8.60652697069641e-06, | |
| "loss": 0.5904, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.3236514522821577, | |
| "grad_norm": 2.374467014031146, | |
| "learning_rate": 8.555967402072402e-06, | |
| "loss": 0.5995, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.3278008298755187, | |
| "grad_norm": 2.40926239385975, | |
| "learning_rate": 8.50466115887714e-06, | |
| "loss": 0.5927, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.33195020746887965, | |
| "grad_norm": 2.3502075362637767, | |
| "learning_rate": 8.452619014287882e-06, | |
| "loss": 0.5847, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.3360995850622407, | |
| "grad_norm": 2.4126500802319097, | |
| "learning_rate": 8.399851896004914e-06, | |
| "loss": 0.5862, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.34024896265560167, | |
| "grad_norm": 2.2950749664456227, | |
| "learning_rate": 8.346370883956975e-06, | |
| "loss": 0.5855, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.34439834024896265, | |
| "grad_norm": 2.161036752273593, | |
| "learning_rate": 8.292187207974723e-06, | |
| "loss": 0.5746, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.34854771784232363, | |
| "grad_norm": 2.4188236778063352, | |
| "learning_rate": 8.237312245432709e-06, | |
| "loss": 0.5879, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.35269709543568467, | |
| "grad_norm": 2.2090916593497636, | |
| "learning_rate": 8.181757518860387e-06, | |
| "loss": 0.5773, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.35684647302904565, | |
| "grad_norm": 2.3146906711334734, | |
| "learning_rate": 8.125534693522639e-06, | |
| "loss": 0.5735, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.36099585062240663, | |
| "grad_norm": 2.239996996129289, | |
| "learning_rate": 8.068655574970316e-06, | |
| "loss": 0.554, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.3651452282157676, | |
| "grad_norm": 2.363097757813145, | |
| "learning_rate": 8.011132106561347e-06, | |
| "loss": 0.5545, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.36929460580912865, | |
| "grad_norm": 2.27029308132692, | |
| "learning_rate": 7.952976366952888e-06, | |
| "loss": 0.5683, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.37344398340248963, | |
| "grad_norm": 2.1074572809534375, | |
| "learning_rate": 7.894200567565075e-06, | |
| "loss": 0.5648, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.3775933609958506, | |
| "grad_norm": 2.273342227929578, | |
| "learning_rate": 7.834817050016899e-06, | |
| "loss": 0.5679, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.3817427385892116, | |
| "grad_norm": 2.2797216207901383, | |
| "learning_rate": 7.774838283534724e-06, | |
| "loss": 0.5559, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.38589211618257263, | |
| "grad_norm": 2.104532263164514, | |
| "learning_rate": 7.714276862334051e-06, | |
| "loss": 0.54, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.3900414937759336, | |
| "grad_norm": 2.1184787555923212, | |
| "learning_rate": 7.653145502974982e-06, | |
| "loss": 0.555, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.3941908713692946, | |
| "grad_norm": 2.1723348338228328, | |
| "learning_rate": 7.591457041692045e-06, | |
| "loss": 0.546, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.3983402489626556, | |
| "grad_norm": 2.1840500746464295, | |
| "learning_rate": 7.529224431698858e-06, | |
| "loss": 0.547, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.4024896265560166, | |
| "grad_norm": 2.318164164245411, | |
| "learning_rate": 7.466460740468246e-06, | |
| "loss": 0.542, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.4066390041493776, | |
| "grad_norm": 2.3138151208917126, | |
| "learning_rate": 7.40317914698835e-06, | |
| "loss": 0.5445, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.4107883817427386, | |
| "grad_norm": 2.1331277286671186, | |
| "learning_rate": 7.339392938995349e-06, | |
| "loss": 0.5319, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.4149377593360996, | |
| "grad_norm": 2.2351009447186803, | |
| "learning_rate": 7.2751155101833095e-06, | |
| "loss": 0.5273, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.4190871369294606, | |
| "grad_norm": 2.135991611803002, | |
| "learning_rate": 7.210360357391818e-06, | |
| "loss": 0.5376, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.42323651452282157, | |
| "grad_norm": 2.5539346756615906, | |
| "learning_rate": 7.145141077771938e-06, | |
| "loss": 0.5273, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.42738589211618255, | |
| "grad_norm": 2.047264916869935, | |
| "learning_rate": 7.0794713659311145e-06, | |
| "loss": 0.5277, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.4315352697095436, | |
| "grad_norm": 2.2988994571475994, | |
| "learning_rate": 7.0133650110576e-06, | |
| "loss": 0.5124, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.43568464730290457, | |
| "grad_norm": 2.185194372325859, | |
| "learning_rate": 6.946835894025037e-06, | |
| "loss": 0.526, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.43983402489626555, | |
| "grad_norm": 2.1924305119120095, | |
| "learning_rate": 6.879897984477778e-06, | |
| "loss": 0.5297, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.44398340248962653, | |
| "grad_norm": 2.2531761041789458, | |
| "learning_rate": 6.8125653378975675e-06, | |
| "loss": 0.5135, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.44813278008298757, | |
| "grad_norm": 2.1140660080628217, | |
| "learning_rate": 6.7448520926522084e-06, | |
| "loss": 0.5209, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.45228215767634855, | |
| "grad_norm": 2.2942813216608995, | |
| "learning_rate": 6.676772467026809e-06, | |
| "loss": 0.5153, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.45643153526970953, | |
| "grad_norm": 2.1380478037125026, | |
| "learning_rate": 6.608340756238261e-06, | |
| "loss": 0.5205, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.4605809128630705, | |
| "grad_norm": 2.2305987813784665, | |
| "learning_rate": 6.539571329433562e-06, | |
| "loss": 0.5042, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.46473029045643155, | |
| "grad_norm": 2.0259829999672583, | |
| "learning_rate": 6.470478626672607e-06, | |
| "loss": 0.5063, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.46887966804979253, | |
| "grad_norm": 2.1482954433368966, | |
| "learning_rate": 6.401077155896098e-06, | |
| "loss": 0.5037, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.4730290456431535, | |
| "grad_norm": 2.0722196480572967, | |
| "learning_rate": 6.3313814898792e-06, | |
| "loss": 0.5119, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.47717842323651455, | |
| "grad_norm": 2.0766008318122116, | |
| "learning_rate": 6.261406263171574e-06, | |
| "loss": 0.4936, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.48132780082987553, | |
| "grad_norm": 2.170054524337145, | |
| "learning_rate": 6.191166169024449e-06, | |
| "loss": 0.5115, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.4854771784232365, | |
| "grad_norm": 2.165392386687399, | |
| "learning_rate": 6.120675956305363e-06, | |
| "loss": 0.506, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.4896265560165975, | |
| "grad_norm": 2.1917064615978865, | |
| "learning_rate": 6.049950426401224e-06, | |
| "loss": 0.4922, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.49377593360995853, | |
| "grad_norm": 2.1320846413675754, | |
| "learning_rate": 5.979004430110356e-06, | |
| "loss": 0.4962, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.4979253112033195, | |
| "grad_norm": 2.2214982569308943, | |
| "learning_rate": 5.907852864524141e-06, | |
| "loss": 0.4798, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.5020746887966805, | |
| "grad_norm": 2.1478446403131017, | |
| "learning_rate": 5.836510669898984e-06, | |
| "loss": 0.488, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.5062240663900415, | |
| "grad_norm": 2.0711442564532683, | |
| "learning_rate": 5.7649928265191625e-06, | |
| "loss": 0.4809, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.5103734439834025, | |
| "grad_norm": 2.0362299353680875, | |
| "learning_rate": 5.693314351551317e-06, | |
| "loss": 0.4779, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.5145228215767634, | |
| "grad_norm": 2.223869208837942, | |
| "learning_rate": 5.621490295891172e-06, | |
| "loss": 0.4816, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.5186721991701245, | |
| "grad_norm": 2.184093648016579, | |
| "learning_rate": 5.5495357410031805e-06, | |
| "loss": 0.4767, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.5228215767634855, | |
| "grad_norm": 2.0615628269463455, | |
| "learning_rate": 5.477465795753744e-06, | |
| "loss": 0.4721, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.5269709543568465, | |
| "grad_norm": 2.139010346072537, | |
| "learning_rate": 5.405295593238701e-06, | |
| "loss": 0.4695, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.5311203319502075, | |
| "grad_norm": 2.123946607015628, | |
| "learning_rate": 5.333040287605687e-06, | |
| "loss": 0.4723, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.5352697095435685, | |
| "grad_norm": 2.3651276982269964, | |
| "learning_rate": 5.260715050872119e-06, | |
| "loss": 0.4624, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.5394190871369294, | |
| "grad_norm": 2.287295610466921, | |
| "learning_rate": 5.1883350697394e-06, | |
| "loss": 0.4644, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.5435684647302904, | |
| "grad_norm": 2.105320559946653, | |
| "learning_rate": 5.115915542404045e-06, | |
| "loss": 0.4584, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.5477178423236515, | |
| "grad_norm": 2.170432310241974, | |
| "learning_rate": 5.0434716753663984e-06, | |
| "loss": 0.4612, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.5518672199170125, | |
| "grad_norm": 2.194690777716423, | |
| "learning_rate": 4.971018680237602e-06, | |
| "loss": 0.4607, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.5560165975103735, | |
| "grad_norm": 1.9776501173869443, | |
| "learning_rate": 4.8985717705455e-06, | |
| "loss": 0.4611, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.5601659751037344, | |
| "grad_norm": 2.108677954477835, | |
| "learning_rate": 4.826146158540125e-06, | |
| "loss": 0.464, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.5643153526970954, | |
| "grad_norm": 2.2306541731963265, | |
| "learning_rate": 4.753757051999468e-06, | |
| "loss": 0.4575, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.5684647302904564, | |
| "grad_norm": 2.177699396676135, | |
| "learning_rate": 4.681419651036177e-06, | |
| "loss": 0.4515, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.5726141078838174, | |
| "grad_norm": 2.1123134999189603, | |
| "learning_rate": 4.609149144905874e-06, | |
| "loss": 0.4609, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.5767634854771784, | |
| "grad_norm": 2.2267780701787343, | |
| "learning_rate": 4.536960708817743e-06, | |
| "loss": 0.4517, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.5809128630705395, | |
| "grad_norm": 2.0878112877178725, | |
| "learning_rate": 4.464869500748075e-06, | |
| "loss": 0.4532, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.5850622406639004, | |
| "grad_norm": 2.064093943237562, | |
| "learning_rate": 4.392890658257421e-06, | |
| "loss": 0.4545, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.5892116182572614, | |
| "grad_norm": 2.091722197853374, | |
| "learning_rate": 4.321039295312048e-06, | |
| "loss": 0.4346, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.5933609958506224, | |
| "grad_norm": 1.9233248459765655, | |
| "learning_rate": 4.249330499110334e-06, | |
| "loss": 0.436, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.5975103734439834, | |
| "grad_norm": 1.9972967400785588, | |
| "learning_rate": 4.177779326914793e-06, | |
| "loss": 0.435, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.6016597510373444, | |
| "grad_norm": 2.0247163429595743, | |
| "learning_rate": 4.106400802890377e-06, | |
| "loss": 0.438, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.6058091286307054, | |
| "grad_norm": 2.082262528377969, | |
| "learning_rate": 4.03520991494974e-06, | |
| "loss": 0.4419, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.6099585062240664, | |
| "grad_norm": 2.1097886512141386, | |
| "learning_rate": 3.964221611606108e-06, | |
| "loss": 0.4355, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.6141078838174274, | |
| "grad_norm": 2.055338754378088, | |
| "learning_rate": 3.893450798834412e-06, | |
| "loss": 0.4365, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.6182572614107884, | |
| "grad_norm": 2.0758265457246154, | |
| "learning_rate": 3.822912336941375e-06, | |
| "loss": 0.4316, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 0.6224066390041494, | |
| "grad_norm": 2.0060803377407788, | |
| "learning_rate": 3.7526210374451665e-06, | |
| "loss": 0.4336, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.6265560165975104, | |
| "grad_norm": 2.1438017129720324, | |
| "learning_rate": 3.6825916599653177e-06, | |
| "loss": 0.432, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 0.6307053941908713, | |
| "grad_norm": 2.066973580119617, | |
| "learning_rate": 3.6128389091235207e-06, | |
| "loss": 0.439, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.6348547717842323, | |
| "grad_norm": 2.0587913853717947, | |
| "learning_rate": 3.543377431455991e-06, | |
| "loss": 0.4278, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 0.6390041493775933, | |
| "grad_norm": 2.0295102232187894, | |
| "learning_rate": 3.4742218123380085e-06, | |
| "loss": 0.4141, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.6431535269709544, | |
| "grad_norm": 2.1211936908053803, | |
| "learning_rate": 3.4053865729213267e-06, | |
| "loss": 0.4269, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.6473029045643154, | |
| "grad_norm": 2.073558611545013, | |
| "learning_rate": 3.3368861670850316e-06, | |
| "loss": 0.4242, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.6514522821576764, | |
| "grad_norm": 2.1028200278850906, | |
| "learning_rate": 3.268734978400564e-06, | |
| "loss": 0.4255, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 0.6556016597510373, | |
| "grad_norm": 2.074338043700759, | |
| "learning_rate": 3.2009473171114748e-06, | |
| "loss": 0.4127, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.6597510373443983, | |
| "grad_norm": 2.0169788933206965, | |
| "learning_rate": 3.1335374171285993e-06, | |
| "loss": 0.4273, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 0.6639004149377593, | |
| "grad_norm": 2.132031529237029, | |
| "learning_rate": 3.0665194330412453e-06, | |
| "loss": 0.4199, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.6680497925311203, | |
| "grad_norm": 1.974289618100589, | |
| "learning_rate": 2.999907437145042e-06, | |
| "loss": 0.4141, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 0.6721991701244814, | |
| "grad_norm": 1.9950951128394907, | |
| "learning_rate": 2.9337154164870723e-06, | |
| "loss": 0.4141, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.6763485477178424, | |
| "grad_norm": 2.0422579243315298, | |
| "learning_rate": 2.8679572699288993e-06, | |
| "loss": 0.4117, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 0.6804979253112033, | |
| "grad_norm": 2.049127089823568, | |
| "learning_rate": 2.802646805228115e-06, | |
| "loss": 0.4055, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.6846473029045643, | |
| "grad_norm": 2.0129580531205686, | |
| "learning_rate": 2.7377977361390118e-06, | |
| "loss": 0.4032, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.6887966804979253, | |
| "grad_norm": 2.1533361617257496, | |
| "learning_rate": 2.673423679533003e-06, | |
| "loss": 0.4049, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.6929460580912863, | |
| "grad_norm": 2.012532576052599, | |
| "learning_rate": 2.609538152539375e-06, | |
| "loss": 0.4112, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 0.6970954356846473, | |
| "grad_norm": 2.0005545981205013, | |
| "learning_rate": 2.546154569706991e-06, | |
| "loss": 0.4062, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.7012448132780082, | |
| "grad_norm": 1.9763702850969325, | |
| "learning_rate": 2.483286240187538e-06, | |
| "loss": 0.4076, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 0.7053941908713693, | |
| "grad_norm": 1.9363754039901064, | |
| "learning_rate": 2.420946364940885e-06, | |
| "loss": 0.3983, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.7095435684647303, | |
| "grad_norm": 1.9964976216880392, | |
| "learning_rate": 2.359148033963195e-06, | |
| "loss": 0.4012, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 0.7136929460580913, | |
| "grad_norm": 1.9539357011094127, | |
| "learning_rate": 2.297904223538289e-06, | |
| "loss": 0.4014, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.7178423236514523, | |
| "grad_norm": 2.003285264891127, | |
| "learning_rate": 2.237227793512935e-06, | |
| "loss": 0.4005, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 0.7219917012448133, | |
| "grad_norm": 2.0469841226757826, | |
| "learning_rate": 2.1771314845965485e-06, | |
| "loss": 0.3919, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.7261410788381742, | |
| "grad_norm": 2.0771139142501482, | |
| "learning_rate": 2.1176279156859396e-06, | |
| "loss": 0.3967, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.7302904564315352, | |
| "grad_norm": 2.0143075674387734, | |
| "learning_rate": 2.0587295812156117e-06, | |
| "loss": 0.4042, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.7344398340248963, | |
| "grad_norm": 2.033797194926859, | |
| "learning_rate": 2.000448848534209e-06, | |
| "loss": 0.3889, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 0.7385892116182573, | |
| "grad_norm": 2.123163913068704, | |
| "learning_rate": 1.942797955307655e-06, | |
| "loss": 0.3927, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.7427385892116183, | |
| "grad_norm": 2.1029028753084855, | |
| "learning_rate": 1.8857890069494983e-06, | |
| "loss": 0.3862, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 0.7468879668049793, | |
| "grad_norm": 1.9596988626179463, | |
| "learning_rate": 1.8294339740790595e-06, | |
| "loss": 0.3926, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.7510373443983402, | |
| "grad_norm": 2.045654702642933, | |
| "learning_rate": 1.7737446900078503e-06, | |
| "loss": 0.3902, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 0.7551867219917012, | |
| "grad_norm": 1.8714455878320024, | |
| "learning_rate": 1.7187328482548543e-06, | |
| "loss": 0.3858, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.7593360995850622, | |
| "grad_norm": 1.9719509461323035, | |
| "learning_rate": 1.6644100000911312e-06, | |
| "loss": 0.3858, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 0.7634854771784232, | |
| "grad_norm": 2.0199806220076995, | |
| "learning_rate": 1.61078755211432e-06, | |
| "loss": 0.3901, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.7676348547717843, | |
| "grad_norm": 1.8860408333074137, | |
| "learning_rate": 1.557876763853493e-06, | |
| "loss": 0.3789, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.7717842323651453, | |
| "grad_norm": 2.0409354742092725, | |
| "learning_rate": 1.5056887454049129e-06, | |
| "loss": 0.3977, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.7759336099585062, | |
| "grad_norm": 1.986307727470136, | |
| "learning_rate": 1.4542344550991538e-06, | |
| "loss": 0.3908, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 0.7800829875518672, | |
| "grad_norm": 1.8832678493109687, | |
| "learning_rate": 1.4035246972000922e-06, | |
| "loss": 0.3935, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.7842323651452282, | |
| "grad_norm": 2.020212496636967, | |
| "learning_rate": 1.353570119636255e-06, | |
| "loss": 0.379, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 0.7883817427385892, | |
| "grad_norm": 2.0199477511715638, | |
| "learning_rate": 1.3043812117649846e-06, | |
| "loss": 0.3931, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.7925311203319502, | |
| "grad_norm": 2.015693992608451, | |
| "learning_rate": 1.2559683021699132e-06, | |
| "loss": 0.3774, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 0.7966804979253111, | |
| "grad_norm": 2.0495350044403113, | |
| "learning_rate": 1.2083415564921775e-06, | |
| "loss": 0.3845, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.8008298755186722, | |
| "grad_norm": 2.1357993118322, | |
| "learning_rate": 1.1615109752958715e-06, | |
| "loss": 0.3781, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 0.8049792531120332, | |
| "grad_norm": 2.0787734920639553, | |
| "learning_rate": 1.1154863919681358e-06, | |
| "loss": 0.3786, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.8091286307053942, | |
| "grad_norm": 2.0147166599927817, | |
| "learning_rate": 1.0702774706543772e-06, | |
| "loss": 0.3801, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 0.8132780082987552, | |
| "grad_norm": 2.1274229351405682, | |
| "learning_rate": 1.0258937042289986e-06, | |
| "loss": 0.3758, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.8174273858921162, | |
| "grad_norm": 2.020236024053247, | |
| "learning_rate": 9.82344412302117e-07, | |
| "loss": 0.3745, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 0.8215767634854771, | |
| "grad_norm": 2.0310388332900224, | |
| "learning_rate": 9.396387392626399e-07, | |
| "loss": 0.3716, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.8257261410788381, | |
| "grad_norm": 2.1668749663108717, | |
| "learning_rate": 8.97785652358158e-07, | |
| "loss": 0.3697, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 0.8298755186721992, | |
| "grad_norm": 2.0045742016058914, | |
| "learning_rate": 8.567939398120095e-07, | |
| "loss": 0.376, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.8340248962655602, | |
| "grad_norm": 1.9913802328205135, | |
| "learning_rate": 8.166722089779539e-07, | |
| "loss": 0.3662, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 0.8381742738589212, | |
| "grad_norm": 2.054785546661835, | |
| "learning_rate": 7.774288845328193e-07, | |
| "loss": 0.3797, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.8423236514522822, | |
| "grad_norm": 2.066321355843617, | |
| "learning_rate": 7.390722067075024e-07, | |
| "loss": 0.3836, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 0.8464730290456431, | |
| "grad_norm": 1.9972704924078868, | |
| "learning_rate": 7.016102295567046e-07, | |
| "loss": 0.3654, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.8506224066390041, | |
| "grad_norm": 1.9358010759861928, | |
| "learning_rate": 6.650508192677546e-07, | |
| "loss": 0.3649, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 0.8547717842323651, | |
| "grad_norm": 2.0175675008368645, | |
| "learning_rate": 6.294016525088914e-07, | |
| "loss": 0.3682, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.8589211618257261, | |
| "grad_norm": 2.046261015462815, | |
| "learning_rate": 5.946702148173211e-07, | |
| "loss": 0.3601, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 0.8630705394190872, | |
| "grad_norm": 2.08234471232173, | |
| "learning_rate": 5.608637990274335e-07, | |
| "loss": 0.3675, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.8672199170124482, | |
| "grad_norm": 2.061640652779725, | |
| "learning_rate": 5.279895037394567e-07, | |
| "loss": 0.3675, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 0.8713692946058091, | |
| "grad_norm": 2.050116976236056, | |
| "learning_rate": 4.960542318289118e-07, | |
| "loss": 0.3663, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.8755186721991701, | |
| "grad_norm": 2.072309354512263, | |
| "learning_rate": 4.6506468899716607e-07, | |
| "loss": 0.372, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 0.8796680497925311, | |
| "grad_norm": 2.152457552234192, | |
| "learning_rate": 4.3502738236337605e-07, | |
| "loss": 0.3593, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.8838174273858921, | |
| "grad_norm": 1.996868735465177, | |
| "learning_rate": 4.0594861909814266e-07, | |
| "loss": 0.3604, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 0.8879668049792531, | |
| "grad_norm": 1.9994915003373548, | |
| "learning_rate": 3.7783450509914e-07, | |
| "loss": 0.3656, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.8921161825726142, | |
| "grad_norm": 1.9740173742965879, | |
| "learning_rate": 3.5069094370901613e-07, | |
| "loss": 0.3633, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 0.8962655601659751, | |
| "grad_norm": 1.961688927842617, | |
| "learning_rate": 3.245236344758179e-07, | |
| "loss": 0.3634, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.9004149377593361, | |
| "grad_norm": 2.0299031717626463, | |
| "learning_rate": 2.9933807195621446e-07, | |
| "loss": 0.3643, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 0.9045643153526971, | |
| "grad_norm": 1.9933810295199255, | |
| "learning_rate": 2.751395445617594e-07, | |
| "loss": 0.3611, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.9087136929460581, | |
| "grad_norm": 1.9428008804611876, | |
| "learning_rate": 2.5193313344844427e-07, | |
| "loss": 0.3647, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 0.9128630705394191, | |
| "grad_norm": 2.070451744064332, | |
| "learning_rate": 2.2972371144976524e-07, | |
| "loss": 0.3578, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.91701244813278, | |
| "grad_norm": 2.004094672792972, | |
| "learning_rate": 2.0851594205353543e-07, | |
| "loss": 0.3682, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 0.921161825726141, | |
| "grad_norm": 1.99685429733088, | |
| "learning_rate": 1.8831427842266047e-07, | |
| "loss": 0.3629, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.9253112033195021, | |
| "grad_norm": 2.0362204051502495, | |
| "learning_rate": 1.6912296246006544e-07, | |
| "loss": 0.3619, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 0.9294605809128631, | |
| "grad_norm": 1.9954463784544125, | |
| "learning_rate": 1.5094602391799606e-07, | |
| "loss": 0.359, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.9336099585062241, | |
| "grad_norm": 2.101222001014506, | |
| "learning_rate": 1.3378727955185243e-07, | |
| "loss": 0.3643, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 0.9377593360995851, | |
| "grad_norm": 2.0097408515894477, | |
| "learning_rate": 1.1765033231876332e-07, | |
| "loss": 0.365, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.941908713692946, | |
| "grad_norm": 2.083152089600553, | |
| "learning_rate": 1.0253857062103579e-07, | |
| "loss": 0.356, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 0.946058091286307, | |
| "grad_norm": 2.1343299677958565, | |
| "learning_rate": 8.845516759467432e-08, | |
| "loss": 0.3637, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.950207468879668, | |
| "grad_norm": 2.0339392745318854, | |
| "learning_rate": 7.540308044308442e-08, | |
| "loss": 0.3654, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 0.9543568464730291, | |
| "grad_norm": 2.0235713792250674, | |
| "learning_rate": 6.338504981613037e-08, | |
| "loss": 0.3601, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.9585062240663901, | |
| "grad_norm": 2.109608243011711, | |
| "learning_rate": 5.2403599234658876e-08, | |
| "loss": 0.356, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 0.9626556016597511, | |
| "grad_norm": 2.0760547345374603, | |
| "learning_rate": 4.246103456061246e-08, | |
| "loss": 0.3595, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.966804979253112, | |
| "grad_norm": 2.052680320906187, | |
| "learning_rate": 3.355944351285278e-08, | |
| "loss": 0.3664, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 0.970954356846473, | |
| "grad_norm": 2.0133846012871484, | |
| "learning_rate": 2.5700695228783045e-08, | |
| "loss": 0.3602, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.975103734439834, | |
| "grad_norm": 2.0460051222086517, | |
| "learning_rate": 1.8886439871874662e-08, | |
| "loss": 0.3614, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 0.979253112033195, | |
| "grad_norm": 1.9713443728652713, | |
| "learning_rate": 1.31181082851628e-08, | |
| "loss": 0.3634, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.983402489626556, | |
| "grad_norm": 1.9598526108360579, | |
| "learning_rate": 8.396911690807253e-09, | |
| "loss": 0.3633, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 0.9875518672199171, | |
| "grad_norm": 2.018358583132368, | |
| "learning_rate": 4.723841435759769e-09, | |
| "loss": 0.3678, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.991701244813278, | |
| "grad_norm": 2.0932112430813317, | |
| "learning_rate": 2.0996687836016825e-09, | |
| "loss": 0.364, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 0.995850622406639, | |
| "grad_norm": 2.064858475016426, | |
| "learning_rate": 5.249447525995654e-10, | |
| "loss": 0.3573, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 2.020766577175708, | |
| "learning_rate": 0.0, | |
| "loss": 0.3623, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.3198564946651459, | |
| "eval_runtime": 1.1889, | |
| "eval_samples_per_second": 2.523, | |
| "eval_steps_per_second": 0.841, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 1205, | |
| "total_flos": 252197795266560.0, | |
| "train_loss": 0.5424416576678327, | |
| "train_runtime": 23511.1257, | |
| "train_samples_per_second": 1.639, | |
| "train_steps_per_second": 0.051 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1205, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 252197795266560.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |