| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.02333926539662164, | |
| "eval_steps": 500, | |
| "global_step": 200, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0001166963269831082, | |
| "grad_norm": 0.5096844434738159, | |
| "learning_rate": 4e-05, | |
| "loss": 0.5061, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0002333926539662164, | |
| "grad_norm": 0.5573846697807312, | |
| "learning_rate": 8e-05, | |
| "loss": 0.536, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.0003500889809493246, | |
| "grad_norm": 0.5168461203575134, | |
| "learning_rate": 0.00012, | |
| "loss": 0.5209, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.0004667853079324328, | |
| "grad_norm": 0.4047943651676178, | |
| "learning_rate": 0.00016, | |
| "loss": 0.4604, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.000583481634915541, | |
| "grad_norm": 0.20825760066509247, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3089, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.0007001779618986492, | |
| "grad_norm": 0.20194634795188904, | |
| "learning_rate": 0.00019997664642690334, | |
| "loss": 0.3358, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.0008168742888817575, | |
| "grad_norm": 0.193731427192688, | |
| "learning_rate": 0.00019995329285380664, | |
| "loss": 0.3042, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.0009335706158648656, | |
| "grad_norm": 0.21419042348861694, | |
| "learning_rate": 0.00019992993928070996, | |
| "loss": 0.341, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.001050266942847974, | |
| "grad_norm": 0.18434806168079376, | |
| "learning_rate": 0.00019990658570761326, | |
| "loss": 0.2665, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.001166963269831082, | |
| "grad_norm": 0.16945861279964447, | |
| "learning_rate": 0.0001998832321345166, | |
| "loss": 0.3113, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0012836595968141902, | |
| "grad_norm": 0.1379944235086441, | |
| "learning_rate": 0.0001998598785614199, | |
| "loss": 0.213, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.0014003559237972985, | |
| "grad_norm": 0.12341434508562088, | |
| "learning_rate": 0.00019983652498832322, | |
| "loss": 0.1954, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.0015170522507804067, | |
| "grad_norm": 0.12021443247795105, | |
| "learning_rate": 0.00019981317141522654, | |
| "loss": 0.2118, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.001633748577763515, | |
| "grad_norm": 0.16578464210033417, | |
| "learning_rate": 0.00019978981784212984, | |
| "loss": 0.2335, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.0017504449047466232, | |
| "grad_norm": 0.13460949063301086, | |
| "learning_rate": 0.00019976646426903317, | |
| "loss": 0.1664, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.0018671412317297312, | |
| "grad_norm": 0.13434286415576935, | |
| "learning_rate": 0.0001997431106959365, | |
| "loss": 0.2334, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.0019838375587128395, | |
| "grad_norm": 0.15617124736309052, | |
| "learning_rate": 0.00019971975712283982, | |
| "loss": 0.267, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.002100533885695948, | |
| "grad_norm": 0.12410859763622284, | |
| "learning_rate": 0.00019969640354974312, | |
| "loss": 0.2141, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.002217230212679056, | |
| "grad_norm": 0.10960312187671661, | |
| "learning_rate": 0.00019967304997664645, | |
| "loss": 0.1625, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.002333926539662164, | |
| "grad_norm": 0.1421806961297989, | |
| "learning_rate": 0.00019964969640354975, | |
| "loss": 0.2529, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0024506228666452724, | |
| "grad_norm": 0.1390838772058487, | |
| "learning_rate": 0.00019962634283045308, | |
| "loss": 0.1938, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.0025673191936283805, | |
| "grad_norm": 0.11065292358398438, | |
| "learning_rate": 0.00019960298925735637, | |
| "loss": 0.1892, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.002684015520611489, | |
| "grad_norm": 0.12330306321382523, | |
| "learning_rate": 0.0001995796356842597, | |
| "loss": 0.2334, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.002800711847594597, | |
| "grad_norm": 0.10883963108062744, | |
| "learning_rate": 0.00019955628211116303, | |
| "loss": 0.1953, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.002917408174577705, | |
| "grad_norm": 0.11904892325401306, | |
| "learning_rate": 0.00019953292853806633, | |
| "loss": 0.191, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.0030341045015608134, | |
| "grad_norm": 0.09328188002109528, | |
| "learning_rate": 0.00019950957496496965, | |
| "loss": 0.1517, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.0031508008285439214, | |
| "grad_norm": 0.10651596635580063, | |
| "learning_rate": 0.00019948622139187295, | |
| "loss": 0.1778, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.00326749715552703, | |
| "grad_norm": 0.09824363142251968, | |
| "learning_rate": 0.00019946286781877628, | |
| "loss": 0.1412, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.003384193482510138, | |
| "grad_norm": 0.0911468043923378, | |
| "learning_rate": 0.00019943951424567958, | |
| "loss": 0.1413, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.0035008898094932464, | |
| "grad_norm": 0.10707972198724747, | |
| "learning_rate": 0.0001994161606725829, | |
| "loss": 0.166, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0036175861364763544, | |
| "grad_norm": 0.1010749414563179, | |
| "learning_rate": 0.00019939280709948623, | |
| "loss": 0.1762, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.0037342824634594624, | |
| "grad_norm": 0.11201727390289307, | |
| "learning_rate": 0.00019936945352638953, | |
| "loss": 0.1691, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.003850978790442571, | |
| "grad_norm": 0.10435190051794052, | |
| "learning_rate": 0.00019934609995329286, | |
| "loss": 0.1615, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.003967675117425679, | |
| "grad_norm": 0.10385840386152267, | |
| "learning_rate": 0.0001993227463801962, | |
| "loss": 0.1395, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.004084371444408787, | |
| "grad_norm": 0.10580045729875565, | |
| "learning_rate": 0.00019929939280709951, | |
| "loss": 0.188, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.004201067771391896, | |
| "grad_norm": 0.10571294277906418, | |
| "learning_rate": 0.00019927603923400281, | |
| "loss": 0.1778, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.004317764098375003, | |
| "grad_norm": 0.11938229203224182, | |
| "learning_rate": 0.00019925268566090614, | |
| "loss": 0.2181, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.004434460425358112, | |
| "grad_norm": 0.09765143692493439, | |
| "learning_rate": 0.00019922933208780944, | |
| "loss": 0.13, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.00455115675234122, | |
| "grad_norm": 0.11270410567522049, | |
| "learning_rate": 0.00019920597851471277, | |
| "loss": 0.1988, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.004667853079324328, | |
| "grad_norm": 0.09199155867099762, | |
| "learning_rate": 0.0001991826249416161, | |
| "loss": 0.1616, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.004784549406307436, | |
| "grad_norm": 0.10620560497045517, | |
| "learning_rate": 0.0001991592713685194, | |
| "loss": 0.1728, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.004901245733290545, | |
| "grad_norm": 0.09054724127054214, | |
| "learning_rate": 0.00019913591779542272, | |
| "loss": 0.1369, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.005017942060273653, | |
| "grad_norm": 0.10327401012182236, | |
| "learning_rate": 0.00019911256422232602, | |
| "loss": 0.1708, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.005134638387256761, | |
| "grad_norm": 0.09473054111003876, | |
| "learning_rate": 0.00019908921064922935, | |
| "loss": 0.1589, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.005251334714239869, | |
| "grad_norm": 0.10536584258079529, | |
| "learning_rate": 0.00019906585707613265, | |
| "loss": 0.1962, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.005368031041222978, | |
| "grad_norm": 0.10594025254249573, | |
| "learning_rate": 0.00019904250350303597, | |
| "loss": 0.1893, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.005484727368206085, | |
| "grad_norm": 0.09798738360404968, | |
| "learning_rate": 0.00019901914992993927, | |
| "loss": 0.1759, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.005601423695189194, | |
| "grad_norm": 0.08874180912971497, | |
| "learning_rate": 0.0001989957963568426, | |
| "loss": 0.1558, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.005718120022172302, | |
| "grad_norm": 0.11178728193044662, | |
| "learning_rate": 0.00019897244278374593, | |
| "loss": 0.1745, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.00583481634915541, | |
| "grad_norm": 0.0964571163058281, | |
| "learning_rate": 0.00019894908921064923, | |
| "loss": 0.1751, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.005951512676138518, | |
| "grad_norm": 0.10600943863391876, | |
| "learning_rate": 0.00019892573563755255, | |
| "loss": 0.1656, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.006068209003121627, | |
| "grad_norm": 0.10203580558300018, | |
| "learning_rate": 0.00019890238206445585, | |
| "loss": 0.1621, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.006184905330104735, | |
| "grad_norm": 0.11010047048330307, | |
| "learning_rate": 0.0001988790284913592, | |
| "loss": 0.1789, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.006301601657087843, | |
| "grad_norm": 0.11551900953054428, | |
| "learning_rate": 0.0001988556749182625, | |
| "loss": 0.1794, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.006418297984070951, | |
| "grad_norm": 0.11391794681549072, | |
| "learning_rate": 0.00019883232134516583, | |
| "loss": 0.1765, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.00653499431105406, | |
| "grad_norm": 0.11572562158107758, | |
| "learning_rate": 0.00019880896777206913, | |
| "loss": 0.2161, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.006651690638037167, | |
| "grad_norm": 0.09810175001621246, | |
| "learning_rate": 0.00019878561419897246, | |
| "loss": 0.1534, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.006768386965020276, | |
| "grad_norm": 0.10156040638685226, | |
| "learning_rate": 0.00019876226062587579, | |
| "loss": 0.1514, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.006885083292003384, | |
| "grad_norm": 0.09523003548383713, | |
| "learning_rate": 0.00019873890705277909, | |
| "loss": 0.1345, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.007001779618986493, | |
| "grad_norm": 0.11223362386226654, | |
| "learning_rate": 0.0001987155534796824, | |
| "loss": 0.1995, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.0071184759459696, | |
| "grad_norm": 0.09502169489860535, | |
| "learning_rate": 0.0001986921999065857, | |
| "loss": 0.1428, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.007235172272952709, | |
| "grad_norm": 0.11790277063846588, | |
| "learning_rate": 0.00019866884633348904, | |
| "loss": 0.1592, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.007351868599935817, | |
| "grad_norm": 0.0922728031873703, | |
| "learning_rate": 0.00019864549276039234, | |
| "loss": 0.1264, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.007468564926918925, | |
| "grad_norm": 0.09835848957300186, | |
| "learning_rate": 0.00019862213918729566, | |
| "loss": 0.131, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.007585261253902033, | |
| "grad_norm": 0.1120564341545105, | |
| "learning_rate": 0.000198598785614199, | |
| "loss": 0.1605, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.007701957580885142, | |
| "grad_norm": 0.10365966707468033, | |
| "learning_rate": 0.0001985754320411023, | |
| "loss": 0.1458, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.00781865390786825, | |
| "grad_norm": 0.12009326368570328, | |
| "learning_rate": 0.00019855207846800562, | |
| "loss": 0.1818, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.007935350234851358, | |
| "grad_norm": 0.10382229834794998, | |
| "learning_rate": 0.00019852872489490892, | |
| "loss": 0.1695, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.008052046561834466, | |
| "grad_norm": 0.12162943184375763, | |
| "learning_rate": 0.00019850537132181224, | |
| "loss": 0.1916, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.008168742888817575, | |
| "grad_norm": 0.1090371385216713, | |
| "learning_rate": 0.00019848201774871554, | |
| "loss": 0.1733, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.008285439215800683, | |
| "grad_norm": 0.1108129546046257, | |
| "learning_rate": 0.00019845866417561887, | |
| "loss": 0.1533, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.008402135542783792, | |
| "grad_norm": 0.10778584331274033, | |
| "learning_rate": 0.0001984353106025222, | |
| "loss": 0.1729, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.008518831869766898, | |
| "grad_norm": 0.14670732617378235, | |
| "learning_rate": 0.00019841195702942552, | |
| "loss": 0.1499, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.008635528196750007, | |
| "grad_norm": 0.10100234299898148, | |
| "learning_rate": 0.00019838860345632882, | |
| "loss": 0.1556, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.008752224523733115, | |
| "grad_norm": 0.11347773671150208, | |
| "learning_rate": 0.00019836524988323215, | |
| "loss": 0.1783, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.008868920850716224, | |
| "grad_norm": 0.09582630544900894, | |
| "learning_rate": 0.00019834189631013548, | |
| "loss": 0.1516, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.008985617177699332, | |
| "grad_norm": 0.09301317483186722, | |
| "learning_rate": 0.00019831854273703878, | |
| "loss": 0.1491, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.00910231350468244, | |
| "grad_norm": 0.12455611675977707, | |
| "learning_rate": 0.0001982951891639421, | |
| "loss": 0.1654, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.00921900983166555, | |
| "grad_norm": 0.11573786288499832, | |
| "learning_rate": 0.0001982718355908454, | |
| "loss": 0.2097, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.009335706158648656, | |
| "grad_norm": 0.09937581419944763, | |
| "learning_rate": 0.00019824848201774873, | |
| "loss": 0.1563, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.009452402485631764, | |
| "grad_norm": 0.11743341386318207, | |
| "learning_rate": 0.00019822512844465203, | |
| "loss": 0.1656, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.009569098812614873, | |
| "grad_norm": 0.10934270918369293, | |
| "learning_rate": 0.00019820177487155536, | |
| "loss": 0.1468, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.009685795139597981, | |
| "grad_norm": 0.11555736511945724, | |
| "learning_rate": 0.00019817842129845868, | |
| "loss": 0.1815, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.00980249146658109, | |
| "grad_norm": 0.11791291832923889, | |
| "learning_rate": 0.00019815506772536198, | |
| "loss": 0.1807, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.009919187793564198, | |
| "grad_norm": 0.1130499318242073, | |
| "learning_rate": 0.0001981317141522653, | |
| "loss": 0.1698, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.010035884120547307, | |
| "grad_norm": 0.10540090501308441, | |
| "learning_rate": 0.0001981083605791686, | |
| "loss": 0.1582, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.010152580447530413, | |
| "grad_norm": 0.09527558833360672, | |
| "learning_rate": 0.00019808500700607194, | |
| "loss": 0.1387, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.010269276774513522, | |
| "grad_norm": 0.11643368750810623, | |
| "learning_rate": 0.00019806165343297524, | |
| "loss": 0.1842, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.01038597310149663, | |
| "grad_norm": 0.11340148001909256, | |
| "learning_rate": 0.00019803829985987856, | |
| "loss": 0.1616, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.010502669428479739, | |
| "grad_norm": 0.12303619831800461, | |
| "learning_rate": 0.0001980149462867819, | |
| "loss": 0.1872, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.010619365755462847, | |
| "grad_norm": 0.09786645323038101, | |
| "learning_rate": 0.0001979915927136852, | |
| "loss": 0.1567, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.010736062082445956, | |
| "grad_norm": 0.10433386266231537, | |
| "learning_rate": 0.00019796823914058854, | |
| "loss": 0.1783, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.010852758409429062, | |
| "grad_norm": 0.31017664074897766, | |
| "learning_rate": 0.00019794488556749184, | |
| "loss": 0.1546, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.01096945473641217, | |
| "grad_norm": 0.10538630187511444, | |
| "learning_rate": 0.00019792153199439517, | |
| "loss": 0.1562, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.01108615106339528, | |
| "grad_norm": 0.10152962803840637, | |
| "learning_rate": 0.00019789817842129847, | |
| "loss": 0.1573, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.011202847390378388, | |
| "grad_norm": 0.1028379276394844, | |
| "learning_rate": 0.0001978748248482018, | |
| "loss": 0.151, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.011319543717361496, | |
| "grad_norm": 0.10292468219995499, | |
| "learning_rate": 0.0001978514712751051, | |
| "loss": 0.1642, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.011436240044344605, | |
| "grad_norm": 0.10416844487190247, | |
| "learning_rate": 0.00019782811770200842, | |
| "loss": 0.1701, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.011552936371327713, | |
| "grad_norm": 0.10852757841348648, | |
| "learning_rate": 0.00019780476412891172, | |
| "loss": 0.1474, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.01166963269831082, | |
| "grad_norm": 0.10577951371669769, | |
| "learning_rate": 0.00019778141055581505, | |
| "loss": 0.18, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.011786329025293928, | |
| "grad_norm": 0.08033058792352676, | |
| "learning_rate": 0.00019775805698271838, | |
| "loss": 0.1211, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.011903025352277037, | |
| "grad_norm": 0.1296456754207611, | |
| "learning_rate": 0.00019773470340962167, | |
| "loss": 0.1822, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.012019721679260145, | |
| "grad_norm": 0.114451102912426, | |
| "learning_rate": 0.000197711349836525, | |
| "loss": 0.1756, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.012136418006243254, | |
| "grad_norm": 0.10711831599473953, | |
| "learning_rate": 0.0001976879962634283, | |
| "loss": 0.1622, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.012253114333226362, | |
| "grad_norm": 0.11008073389530182, | |
| "learning_rate": 0.00019766464269033163, | |
| "loss": 0.1554, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.01236981066020947, | |
| "grad_norm": 0.1160978451371193, | |
| "learning_rate": 0.00019764128911723493, | |
| "loss": 0.1667, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.012486506987192577, | |
| "grad_norm": 0.1413351148366928, | |
| "learning_rate": 0.00019761793554413825, | |
| "loss": 0.191, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.012603203314175686, | |
| "grad_norm": 0.08713728189468384, | |
| "learning_rate": 0.00019759458197104158, | |
| "loss": 0.1353, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.012719899641158794, | |
| "grad_norm": 0.09029239416122437, | |
| "learning_rate": 0.00019757122839794488, | |
| "loss": 0.1078, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.012836595968141903, | |
| "grad_norm": 0.09928172081708908, | |
| "learning_rate": 0.0001975478748248482, | |
| "loss": 0.136, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.012953292295125011, | |
| "grad_norm": 0.11607584357261658, | |
| "learning_rate": 0.00019752452125175153, | |
| "loss": 0.1861, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.01306998862210812, | |
| "grad_norm": 0.12424415349960327, | |
| "learning_rate": 0.00019750116767865486, | |
| "loss": 0.1987, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.013186684949091228, | |
| "grad_norm": 0.12206408381462097, | |
| "learning_rate": 0.00019747781410555816, | |
| "loss": 0.2091, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.013303381276074335, | |
| "grad_norm": 0.09520223736763, | |
| "learning_rate": 0.0001974544605324615, | |
| "loss": 0.1318, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.013420077603057443, | |
| "grad_norm": 0.10052375495433807, | |
| "learning_rate": 0.0001974311069593648, | |
| "loss": 0.1675, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.013536773930040552, | |
| "grad_norm": 0.09986284375190735, | |
| "learning_rate": 0.00019740775338626811, | |
| "loss": 0.1496, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.01365347025702366, | |
| "grad_norm": 0.09899864345788956, | |
| "learning_rate": 0.00019738439981317144, | |
| "loss": 0.1427, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.013770166584006769, | |
| "grad_norm": 0.09922472387552261, | |
| "learning_rate": 0.00019736104624007474, | |
| "loss": 0.1412, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.013886862910989877, | |
| "grad_norm": 0.08671886473894119, | |
| "learning_rate": 0.00019733769266697807, | |
| "loss": 0.1305, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.014003559237972986, | |
| "grad_norm": 0.10284463316202164, | |
| "learning_rate": 0.00019731433909388137, | |
| "loss": 0.1455, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.014120255564956092, | |
| "grad_norm": 0.12423279136419296, | |
| "learning_rate": 0.0001972909855207847, | |
| "loss": 0.1984, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.0142369518919392, | |
| "grad_norm": 0.12210292369127274, | |
| "learning_rate": 0.000197267631947688, | |
| "loss": 0.1854, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.01435364821892231, | |
| "grad_norm": 0.10566538572311401, | |
| "learning_rate": 0.00019724427837459132, | |
| "loss": 0.1347, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.014470344545905418, | |
| "grad_norm": 0.09597185254096985, | |
| "learning_rate": 0.00019722092480149465, | |
| "loss": 0.133, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.014587040872888526, | |
| "grad_norm": 0.11847853660583496, | |
| "learning_rate": 0.00019719757122839795, | |
| "loss": 0.1788, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.014703737199871635, | |
| "grad_norm": 0.10845934599637985, | |
| "learning_rate": 0.00019717421765530127, | |
| "loss": 0.1509, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.014820433526854743, | |
| "grad_norm": 0.09963663667440414, | |
| "learning_rate": 0.00019715086408220457, | |
| "loss": 0.1677, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.01493712985383785, | |
| "grad_norm": 0.09440722316503525, | |
| "learning_rate": 0.0001971275105091079, | |
| "loss": 0.1396, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.015053826180820958, | |
| "grad_norm": 0.11402937024831772, | |
| "learning_rate": 0.0001971041569360112, | |
| "loss": 0.18, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.015170522507804067, | |
| "grad_norm": 0.1282823383808136, | |
| "learning_rate": 0.00019708080336291455, | |
| "loss": 0.1817, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.015287218834787175, | |
| "grad_norm": 0.09704853594303131, | |
| "learning_rate": 0.00019705744978981785, | |
| "loss": 0.1502, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.015403915161770284, | |
| "grad_norm": 0.09895353019237518, | |
| "learning_rate": 0.00019703409621672118, | |
| "loss": 0.1477, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.015520611488753392, | |
| "grad_norm": 0.10989242792129517, | |
| "learning_rate": 0.00019701074264362448, | |
| "loss": 0.1483, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.0156373078157365, | |
| "grad_norm": 0.11348774284124374, | |
| "learning_rate": 0.0001969873890705278, | |
| "loss": 0.1787, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.01575400414271961, | |
| "grad_norm": 0.10849590599536896, | |
| "learning_rate": 0.00019696403549743113, | |
| "loss": 0.1564, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.015870700469702716, | |
| "grad_norm": 0.10929839313030243, | |
| "learning_rate": 0.00019694068192433443, | |
| "loss": 0.1481, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.015987396796685826, | |
| "grad_norm": 0.09660619497299194, | |
| "learning_rate": 0.00019691732835123776, | |
| "loss": 0.1433, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.016104093123668933, | |
| "grad_norm": 0.11423259973526001, | |
| "learning_rate": 0.00019689397477814106, | |
| "loss": 0.1342, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.01622078945065204, | |
| "grad_norm": 0.10947205871343613, | |
| "learning_rate": 0.00019687062120504439, | |
| "loss": 0.1825, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.01633748577763515, | |
| "grad_norm": 0.11746672540903091, | |
| "learning_rate": 0.00019684726763194768, | |
| "loss": 0.1766, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.016454182104618256, | |
| "grad_norm": 0.1152237206697464, | |
| "learning_rate": 0.000196823914058851, | |
| "loss": 0.161, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.016570878431601366, | |
| "grad_norm": 0.10848015546798706, | |
| "learning_rate": 0.00019680056048575434, | |
| "loss": 0.1541, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.016687574758584473, | |
| "grad_norm": 0.12663570046424866, | |
| "learning_rate": 0.00019677720691265764, | |
| "loss": 0.1891, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.016804271085567583, | |
| "grad_norm": 0.1088947057723999, | |
| "learning_rate": 0.00019675385333956096, | |
| "loss": 0.1505, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.01692096741255069, | |
| "grad_norm": 0.10418037325143814, | |
| "learning_rate": 0.00019673049976646426, | |
| "loss": 0.151, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.017037663739533797, | |
| "grad_norm": 0.08672403544187546, | |
| "learning_rate": 0.0001967071461933676, | |
| "loss": 0.1213, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.017154360066516907, | |
| "grad_norm": 0.10863472521305084, | |
| "learning_rate": 0.0001966837926202709, | |
| "loss": 0.1552, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.017271056393500014, | |
| "grad_norm": 0.10580800473690033, | |
| "learning_rate": 0.00019666043904717422, | |
| "loss": 0.1589, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.017387752720483124, | |
| "grad_norm": 0.09545203298330307, | |
| "learning_rate": 0.00019663708547407754, | |
| "loss": 0.139, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.01750444904746623, | |
| "grad_norm": 0.14016349613666534, | |
| "learning_rate": 0.00019661373190098087, | |
| "loss": 0.2126, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.01762114537444934, | |
| "grad_norm": 0.11033914983272552, | |
| "learning_rate": 0.00019659037832788417, | |
| "loss": 0.1644, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.017737841701432448, | |
| "grad_norm": 0.10455331206321716, | |
| "learning_rate": 0.0001965670247547875, | |
| "loss": 0.1499, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.017854538028415554, | |
| "grad_norm": 0.11884409189224243, | |
| "learning_rate": 0.00019654367118169082, | |
| "loss": 0.1731, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.017971234355398664, | |
| "grad_norm": 0.11076351255178452, | |
| "learning_rate": 0.00019652031760859412, | |
| "loss": 0.1782, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.01808793068238177, | |
| "grad_norm": 0.11540203541517258, | |
| "learning_rate": 0.00019649696403549745, | |
| "loss": 0.1731, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.01820462700936488, | |
| "grad_norm": 0.09334211051464081, | |
| "learning_rate": 0.00019647361046240075, | |
| "loss": 0.1576, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.018321323336347988, | |
| "grad_norm": 0.11943213641643524, | |
| "learning_rate": 0.00019645025688930408, | |
| "loss": 0.1715, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.0184380196633311, | |
| "grad_norm": 0.08858149498701096, | |
| "learning_rate": 0.00019642690331620738, | |
| "loss": 0.1282, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.018554715990314205, | |
| "grad_norm": 0.10284683853387833, | |
| "learning_rate": 0.0001964035497431107, | |
| "loss": 0.1727, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.018671412317297312, | |
| "grad_norm": 0.10812927782535553, | |
| "learning_rate": 0.00019638019617001403, | |
| "loss": 0.1741, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.018788108644280422, | |
| "grad_norm": 0.08129740506410599, | |
| "learning_rate": 0.00019635684259691733, | |
| "loss": 0.107, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.01890480497126353, | |
| "grad_norm": 0.11642193049192429, | |
| "learning_rate": 0.00019633348902382066, | |
| "loss": 0.1768, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.01902150129824664, | |
| "grad_norm": 0.10036379098892212, | |
| "learning_rate": 0.00019631013545072396, | |
| "loss": 0.1458, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.019138197625229746, | |
| "grad_norm": 0.0957493856549263, | |
| "learning_rate": 0.00019628678187762728, | |
| "loss": 0.143, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.019254893952212856, | |
| "grad_norm": 0.12155473232269287, | |
| "learning_rate": 0.00019626342830453058, | |
| "loss": 0.1807, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.019371590279195963, | |
| "grad_norm": 0.11061038821935654, | |
| "learning_rate": 0.0001962400747314339, | |
| "loss": 0.1699, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.01948828660617907, | |
| "grad_norm": 0.10963009297847748, | |
| "learning_rate": 0.00019621672115833724, | |
| "loss": 0.1638, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.01960498293316218, | |
| "grad_norm": 0.11229882389307022, | |
| "learning_rate": 0.00019619336758524056, | |
| "loss": 0.1788, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.019721679260145286, | |
| "grad_norm": 0.10003431886434555, | |
| "learning_rate": 0.0001961700140121439, | |
| "loss": 0.1547, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.019838375587128396, | |
| "grad_norm": 0.09698428958654404, | |
| "learning_rate": 0.0001961466604390472, | |
| "loss": 0.1193, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.019955071914111503, | |
| "grad_norm": 0.10166200250387192, | |
| "learning_rate": 0.00019612330686595052, | |
| "loss": 0.1511, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.020071768241094613, | |
| "grad_norm": 0.10210378468036652, | |
| "learning_rate": 0.00019609995329285382, | |
| "loss": 0.1563, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.02018846456807772, | |
| "grad_norm": 0.09979470074176788, | |
| "learning_rate": 0.00019607659971975714, | |
| "loss": 0.1317, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.020305160895060827, | |
| "grad_norm": 0.12395334988832474, | |
| "learning_rate": 0.00019605324614666044, | |
| "loss": 0.1791, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.020421857222043937, | |
| "grad_norm": 0.11577446013689041, | |
| "learning_rate": 0.00019602989257356377, | |
| "loss": 0.1681, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.020538553549027044, | |
| "grad_norm": 0.10168549418449402, | |
| "learning_rate": 0.00019600653900046707, | |
| "loss": 0.1404, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.020655249876010154, | |
| "grad_norm": 0.1242406889796257, | |
| "learning_rate": 0.0001959831854273704, | |
| "loss": 0.1673, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.02077194620299326, | |
| "grad_norm": 0.09891916811466217, | |
| "learning_rate": 0.00019595983185427372, | |
| "loss": 0.1298, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.020888642529976367, | |
| "grad_norm": 0.15590757131576538, | |
| "learning_rate": 0.00019593647828117702, | |
| "loss": 0.1705, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.021005338856959477, | |
| "grad_norm": 0.08277418464422226, | |
| "learning_rate": 0.00019591312470808035, | |
| "loss": 0.1104, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.021122035183942584, | |
| "grad_norm": 0.10521771758794785, | |
| "learning_rate": 0.00019588977113498365, | |
| "loss": 0.1442, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.021238731510925694, | |
| "grad_norm": 0.10389945656061172, | |
| "learning_rate": 0.00019586641756188698, | |
| "loss": 0.1448, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.0213554278379088, | |
| "grad_norm": 0.11086277663707733, | |
| "learning_rate": 0.00019584306398879027, | |
| "loss": 0.1555, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.02147212416489191, | |
| "grad_norm": 0.1056290939450264, | |
| "learning_rate": 0.0001958197104156936, | |
| "loss": 0.161, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.021588820491875018, | |
| "grad_norm": 0.11331475526094437, | |
| "learning_rate": 0.00019579635684259693, | |
| "loss": 0.1236, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.021705516818858125, | |
| "grad_norm": 0.09584867209196091, | |
| "learning_rate": 0.00019577300326950023, | |
| "loss": 0.1282, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.021822213145841235, | |
| "grad_norm": 0.11787907034158707, | |
| "learning_rate": 0.00019574964969640358, | |
| "loss": 0.1843, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.02193890947282434, | |
| "grad_norm": 0.10822898149490356, | |
| "learning_rate": 0.00019572629612330688, | |
| "loss": 0.1638, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.022055605799807452, | |
| "grad_norm": 0.25267189741134644, | |
| "learning_rate": 0.0001957029425502102, | |
| "loss": 0.1718, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.02217230212679056, | |
| "grad_norm": 0.1412399560213089, | |
| "learning_rate": 0.0001956795889771135, | |
| "loss": 0.1803, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.02228899845377367, | |
| "grad_norm": 0.11052538454532623, | |
| "learning_rate": 0.00019565623540401683, | |
| "loss": 0.1638, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.022405694780756776, | |
| "grad_norm": 0.11876215785741806, | |
| "learning_rate": 0.00019563288183092013, | |
| "loss": 0.2179, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.022522391107739882, | |
| "grad_norm": 0.10292577743530273, | |
| "learning_rate": 0.00019560952825782346, | |
| "loss": 0.1568, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.022639087434722992, | |
| "grad_norm": 0.10467737168073654, | |
| "learning_rate": 0.0001955861746847268, | |
| "loss": 0.156, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.0227557837617061, | |
| "grad_norm": 0.116755910217762, | |
| "learning_rate": 0.0001955628211116301, | |
| "loss": 0.1665, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.02287248008868921, | |
| "grad_norm": 0.09144476056098938, | |
| "learning_rate": 0.00019553946753853341, | |
| "loss": 0.1356, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.022989176415672316, | |
| "grad_norm": 0.09073708951473236, | |
| "learning_rate": 0.00019551611396543671, | |
| "loss": 0.14, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.023105872742655426, | |
| "grad_norm": 0.10096915066242218, | |
| "learning_rate": 0.00019549276039234004, | |
| "loss": 0.1595, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.023222569069638533, | |
| "grad_norm": 0.10241077095270157, | |
| "learning_rate": 0.00019546940681924334, | |
| "loss": 0.1479, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.02333926539662164, | |
| "grad_norm": 0.10773292183876038, | |
| "learning_rate": 0.00019544605324614667, | |
| "loss": 0.1548, | |
| "step": 200 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 8569, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.4879936776325523e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |