| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 5.0, |
| "eval_steps": 500, |
| "global_step": 9140, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.02735229759299781, |
| "grad_norm": 0.8252887725830078, |
| "learning_rate": 0.000245, |
| "loss": 4.4979, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.05470459518599562, |
| "grad_norm": 0.14624212682247162, |
| "learning_rate": 0.000495, |
| "loss": 0.0801, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.08205689277899343, |
| "grad_norm": 0.20989255607128143, |
| "learning_rate": 0.0004972898230088496, |
| "loss": 0.045, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.10940919037199125, |
| "grad_norm": 0.43385007977485657, |
| "learning_rate": 0.0004945243362831859, |
| "loss": 0.038, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.13676148796498905, |
| "grad_norm": 0.1885886788368225, |
| "learning_rate": 0.0004917588495575221, |
| "loss": 0.0351, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.16411378555798686, |
| "grad_norm": 0.2390187680721283, |
| "learning_rate": 0.0004889933628318584, |
| "loss": 0.0379, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.19146608315098468, |
| "grad_norm": 0.14649873971939087, |
| "learning_rate": 0.0004862278761061947, |
| "loss": 0.0345, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.2188183807439825, |
| "grad_norm": 0.22285760939121246, |
| "learning_rate": 0.000483462389380531, |
| "loss": 0.0308, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.2461706783369803, |
| "grad_norm": 0.08991721272468567, |
| "learning_rate": 0.00048069690265486727, |
| "loss": 0.0335, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.2735229759299781, |
| "grad_norm": 0.09125715494155884, |
| "learning_rate": 0.00047793141592920353, |
| "loss": 0.03, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.30087527352297594, |
| "grad_norm": 0.06857843697071075, |
| "learning_rate": 0.00047516592920353985, |
| "loss": 0.0304, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.3282275711159737, |
| "grad_norm": 0.11107359081506729, |
| "learning_rate": 0.0004724004424778761, |
| "loss": 0.0318, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.35557986870897157, |
| "grad_norm": 0.1745302528142929, |
| "learning_rate": 0.00046963495575221237, |
| "loss": 0.031, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.38293216630196936, |
| "grad_norm": 0.06578712165355682, |
| "learning_rate": 0.0004668694690265487, |
| "loss": 0.0303, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.4102844638949672, |
| "grad_norm": 0.08079079538583755, |
| "learning_rate": 0.00046410398230088495, |
| "loss": 0.028, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.437636761487965, |
| "grad_norm": 0.09777480363845825, |
| "learning_rate": 0.0004613384955752212, |
| "loss": 0.0309, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.4649890590809628, |
| "grad_norm": 0.055102407932281494, |
| "learning_rate": 0.0004585730088495575, |
| "loss": 0.0295, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.4923413566739606, |
| "grad_norm": 0.08960958570241928, |
| "learning_rate": 0.00045580752212389384, |
| "loss": 0.0308, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.5196936542669585, |
| "grad_norm": 0.09604144841432571, |
| "learning_rate": 0.0004530420353982301, |
| "loss": 0.0275, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.5470459518599562, |
| "grad_norm": 0.15996237099170685, |
| "learning_rate": 0.0004502765486725664, |
| "loss": 0.028, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.574398249452954, |
| "grad_norm": 0.07793421298265457, |
| "learning_rate": 0.0004475110619469027, |
| "loss": 0.0291, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.6017505470459519, |
| "grad_norm": 0.21286524832248688, |
| "learning_rate": 0.00044474557522123894, |
| "loss": 0.0267, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.6291028446389497, |
| "grad_norm": 0.15267890691757202, |
| "learning_rate": 0.00044198008849557525, |
| "loss": 0.0302, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.6564551422319475, |
| "grad_norm": 0.10255500674247742, |
| "learning_rate": 0.0004392146017699115, |
| "loss": 0.0279, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.6838074398249453, |
| "grad_norm": 0.05677573382854462, |
| "learning_rate": 0.0004364491150442478, |
| "loss": 0.0241, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.7111597374179431, |
| "grad_norm": 0.17168180644512177, |
| "learning_rate": 0.0004336836283185841, |
| "loss": 0.0267, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.7385120350109409, |
| "grad_norm": 0.06085604801774025, |
| "learning_rate": 0.00043091814159292035, |
| "loss": 0.0274, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.7658643326039387, |
| "grad_norm": 0.11685376614332199, |
| "learning_rate": 0.0004281526548672566, |
| "loss": 0.0252, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.7932166301969366, |
| "grad_norm": 0.0925152450799942, |
| "learning_rate": 0.00042538716814159293, |
| "loss": 0.0264, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.8205689277899344, |
| "grad_norm": 0.409482479095459, |
| "learning_rate": 0.0004226216814159292, |
| "loss": 0.0253, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.8479212253829321, |
| "grad_norm": 0.07285141944885254, |
| "learning_rate": 0.00041985619469026546, |
| "loss": 0.0275, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.87527352297593, |
| "grad_norm": 0.33443543314933777, |
| "learning_rate": 0.0004170907079646018, |
| "loss": 0.024, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.9026258205689278, |
| "grad_norm": 0.09892763197422028, |
| "learning_rate": 0.0004143252212389381, |
| "loss": 0.0256, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.9299781181619255, |
| "grad_norm": 0.05468318983912468, |
| "learning_rate": 0.00041155973451327435, |
| "loss": 0.024, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.9573304157549234, |
| "grad_norm": 0.0622013621032238, |
| "learning_rate": 0.00040879424778761066, |
| "loss": 0.0228, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.9846827133479212, |
| "grad_norm": 0.10611853003501892, |
| "learning_rate": 0.0004060287610619469, |
| "loss": 0.0227, |
| "step": 1800 |
| }, |
| { |
| "epoch": 1.012035010940919, |
| "grad_norm": 0.09531939029693604, |
| "learning_rate": 0.0004032632743362832, |
| "loss": 0.0244, |
| "step": 1850 |
| }, |
| { |
| "epoch": 1.039387308533917, |
| "grad_norm": 0.04730033501982689, |
| "learning_rate": 0.0004004977876106195, |
| "loss": 0.0248, |
| "step": 1900 |
| }, |
| { |
| "epoch": 1.0667396061269148, |
| "grad_norm": 0.06383775174617767, |
| "learning_rate": 0.00039773230088495576, |
| "loss": 0.0224, |
| "step": 1950 |
| }, |
| { |
| "epoch": 1.0940919037199124, |
| "grad_norm": 0.08950739353895187, |
| "learning_rate": 0.000394966814159292, |
| "loss": 0.0241, |
| "step": 2000 |
| }, |
| { |
| "epoch": 1.1214442013129102, |
| "grad_norm": 0.06031138449907303, |
| "learning_rate": 0.00039220132743362834, |
| "loss": 0.0229, |
| "step": 2050 |
| }, |
| { |
| "epoch": 1.148796498905908, |
| "grad_norm": 0.09096775949001312, |
| "learning_rate": 0.0003894358407079646, |
| "loss": 0.0265, |
| "step": 2100 |
| }, |
| { |
| "epoch": 1.176148796498906, |
| "grad_norm": 0.042759090662002563, |
| "learning_rate": 0.00038667035398230086, |
| "loss": 0.0224, |
| "step": 2150 |
| }, |
| { |
| "epoch": 1.2035010940919038, |
| "grad_norm": 0.1569543182849884, |
| "learning_rate": 0.0003839048672566372, |
| "loss": 0.0234, |
| "step": 2200 |
| }, |
| { |
| "epoch": 1.2308533916849016, |
| "grad_norm": 0.10266666859388351, |
| "learning_rate": 0.00038113938053097344, |
| "loss": 0.0221, |
| "step": 2250 |
| }, |
| { |
| "epoch": 1.2582056892778994, |
| "grad_norm": 0.11569976806640625, |
| "learning_rate": 0.00037837389380530976, |
| "loss": 0.0234, |
| "step": 2300 |
| }, |
| { |
| "epoch": 1.2855579868708973, |
| "grad_norm": 0.1363728791475296, |
| "learning_rate": 0.0003756084070796461, |
| "loss": 0.0245, |
| "step": 2350 |
| }, |
| { |
| "epoch": 1.312910284463895, |
| "grad_norm": 0.20765434205532074, |
| "learning_rate": 0.00037284292035398234, |
| "loss": 0.0224, |
| "step": 2400 |
| }, |
| { |
| "epoch": 1.3402625820568927, |
| "grad_norm": 0.085990771651268, |
| "learning_rate": 0.0003700774336283186, |
| "loss": 0.0242, |
| "step": 2450 |
| }, |
| { |
| "epoch": 1.3676148796498906, |
| "grad_norm": 0.11980952322483063, |
| "learning_rate": 0.00036731194690265486, |
| "loss": 0.0236, |
| "step": 2500 |
| }, |
| { |
| "epoch": 1.3949671772428884, |
| "grad_norm": 0.047215647995471954, |
| "learning_rate": 0.0003645464601769912, |
| "loss": 0.024, |
| "step": 2550 |
| }, |
| { |
| "epoch": 1.4223194748358863, |
| "grad_norm": 0.13020800054073334, |
| "learning_rate": 0.00036178097345132744, |
| "loss": 0.0216, |
| "step": 2600 |
| }, |
| { |
| "epoch": 1.449671772428884, |
| "grad_norm": 0.14372025430202484, |
| "learning_rate": 0.0003590154867256637, |
| "loss": 0.0209, |
| "step": 2650 |
| }, |
| { |
| "epoch": 1.4770240700218817, |
| "grad_norm": 0.05821918696165085, |
| "learning_rate": 0.00035625, |
| "loss": 0.0232, |
| "step": 2700 |
| }, |
| { |
| "epoch": 1.5043763676148796, |
| "grad_norm": 0.0778370276093483, |
| "learning_rate": 0.0003534845132743363, |
| "loss": 0.0183, |
| "step": 2750 |
| }, |
| { |
| "epoch": 1.5317286652078774, |
| "grad_norm": 0.11819358170032501, |
| "learning_rate": 0.00035071902654867254, |
| "loss": 0.0221, |
| "step": 2800 |
| }, |
| { |
| "epoch": 1.5590809628008753, |
| "grad_norm": 0.0761452466249466, |
| "learning_rate": 0.00034795353982300885, |
| "loss": 0.0203, |
| "step": 2850 |
| }, |
| { |
| "epoch": 1.5864332603938731, |
| "grad_norm": 0.0948249101638794, |
| "learning_rate": 0.0003451880530973451, |
| "loss": 0.0227, |
| "step": 2900 |
| }, |
| { |
| "epoch": 1.613785557986871, |
| "grad_norm": 0.05540154129266739, |
| "learning_rate": 0.0003424225663716814, |
| "loss": 0.0224, |
| "step": 2950 |
| }, |
| { |
| "epoch": 1.6411378555798688, |
| "grad_norm": 0.11511294543743134, |
| "learning_rate": 0.00033965707964601774, |
| "loss": 0.0217, |
| "step": 3000 |
| }, |
| { |
| "epoch": 1.6684901531728666, |
| "grad_norm": 0.05139593780040741, |
| "learning_rate": 0.000336891592920354, |
| "loss": 0.0209, |
| "step": 3050 |
| }, |
| { |
| "epoch": 1.6958424507658645, |
| "grad_norm": 0.09306413680315018, |
| "learning_rate": 0.00033412610619469027, |
| "loss": 0.0226, |
| "step": 3100 |
| }, |
| { |
| "epoch": 1.723194748358862, |
| "grad_norm": 0.08103025704622269, |
| "learning_rate": 0.0003313606194690266, |
| "loss": 0.0209, |
| "step": 3150 |
| }, |
| { |
| "epoch": 1.75054704595186, |
| "grad_norm": 0.03991864249110222, |
| "learning_rate": 0.00032859513274336285, |
| "loss": 0.0192, |
| "step": 3200 |
| }, |
| { |
| "epoch": 1.7778993435448578, |
| "grad_norm": 0.029916733503341675, |
| "learning_rate": 0.0003258296460176991, |
| "loss": 0.0195, |
| "step": 3250 |
| }, |
| { |
| "epoch": 1.8052516411378556, |
| "grad_norm": 0.06091579794883728, |
| "learning_rate": 0.0003230641592920354, |
| "loss": 0.0198, |
| "step": 3300 |
| }, |
| { |
| "epoch": 1.8326039387308533, |
| "grad_norm": 0.06632323563098907, |
| "learning_rate": 0.0003202986725663717, |
| "loss": 0.022, |
| "step": 3350 |
| }, |
| { |
| "epoch": 1.859956236323851, |
| "grad_norm": 0.04406097158789635, |
| "learning_rate": 0.00031753318584070795, |
| "loss": 0.019, |
| "step": 3400 |
| }, |
| { |
| "epoch": 1.887308533916849, |
| "grad_norm": 0.030064748600125313, |
| "learning_rate": 0.00031476769911504426, |
| "loss": 0.0219, |
| "step": 3450 |
| }, |
| { |
| "epoch": 1.9146608315098468, |
| "grad_norm": 0.08452901244163513, |
| "learning_rate": 0.0003120022123893805, |
| "loss": 0.0203, |
| "step": 3500 |
| }, |
| { |
| "epoch": 1.9420131291028446, |
| "grad_norm": 0.046714432537555695, |
| "learning_rate": 0.0003092367256637168, |
| "loss": 0.0185, |
| "step": 3550 |
| }, |
| { |
| "epoch": 1.9693654266958425, |
| "grad_norm": 0.06433264911174774, |
| "learning_rate": 0.0003064712389380531, |
| "loss": 0.0196, |
| "step": 3600 |
| }, |
| { |
| "epoch": 1.9967177242888403, |
| "grad_norm": 0.03903990983963013, |
| "learning_rate": 0.00030370575221238936, |
| "loss": 0.0197, |
| "step": 3650 |
| }, |
| { |
| "epoch": 2.024070021881838, |
| "grad_norm": 0.07869091629981995, |
| "learning_rate": 0.0003009402654867257, |
| "loss": 0.0185, |
| "step": 3700 |
| }, |
| { |
| "epoch": 2.051422319474836, |
| "grad_norm": 0.08784345537424088, |
| "learning_rate": 0.000298174778761062, |
| "loss": 0.0173, |
| "step": 3750 |
| }, |
| { |
| "epoch": 2.078774617067834, |
| "grad_norm": 0.0678941160440445, |
| "learning_rate": 0.00029540929203539825, |
| "loss": 0.0172, |
| "step": 3800 |
| }, |
| { |
| "epoch": 2.1061269146608317, |
| "grad_norm": 0.04075619950890541, |
| "learning_rate": 0.0002926438053097345, |
| "loss": 0.0187, |
| "step": 3850 |
| }, |
| { |
| "epoch": 2.1334792122538295, |
| "grad_norm": 0.04440777003765106, |
| "learning_rate": 0.00028987831858407083, |
| "loss": 0.0178, |
| "step": 3900 |
| }, |
| { |
| "epoch": 2.160831509846827, |
| "grad_norm": 0.11714845895767212, |
| "learning_rate": 0.0002871128318584071, |
| "loss": 0.0173, |
| "step": 3950 |
| }, |
| { |
| "epoch": 2.1881838074398248, |
| "grad_norm": 0.05424795299768448, |
| "learning_rate": 0.00028434734513274335, |
| "loss": 0.0191, |
| "step": 4000 |
| }, |
| { |
| "epoch": 2.2155361050328226, |
| "grad_norm": 0.05837958678603172, |
| "learning_rate": 0.00028158185840707967, |
| "loss": 0.0195, |
| "step": 4050 |
| }, |
| { |
| "epoch": 2.2428884026258205, |
| "grad_norm": 0.06023433431982994, |
| "learning_rate": 0.00027881637168141593, |
| "loss": 0.0175, |
| "step": 4100 |
| }, |
| { |
| "epoch": 2.2702407002188183, |
| "grad_norm": 0.054130081087350845, |
| "learning_rate": 0.0002760508849557522, |
| "loss": 0.0193, |
| "step": 4150 |
| }, |
| { |
| "epoch": 2.297592997811816, |
| "grad_norm": 0.14617919921875, |
| "learning_rate": 0.0002732853982300885, |
| "loss": 0.019, |
| "step": 4200 |
| }, |
| { |
| "epoch": 2.324945295404814, |
| "grad_norm": 0.04461858794093132, |
| "learning_rate": 0.00027051991150442477, |
| "loss": 0.0172, |
| "step": 4250 |
| }, |
| { |
| "epoch": 2.352297592997812, |
| "grad_norm": 0.0726858526468277, |
| "learning_rate": 0.00026775442477876103, |
| "loss": 0.0165, |
| "step": 4300 |
| }, |
| { |
| "epoch": 2.3796498905908097, |
| "grad_norm": 0.08061967045068741, |
| "learning_rate": 0.00026498893805309735, |
| "loss": 0.0176, |
| "step": 4350 |
| }, |
| { |
| "epoch": 2.4070021881838075, |
| "grad_norm": 0.10574040561914444, |
| "learning_rate": 0.00026222345132743366, |
| "loss": 0.0173, |
| "step": 4400 |
| }, |
| { |
| "epoch": 2.4343544857768054, |
| "grad_norm": 0.05545186251401901, |
| "learning_rate": 0.0002594579646017699, |
| "loss": 0.0173, |
| "step": 4450 |
| }, |
| { |
| "epoch": 2.461706783369803, |
| "grad_norm": 0.24571385979652405, |
| "learning_rate": 0.00025669247787610624, |
| "loss": 0.0176, |
| "step": 4500 |
| }, |
| { |
| "epoch": 2.489059080962801, |
| "grad_norm": 0.09013593196868896, |
| "learning_rate": 0.0002539269911504425, |
| "loss": 0.0178, |
| "step": 4550 |
| }, |
| { |
| "epoch": 2.516411378555799, |
| "grad_norm": 0.03443370759487152, |
| "learning_rate": 0.00025116150442477876, |
| "loss": 0.0161, |
| "step": 4600 |
| }, |
| { |
| "epoch": 2.5437636761487967, |
| "grad_norm": 0.0702081173658371, |
| "learning_rate": 0.000248396017699115, |
| "loss": 0.0184, |
| "step": 4650 |
| }, |
| { |
| "epoch": 2.5711159737417946, |
| "grad_norm": 0.2026512324810028, |
| "learning_rate": 0.00024563053097345134, |
| "loss": 0.0168, |
| "step": 4700 |
| }, |
| { |
| "epoch": 2.598468271334792, |
| "grad_norm": 0.09723120927810669, |
| "learning_rate": 0.0002428650442477876, |
| "loss": 0.0173, |
| "step": 4750 |
| }, |
| { |
| "epoch": 2.62582056892779, |
| "grad_norm": 0.05518170818686485, |
| "learning_rate": 0.0002400995575221239, |
| "loss": 0.0174, |
| "step": 4800 |
| }, |
| { |
| "epoch": 2.6531728665207877, |
| "grad_norm": 0.09659027308225632, |
| "learning_rate": 0.00023733407079646018, |
| "loss": 0.0163, |
| "step": 4850 |
| }, |
| { |
| "epoch": 2.6805251641137855, |
| "grad_norm": 0.05287766829133034, |
| "learning_rate": 0.00023456858407079644, |
| "loss": 0.017, |
| "step": 4900 |
| }, |
| { |
| "epoch": 2.7078774617067833, |
| "grad_norm": 0.06743517518043518, |
| "learning_rate": 0.00023180309734513276, |
| "loss": 0.0163, |
| "step": 4950 |
| }, |
| { |
| "epoch": 2.735229759299781, |
| "grad_norm": 0.07032987475395203, |
| "learning_rate": 0.00022903761061946905, |
| "loss": 0.0176, |
| "step": 5000 |
| }, |
| { |
| "epoch": 2.762582056892779, |
| "grad_norm": 0.08675131946802139, |
| "learning_rate": 0.0002262721238938053, |
| "loss": 0.0153, |
| "step": 5050 |
| }, |
| { |
| "epoch": 2.789934354485777, |
| "grad_norm": 0.0664379745721817, |
| "learning_rate": 0.0002235066371681416, |
| "loss": 0.0173, |
| "step": 5100 |
| }, |
| { |
| "epoch": 2.8172866520787747, |
| "grad_norm": 0.08365903049707413, |
| "learning_rate": 0.00022074115044247789, |
| "loss": 0.0159, |
| "step": 5150 |
| }, |
| { |
| "epoch": 2.8446389496717726, |
| "grad_norm": 0.050819192081689835, |
| "learning_rate": 0.00021797566371681415, |
| "loss": 0.0148, |
| "step": 5200 |
| }, |
| { |
| "epoch": 2.8719912472647704, |
| "grad_norm": 0.10989898443222046, |
| "learning_rate": 0.00021521017699115044, |
| "loss": 0.0163, |
| "step": 5250 |
| }, |
| { |
| "epoch": 2.899343544857768, |
| "grad_norm": 0.1679241806268692, |
| "learning_rate": 0.00021244469026548675, |
| "loss": 0.017, |
| "step": 5300 |
| }, |
| { |
| "epoch": 2.9266958424507656, |
| "grad_norm": 0.077048659324646, |
| "learning_rate": 0.000209679203539823, |
| "loss": 0.0162, |
| "step": 5350 |
| }, |
| { |
| "epoch": 2.9540481400437635, |
| "grad_norm": 0.0816880315542221, |
| "learning_rate": 0.0002069137168141593, |
| "loss": 0.0159, |
| "step": 5400 |
| }, |
| { |
| "epoch": 2.9814004376367613, |
| "grad_norm": 0.07774699479341507, |
| "learning_rate": 0.0002041482300884956, |
| "loss": 0.0154, |
| "step": 5450 |
| }, |
| { |
| "epoch": 3.008752735229759, |
| "grad_norm": 0.07249217480421066, |
| "learning_rate": 0.00020138274336283185, |
| "loss": 0.0169, |
| "step": 5500 |
| }, |
| { |
| "epoch": 3.036105032822757, |
| "grad_norm": 0.054936815053224564, |
| "learning_rate": 0.00019861725663716814, |
| "loss": 0.0145, |
| "step": 5550 |
| }, |
| { |
| "epoch": 3.063457330415755, |
| "grad_norm": 0.08779731392860413, |
| "learning_rate": 0.00019585176991150443, |
| "loss": 0.0152, |
| "step": 5600 |
| }, |
| { |
| "epoch": 3.0908096280087527, |
| "grad_norm": 0.041388992220163345, |
| "learning_rate": 0.00019308628318584072, |
| "loss": 0.0143, |
| "step": 5650 |
| }, |
| { |
| "epoch": 3.1181619256017505, |
| "grad_norm": 0.07258164137601852, |
| "learning_rate": 0.000190320796460177, |
| "loss": 0.0146, |
| "step": 5700 |
| }, |
| { |
| "epoch": 3.1455142231947484, |
| "grad_norm": 0.08956324309110641, |
| "learning_rate": 0.0001875553097345133, |
| "loss": 0.0135, |
| "step": 5750 |
| }, |
| { |
| "epoch": 3.1728665207877462, |
| "grad_norm": 0.0332624725997448, |
| "learning_rate": 0.00018478982300884956, |
| "loss": 0.0143, |
| "step": 5800 |
| }, |
| { |
| "epoch": 3.200218818380744, |
| "grad_norm": 0.10055282711982727, |
| "learning_rate": 0.00018202433628318585, |
| "loss": 0.0143, |
| "step": 5850 |
| }, |
| { |
| "epoch": 3.227571115973742, |
| "grad_norm": 0.11651066690683365, |
| "learning_rate": 0.00017925884955752213, |
| "loss": 0.0155, |
| "step": 5900 |
| }, |
| { |
| "epoch": 3.2549234135667398, |
| "grad_norm": 0.08547132462263107, |
| "learning_rate": 0.0001764933628318584, |
| "loss": 0.0141, |
| "step": 5950 |
| }, |
| { |
| "epoch": 3.2822757111597376, |
| "grad_norm": 0.07074438035488129, |
| "learning_rate": 0.0001737278761061947, |
| "loss": 0.0142, |
| "step": 6000 |
| }, |
| { |
| "epoch": 3.3096280087527354, |
| "grad_norm": 0.07982511818408966, |
| "learning_rate": 0.00017096238938053097, |
| "loss": 0.0168, |
| "step": 6050 |
| }, |
| { |
| "epoch": 3.3369803063457333, |
| "grad_norm": 0.04909352585673332, |
| "learning_rate": 0.00016819690265486726, |
| "loss": 0.014, |
| "step": 6100 |
| }, |
| { |
| "epoch": 3.3643326039387307, |
| "grad_norm": 0.07797563821077347, |
| "learning_rate": 0.00016543141592920355, |
| "loss": 0.0157, |
| "step": 6150 |
| }, |
| { |
| "epoch": 3.3916849015317285, |
| "grad_norm": 0.08203662931919098, |
| "learning_rate": 0.0001626659292035398, |
| "loss": 0.0142, |
| "step": 6200 |
| }, |
| { |
| "epoch": 3.4190371991247264, |
| "grad_norm": 0.08141785860061646, |
| "learning_rate": 0.0001599004424778761, |
| "loss": 0.0144, |
| "step": 6250 |
| }, |
| { |
| "epoch": 3.446389496717724, |
| "grad_norm": 0.05146276205778122, |
| "learning_rate": 0.0001571349557522124, |
| "loss": 0.0142, |
| "step": 6300 |
| }, |
| { |
| "epoch": 3.473741794310722, |
| "grad_norm": 0.04246249422430992, |
| "learning_rate": 0.00015436946902654868, |
| "loss": 0.0144, |
| "step": 6350 |
| }, |
| { |
| "epoch": 3.50109409190372, |
| "grad_norm": 0.07449432462453842, |
| "learning_rate": 0.00015160398230088497, |
| "loss": 0.0166, |
| "step": 6400 |
| }, |
| { |
| "epoch": 3.5284463894967177, |
| "grad_norm": 0.04963746666908264, |
| "learning_rate": 0.00014883849557522125, |
| "loss": 0.014, |
| "step": 6450 |
| }, |
| { |
| "epoch": 3.5557986870897156, |
| "grad_norm": 0.10930000245571136, |
| "learning_rate": 0.00014607300884955752, |
| "loss": 0.0142, |
| "step": 6500 |
| }, |
| { |
| "epoch": 3.5831509846827134, |
| "grad_norm": 0.09715767204761505, |
| "learning_rate": 0.0001433075221238938, |
| "loss": 0.0147, |
| "step": 6550 |
| }, |
| { |
| "epoch": 3.6105032822757113, |
| "grad_norm": 0.1151093989610672, |
| "learning_rate": 0.0001405420353982301, |
| "loss": 0.0126, |
| "step": 6600 |
| }, |
| { |
| "epoch": 3.637855579868709, |
| "grad_norm": 0.06474081426858902, |
| "learning_rate": 0.00013777654867256636, |
| "loss": 0.0132, |
| "step": 6650 |
| }, |
| { |
| "epoch": 3.6652078774617065, |
| "grad_norm": 0.10157348960638046, |
| "learning_rate": 0.00013501106194690267, |
| "loss": 0.0129, |
| "step": 6700 |
| }, |
| { |
| "epoch": 3.6925601750547044, |
| "grad_norm": 0.08445007354021072, |
| "learning_rate": 0.00013224557522123896, |
| "loss": 0.0136, |
| "step": 6750 |
| }, |
| { |
| "epoch": 3.719912472647702, |
| "grad_norm": 0.1165938451886177, |
| "learning_rate": 0.00012948008849557522, |
| "loss": 0.013, |
| "step": 6800 |
| }, |
| { |
| "epoch": 3.7472647702407, |
| "grad_norm": 0.0693984255194664, |
| "learning_rate": 0.0001267146017699115, |
| "loss": 0.0131, |
| "step": 6850 |
| }, |
| { |
| "epoch": 3.774617067833698, |
| "grad_norm": 0.07007980346679688, |
| "learning_rate": 0.0001239491150442478, |
| "loss": 0.0136, |
| "step": 6900 |
| }, |
| { |
| "epoch": 3.8019693654266957, |
| "grad_norm": 0.05304344743490219, |
| "learning_rate": 0.00012118362831858407, |
| "loss": 0.0123, |
| "step": 6950 |
| }, |
| { |
| "epoch": 3.8293216630196936, |
| "grad_norm": 0.07882869988679886, |
| "learning_rate": 0.00011841814159292036, |
| "loss": 0.0138, |
| "step": 7000 |
| }, |
| { |
| "epoch": 3.8566739606126914, |
| "grad_norm": 0.07988675683736801, |
| "learning_rate": 0.00011565265486725664, |
| "loss": 0.0142, |
| "step": 7050 |
| }, |
| { |
| "epoch": 3.8840262582056893, |
| "grad_norm": 0.06684820353984833, |
| "learning_rate": 0.00011288716814159291, |
| "loss": 0.013, |
| "step": 7100 |
| }, |
| { |
| "epoch": 3.911378555798687, |
| "grad_norm": 0.04811659827828407, |
| "learning_rate": 0.00011012168141592921, |
| "loss": 0.0145, |
| "step": 7150 |
| }, |
| { |
| "epoch": 3.938730853391685, |
| "grad_norm": 0.09577899426221848, |
| "learning_rate": 0.00010735619469026549, |
| "loss": 0.0149, |
| "step": 7200 |
| }, |
| { |
| "epoch": 3.966083150984683, |
| "grad_norm": 0.08057638257741928, |
| "learning_rate": 0.00010459070796460176, |
| "loss": 0.0121, |
| "step": 7250 |
| }, |
| { |
| "epoch": 3.9934354485776806, |
| "grad_norm": 0.07438407838344574, |
| "learning_rate": 0.00010182522123893805, |
| "loss": 0.0121, |
| "step": 7300 |
| }, |
| { |
| "epoch": 4.0207877461706785, |
| "grad_norm": 0.039558082818984985, |
| "learning_rate": 9.905973451327434e-05, |
| "loss": 0.0127, |
| "step": 7350 |
| }, |
| { |
| "epoch": 4.048140043763676, |
| "grad_norm": 0.09110087901353836, |
| "learning_rate": 9.629424778761062e-05, |
| "loss": 0.0112, |
| "step": 7400 |
| }, |
| { |
| "epoch": 4.075492341356674, |
| "grad_norm": 0.11649812757968903, |
| "learning_rate": 9.35287610619469e-05, |
| "loss": 0.012, |
| "step": 7450 |
| }, |
| { |
| "epoch": 4.102844638949672, |
| "grad_norm": 0.06791022419929504, |
| "learning_rate": 9.07632743362832e-05, |
| "loss": 0.0123, |
| "step": 7500 |
| }, |
| { |
| "epoch": 4.13019693654267, |
| "grad_norm": 0.08256133645772934, |
| "learning_rate": 8.799778761061947e-05, |
| "loss": 0.0115, |
| "step": 7550 |
| }, |
| { |
| "epoch": 4.157549234135668, |
| "grad_norm": 0.0456516407430172, |
| "learning_rate": 8.523230088495576e-05, |
| "loss": 0.0106, |
| "step": 7600 |
| }, |
| { |
| "epoch": 4.1849015317286655, |
| "grad_norm": 0.0911739319562912, |
| "learning_rate": 8.246681415929203e-05, |
| "loss": 0.0132, |
| "step": 7650 |
| }, |
| { |
| "epoch": 4.212253829321663, |
| "grad_norm": 0.04813405126333237, |
| "learning_rate": 7.970132743362832e-05, |
| "loss": 0.0122, |
| "step": 7700 |
| }, |
| { |
| "epoch": 4.239606126914661, |
| "grad_norm": 0.08700749278068542, |
| "learning_rate": 7.69358407079646e-05, |
| "loss": 0.0115, |
| "step": 7750 |
| }, |
| { |
| "epoch": 4.266958424507659, |
| "grad_norm": 0.08354539424180984, |
| "learning_rate": 7.417035398230089e-05, |
| "loss": 0.0129, |
| "step": 7800 |
| }, |
| { |
| "epoch": 4.294310722100656, |
| "grad_norm": 0.07438132911920547, |
| "learning_rate": 7.140486725663717e-05, |
| "loss": 0.0125, |
| "step": 7850 |
| }, |
| { |
| "epoch": 4.321663019693654, |
| "grad_norm": 0.04467419162392616, |
| "learning_rate": 6.863938053097345e-05, |
| "loss": 0.011, |
| "step": 7900 |
| }, |
| { |
| "epoch": 4.349015317286652, |
| "grad_norm": 0.01871863380074501, |
| "learning_rate": 6.587389380530974e-05, |
| "loss": 0.0112, |
| "step": 7950 |
| }, |
| { |
| "epoch": 4.3763676148796495, |
| "grad_norm": 0.07461551576852798, |
| "learning_rate": 6.310840707964601e-05, |
| "loss": 0.013, |
| "step": 8000 |
| }, |
| { |
| "epoch": 4.403719912472647, |
| "grad_norm": 0.04248015210032463, |
| "learning_rate": 6.03429203539823e-05, |
| "loss": 0.013, |
| "step": 8050 |
| }, |
| { |
| "epoch": 4.431072210065645, |
| "grad_norm": 0.09184252470731735, |
| "learning_rate": 5.7577433628318583e-05, |
| "loss": 0.0114, |
| "step": 8100 |
| }, |
| { |
| "epoch": 4.458424507658643, |
| "grad_norm": 0.0578514039516449, |
| "learning_rate": 5.481194690265487e-05, |
| "loss": 0.0125, |
| "step": 8150 |
| }, |
| { |
| "epoch": 4.485776805251641, |
| "grad_norm": 0.10820753872394562, |
| "learning_rate": 5.2046460176991154e-05, |
| "loss": 0.0135, |
| "step": 8200 |
| }, |
| { |
| "epoch": 4.513129102844639, |
| "grad_norm": 0.10485094785690308, |
| "learning_rate": 4.9280973451327436e-05, |
| "loss": 0.0106, |
| "step": 8250 |
| }, |
| { |
| "epoch": 4.540481400437637, |
| "grad_norm": 0.08615507930517197, |
| "learning_rate": 4.651548672566372e-05, |
| "loss": 0.0124, |
| "step": 8300 |
| }, |
| { |
| "epoch": 4.567833698030634, |
| "grad_norm": 0.08930730074644089, |
| "learning_rate": 4.375e-05, |
| "loss": 0.0121, |
| "step": 8350 |
| }, |
| { |
| "epoch": 4.595185995623632, |
| "grad_norm": 0.05729290097951889, |
| "learning_rate": 4.098451327433628e-05, |
| "loss": 0.012, |
| "step": 8400 |
| }, |
| { |
| "epoch": 4.62253829321663, |
| "grad_norm": 0.06665951758623123, |
| "learning_rate": 3.821902654867257e-05, |
| "loss": 0.0121, |
| "step": 8450 |
| }, |
| { |
| "epoch": 4.649890590809628, |
| "grad_norm": 0.0894913300871849, |
| "learning_rate": 3.5453539823008845e-05, |
| "loss": 0.0132, |
| "step": 8500 |
| }, |
| { |
| "epoch": 4.677242888402626, |
| "grad_norm": 0.05070062354207039, |
| "learning_rate": 3.2688053097345134e-05, |
| "loss": 0.0127, |
| "step": 8550 |
| }, |
| { |
| "epoch": 4.704595185995624, |
| "grad_norm": 0.0852472186088562, |
| "learning_rate": 2.9922566371681416e-05, |
| "loss": 0.013, |
| "step": 8600 |
| }, |
| { |
| "epoch": 4.7319474835886215, |
| "grad_norm": 0.04788070544600487, |
| "learning_rate": 2.7157079646017698e-05, |
| "loss": 0.0116, |
| "step": 8650 |
| }, |
| { |
| "epoch": 4.759299781181619, |
| "grad_norm": 0.05870731547474861, |
| "learning_rate": 2.4391592920353983e-05, |
| "loss": 0.0119, |
| "step": 8700 |
| }, |
| { |
| "epoch": 4.786652078774617, |
| "grad_norm": 0.055160026997327805, |
| "learning_rate": 2.1626106194690268e-05, |
| "loss": 0.013, |
| "step": 8750 |
| }, |
| { |
| "epoch": 4.814004376367615, |
| "grad_norm": 0.07681864500045776, |
| "learning_rate": 1.886061946902655e-05, |
| "loss": 0.0115, |
| "step": 8800 |
| }, |
| { |
| "epoch": 4.841356673960613, |
| "grad_norm": 0.04158329963684082, |
| "learning_rate": 1.6095132743362832e-05, |
| "loss": 0.0109, |
| "step": 8850 |
| }, |
| { |
| "epoch": 4.868708971553611, |
| "grad_norm": 0.07591590285301208, |
| "learning_rate": 1.3329646017699115e-05, |
| "loss": 0.0115, |
| "step": 8900 |
| }, |
| { |
| "epoch": 4.8960612691466086, |
| "grad_norm": 0.04335255175828934, |
| "learning_rate": 1.0564159292035397e-05, |
| "loss": 0.0126, |
| "step": 8950 |
| }, |
| { |
| "epoch": 4.923413566739606, |
| "grad_norm": 0.02843708172440529, |
| "learning_rate": 7.798672566371682e-06, |
| "loss": 0.0109, |
| "step": 9000 |
| }, |
| { |
| "epoch": 4.950765864332604, |
| "grad_norm": 0.051726795732975006, |
| "learning_rate": 5.033185840707965e-06, |
| "loss": 0.0112, |
| "step": 9050 |
| }, |
| { |
| "epoch": 4.978118161925602, |
| "grad_norm": 0.03216787800192833, |
| "learning_rate": 2.267699115044248e-06, |
| "loss": 0.0122, |
| "step": 9100 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 9140, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 4947927084564480.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|