| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.0, |
| "eval_steps": 500, |
| "global_step": 17121, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.008761170492377781, |
| "grad_norm": 1.859375, |
| "learning_rate": 4.985690088195783e-05, |
| "loss": 0.5628, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.017522340984755563, |
| "grad_norm": 1.453125, |
| "learning_rate": 4.971088137375153e-05, |
| "loss": 0.3821, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.026283511477133346, |
| "grad_norm": 1.2734375, |
| "learning_rate": 4.9564861865545236e-05, |
| "loss": 0.3687, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.035044681969511125, |
| "grad_norm": 1.234375, |
| "learning_rate": 4.941884235733894e-05, |
| "loss": 0.3548, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.04380585246188891, |
| "grad_norm": 1.0625, |
| "learning_rate": 4.927282284913264e-05, |
| "loss": 0.3448, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.05256702295426669, |
| "grad_norm": 1.140625, |
| "learning_rate": 4.912680334092635e-05, |
| "loss": 0.3381, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.061328193446644474, |
| "grad_norm": 1.1328125, |
| "learning_rate": 4.898078383272006e-05, |
| "loss": 0.3325, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.07008936393902225, |
| "grad_norm": 1.0546875, |
| "learning_rate": 4.883476432451376e-05, |
| "loss": 0.324, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.07885053443140004, |
| "grad_norm": 0.9453125, |
| "learning_rate": 4.8688744816307465e-05, |
| "loss": 0.3223, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.08761170492377782, |
| "grad_norm": 0.984375, |
| "learning_rate": 4.854272530810117e-05, |
| "loss": 0.3197, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.09637287541615559, |
| "grad_norm": 0.94140625, |
| "learning_rate": 4.839670579989487e-05, |
| "loss": 0.3086, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.10513404590853338, |
| "grad_norm": 0.9453125, |
| "learning_rate": 4.825068629168857e-05, |
| "loss": 0.3122, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.11389521640091116, |
| "grad_norm": 0.90234375, |
| "learning_rate": 4.8104666783482275e-05, |
| "loss": 0.3035, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.12265638689328895, |
| "grad_norm": 0.91796875, |
| "learning_rate": 4.795864727527598e-05, |
| "loss": 0.3073, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.13141755738566674, |
| "grad_norm": 0.97265625, |
| "learning_rate": 4.781262776706969e-05, |
| "loss": 0.3051, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.1401787278780445, |
| "grad_norm": 1.0, |
| "learning_rate": 4.766660825886339e-05, |
| "loss": 0.2992, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.1489398983704223, |
| "grad_norm": 0.87890625, |
| "learning_rate": 4.752058875065709e-05, |
| "loss": 0.3007, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.15770106886280008, |
| "grad_norm": 0.8828125, |
| "learning_rate": 4.7374569242450795e-05, |
| "loss": 0.2952, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.16646223935517784, |
| "grad_norm": 0.875, |
| "learning_rate": 4.72285497342445e-05, |
| "loss": 0.2981, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.17522340984755563, |
| "grad_norm": 0.828125, |
| "learning_rate": 4.70825302260382e-05, |
| "loss": 0.2911, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.18398458033993342, |
| "grad_norm": 0.83203125, |
| "learning_rate": 4.69365107178319e-05, |
| "loss": 0.2848, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.19274575083231119, |
| "grad_norm": 0.80078125, |
| "learning_rate": 4.679049120962561e-05, |
| "loss": 0.2932, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.20150692132468898, |
| "grad_norm": 0.828125, |
| "learning_rate": 4.6644471701419314e-05, |
| "loss": 0.2908, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.21026809181706677, |
| "grad_norm": 0.9140625, |
| "learning_rate": 4.649845219321302e-05, |
| "loss": 0.2836, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.21902926230944456, |
| "grad_norm": 0.80859375, |
| "learning_rate": 4.635243268500672e-05, |
| "loss": 0.2907, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.22779043280182232, |
| "grad_norm": 0.87109375, |
| "learning_rate": 4.620641317680042e-05, |
| "loss": 0.2859, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.2365516032942001, |
| "grad_norm": 0.85546875, |
| "learning_rate": 4.6060393668594125e-05, |
| "loss": 0.2826, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.2453127737865779, |
| "grad_norm": 0.859375, |
| "learning_rate": 4.591437416038783e-05, |
| "loss": 0.2862, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.2540739442789557, |
| "grad_norm": 0.8203125, |
| "learning_rate": 4.576835465218153e-05, |
| "loss": 0.2865, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.2628351147713335, |
| "grad_norm": 0.8046875, |
| "learning_rate": 4.562233514397524e-05, |
| "loss": 0.2792, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.2715962852637112, |
| "grad_norm": 1.0703125, |
| "learning_rate": 4.547631563576894e-05, |
| "loss": 0.281, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.280357455756089, |
| "grad_norm": 0.91015625, |
| "learning_rate": 4.5330296127562645e-05, |
| "loss": 0.28, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.2891186262484668, |
| "grad_norm": 0.89453125, |
| "learning_rate": 4.518427661935635e-05, |
| "loss": 0.2789, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.2978797967408446, |
| "grad_norm": 0.8984375, |
| "learning_rate": 4.503825711115005e-05, |
| "loss": 0.2792, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.3066409672332224, |
| "grad_norm": 0.765625, |
| "learning_rate": 4.489223760294375e-05, |
| "loss": 0.2809, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.31540213772560016, |
| "grad_norm": 0.859375, |
| "learning_rate": 4.4746218094737455e-05, |
| "loss": 0.2729, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.3241633082179779, |
| "grad_norm": 0.82421875, |
| "learning_rate": 4.4600198586531164e-05, |
| "loss": 0.2763, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.3329244787103557, |
| "grad_norm": 0.83984375, |
| "learning_rate": 4.445417907832487e-05, |
| "loss": 0.2789, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.3416856492027335, |
| "grad_norm": 0.8671875, |
| "learning_rate": 4.430815957011857e-05, |
| "loss": 0.2775, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.35044681969511127, |
| "grad_norm": 0.87890625, |
| "learning_rate": 4.416214006191227e-05, |
| "loss": 0.2727, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.35920799018748906, |
| "grad_norm": 0.87109375, |
| "learning_rate": 4.4016120553705975e-05, |
| "loss": 0.2748, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.36796916067986685, |
| "grad_norm": 0.81640625, |
| "learning_rate": 4.387010104549968e-05, |
| "loss": 0.2747, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.37673033117224464, |
| "grad_norm": 0.875, |
| "learning_rate": 4.372408153729338e-05, |
| "loss": 0.2711, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.38549150166462237, |
| "grad_norm": 0.72265625, |
| "learning_rate": 4.357806202908708e-05, |
| "loss": 0.2694, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.39425267215700016, |
| "grad_norm": 0.80078125, |
| "learning_rate": 4.343204252088079e-05, |
| "loss": 0.27, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.40301384264937795, |
| "grad_norm": 0.76171875, |
| "learning_rate": 4.32860230126745e-05, |
| "loss": 0.2662, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.41177501314175574, |
| "grad_norm": 0.81640625, |
| "learning_rate": 4.3140003504468204e-05, |
| "loss": 0.2764, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.42053618363413353, |
| "grad_norm": 0.84765625, |
| "learning_rate": 4.2993983996261906e-05, |
| "loss": 0.2709, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.4292973541265113, |
| "grad_norm": 0.921875, |
| "learning_rate": 4.284796448805561e-05, |
| "loss": 0.2694, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.4380585246188891, |
| "grad_norm": 0.77734375, |
| "learning_rate": 4.270194497984931e-05, |
| "loss": 0.2673, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.44681969511126685, |
| "grad_norm": 0.78515625, |
| "learning_rate": 4.2555925471643014e-05, |
| "loss": 0.2669, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.45558086560364464, |
| "grad_norm": 0.6796875, |
| "learning_rate": 4.240990596343672e-05, |
| "loss": 0.2686, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.4643420360960224, |
| "grad_norm": 0.765625, |
| "learning_rate": 4.226388645523042e-05, |
| "loss": 0.2663, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.4731032065884002, |
| "grad_norm": 0.7734375, |
| "learning_rate": 4.211786694702413e-05, |
| "loss": 0.2638, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.481864377080778, |
| "grad_norm": 0.765625, |
| "learning_rate": 4.197184743881783e-05, |
| "loss": 0.2644, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.4906255475731558, |
| "grad_norm": 0.921875, |
| "learning_rate": 4.1825827930611534e-05, |
| "loss": 0.2643, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.49938671806553353, |
| "grad_norm": 0.7421875, |
| "learning_rate": 4.1679808422405236e-05, |
| "loss": 0.2675, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.5081478885579114, |
| "grad_norm": 0.9140625, |
| "learning_rate": 4.153378891419894e-05, |
| "loss": 0.269, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.5169090590502892, |
| "grad_norm": 0.84765625, |
| "learning_rate": 4.138776940599264e-05, |
| "loss": 0.2607, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.525670229542667, |
| "grad_norm": 0.83203125, |
| "learning_rate": 4.1241749897786344e-05, |
| "loss": 0.2598, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.5344314000350446, |
| "grad_norm": 0.75390625, |
| "learning_rate": 4.1095730389580054e-05, |
| "loss": 0.2649, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.5431925705274224, |
| "grad_norm": 0.71484375, |
| "learning_rate": 4.0949710881373756e-05, |
| "loss": 0.2636, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.5519537410198002, |
| "grad_norm": 0.73046875, |
| "learning_rate": 4.080369137316746e-05, |
| "loss": 0.2603, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.560714911512178, |
| "grad_norm": 0.8203125, |
| "learning_rate": 4.065767186496116e-05, |
| "loss": 0.2566, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.5694760820045558, |
| "grad_norm": 0.79296875, |
| "learning_rate": 4.0511652356754864e-05, |
| "loss": 0.2577, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.5782372524969336, |
| "grad_norm": 0.828125, |
| "learning_rate": 4.0365632848548566e-05, |
| "loss": 0.2596, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.5869984229893114, |
| "grad_norm": 0.8125, |
| "learning_rate": 4.021961334034227e-05, |
| "loss": 0.2622, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.5957595934816892, |
| "grad_norm": 0.7734375, |
| "learning_rate": 4.007359383213597e-05, |
| "loss": 0.2584, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.604520763974067, |
| "grad_norm": 0.79296875, |
| "learning_rate": 3.992757432392968e-05, |
| "loss": 0.2572, |
| "step": 3450 |
| }, |
| { |
| "epoch": 0.6132819344664447, |
| "grad_norm": 0.84765625, |
| "learning_rate": 3.9781554815723384e-05, |
| "loss": 0.26, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.6220431049588225, |
| "grad_norm": 0.80859375, |
| "learning_rate": 3.9635535307517086e-05, |
| "loss": 0.26, |
| "step": 3550 |
| }, |
| { |
| "epoch": 0.6308042754512003, |
| "grad_norm": 0.82421875, |
| "learning_rate": 3.948951579931079e-05, |
| "loss": 0.2586, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.6395654459435781, |
| "grad_norm": 0.8984375, |
| "learning_rate": 3.934349629110449e-05, |
| "loss": 0.2652, |
| "step": 3650 |
| }, |
| { |
| "epoch": 0.6483266164359558, |
| "grad_norm": 0.77734375, |
| "learning_rate": 3.9197476782898194e-05, |
| "loss": 0.2576, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.6570877869283336, |
| "grad_norm": 0.7890625, |
| "learning_rate": 3.9051457274691897e-05, |
| "loss": 0.2595, |
| "step": 3750 |
| }, |
| { |
| "epoch": 0.6658489574207114, |
| "grad_norm": 0.7890625, |
| "learning_rate": 3.8905437766485606e-05, |
| "loss": 0.256, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.6746101279130892, |
| "grad_norm": 0.84375, |
| "learning_rate": 3.875941825827931e-05, |
| "loss": 0.2571, |
| "step": 3850 |
| }, |
| { |
| "epoch": 0.683371298405467, |
| "grad_norm": 0.8671875, |
| "learning_rate": 3.861339875007301e-05, |
| "loss": 0.2546, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.6921324688978447, |
| "grad_norm": 0.875, |
| "learning_rate": 3.8467379241866714e-05, |
| "loss": 0.2578, |
| "step": 3950 |
| }, |
| { |
| "epoch": 0.7008936393902225, |
| "grad_norm": 0.81640625, |
| "learning_rate": 3.8321359733660416e-05, |
| "loss": 0.2551, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.7096548098826003, |
| "grad_norm": 0.80859375, |
| "learning_rate": 3.817534022545412e-05, |
| "loss": 0.2586, |
| "step": 4050 |
| }, |
| { |
| "epoch": 0.7184159803749781, |
| "grad_norm": 0.8046875, |
| "learning_rate": 3.802932071724782e-05, |
| "loss": 0.2568, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.7271771508673559, |
| "grad_norm": 0.8515625, |
| "learning_rate": 3.7883301209041524e-05, |
| "loss": 0.2632, |
| "step": 4150 |
| }, |
| { |
| "epoch": 0.7359383213597337, |
| "grad_norm": 0.75390625, |
| "learning_rate": 3.7737281700835233e-05, |
| "loss": 0.2518, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.7446994918521115, |
| "grad_norm": 0.8203125, |
| "learning_rate": 3.7591262192628936e-05, |
| "loss": 0.2539, |
| "step": 4250 |
| }, |
| { |
| "epoch": 0.7534606623444893, |
| "grad_norm": 0.765625, |
| "learning_rate": 3.7445242684422645e-05, |
| "loss": 0.2507, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.762221832836867, |
| "grad_norm": 0.83984375, |
| "learning_rate": 3.729922317621635e-05, |
| "loss": 0.2517, |
| "step": 4350 |
| }, |
| { |
| "epoch": 0.7709830033292447, |
| "grad_norm": 0.74609375, |
| "learning_rate": 3.715320366801005e-05, |
| "loss": 0.2523, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.7797441738216225, |
| "grad_norm": 0.70703125, |
| "learning_rate": 3.700718415980375e-05, |
| "loss": 0.2492, |
| "step": 4450 |
| }, |
| { |
| "epoch": 0.7885053443140003, |
| "grad_norm": 0.8359375, |
| "learning_rate": 3.6861164651597456e-05, |
| "loss": 0.2571, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.7972665148063781, |
| "grad_norm": 0.7890625, |
| "learning_rate": 3.671514514339116e-05, |
| "loss": 0.2534, |
| "step": 4550 |
| }, |
| { |
| "epoch": 0.8060276852987559, |
| "grad_norm": 0.79296875, |
| "learning_rate": 3.656912563518487e-05, |
| "loss": 0.2573, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.8147888557911337, |
| "grad_norm": 0.80078125, |
| "learning_rate": 3.642310612697857e-05, |
| "loss": 0.2552, |
| "step": 4650 |
| }, |
| { |
| "epoch": 0.8235500262835115, |
| "grad_norm": 0.796875, |
| "learning_rate": 3.627708661877227e-05, |
| "loss": 0.252, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.8323111967758893, |
| "grad_norm": 0.75, |
| "learning_rate": 3.6131067110565975e-05, |
| "loss": 0.2545, |
| "step": 4750 |
| }, |
| { |
| "epoch": 0.8410723672682671, |
| "grad_norm": 0.77734375, |
| "learning_rate": 3.598504760235968e-05, |
| "loss": 0.2536, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.8498335377606449, |
| "grad_norm": 0.83203125, |
| "learning_rate": 3.583902809415338e-05, |
| "loss": 0.2486, |
| "step": 4850 |
| }, |
| { |
| "epoch": 0.8585947082530226, |
| "grad_norm": 0.79296875, |
| "learning_rate": 3.569300858594708e-05, |
| "loss": 0.2491, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.8673558787454004, |
| "grad_norm": 0.796875, |
| "learning_rate": 3.5546989077740786e-05, |
| "loss": 0.2493, |
| "step": 4950 |
| }, |
| { |
| "epoch": 0.8761170492377782, |
| "grad_norm": 0.8359375, |
| "learning_rate": 3.5400969569534495e-05, |
| "loss": 0.2496, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.8848782197301559, |
| "grad_norm": 0.73828125, |
| "learning_rate": 3.52549500613282e-05, |
| "loss": 0.2488, |
| "step": 5050 |
| }, |
| { |
| "epoch": 0.8936393902225337, |
| "grad_norm": 0.8046875, |
| "learning_rate": 3.51089305531219e-05, |
| "loss": 0.2497, |
| "step": 5100 |
| }, |
| { |
| "epoch": 0.9024005607149115, |
| "grad_norm": 0.80078125, |
| "learning_rate": 3.49629110449156e-05, |
| "loss": 0.253, |
| "step": 5150 |
| }, |
| { |
| "epoch": 0.9111617312072893, |
| "grad_norm": 0.7421875, |
| "learning_rate": 3.4816891536709306e-05, |
| "loss": 0.2515, |
| "step": 5200 |
| }, |
| { |
| "epoch": 0.9199229016996671, |
| "grad_norm": 0.7890625, |
| "learning_rate": 3.467087202850301e-05, |
| "loss": 0.2479, |
| "step": 5250 |
| }, |
| { |
| "epoch": 0.9286840721920449, |
| "grad_norm": 0.796875, |
| "learning_rate": 3.452485252029671e-05, |
| "loss": 0.2486, |
| "step": 5300 |
| }, |
| { |
| "epoch": 0.9374452426844226, |
| "grad_norm": 0.82421875, |
| "learning_rate": 3.437883301209042e-05, |
| "loss": 0.2518, |
| "step": 5350 |
| }, |
| { |
| "epoch": 0.9462064131768004, |
| "grad_norm": 0.85546875, |
| "learning_rate": 3.423281350388412e-05, |
| "loss": 0.2499, |
| "step": 5400 |
| }, |
| { |
| "epoch": 0.9549675836691782, |
| "grad_norm": 0.76171875, |
| "learning_rate": 3.4086793995677825e-05, |
| "loss": 0.2508, |
| "step": 5450 |
| }, |
| { |
| "epoch": 0.963728754161556, |
| "grad_norm": 0.83203125, |
| "learning_rate": 3.394077448747153e-05, |
| "loss": 0.2482, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.9724899246539338, |
| "grad_norm": 0.78515625, |
| "learning_rate": 3.379475497926523e-05, |
| "loss": 0.2491, |
| "step": 5550 |
| }, |
| { |
| "epoch": 0.9812510951463116, |
| "grad_norm": 0.84765625, |
| "learning_rate": 3.364873547105893e-05, |
| "loss": 0.2471, |
| "step": 5600 |
| }, |
| { |
| "epoch": 0.9900122656386894, |
| "grad_norm": 0.703125, |
| "learning_rate": 3.3502715962852636e-05, |
| "loss": 0.2539, |
| "step": 5650 |
| }, |
| { |
| "epoch": 0.9987734361310671, |
| "grad_norm": 0.77734375, |
| "learning_rate": 3.335669645464634e-05, |
| "loss": 0.2488, |
| "step": 5700 |
| }, |
| { |
| "epoch": 1.007534606623445, |
| "grad_norm": 0.80078125, |
| "learning_rate": 3.321067694644005e-05, |
| "loss": 0.2335, |
| "step": 5750 |
| }, |
| { |
| "epoch": 1.0162957771158228, |
| "grad_norm": 0.8046875, |
| "learning_rate": 3.306465743823375e-05, |
| "loss": 0.2302, |
| "step": 5800 |
| }, |
| { |
| "epoch": 1.0250569476082005, |
| "grad_norm": 0.84765625, |
| "learning_rate": 3.291863793002745e-05, |
| "loss": 0.2287, |
| "step": 5850 |
| }, |
| { |
| "epoch": 1.0338181181005783, |
| "grad_norm": 0.84765625, |
| "learning_rate": 3.2772618421821155e-05, |
| "loss": 0.2278, |
| "step": 5900 |
| }, |
| { |
| "epoch": 1.0425792885929561, |
| "grad_norm": 0.87109375, |
| "learning_rate": 3.262659891361486e-05, |
| "loss": 0.2322, |
| "step": 5950 |
| }, |
| { |
| "epoch": 1.051340459085334, |
| "grad_norm": 0.87109375, |
| "learning_rate": 3.248057940540856e-05, |
| "loss": 0.2305, |
| "step": 6000 |
| }, |
| { |
| "epoch": 1.0601016295777115, |
| "grad_norm": 0.95703125, |
| "learning_rate": 3.233455989720226e-05, |
| "loss": 0.2316, |
| "step": 6050 |
| }, |
| { |
| "epoch": 1.0688628000700893, |
| "grad_norm": 0.75, |
| "learning_rate": 3.218854038899597e-05, |
| "loss": 0.2366, |
| "step": 6100 |
| }, |
| { |
| "epoch": 1.077623970562467, |
| "grad_norm": 1.015625, |
| "learning_rate": 3.2042520880789675e-05, |
| "loss": 0.2331, |
| "step": 6150 |
| }, |
| { |
| "epoch": 1.0863851410548448, |
| "grad_norm": 0.80078125, |
| "learning_rate": 3.189650137258338e-05, |
| "loss": 0.2283, |
| "step": 6200 |
| }, |
| { |
| "epoch": 1.0951463115472226, |
| "grad_norm": 0.8828125, |
| "learning_rate": 3.175048186437708e-05, |
| "loss": 0.2301, |
| "step": 6250 |
| }, |
| { |
| "epoch": 1.1039074820396004, |
| "grad_norm": 0.87109375, |
| "learning_rate": 3.160446235617079e-05, |
| "loss": 0.2311, |
| "step": 6300 |
| }, |
| { |
| "epoch": 1.1126686525319782, |
| "grad_norm": 0.83203125, |
| "learning_rate": 3.145844284796449e-05, |
| "loss": 0.2278, |
| "step": 6350 |
| }, |
| { |
| "epoch": 1.121429823024356, |
| "grad_norm": 0.8359375, |
| "learning_rate": 3.1312423339758195e-05, |
| "loss": 0.231, |
| "step": 6400 |
| }, |
| { |
| "epoch": 1.1301909935167338, |
| "grad_norm": 0.97265625, |
| "learning_rate": 3.11664038315519e-05, |
| "loss": 0.2302, |
| "step": 6450 |
| }, |
| { |
| "epoch": 1.1389521640091116, |
| "grad_norm": 0.94921875, |
| "learning_rate": 3.10203843233456e-05, |
| "loss": 0.2311, |
| "step": 6500 |
| }, |
| { |
| "epoch": 1.1477133345014894, |
| "grad_norm": 0.85546875, |
| "learning_rate": 3.087436481513931e-05, |
| "loss": 0.234, |
| "step": 6550 |
| }, |
| { |
| "epoch": 1.1564745049938672, |
| "grad_norm": 0.8984375, |
| "learning_rate": 3.072834530693301e-05, |
| "loss": 0.231, |
| "step": 6600 |
| }, |
| { |
| "epoch": 1.165235675486245, |
| "grad_norm": 0.86328125, |
| "learning_rate": 3.0582325798726714e-05, |
| "loss": 0.2294, |
| "step": 6650 |
| }, |
| { |
| "epoch": 1.1739968459786227, |
| "grad_norm": 0.796875, |
| "learning_rate": 3.0436306290520417e-05, |
| "loss": 0.2316, |
| "step": 6700 |
| }, |
| { |
| "epoch": 1.1827580164710005, |
| "grad_norm": 0.8984375, |
| "learning_rate": 3.029028678231412e-05, |
| "loss": 0.2335, |
| "step": 6750 |
| }, |
| { |
| "epoch": 1.1915191869633783, |
| "grad_norm": 1.0234375, |
| "learning_rate": 3.0144267274107822e-05, |
| "loss": 0.2282, |
| "step": 6800 |
| }, |
| { |
| "epoch": 1.2002803574557561, |
| "grad_norm": 0.85546875, |
| "learning_rate": 2.9998247765901528e-05, |
| "loss": 0.2327, |
| "step": 6850 |
| }, |
| { |
| "epoch": 1.209041527948134, |
| "grad_norm": 0.8671875, |
| "learning_rate": 2.985222825769523e-05, |
| "loss": 0.2295, |
| "step": 6900 |
| }, |
| { |
| "epoch": 1.2178026984405117, |
| "grad_norm": 0.85546875, |
| "learning_rate": 2.9706208749488933e-05, |
| "loss": 0.2306, |
| "step": 6950 |
| }, |
| { |
| "epoch": 1.2265638689328895, |
| "grad_norm": 0.875, |
| "learning_rate": 2.9560189241282636e-05, |
| "loss": 0.234, |
| "step": 7000 |
| }, |
| { |
| "epoch": 1.2353250394252673, |
| "grad_norm": 0.85546875, |
| "learning_rate": 2.9414169733076342e-05, |
| "loss": 0.2302, |
| "step": 7050 |
| }, |
| { |
| "epoch": 1.244086209917645, |
| "grad_norm": 0.9375, |
| "learning_rate": 2.9268150224870045e-05, |
| "loss": 0.2273, |
| "step": 7100 |
| }, |
| { |
| "epoch": 1.2528473804100229, |
| "grad_norm": 0.83203125, |
| "learning_rate": 2.9122130716663747e-05, |
| "loss": 0.2255, |
| "step": 7150 |
| }, |
| { |
| "epoch": 1.2616085509024004, |
| "grad_norm": 0.8828125, |
| "learning_rate": 2.8976111208457453e-05, |
| "loss": 0.2305, |
| "step": 7200 |
| }, |
| { |
| "epoch": 1.2703697213947782, |
| "grad_norm": 0.8125, |
| "learning_rate": 2.8830091700251156e-05, |
| "loss": 0.2293, |
| "step": 7250 |
| }, |
| { |
| "epoch": 1.279130891887156, |
| "grad_norm": 0.91796875, |
| "learning_rate": 2.868407219204486e-05, |
| "loss": 0.2258, |
| "step": 7300 |
| }, |
| { |
| "epoch": 1.2878920623795338, |
| "grad_norm": 0.859375, |
| "learning_rate": 2.853805268383856e-05, |
| "loss": 0.227, |
| "step": 7350 |
| }, |
| { |
| "epoch": 1.2966532328719116, |
| "grad_norm": 0.91015625, |
| "learning_rate": 2.8392033175632267e-05, |
| "loss": 0.2292, |
| "step": 7400 |
| }, |
| { |
| "epoch": 1.3054144033642894, |
| "grad_norm": 0.8359375, |
| "learning_rate": 2.824601366742597e-05, |
| "loss": 0.228, |
| "step": 7450 |
| }, |
| { |
| "epoch": 1.3141755738566672, |
| "grad_norm": 0.859375, |
| "learning_rate": 2.8099994159219672e-05, |
| "loss": 0.2299, |
| "step": 7500 |
| }, |
| { |
| "epoch": 1.322936744349045, |
| "grad_norm": 0.9453125, |
| "learning_rate": 2.7953974651013375e-05, |
| "loss": 0.225, |
| "step": 7550 |
| }, |
| { |
| "epoch": 1.3316979148414227, |
| "grad_norm": 0.82421875, |
| "learning_rate": 2.780795514280708e-05, |
| "loss": 0.2274, |
| "step": 7600 |
| }, |
| { |
| "epoch": 1.3404590853338005, |
| "grad_norm": 0.921875, |
| "learning_rate": 2.7661935634600783e-05, |
| "loss": 0.2295, |
| "step": 7650 |
| }, |
| { |
| "epoch": 1.3492202558261783, |
| "grad_norm": 0.8359375, |
| "learning_rate": 2.7515916126394486e-05, |
| "loss": 0.2259, |
| "step": 7700 |
| }, |
| { |
| "epoch": 1.3579814263185561, |
| "grad_norm": 0.90234375, |
| "learning_rate": 2.736989661818819e-05, |
| "loss": 0.2321, |
| "step": 7750 |
| }, |
| { |
| "epoch": 1.366742596810934, |
| "grad_norm": 0.94140625, |
| "learning_rate": 2.7223877109981894e-05, |
| "loss": 0.2279, |
| "step": 7800 |
| }, |
| { |
| "epoch": 1.3755037673033117, |
| "grad_norm": 0.8984375, |
| "learning_rate": 2.7077857601775597e-05, |
| "loss": 0.2291, |
| "step": 7850 |
| }, |
| { |
| "epoch": 1.3842649377956895, |
| "grad_norm": 0.96484375, |
| "learning_rate": 2.69318380935693e-05, |
| "loss": 0.2273, |
| "step": 7900 |
| }, |
| { |
| "epoch": 1.3930261082880673, |
| "grad_norm": 0.9375, |
| "learning_rate": 2.6785818585363006e-05, |
| "loss": 0.2296, |
| "step": 7950 |
| }, |
| { |
| "epoch": 1.401787278780445, |
| "grad_norm": 0.91796875, |
| "learning_rate": 2.6639799077156708e-05, |
| "loss": 0.2262, |
| "step": 8000 |
| }, |
| { |
| "epoch": 1.4105484492728229, |
| "grad_norm": 0.875, |
| "learning_rate": 2.649377956895041e-05, |
| "loss": 0.2258, |
| "step": 8050 |
| }, |
| { |
| "epoch": 1.4193096197652006, |
| "grad_norm": 1.03125, |
| "learning_rate": 2.6347760060744113e-05, |
| "loss": 0.2237, |
| "step": 8100 |
| }, |
| { |
| "epoch": 1.4280707902575784, |
| "grad_norm": 0.92578125, |
| "learning_rate": 2.620174055253782e-05, |
| "loss": 0.2306, |
| "step": 8150 |
| }, |
| { |
| "epoch": 1.4368319607499562, |
| "grad_norm": 0.95703125, |
| "learning_rate": 2.6055721044331522e-05, |
| "loss": 0.2282, |
| "step": 8200 |
| }, |
| { |
| "epoch": 1.445593131242334, |
| "grad_norm": 0.82421875, |
| "learning_rate": 2.5909701536125224e-05, |
| "loss": 0.2245, |
| "step": 8250 |
| }, |
| { |
| "epoch": 1.4543543017347118, |
| "grad_norm": 0.890625, |
| "learning_rate": 2.5763682027918934e-05, |
| "loss": 0.2267, |
| "step": 8300 |
| }, |
| { |
| "epoch": 1.4631154722270896, |
| "grad_norm": 0.88671875, |
| "learning_rate": 2.5617662519712636e-05, |
| "loss": 0.2248, |
| "step": 8350 |
| }, |
| { |
| "epoch": 1.4718766427194674, |
| "grad_norm": 1.109375, |
| "learning_rate": 2.5471643011506342e-05, |
| "loss": 0.2299, |
| "step": 8400 |
| }, |
| { |
| "epoch": 1.4806378132118452, |
| "grad_norm": 0.90234375, |
| "learning_rate": 2.5325623503300045e-05, |
| "loss": 0.2222, |
| "step": 8450 |
| }, |
| { |
| "epoch": 1.489398983704223, |
| "grad_norm": 0.94921875, |
| "learning_rate": 2.5179603995093748e-05, |
| "loss": 0.228, |
| "step": 8500 |
| }, |
| { |
| "epoch": 1.4981601541966008, |
| "grad_norm": 0.83984375, |
| "learning_rate": 2.503358448688745e-05, |
| "loss": 0.2302, |
| "step": 8550 |
| }, |
| { |
| "epoch": 1.5069213246889785, |
| "grad_norm": 0.94140625, |
| "learning_rate": 2.4887564978681153e-05, |
| "loss": 0.2281, |
| "step": 8600 |
| }, |
| { |
| "epoch": 1.5156824951813563, |
| "grad_norm": 0.88671875, |
| "learning_rate": 2.4741545470474855e-05, |
| "loss": 0.225, |
| "step": 8650 |
| }, |
| { |
| "epoch": 1.5244436656737341, |
| "grad_norm": 0.921875, |
| "learning_rate": 2.4595525962268558e-05, |
| "loss": 0.2277, |
| "step": 8700 |
| }, |
| { |
| "epoch": 1.533204836166112, |
| "grad_norm": 0.796875, |
| "learning_rate": 2.4449506454062264e-05, |
| "loss": 0.2216, |
| "step": 8750 |
| }, |
| { |
| "epoch": 1.5419660066584897, |
| "grad_norm": 0.86328125, |
| "learning_rate": 2.430348694585597e-05, |
| "loss": 0.2238, |
| "step": 8800 |
| }, |
| { |
| "epoch": 1.5507271771508675, |
| "grad_norm": 0.91015625, |
| "learning_rate": 2.4157467437649672e-05, |
| "loss": 0.229, |
| "step": 8850 |
| }, |
| { |
| "epoch": 1.5594883476432453, |
| "grad_norm": 0.99609375, |
| "learning_rate": 2.4011447929443375e-05, |
| "loss": 0.2273, |
| "step": 8900 |
| }, |
| { |
| "epoch": 1.568249518135623, |
| "grad_norm": 0.91796875, |
| "learning_rate": 2.386542842123708e-05, |
| "loss": 0.2261, |
| "step": 8950 |
| }, |
| { |
| "epoch": 1.5770106886280009, |
| "grad_norm": 0.921875, |
| "learning_rate": 2.3719408913030784e-05, |
| "loss": 0.2244, |
| "step": 9000 |
| }, |
| { |
| "epoch": 1.5857718591203784, |
| "grad_norm": 0.9140625, |
| "learning_rate": 2.3573389404824486e-05, |
| "loss": 0.2243, |
| "step": 9050 |
| }, |
| { |
| "epoch": 1.5945330296127562, |
| "grad_norm": 0.93359375, |
| "learning_rate": 2.342736989661819e-05, |
| "loss": 0.2277, |
| "step": 9100 |
| }, |
| { |
| "epoch": 1.603294200105134, |
| "grad_norm": 0.83984375, |
| "learning_rate": 2.3281350388411895e-05, |
| "loss": 0.2238, |
| "step": 9150 |
| }, |
| { |
| "epoch": 1.6120553705975118, |
| "grad_norm": 0.80078125, |
| "learning_rate": 2.3135330880205597e-05, |
| "loss": 0.2251, |
| "step": 9200 |
| }, |
| { |
| "epoch": 1.6208165410898896, |
| "grad_norm": 1.015625, |
| "learning_rate": 2.29893113719993e-05, |
| "loss": 0.2257, |
| "step": 9250 |
| }, |
| { |
| "epoch": 1.6295777115822674, |
| "grad_norm": 0.890625, |
| "learning_rate": 2.2843291863793003e-05, |
| "loss": 0.2256, |
| "step": 9300 |
| }, |
| { |
| "epoch": 1.6383388820746452, |
| "grad_norm": 1.0078125, |
| "learning_rate": 2.269727235558671e-05, |
| "loss": 0.2239, |
| "step": 9350 |
| }, |
| { |
| "epoch": 1.647100052567023, |
| "grad_norm": 0.9140625, |
| "learning_rate": 2.255125284738041e-05, |
| "loss": 0.228, |
| "step": 9400 |
| }, |
| { |
| "epoch": 1.6558612230594008, |
| "grad_norm": 1.0546875, |
| "learning_rate": 2.2405233339174114e-05, |
| "loss": 0.2274, |
| "step": 9450 |
| }, |
| { |
| "epoch": 1.6646223935517785, |
| "grad_norm": 0.91796875, |
| "learning_rate": 2.2259213830967816e-05, |
| "loss": 0.2214, |
| "step": 9500 |
| }, |
| { |
| "epoch": 1.6733835640441563, |
| "grad_norm": 0.9140625, |
| "learning_rate": 2.2113194322761522e-05, |
| "loss": 0.2236, |
| "step": 9550 |
| }, |
| { |
| "epoch": 1.6821447345365341, |
| "grad_norm": 0.90625, |
| "learning_rate": 2.1967174814555225e-05, |
| "loss": 0.2222, |
| "step": 9600 |
| }, |
| { |
| "epoch": 1.690905905028912, |
| "grad_norm": 0.85546875, |
| "learning_rate": 2.1821155306348927e-05, |
| "loss": 0.2227, |
| "step": 9650 |
| }, |
| { |
| "epoch": 1.6996670755212895, |
| "grad_norm": 0.98828125, |
| "learning_rate": 2.1675135798142633e-05, |
| "loss": 0.2277, |
| "step": 9700 |
| }, |
| { |
| "epoch": 1.7084282460136673, |
| "grad_norm": 0.9296875, |
| "learning_rate": 2.1529116289936336e-05, |
| "loss": 0.2257, |
| "step": 9750 |
| }, |
| { |
| "epoch": 1.717189416506045, |
| "grad_norm": 0.97265625, |
| "learning_rate": 2.1383096781730042e-05, |
| "loss": 0.221, |
| "step": 9800 |
| }, |
| { |
| "epoch": 1.7259505869984229, |
| "grad_norm": 0.83203125, |
| "learning_rate": 2.1237077273523745e-05, |
| "loss": 0.2231, |
| "step": 9850 |
| }, |
| { |
| "epoch": 1.7347117574908006, |
| "grad_norm": 0.9140625, |
| "learning_rate": 2.1091057765317447e-05, |
| "loss": 0.22, |
| "step": 9900 |
| }, |
| { |
| "epoch": 1.7434729279831784, |
| "grad_norm": 0.875, |
| "learning_rate": 2.0945038257111153e-05, |
| "loss": 0.2241, |
| "step": 9950 |
| }, |
| { |
| "epoch": 1.7522340984755562, |
| "grad_norm": 0.82421875, |
| "learning_rate": 2.0799018748904856e-05, |
| "loss": 0.2215, |
| "step": 10000 |
| }, |
| { |
| "epoch": 1.760995268967934, |
| "grad_norm": 0.828125, |
| "learning_rate": 2.065299924069856e-05, |
| "loss": 0.2267, |
| "step": 10050 |
| }, |
| { |
| "epoch": 1.7697564394603118, |
| "grad_norm": 0.9453125, |
| "learning_rate": 2.050697973249226e-05, |
| "loss": 0.222, |
| "step": 10100 |
| }, |
| { |
| "epoch": 1.7785176099526896, |
| "grad_norm": 0.92578125, |
| "learning_rate": 2.0360960224285967e-05, |
| "loss": 0.2239, |
| "step": 10150 |
| }, |
| { |
| "epoch": 1.7872787804450674, |
| "grad_norm": 0.9140625, |
| "learning_rate": 2.021494071607967e-05, |
| "loss": 0.22, |
| "step": 10200 |
| }, |
| { |
| "epoch": 1.7960399509374452, |
| "grad_norm": 0.94140625, |
| "learning_rate": 2.0068921207873372e-05, |
| "loss": 0.2293, |
| "step": 10250 |
| }, |
| { |
| "epoch": 1.804801121429823, |
| "grad_norm": 0.96484375, |
| "learning_rate": 1.9922901699667078e-05, |
| "loss": 0.2271, |
| "step": 10300 |
| }, |
| { |
| "epoch": 1.8135622919222008, |
| "grad_norm": 0.875, |
| "learning_rate": 1.977688219146078e-05, |
| "loss": 0.2252, |
| "step": 10350 |
| }, |
| { |
| "epoch": 1.8223234624145785, |
| "grad_norm": 0.91796875, |
| "learning_rate": 1.9630862683254483e-05, |
| "loss": 0.2265, |
| "step": 10400 |
| }, |
| { |
| "epoch": 1.8310846329069563, |
| "grad_norm": 0.9453125, |
| "learning_rate": 1.9484843175048186e-05, |
| "loss": 0.2239, |
| "step": 10450 |
| }, |
| { |
| "epoch": 1.8398458033993341, |
| "grad_norm": 0.94921875, |
| "learning_rate": 1.9338823666841892e-05, |
| "loss": 0.222, |
| "step": 10500 |
| }, |
| { |
| "epoch": 1.848606973891712, |
| "grad_norm": 1.078125, |
| "learning_rate": 1.9192804158635594e-05, |
| "loss": 0.2231, |
| "step": 10550 |
| }, |
| { |
| "epoch": 1.8573681443840897, |
| "grad_norm": 0.98046875, |
| "learning_rate": 1.9046784650429297e-05, |
| "loss": 0.2266, |
| "step": 10600 |
| }, |
| { |
| "epoch": 1.8661293148764675, |
| "grad_norm": 0.91015625, |
| "learning_rate": 1.8900765142223e-05, |
| "loss": 0.2245, |
| "step": 10650 |
| }, |
| { |
| "epoch": 1.8748904853688453, |
| "grad_norm": 0.98828125, |
| "learning_rate": 1.8754745634016706e-05, |
| "loss": 0.2243, |
| "step": 10700 |
| }, |
| { |
| "epoch": 1.883651655861223, |
| "grad_norm": 0.80859375, |
| "learning_rate": 1.8608726125810408e-05, |
| "loss": 0.2194, |
| "step": 10750 |
| }, |
| { |
| "epoch": 1.8924128263536009, |
| "grad_norm": 0.87109375, |
| "learning_rate": 1.8462706617604114e-05, |
| "loss": 0.2253, |
| "step": 10800 |
| }, |
| { |
| "epoch": 1.9011739968459787, |
| "grad_norm": 0.875, |
| "learning_rate": 1.8316687109397817e-05, |
| "loss": 0.2231, |
| "step": 10850 |
| }, |
| { |
| "epoch": 1.9099351673383564, |
| "grad_norm": 0.91015625, |
| "learning_rate": 1.8170667601191523e-05, |
| "loss": 0.2215, |
| "step": 10900 |
| }, |
| { |
| "epoch": 1.9186963378307342, |
| "grad_norm": 0.99609375, |
| "learning_rate": 1.8024648092985225e-05, |
| "loss": 0.2246, |
| "step": 10950 |
| }, |
| { |
| "epoch": 1.927457508323112, |
| "grad_norm": 0.890625, |
| "learning_rate": 1.7878628584778928e-05, |
| "loss": 0.2215, |
| "step": 11000 |
| }, |
| { |
| "epoch": 1.9362186788154898, |
| "grad_norm": 0.953125, |
| "learning_rate": 1.773260907657263e-05, |
| "loss": 0.2251, |
| "step": 11050 |
| }, |
| { |
| "epoch": 1.9449798493078676, |
| "grad_norm": 0.91015625, |
| "learning_rate": 1.7586589568366336e-05, |
| "loss": 0.219, |
| "step": 11100 |
| }, |
| { |
| "epoch": 1.9537410198002454, |
| "grad_norm": 1.0234375, |
| "learning_rate": 1.744057006016004e-05, |
| "loss": 0.2194, |
| "step": 11150 |
| }, |
| { |
| "epoch": 1.9625021902926232, |
| "grad_norm": 0.84765625, |
| "learning_rate": 1.729455055195374e-05, |
| "loss": 0.219, |
| "step": 11200 |
| }, |
| { |
| "epoch": 1.971263360785001, |
| "grad_norm": 0.94140625, |
| "learning_rate": 1.7148531043747444e-05, |
| "loss": 0.2183, |
| "step": 11250 |
| }, |
| { |
| "epoch": 1.9800245312773788, |
| "grad_norm": 0.8671875, |
| "learning_rate": 1.700251153554115e-05, |
| "loss": 0.221, |
| "step": 11300 |
| }, |
| { |
| "epoch": 1.9887857017697566, |
| "grad_norm": 0.921875, |
| "learning_rate": 1.6856492027334853e-05, |
| "loss": 0.2218, |
| "step": 11350 |
| }, |
| { |
| "epoch": 1.9975468722621343, |
| "grad_norm": 0.93359375, |
| "learning_rate": 1.6710472519128555e-05, |
| "loss": 0.221, |
| "step": 11400 |
| }, |
| { |
| "epoch": 2.006308042754512, |
| "grad_norm": 1.03125, |
| "learning_rate": 1.6564453010922258e-05, |
| "loss": 0.2109, |
| "step": 11450 |
| }, |
| { |
| "epoch": 2.01506921324689, |
| "grad_norm": 1.0234375, |
| "learning_rate": 1.6418433502715964e-05, |
| "loss": 0.2091, |
| "step": 11500 |
| }, |
| { |
| "epoch": 2.0238303837392677, |
| "grad_norm": 1.078125, |
| "learning_rate": 1.6272413994509666e-05, |
| "loss": 0.2039, |
| "step": 11550 |
| }, |
| { |
| "epoch": 2.0325915542316455, |
| "grad_norm": 1.0625, |
| "learning_rate": 1.612639448630337e-05, |
| "loss": 0.2078, |
| "step": 11600 |
| }, |
| { |
| "epoch": 2.0413527247240233, |
| "grad_norm": 1.046875, |
| "learning_rate": 1.5980374978097075e-05, |
| "loss": 0.2071, |
| "step": 11650 |
| }, |
| { |
| "epoch": 2.050113895216401, |
| "grad_norm": 1.0703125, |
| "learning_rate": 1.5834355469890778e-05, |
| "loss": 0.2056, |
| "step": 11700 |
| }, |
| { |
| "epoch": 2.058875065708779, |
| "grad_norm": 1.1015625, |
| "learning_rate": 1.568833596168448e-05, |
| "loss": 0.2097, |
| "step": 11750 |
| }, |
| { |
| "epoch": 2.0676362362011567, |
| "grad_norm": 1.109375, |
| "learning_rate": 1.5542316453478186e-05, |
| "loss": 0.2083, |
| "step": 11800 |
| }, |
| { |
| "epoch": 2.0763974066935345, |
| "grad_norm": 0.984375, |
| "learning_rate": 1.539629694527189e-05, |
| "loss": 0.2047, |
| "step": 11850 |
| }, |
| { |
| "epoch": 2.0851585771859122, |
| "grad_norm": 1.0390625, |
| "learning_rate": 1.5250277437065593e-05, |
| "loss": 0.2044, |
| "step": 11900 |
| }, |
| { |
| "epoch": 2.09391974767829, |
| "grad_norm": 1.03125, |
| "learning_rate": 1.5104257928859297e-05, |
| "loss": 0.207, |
| "step": 11950 |
| }, |
| { |
| "epoch": 2.102680918170668, |
| "grad_norm": 1.1015625, |
| "learning_rate": 1.4958238420653e-05, |
| "loss": 0.2058, |
| "step": 12000 |
| }, |
| { |
| "epoch": 2.111442088663045, |
| "grad_norm": 1.09375, |
| "learning_rate": 1.4812218912446704e-05, |
| "loss": 0.2035, |
| "step": 12050 |
| }, |
| { |
| "epoch": 2.120203259155423, |
| "grad_norm": 1.125, |
| "learning_rate": 1.4666199404240409e-05, |
| "loss": 0.21, |
| "step": 12100 |
| }, |
| { |
| "epoch": 2.1289644296478007, |
| "grad_norm": 1.015625, |
| "learning_rate": 1.4520179896034111e-05, |
| "loss": 0.2079, |
| "step": 12150 |
| }, |
| { |
| "epoch": 2.1377256001401785, |
| "grad_norm": 1.015625, |
| "learning_rate": 1.4374160387827815e-05, |
| "loss": 0.2057, |
| "step": 12200 |
| }, |
| { |
| "epoch": 2.1464867706325563, |
| "grad_norm": 1.046875, |
| "learning_rate": 1.4228140879621518e-05, |
| "loss": 0.2058, |
| "step": 12250 |
| }, |
| { |
| "epoch": 2.155247941124934, |
| "grad_norm": 0.9375, |
| "learning_rate": 1.4082121371415222e-05, |
| "loss": 0.2087, |
| "step": 12300 |
| }, |
| { |
| "epoch": 2.164009111617312, |
| "grad_norm": 1.125, |
| "learning_rate": 1.3936101863208925e-05, |
| "loss": 0.2049, |
| "step": 12350 |
| }, |
| { |
| "epoch": 2.1727702821096897, |
| "grad_norm": 1.15625, |
| "learning_rate": 1.3790082355002629e-05, |
| "loss": 0.2073, |
| "step": 12400 |
| }, |
| { |
| "epoch": 2.1815314526020675, |
| "grad_norm": 1.203125, |
| "learning_rate": 1.3644062846796332e-05, |
| "loss": 0.2048, |
| "step": 12450 |
| }, |
| { |
| "epoch": 2.1902926230944453, |
| "grad_norm": 1.1171875, |
| "learning_rate": 1.3498043338590036e-05, |
| "loss": 0.2062, |
| "step": 12500 |
| }, |
| { |
| "epoch": 2.199053793586823, |
| "grad_norm": 1.15625, |
| "learning_rate": 1.3352023830383739e-05, |
| "loss": 0.2104, |
| "step": 12550 |
| }, |
| { |
| "epoch": 2.207814964079201, |
| "grad_norm": 1.109375, |
| "learning_rate": 1.3206004322177443e-05, |
| "loss": 0.2075, |
| "step": 12600 |
| }, |
| { |
| "epoch": 2.2165761345715786, |
| "grad_norm": 0.9609375, |
| "learning_rate": 1.3059984813971145e-05, |
| "loss": 0.2071, |
| "step": 12650 |
| }, |
| { |
| "epoch": 2.2253373050639564, |
| "grad_norm": 1.03125, |
| "learning_rate": 1.291396530576485e-05, |
| "loss": 0.2063, |
| "step": 12700 |
| }, |
| { |
| "epoch": 2.2340984755563342, |
| "grad_norm": 1.0625, |
| "learning_rate": 1.2767945797558552e-05, |
| "loss": 0.2033, |
| "step": 12750 |
| }, |
| { |
| "epoch": 2.242859646048712, |
| "grad_norm": 1.0625, |
| "learning_rate": 1.262192628935226e-05, |
| "loss": 0.2057, |
| "step": 12800 |
| }, |
| { |
| "epoch": 2.25162081654109, |
| "grad_norm": 1.125, |
| "learning_rate": 1.2475906781145961e-05, |
| "loss": 0.2063, |
| "step": 12850 |
| }, |
| { |
| "epoch": 2.2603819870334676, |
| "grad_norm": 1.046875, |
| "learning_rate": 1.2329887272939665e-05, |
| "loss": 0.2011, |
| "step": 12900 |
| }, |
| { |
| "epoch": 2.2691431575258454, |
| "grad_norm": 1.1328125, |
| "learning_rate": 1.2183867764733368e-05, |
| "loss": 0.2086, |
| "step": 12950 |
| }, |
| { |
| "epoch": 2.277904328018223, |
| "grad_norm": 1.171875, |
| "learning_rate": 1.2037848256527072e-05, |
| "loss": 0.2063, |
| "step": 13000 |
| }, |
| { |
| "epoch": 2.286665498510601, |
| "grad_norm": 1.0625, |
| "learning_rate": 1.1891828748320776e-05, |
| "loss": 0.2082, |
| "step": 13050 |
| }, |
| { |
| "epoch": 2.2954266690029788, |
| "grad_norm": 0.984375, |
| "learning_rate": 1.174580924011448e-05, |
| "loss": 0.2124, |
| "step": 13100 |
| }, |
| { |
| "epoch": 2.3041878394953565, |
| "grad_norm": 1.0078125, |
| "learning_rate": 1.1599789731908183e-05, |
| "loss": 0.2078, |
| "step": 13150 |
| }, |
| { |
| "epoch": 2.3129490099877343, |
| "grad_norm": 1.046875, |
| "learning_rate": 1.1453770223701888e-05, |
| "loss": 0.2098, |
| "step": 13200 |
| }, |
| { |
| "epoch": 2.321710180480112, |
| "grad_norm": 1.046875, |
| "learning_rate": 1.130775071549559e-05, |
| "loss": 0.2057, |
| "step": 13250 |
| }, |
| { |
| "epoch": 2.33047135097249, |
| "grad_norm": 1.1875, |
| "learning_rate": 1.1161731207289294e-05, |
| "loss": 0.2048, |
| "step": 13300 |
| }, |
| { |
| "epoch": 2.3392325214648677, |
| "grad_norm": 1.3203125, |
| "learning_rate": 1.1015711699082999e-05, |
| "loss": 0.2049, |
| "step": 13350 |
| }, |
| { |
| "epoch": 2.3479936919572455, |
| "grad_norm": 1.0078125, |
| "learning_rate": 1.0869692190876701e-05, |
| "loss": 0.206, |
| "step": 13400 |
| }, |
| { |
| "epoch": 2.3567548624496233, |
| "grad_norm": 1.125, |
| "learning_rate": 1.0723672682670406e-05, |
| "loss": 0.2044, |
| "step": 13450 |
| }, |
| { |
| "epoch": 2.365516032942001, |
| "grad_norm": 1.2890625, |
| "learning_rate": 1.0577653174464108e-05, |
| "loss": 0.2062, |
| "step": 13500 |
| }, |
| { |
| "epoch": 2.374277203434379, |
| "grad_norm": 1.140625, |
| "learning_rate": 1.0431633666257812e-05, |
| "loss": 0.206, |
| "step": 13550 |
| }, |
| { |
| "epoch": 2.3830383739267567, |
| "grad_norm": 1.09375, |
| "learning_rate": 1.0285614158051517e-05, |
| "loss": 0.2044, |
| "step": 13600 |
| }, |
| { |
| "epoch": 2.3917995444191344, |
| "grad_norm": 1.0546875, |
| "learning_rate": 1.0139594649845221e-05, |
| "loss": 0.207, |
| "step": 13650 |
| }, |
| { |
| "epoch": 2.4005607149115122, |
| "grad_norm": 1.1953125, |
| "learning_rate": 9.993575141638924e-06, |
| "loss": 0.2032, |
| "step": 13700 |
| }, |
| { |
| "epoch": 2.40932188540389, |
| "grad_norm": 1.03125, |
| "learning_rate": 9.847555633432628e-06, |
| "loss": 0.2037, |
| "step": 13750 |
| }, |
| { |
| "epoch": 2.418083055896268, |
| "grad_norm": 1.2265625, |
| "learning_rate": 9.70153612522633e-06, |
| "loss": 0.2076, |
| "step": 13800 |
| }, |
| { |
| "epoch": 2.4268442263886456, |
| "grad_norm": 0.984375, |
| "learning_rate": 9.555516617020035e-06, |
| "loss": 0.2049, |
| "step": 13850 |
| }, |
| { |
| "epoch": 2.4356053968810234, |
| "grad_norm": 1.125, |
| "learning_rate": 9.409497108813737e-06, |
| "loss": 0.2046, |
| "step": 13900 |
| }, |
| { |
| "epoch": 2.444366567373401, |
| "grad_norm": 1.171875, |
| "learning_rate": 9.263477600607442e-06, |
| "loss": 0.2044, |
| "step": 13950 |
| }, |
| { |
| "epoch": 2.453127737865779, |
| "grad_norm": 1.15625, |
| "learning_rate": 9.117458092401144e-06, |
| "loss": 0.2061, |
| "step": 14000 |
| }, |
| { |
| "epoch": 2.4618889083581568, |
| "grad_norm": 1.1328125, |
| "learning_rate": 8.97143858419485e-06, |
| "loss": 0.206, |
| "step": 14050 |
| }, |
| { |
| "epoch": 2.4706500788505346, |
| "grad_norm": 1.0390625, |
| "learning_rate": 8.825419075988553e-06, |
| "loss": 0.2012, |
| "step": 14100 |
| }, |
| { |
| "epoch": 2.4794112493429123, |
| "grad_norm": 1.0, |
| "learning_rate": 8.679399567782257e-06, |
| "loss": 0.2008, |
| "step": 14150 |
| }, |
| { |
| "epoch": 2.48817241983529, |
| "grad_norm": 1.1328125, |
| "learning_rate": 8.53338005957596e-06, |
| "loss": 0.2046, |
| "step": 14200 |
| }, |
| { |
| "epoch": 2.496933590327668, |
| "grad_norm": 1.1328125, |
| "learning_rate": 8.387360551369664e-06, |
| "loss": 0.2067, |
| "step": 14250 |
| }, |
| { |
| "epoch": 2.5056947608200457, |
| "grad_norm": 1.2109375, |
| "learning_rate": 8.241341043163366e-06, |
| "loss": 0.2041, |
| "step": 14300 |
| }, |
| { |
| "epoch": 2.5144559313124235, |
| "grad_norm": 1.171875, |
| "learning_rate": 8.09532153495707e-06, |
| "loss": 0.2066, |
| "step": 14350 |
| }, |
| { |
| "epoch": 2.523217101804801, |
| "grad_norm": 0.984375, |
| "learning_rate": 7.949302026750773e-06, |
| "loss": 0.2036, |
| "step": 14400 |
| }, |
| { |
| "epoch": 2.531978272297179, |
| "grad_norm": 1.1640625, |
| "learning_rate": 7.803282518544478e-06, |
| "loss": 0.2002, |
| "step": 14450 |
| }, |
| { |
| "epoch": 2.5407394427895564, |
| "grad_norm": 1.203125, |
| "learning_rate": 7.65726301033818e-06, |
| "loss": 0.2057, |
| "step": 14500 |
| }, |
| { |
| "epoch": 2.5495006132819347, |
| "grad_norm": 1.1015625, |
| "learning_rate": 7.511243502131886e-06, |
| "loss": 0.2069, |
| "step": 14550 |
| }, |
| { |
| "epoch": 2.558261783774312, |
| "grad_norm": 1.28125, |
| "learning_rate": 7.36522399392559e-06, |
| "loss": 0.1996, |
| "step": 14600 |
| }, |
| { |
| "epoch": 2.5670229542666902, |
| "grad_norm": 1.09375, |
| "learning_rate": 7.219204485719293e-06, |
| "loss": 0.2085, |
| "step": 14650 |
| }, |
| { |
| "epoch": 2.5757841247590676, |
| "grad_norm": 1.0703125, |
| "learning_rate": 7.0731849775129965e-06, |
| "loss": 0.2068, |
| "step": 14700 |
| }, |
| { |
| "epoch": 2.584545295251446, |
| "grad_norm": 1.046875, |
| "learning_rate": 6.9271654693067e-06, |
| "loss": 0.2054, |
| "step": 14750 |
| }, |
| { |
| "epoch": 2.593306465743823, |
| "grad_norm": 1.1171875, |
| "learning_rate": 6.781145961100403e-06, |
| "loss": 0.2051, |
| "step": 14800 |
| }, |
| { |
| "epoch": 2.6020676362362014, |
| "grad_norm": 1.15625, |
| "learning_rate": 6.635126452894107e-06, |
| "loss": 0.2063, |
| "step": 14850 |
| }, |
| { |
| "epoch": 2.6108288067285788, |
| "grad_norm": 1.0859375, |
| "learning_rate": 6.48910694468781e-06, |
| "loss": 0.208, |
| "step": 14900 |
| }, |
| { |
| "epoch": 2.619589977220957, |
| "grad_norm": 1.21875, |
| "learning_rate": 6.343087436481514e-06, |
| "loss": 0.2069, |
| "step": 14950 |
| }, |
| { |
| "epoch": 2.6283511477133343, |
| "grad_norm": 1.09375, |
| "learning_rate": 6.197067928275218e-06, |
| "loss": 0.2033, |
| "step": 15000 |
| }, |
| { |
| "epoch": 2.637112318205712, |
| "grad_norm": 1.109375, |
| "learning_rate": 6.0510484200689214e-06, |
| "loss": 0.2058, |
| "step": 15050 |
| }, |
| { |
| "epoch": 2.64587348869809, |
| "grad_norm": 1.0859375, |
| "learning_rate": 5.905028911862625e-06, |
| "loss": 0.2044, |
| "step": 15100 |
| }, |
| { |
| "epoch": 2.6546346591904677, |
| "grad_norm": 1.09375, |
| "learning_rate": 5.759009403656328e-06, |
| "loss": 0.2018, |
| "step": 15150 |
| }, |
| { |
| "epoch": 2.6633958296828455, |
| "grad_norm": 1.078125, |
| "learning_rate": 5.612989895450033e-06, |
| "loss": 0.2032, |
| "step": 15200 |
| }, |
| { |
| "epoch": 2.6721570001752233, |
| "grad_norm": 1.015625, |
| "learning_rate": 5.466970387243736e-06, |
| "loss": 0.206, |
| "step": 15250 |
| }, |
| { |
| "epoch": 2.680918170667601, |
| "grad_norm": 1.09375, |
| "learning_rate": 5.3209508790374395e-06, |
| "loss": 0.2051, |
| "step": 15300 |
| }, |
| { |
| "epoch": 2.689679341159979, |
| "grad_norm": 1.0546875, |
| "learning_rate": 5.174931370831143e-06, |
| "loss": 0.2048, |
| "step": 15350 |
| }, |
| { |
| "epoch": 2.6984405116523567, |
| "grad_norm": 1.2421875, |
| "learning_rate": 5.028911862624846e-06, |
| "loss": 0.2074, |
| "step": 15400 |
| }, |
| { |
| "epoch": 2.7072016821447344, |
| "grad_norm": 1.0390625, |
| "learning_rate": 4.882892354418551e-06, |
| "loss": 0.2043, |
| "step": 15450 |
| }, |
| { |
| "epoch": 2.7159628526371122, |
| "grad_norm": 1.0703125, |
| "learning_rate": 4.736872846212254e-06, |
| "loss": 0.2064, |
| "step": 15500 |
| }, |
| { |
| "epoch": 2.72472402312949, |
| "grad_norm": 1.140625, |
| "learning_rate": 4.5908533380059575e-06, |
| "loss": 0.2044, |
| "step": 15550 |
| }, |
| { |
| "epoch": 2.733485193621868, |
| "grad_norm": 1.0390625, |
| "learning_rate": 4.444833829799661e-06, |
| "loss": 0.2078, |
| "step": 15600 |
| }, |
| { |
| "epoch": 2.7422463641142456, |
| "grad_norm": 1.2265625, |
| "learning_rate": 4.298814321593364e-06, |
| "loss": 0.2018, |
| "step": 15650 |
| }, |
| { |
| "epoch": 2.7510075346066234, |
| "grad_norm": 1.15625, |
| "learning_rate": 4.152794813387069e-06, |
| "loss": 0.2072, |
| "step": 15700 |
| }, |
| { |
| "epoch": 2.759768705099001, |
| "grad_norm": 1.1484375, |
| "learning_rate": 4.006775305180772e-06, |
| "loss": 0.2054, |
| "step": 15750 |
| }, |
| { |
| "epoch": 2.768529875591379, |
| "grad_norm": 1.1328125, |
| "learning_rate": 3.8607557969744755e-06, |
| "loss": 0.2067, |
| "step": 15800 |
| }, |
| { |
| "epoch": 2.7772910460837568, |
| "grad_norm": 1.234375, |
| "learning_rate": 3.7147362887681794e-06, |
| "loss": 0.2029, |
| "step": 15850 |
| }, |
| { |
| "epoch": 2.7860522165761346, |
| "grad_norm": 1.125, |
| "learning_rate": 3.568716780561883e-06, |
| "loss": 0.2006, |
| "step": 15900 |
| }, |
| { |
| "epoch": 2.7948133870685123, |
| "grad_norm": 1.171875, |
| "learning_rate": 3.422697272355587e-06, |
| "loss": 0.2032, |
| "step": 15950 |
| }, |
| { |
| "epoch": 2.80357455756089, |
| "grad_norm": 1.015625, |
| "learning_rate": 3.2766777641492905e-06, |
| "loss": 0.2056, |
| "step": 16000 |
| }, |
| { |
| "epoch": 2.812335728053268, |
| "grad_norm": 1.1484375, |
| "learning_rate": 3.130658255942994e-06, |
| "loss": 0.2069, |
| "step": 16050 |
| }, |
| { |
| "epoch": 2.8210968985456457, |
| "grad_norm": 1.1171875, |
| "learning_rate": 2.984638747736698e-06, |
| "loss": 0.2057, |
| "step": 16100 |
| }, |
| { |
| "epoch": 2.8298580690380235, |
| "grad_norm": 1.125, |
| "learning_rate": 2.8386192395304013e-06, |
| "loss": 0.2069, |
| "step": 16150 |
| }, |
| { |
| "epoch": 2.8386192395304013, |
| "grad_norm": 1.109375, |
| "learning_rate": 2.6925997313241047e-06, |
| "loss": 0.2047, |
| "step": 16200 |
| }, |
| { |
| "epoch": 2.847380410022779, |
| "grad_norm": 1.0625, |
| "learning_rate": 2.5465802231178086e-06, |
| "loss": 0.2093, |
| "step": 16250 |
| }, |
| { |
| "epoch": 2.856141580515157, |
| "grad_norm": 1.015625, |
| "learning_rate": 2.4005607149115124e-06, |
| "loss": 0.2047, |
| "step": 16300 |
| }, |
| { |
| "epoch": 2.8649027510075347, |
| "grad_norm": 1.1953125, |
| "learning_rate": 2.2545412067052163e-06, |
| "loss": 0.2042, |
| "step": 16350 |
| }, |
| { |
| "epoch": 2.8736639214999125, |
| "grad_norm": 1.1953125, |
| "learning_rate": 2.1085216984989197e-06, |
| "loss": 0.2051, |
| "step": 16400 |
| }, |
| { |
| "epoch": 2.8824250919922902, |
| "grad_norm": 1.2734375, |
| "learning_rate": 1.962502190292623e-06, |
| "loss": 0.2026, |
| "step": 16450 |
| }, |
| { |
| "epoch": 2.891186262484668, |
| "grad_norm": 1.203125, |
| "learning_rate": 1.8164826820863268e-06, |
| "loss": 0.2134, |
| "step": 16500 |
| }, |
| { |
| "epoch": 2.899947432977046, |
| "grad_norm": 1.265625, |
| "learning_rate": 1.6704631738800305e-06, |
| "loss": 0.2035, |
| "step": 16550 |
| }, |
| { |
| "epoch": 2.9087086034694236, |
| "grad_norm": 1.0078125, |
| "learning_rate": 1.524443665673734e-06, |
| "loss": 0.2024, |
| "step": 16600 |
| }, |
| { |
| "epoch": 2.9174697739618014, |
| "grad_norm": 1.1484375, |
| "learning_rate": 1.3784241574674377e-06, |
| "loss": 0.209, |
| "step": 16650 |
| }, |
| { |
| "epoch": 2.926230944454179, |
| "grad_norm": 1.0234375, |
| "learning_rate": 1.2324046492611414e-06, |
| "loss": 0.2035, |
| "step": 16700 |
| }, |
| { |
| "epoch": 2.934992114946557, |
| "grad_norm": 1.15625, |
| "learning_rate": 1.0863851410548448e-06, |
| "loss": 0.2061, |
| "step": 16750 |
| }, |
| { |
| "epoch": 2.9437532854389348, |
| "grad_norm": 1.140625, |
| "learning_rate": 9.403656328485486e-07, |
| "loss": 0.2049, |
| "step": 16800 |
| }, |
| { |
| "epoch": 2.9525144559313126, |
| "grad_norm": 0.98046875, |
| "learning_rate": 7.943461246422522e-07, |
| "loss": 0.2058, |
| "step": 16850 |
| }, |
| { |
| "epoch": 2.9612756264236904, |
| "grad_norm": 1.046875, |
| "learning_rate": 6.483266164359559e-07, |
| "loss": 0.2049, |
| "step": 16900 |
| }, |
| { |
| "epoch": 2.970036796916068, |
| "grad_norm": 1.109375, |
| "learning_rate": 5.023071082296594e-07, |
| "loss": 0.2014, |
| "step": 16950 |
| }, |
| { |
| "epoch": 2.978797967408446, |
| "grad_norm": 1.0546875, |
| "learning_rate": 3.562876000233631e-07, |
| "loss": 0.2054, |
| "step": 17000 |
| }, |
| { |
| "epoch": 2.9875591379008233, |
| "grad_norm": 1.015625, |
| "learning_rate": 2.1026809181706677e-07, |
| "loss": 0.2021, |
| "step": 17050 |
| }, |
| { |
| "epoch": 2.9963203083932015, |
| "grad_norm": 1.03125, |
| "learning_rate": 6.42485836107704e-08, |
| "loss": 0.2041, |
| "step": 17100 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 17121, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.174577165177979e+18, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|