adamo1139's picture
Upload 90 files
a6bc73a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.011672527248055795,
"eval_steps": 500,
"global_step": 400,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 2.9181318120139488e-05,
"grad_norm": 0.5635480880737305,
"learning_rate": 5.000000000000001e-07,
"loss": 0.8507,
"step": 1
},
{
"epoch": 5.8362636240278976e-05,
"grad_norm": 0.33079156279563904,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.1251,
"step": 2
},
{
"epoch": 8.754395436041846e-05,
"grad_norm": 0.3454552888870239,
"learning_rate": 1.5e-06,
"loss": 0.6399,
"step": 3
},
{
"epoch": 0.00011672527248055795,
"grad_norm": 0.4293176829814911,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.5054,
"step": 4
},
{
"epoch": 0.00014590659060069743,
"grad_norm": 0.37919726967811584,
"learning_rate": 2.5e-06,
"loss": 0.735,
"step": 5
},
{
"epoch": 0.00017508790872083693,
"grad_norm": 0.6950544714927673,
"learning_rate": 3e-06,
"loss": 1.4197,
"step": 6
},
{
"epoch": 0.0002042692268409764,
"grad_norm": 0.38271600008010864,
"learning_rate": 3.5e-06,
"loss": 0.7222,
"step": 7
},
{
"epoch": 0.0002334505449611159,
"grad_norm": 0.3510509133338928,
"learning_rate": 4.000000000000001e-06,
"loss": 0.6049,
"step": 8
},
{
"epoch": 0.0002626318630812554,
"grad_norm": 0.29938340187072754,
"learning_rate": 4.5e-06,
"loss": 0.555,
"step": 9
},
{
"epoch": 0.00029181318120139485,
"grad_norm": 0.38278627395629883,
"learning_rate": 5e-06,
"loss": 1.7384,
"step": 10
},
{
"epoch": 0.0003209944993215344,
"grad_norm": 0.3768065273761749,
"learning_rate": 5.500000000000001e-06,
"loss": 0.4364,
"step": 11
},
{
"epoch": 0.00035017581744167385,
"grad_norm": 0.3671921491622925,
"learning_rate": 6e-06,
"loss": 1.203,
"step": 12
},
{
"epoch": 0.00037935713556181333,
"grad_norm": 0.3327710032463074,
"learning_rate": 6.5000000000000004e-06,
"loss": 0.5083,
"step": 13
},
{
"epoch": 0.0004085384536819528,
"grad_norm": 0.35065436363220215,
"learning_rate": 7e-06,
"loss": 0.5446,
"step": 14
},
{
"epoch": 0.0004377197718020923,
"grad_norm": 0.40824198722839355,
"learning_rate": 7.500000000000001e-06,
"loss": 0.4587,
"step": 15
},
{
"epoch": 0.0004669010899222318,
"grad_norm": 0.34073805809020996,
"learning_rate": 8.000000000000001e-06,
"loss": 0.5617,
"step": 16
},
{
"epoch": 0.0004960824080423713,
"grad_norm": 0.3621309697628021,
"learning_rate": 8.5e-06,
"loss": 1.1623,
"step": 17
},
{
"epoch": 0.0005252637261625108,
"grad_norm": 0.31340083479881287,
"learning_rate": 9e-06,
"loss": 0.5276,
"step": 18
},
{
"epoch": 0.0005544450442826502,
"grad_norm": 0.36106982827186584,
"learning_rate": 9.5e-06,
"loss": 0.5017,
"step": 19
},
{
"epoch": 0.0005836263624027897,
"grad_norm": 0.31271892786026,
"learning_rate": 1e-05,
"loss": 0.3743,
"step": 20
},
{
"epoch": 0.0006128076805229292,
"grad_norm": 0.38480448722839355,
"learning_rate": 9.999997874331895e-06,
"loss": 1.1777,
"step": 21
},
{
"epoch": 0.0006419889986430688,
"grad_norm": 0.5181815028190613,
"learning_rate": 9.999991497329387e-06,
"loss": 0.8806,
"step": 22
},
{
"epoch": 0.0006711703167632082,
"grad_norm": 0.31474944949150085,
"learning_rate": 9.9999808689979e-06,
"loss": 0.6258,
"step": 23
},
{
"epoch": 0.0007003516348833477,
"grad_norm": 0.4090331494808197,
"learning_rate": 9.999965989346468e-06,
"loss": 0.5284,
"step": 24
},
{
"epoch": 0.0007295329530034872,
"grad_norm": 0.38257884979248047,
"learning_rate": 9.999946858387744e-06,
"loss": 0.8024,
"step": 25
},
{
"epoch": 0.0007587142711236267,
"grad_norm": 0.3842026889324188,
"learning_rate": 9.999923476137992e-06,
"loss": 1.1511,
"step": 26
},
{
"epoch": 0.0007878955892437661,
"grad_norm": 0.3617384433746338,
"learning_rate": 9.999895842617097e-06,
"loss": 0.6226,
"step": 27
},
{
"epoch": 0.0008170769073639056,
"grad_norm": 0.36202019453048706,
"learning_rate": 9.999863957848556e-06,
"loss": 1.1775,
"step": 28
},
{
"epoch": 0.0008462582254840451,
"grad_norm": 0.40996360778808594,
"learning_rate": 9.999827821859475e-06,
"loss": 0.9613,
"step": 29
},
{
"epoch": 0.0008754395436041846,
"grad_norm": 0.38747870922088623,
"learning_rate": 9.999787434680581e-06,
"loss": 0.7427,
"step": 30
},
{
"epoch": 0.0009046208617243241,
"grad_norm": 0.3686698377132416,
"learning_rate": 9.999742796346215e-06,
"loss": 1.8661,
"step": 31
},
{
"epoch": 0.0009338021798444636,
"grad_norm": 0.4100247323513031,
"learning_rate": 9.99969390689433e-06,
"loss": 0.7473,
"step": 32
},
{
"epoch": 0.0009629834979646031,
"grad_norm": 0.3075491189956665,
"learning_rate": 9.999640766366496e-06,
"loss": 0.352,
"step": 33
},
{
"epoch": 0.0009921648160847426,
"grad_norm": 0.3251459002494812,
"learning_rate": 9.999583374807895e-06,
"loss": 1.0352,
"step": 34
},
{
"epoch": 0.001021346134204882,
"grad_norm": 0.31156864762306213,
"learning_rate": 9.999521732267327e-06,
"loss": 0.4442,
"step": 35
},
{
"epoch": 0.0010505274523250215,
"grad_norm": 0.7785539627075195,
"learning_rate": 9.999455838797207e-06,
"loss": 1.0465,
"step": 36
},
{
"epoch": 0.001079708770445161,
"grad_norm": 0.3474777936935425,
"learning_rate": 9.999385694453557e-06,
"loss": 0.5922,
"step": 37
},
{
"epoch": 0.0011088900885653005,
"grad_norm": 0.3285025656223297,
"learning_rate": 9.99931129929602e-06,
"loss": 0.6125,
"step": 38
},
{
"epoch": 0.00113807140668544,
"grad_norm": 0.31099236011505127,
"learning_rate": 9.999232653387854e-06,
"loss": 0.8929,
"step": 39
},
{
"epoch": 0.0011672527248055794,
"grad_norm": 0.9824883937835693,
"learning_rate": 9.999149756795927e-06,
"loss": 1.0796,
"step": 40
},
{
"epoch": 0.0011964340429257189,
"grad_norm": 0.37850892543792725,
"learning_rate": 9.999062609590723e-06,
"loss": 1.1134,
"step": 41
},
{
"epoch": 0.0012256153610458584,
"grad_norm": 0.35737380385398865,
"learning_rate": 9.998971211846343e-06,
"loss": 1.9588,
"step": 42
},
{
"epoch": 0.0012547966791659978,
"grad_norm": 0.5358197689056396,
"learning_rate": 9.998875563640495e-06,
"loss": 0.5388,
"step": 43
},
{
"epoch": 0.0012839779972861375,
"grad_norm": 0.3402862846851349,
"learning_rate": 9.99877566505451e-06,
"loss": 0.5397,
"step": 44
},
{
"epoch": 0.001313159315406277,
"grad_norm": 0.36641523241996765,
"learning_rate": 9.998671516173327e-06,
"loss": 0.5417,
"step": 45
},
{
"epoch": 0.0013423406335264165,
"grad_norm": 0.3002683222293854,
"learning_rate": 9.9985631170855e-06,
"loss": 0.5729,
"step": 46
},
{
"epoch": 0.001371521951646556,
"grad_norm": 0.5055456161499023,
"learning_rate": 9.998450467883196e-06,
"loss": 0.5473,
"step": 47
},
{
"epoch": 0.0014007032697666954,
"grad_norm": 0.4431580603122711,
"learning_rate": 9.998333568662199e-06,
"loss": 1.6627,
"step": 48
},
{
"epoch": 0.0014298845878868349,
"grad_norm": 0.31471818685531616,
"learning_rate": 9.998212419521905e-06,
"loss": 0.359,
"step": 49
},
{
"epoch": 0.0014590659060069744,
"grad_norm": 0.4176453649997711,
"learning_rate": 9.998087020565319e-06,
"loss": 0.9098,
"step": 50
},
{
"epoch": 0.0014882472241271138,
"grad_norm": 0.3488178253173828,
"learning_rate": 9.997957371899069e-06,
"loss": 0.5606,
"step": 51
},
{
"epoch": 0.0015174285422472533,
"grad_norm": 0.3195094168186188,
"learning_rate": 9.997823473633388e-06,
"loss": 0.5484,
"step": 52
},
{
"epoch": 0.0015466098603673928,
"grad_norm": 0.3707164525985718,
"learning_rate": 9.997685325882125e-06,
"loss": 1.0832,
"step": 53
},
{
"epoch": 0.0015757911784875323,
"grad_norm": 0.5281696319580078,
"learning_rate": 9.997542928762745e-06,
"loss": 0.4327,
"step": 54
},
{
"epoch": 0.0016049724966076717,
"grad_norm": 0.38910332322120667,
"learning_rate": 9.997396282396322e-06,
"loss": 0.563,
"step": 55
},
{
"epoch": 0.0016341538147278112,
"grad_norm": 0.5761558413505554,
"learning_rate": 9.997245386907541e-06,
"loss": 1.533,
"step": 56
},
{
"epoch": 0.0016633351328479507,
"grad_norm": 0.3254339098930359,
"learning_rate": 9.997090242424711e-06,
"loss": 0.4337,
"step": 57
},
{
"epoch": 0.0016925164509680902,
"grad_norm": 0.5125793814659119,
"learning_rate": 9.996930849079741e-06,
"loss": 0.8955,
"step": 58
},
{
"epoch": 0.0017216977690882296,
"grad_norm": 0.4197414517402649,
"learning_rate": 9.99676720700816e-06,
"loss": 2.1629,
"step": 59
},
{
"epoch": 0.001750879087208369,
"grad_norm": 0.4802038371562958,
"learning_rate": 9.996599316349105e-06,
"loss": 0.43,
"step": 60
},
{
"epoch": 0.0017800604053285088,
"grad_norm": 0.30398809909820557,
"learning_rate": 9.99642717724533e-06,
"loss": 1.0333,
"step": 61
},
{
"epoch": 0.0018092417234486483,
"grad_norm": 0.29787677526474,
"learning_rate": 9.996250789843203e-06,
"loss": 0.4196,
"step": 62
},
{
"epoch": 0.0018384230415687877,
"grad_norm": 0.5412101745605469,
"learning_rate": 9.996070154292691e-06,
"loss": 0.9048,
"step": 63
},
{
"epoch": 0.0018676043596889272,
"grad_norm": 0.6284111738204956,
"learning_rate": 9.995885270747393e-06,
"loss": 0.4706,
"step": 64
},
{
"epoch": 0.0018967856778090667,
"grad_norm": 0.43187353014945984,
"learning_rate": 9.9956961393645e-06,
"loss": 1.2573,
"step": 65
},
{
"epoch": 0.0019259669959292062,
"grad_norm": 0.3548312485218048,
"learning_rate": 9.995502760304829e-06,
"loss": 0.6652,
"step": 66
},
{
"epoch": 0.0019551483140493454,
"grad_norm": 0.3329358398914337,
"learning_rate": 9.995305133732805e-06,
"loss": 0.4759,
"step": 67
},
{
"epoch": 0.001984329632169485,
"grad_norm": 0.3695685565471649,
"learning_rate": 9.99510325981646e-06,
"loss": 1.257,
"step": 68
},
{
"epoch": 0.0020135109502896244,
"grad_norm": 0.3101709485054016,
"learning_rate": 9.994897138727446e-06,
"loss": 0.5615,
"step": 69
},
{
"epoch": 0.002042692268409764,
"grad_norm": 0.5336629152297974,
"learning_rate": 9.994686770641015e-06,
"loss": 0.8056,
"step": 70
},
{
"epoch": 0.0020718735865299038,
"grad_norm": 0.31102678179740906,
"learning_rate": 9.994472155736039e-06,
"loss": 1.0285,
"step": 71
},
{
"epoch": 0.002101054904650043,
"grad_norm": 0.31015750765800476,
"learning_rate": 9.994253294194998e-06,
"loss": 0.6796,
"step": 72
},
{
"epoch": 0.0021302362227701827,
"grad_norm": 1.129408597946167,
"learning_rate": 9.994030186203983e-06,
"loss": 0.9488,
"step": 73
},
{
"epoch": 0.002159417540890322,
"grad_norm": 0.3322198987007141,
"learning_rate": 9.993802831952692e-06,
"loss": 1.6218,
"step": 74
},
{
"epoch": 0.0021885988590104617,
"grad_norm": 0.36219432950019836,
"learning_rate": 9.993571231634444e-06,
"loss": 0.4795,
"step": 75
},
{
"epoch": 0.002217780177130601,
"grad_norm": 0.3242811858654022,
"learning_rate": 9.993335385446155e-06,
"loss": 0.4375,
"step": 76
},
{
"epoch": 0.0022469614952507406,
"grad_norm": 0.5745208859443665,
"learning_rate": 9.993095293588359e-06,
"loss": 1.3936,
"step": 77
},
{
"epoch": 0.00227614281337088,
"grad_norm": 0.3800385594367981,
"learning_rate": 9.992850956265198e-06,
"loss": 0.5863,
"step": 78
},
{
"epoch": 0.0023053241314910196,
"grad_norm": 0.38720518350601196,
"learning_rate": 9.992602373684426e-06,
"loss": 0.6313,
"step": 79
},
{
"epoch": 0.002334505449611159,
"grad_norm": 0.5431109070777893,
"learning_rate": 9.992349546057403e-06,
"loss": 0.7509,
"step": 80
},
{
"epoch": 0.0023636867677312985,
"grad_norm": 0.36960089206695557,
"learning_rate": 9.9920924735991e-06,
"loss": 0.9468,
"step": 81
},
{
"epoch": 0.0023928680858514378,
"grad_norm": 0.3455142080783844,
"learning_rate": 9.991831156528095e-06,
"loss": 0.6766,
"step": 82
},
{
"epoch": 0.0024220494039715774,
"grad_norm": 0.288099080324173,
"learning_rate": 9.991565595066582e-06,
"loss": 0.3018,
"step": 83
},
{
"epoch": 0.0024512307220917167,
"grad_norm": 0.3686278164386749,
"learning_rate": 9.991295789440357e-06,
"loss": 0.6696,
"step": 84
},
{
"epoch": 0.0024804120402118564,
"grad_norm": 0.28621384501457214,
"learning_rate": 9.991021739878828e-06,
"loss": 0.4126,
"step": 85
},
{
"epoch": 0.0025095933583319957,
"grad_norm": 0.33376601338386536,
"learning_rate": 9.990743446615008e-06,
"loss": 0.6864,
"step": 86
},
{
"epoch": 0.0025387746764521353,
"grad_norm": 0.36860471963882446,
"learning_rate": 9.990460909885522e-06,
"loss": 0.5158,
"step": 87
},
{
"epoch": 0.002567955994572275,
"grad_norm": 0.4445946216583252,
"learning_rate": 9.9901741299306e-06,
"loss": 1.2633,
"step": 88
},
{
"epoch": 0.0025971373126924143,
"grad_norm": 0.29571008682250977,
"learning_rate": 9.989883106994086e-06,
"loss": 0.9292,
"step": 89
},
{
"epoch": 0.002626318630812554,
"grad_norm": 0.3927260935306549,
"learning_rate": 9.989587841323423e-06,
"loss": 1.0559,
"step": 90
},
{
"epoch": 0.0026554999489326932,
"grad_norm": 0.36762961745262146,
"learning_rate": 9.98928833316967e-06,
"loss": 0.913,
"step": 91
},
{
"epoch": 0.002684681267052833,
"grad_norm": 0.3317353427410126,
"learning_rate": 9.988984582787482e-06,
"loss": 0.6172,
"step": 92
},
{
"epoch": 0.002713862585172972,
"grad_norm": 0.465626984834671,
"learning_rate": 9.988676590435133e-06,
"loss": 0.7472,
"step": 93
},
{
"epoch": 0.002743043903293112,
"grad_norm": 1.3269602060317993,
"learning_rate": 9.9883643563745e-06,
"loss": 0.6623,
"step": 94
},
{
"epoch": 0.002772225221413251,
"grad_norm": 0.4574294984340668,
"learning_rate": 9.988047880871063e-06,
"loss": 1.3339,
"step": 95
},
{
"epoch": 0.002801406539533391,
"grad_norm": 0.34987109899520874,
"learning_rate": 9.98772716419391e-06,
"loss": 0.6319,
"step": 96
},
{
"epoch": 0.00283058785765353,
"grad_norm": 0.402045875787735,
"learning_rate": 9.98740220661574e-06,
"loss": 0.6483,
"step": 97
},
{
"epoch": 0.0028597691757736698,
"grad_norm": 0.29255297780036926,
"learning_rate": 9.987073008412847e-06,
"loss": 0.5413,
"step": 98
},
{
"epoch": 0.002888950493893809,
"grad_norm": 0.3598422110080719,
"learning_rate": 9.986739569865143e-06,
"loss": 0.595,
"step": 99
},
{
"epoch": 0.0029181318120139487,
"grad_norm": 0.38826480507850647,
"learning_rate": 9.986401891256139e-06,
"loss": 0.6546,
"step": 100
},
{
"epoch": 0.002947313130134088,
"grad_norm": 0.42309150099754333,
"learning_rate": 9.98605997287295e-06,
"loss": 1.0045,
"step": 101
},
{
"epoch": 0.0029764944482542277,
"grad_norm": 0.4501197338104248,
"learning_rate": 9.9857138150063e-06,
"loss": 0.8104,
"step": 102
},
{
"epoch": 0.003005675766374367,
"grad_norm": 0.3553255796432495,
"learning_rate": 9.985363417950515e-06,
"loss": 0.6416,
"step": 103
},
{
"epoch": 0.0030348570844945066,
"grad_norm": 0.30859172344207764,
"learning_rate": 9.985008782003524e-06,
"loss": 0.5201,
"step": 104
},
{
"epoch": 0.0030640384026146463,
"grad_norm": 0.3903139531612396,
"learning_rate": 9.984649907466868e-06,
"loss": 0.8591,
"step": 105
},
{
"epoch": 0.0030932197207347856,
"grad_norm": 0.3806699514389038,
"learning_rate": 9.98428679464568e-06,
"loss": 0.6575,
"step": 106
},
{
"epoch": 0.0031224010388549253,
"grad_norm": 0.7828800082206726,
"learning_rate": 9.983919443848706e-06,
"loss": 0.63,
"step": 107
},
{
"epoch": 0.0031515823569750645,
"grad_norm": 0.32453709840774536,
"learning_rate": 9.98354785538829e-06,
"loss": 0.489,
"step": 108
},
{
"epoch": 0.003180763675095204,
"grad_norm": 0.4170776307582855,
"learning_rate": 9.983172029580387e-06,
"loss": 1.1076,
"step": 109
},
{
"epoch": 0.0032099449932153435,
"grad_norm": 0.3523752689361572,
"learning_rate": 9.982791966744545e-06,
"loss": 0.6405,
"step": 110
},
{
"epoch": 0.003239126311335483,
"grad_norm": 0.32706937193870544,
"learning_rate": 9.98240766720392e-06,
"loss": 1.0937,
"step": 111
},
{
"epoch": 0.0032683076294556224,
"grad_norm": 0.3406533896923065,
"learning_rate": 9.982019131285268e-06,
"loss": 1.3389,
"step": 112
},
{
"epoch": 0.003297488947575762,
"grad_norm": 0.376828134059906,
"learning_rate": 9.98162635931895e-06,
"loss": 0.636,
"step": 113
},
{
"epoch": 0.0033266702656959014,
"grad_norm": 0.35637038946151733,
"learning_rate": 9.981229351638926e-06,
"loss": 0.4319,
"step": 114
},
{
"epoch": 0.003355851583816041,
"grad_norm": 0.5207456350326538,
"learning_rate": 9.980828108582759e-06,
"loss": 0.6011,
"step": 115
},
{
"epoch": 0.0033850329019361803,
"grad_norm": 0.464778333902359,
"learning_rate": 9.980422630491614e-06,
"loss": 0.7913,
"step": 116
},
{
"epoch": 0.00341421422005632,
"grad_norm": 0.3464951515197754,
"learning_rate": 9.980012917710254e-06,
"loss": 0.5774,
"step": 117
},
{
"epoch": 0.0034433955381764593,
"grad_norm": 0.36604249477386475,
"learning_rate": 9.979598970587046e-06,
"loss": 0.7515,
"step": 118
},
{
"epoch": 0.003472576856296599,
"grad_norm": 0.35948342084884644,
"learning_rate": 9.979180789473955e-06,
"loss": 0.4906,
"step": 119
},
{
"epoch": 0.003501758174416738,
"grad_norm": 0.3790506422519684,
"learning_rate": 9.978758374726544e-06,
"loss": 0.7257,
"step": 120
},
{
"epoch": 0.003530939492536878,
"grad_norm": 0.36446383595466614,
"learning_rate": 9.978331726703984e-06,
"loss": 0.6115,
"step": 121
},
{
"epoch": 0.0035601208106570176,
"grad_norm": 0.4974438548088074,
"learning_rate": 9.977900845769037e-06,
"loss": 0.6018,
"step": 122
},
{
"epoch": 0.003589302128777157,
"grad_norm": 0.4783862233161926,
"learning_rate": 9.977465732288065e-06,
"loss": 0.7256,
"step": 123
},
{
"epoch": 0.0036184834468972965,
"grad_norm": 0.35444504022598267,
"learning_rate": 9.977026386631032e-06,
"loss": 0.5428,
"step": 124
},
{
"epoch": 0.003647664765017436,
"grad_norm": 0.44525983929634094,
"learning_rate": 9.9765828091715e-06,
"loss": 0.7336,
"step": 125
},
{
"epoch": 0.0036768460831375755,
"grad_norm": 0.32913491129875183,
"learning_rate": 9.97613500028663e-06,
"loss": 0.6213,
"step": 126
},
{
"epoch": 0.0037060274012577147,
"grad_norm": 0.3486779034137726,
"learning_rate": 9.975682960357176e-06,
"loss": 1.1807,
"step": 127
},
{
"epoch": 0.0037352087193778544,
"grad_norm": 0.2928440570831299,
"learning_rate": 9.975226689767494e-06,
"loss": 0.5046,
"step": 128
},
{
"epoch": 0.0037643900374979937,
"grad_norm": 0.3638307750225067,
"learning_rate": 9.974766188905535e-06,
"loss": 1.2703,
"step": 129
},
{
"epoch": 0.0037935713556181334,
"grad_norm": 0.478950560092926,
"learning_rate": 9.97430145816285e-06,
"loss": 1.1034,
"step": 130
},
{
"epoch": 0.0038227526737382726,
"grad_norm": 0.5774679183959961,
"learning_rate": 9.973832497934583e-06,
"loss": 0.5785,
"step": 131
},
{
"epoch": 0.0038519339918584123,
"grad_norm": 0.3301682770252228,
"learning_rate": 9.973359308619476e-06,
"loss": 0.6012,
"step": 132
},
{
"epoch": 0.0038811153099785516,
"grad_norm": 0.4451266825199127,
"learning_rate": 9.972881890619865e-06,
"loss": 0.6879,
"step": 133
},
{
"epoch": 0.003910296628098691,
"grad_norm": 0.6361525654792786,
"learning_rate": 9.972400244341685e-06,
"loss": 0.8636,
"step": 134
},
{
"epoch": 0.003939477946218831,
"grad_norm": 0.3009544909000397,
"learning_rate": 9.971914370194462e-06,
"loss": 0.5197,
"step": 135
},
{
"epoch": 0.00396865926433897,
"grad_norm": 0.36018285155296326,
"learning_rate": 9.97142426859132e-06,
"loss": 0.6695,
"step": 136
},
{
"epoch": 0.0039978405824591095,
"grad_norm": 0.4810916781425476,
"learning_rate": 9.970929939948978e-06,
"loss": 0.9842,
"step": 137
},
{
"epoch": 0.004027021900579249,
"grad_norm": 0.3791263699531555,
"learning_rate": 9.970431384687741e-06,
"loss": 0.6019,
"step": 138
},
{
"epoch": 0.004056203218699389,
"grad_norm": 0.35318222641944885,
"learning_rate": 9.969928603231523e-06,
"loss": 1.1406,
"step": 139
},
{
"epoch": 0.004085384536819528,
"grad_norm": 0.42550453543663025,
"learning_rate": 9.969421596007817e-06,
"loss": 0.7477,
"step": 140
},
{
"epoch": 0.004114565854939667,
"grad_norm": 0.4141107201576233,
"learning_rate": 9.968910363447715e-06,
"loss": 1.1222,
"step": 141
},
{
"epoch": 0.0041437471730598075,
"grad_norm": 0.2917640507221222,
"learning_rate": 9.968394905985905e-06,
"loss": 0.3913,
"step": 142
},
{
"epoch": 0.004172928491179947,
"grad_norm": 0.38363099098205566,
"learning_rate": 9.967875224060658e-06,
"loss": 1.0972,
"step": 143
},
{
"epoch": 0.004202109809300086,
"grad_norm": 0.5850480794906616,
"learning_rate": 9.967351318113847e-06,
"loss": 0.7765,
"step": 144
},
{
"epoch": 0.004231291127420225,
"grad_norm": 0.31228914856910706,
"learning_rate": 9.96682318859093e-06,
"loss": 0.4522,
"step": 145
},
{
"epoch": 0.004260472445540365,
"grad_norm": 0.45077505707740784,
"learning_rate": 9.96629083594096e-06,
"loss": 0.7652,
"step": 146
},
{
"epoch": 0.004289653763660505,
"grad_norm": 0.32041504979133606,
"learning_rate": 9.965754260616576e-06,
"loss": 0.5541,
"step": 147
},
{
"epoch": 0.004318835081780644,
"grad_norm": 0.34780099987983704,
"learning_rate": 9.965213463074013e-06,
"loss": 0.8193,
"step": 148
},
{
"epoch": 0.004348016399900783,
"grad_norm": 0.36673223972320557,
"learning_rate": 9.964668443773094e-06,
"loss": 1.1096,
"step": 149
},
{
"epoch": 0.004377197718020923,
"grad_norm": 0.4030401110649109,
"learning_rate": 9.964119203177228e-06,
"loss": 0.7801,
"step": 150
},
{
"epoch": 0.0044063790361410626,
"grad_norm": 0.5267347693443298,
"learning_rate": 9.963565741753418e-06,
"loss": 0.6537,
"step": 151
},
{
"epoch": 0.004435560354261202,
"grad_norm": 0.6500905156135559,
"learning_rate": 9.963008059972255e-06,
"loss": 0.6598,
"step": 152
},
{
"epoch": 0.004464741672381341,
"grad_norm": 0.44937339425086975,
"learning_rate": 9.962446158307914e-06,
"loss": 0.9024,
"step": 153
},
{
"epoch": 0.004493922990501481,
"grad_norm": 0.47618094086647034,
"learning_rate": 9.961880037238168e-06,
"loss": 0.6525,
"step": 154
},
{
"epoch": 0.0045231043086216205,
"grad_norm": 0.811037003993988,
"learning_rate": 9.961309697244366e-06,
"loss": 1.2114,
"step": 155
},
{
"epoch": 0.00455228562674176,
"grad_norm": 0.30810266733169556,
"learning_rate": 9.960735138811451e-06,
"loss": 0.6901,
"step": 156
},
{
"epoch": 0.0045814669448619,
"grad_norm": 0.4555955231189728,
"learning_rate": 9.960156362427949e-06,
"loss": 0.822,
"step": 157
},
{
"epoch": 0.004610648262982039,
"grad_norm": 0.3224095106124878,
"learning_rate": 9.959573368585979e-06,
"loss": 0.5978,
"step": 158
},
{
"epoch": 0.004639829581102178,
"grad_norm": 0.37293335795402527,
"learning_rate": 9.95898615778124e-06,
"loss": 0.6142,
"step": 159
},
{
"epoch": 0.004669010899222318,
"grad_norm": 0.44633975625038147,
"learning_rate": 9.958394730513014e-06,
"loss": 0.5834,
"step": 160
},
{
"epoch": 0.004698192217342458,
"grad_norm": 0.3741106688976288,
"learning_rate": 9.957799087284177e-06,
"loss": 0.7021,
"step": 161
},
{
"epoch": 0.004727373535462597,
"grad_norm": 0.35581302642822266,
"learning_rate": 9.957199228601183e-06,
"loss": 0.7703,
"step": 162
},
{
"epoch": 0.004756554853582736,
"grad_norm": 0.3660070300102234,
"learning_rate": 9.956595154974073e-06,
"loss": 1.0133,
"step": 163
},
{
"epoch": 0.0047857361717028755,
"grad_norm": 0.35619139671325684,
"learning_rate": 9.955986866916472e-06,
"loss": 0.5734,
"step": 164
},
{
"epoch": 0.004814917489823016,
"grad_norm": 0.3273680806159973,
"learning_rate": 9.955374364945585e-06,
"loss": 1.1949,
"step": 165
},
{
"epoch": 0.004844098807943155,
"grad_norm": 0.3657272458076477,
"learning_rate": 9.954757649582202e-06,
"loss": 0.4649,
"step": 166
},
{
"epoch": 0.004873280126063294,
"grad_norm": 0.3416500985622406,
"learning_rate": 9.9541367213507e-06,
"loss": 0.817,
"step": 167
},
{
"epoch": 0.004902461444183433,
"grad_norm": 0.3144441545009613,
"learning_rate": 9.95351158077903e-06,
"loss": 0.6406,
"step": 168
},
{
"epoch": 0.0049316427623035735,
"grad_norm": 0.4005574584007263,
"learning_rate": 9.952882228398731e-06,
"loss": 0.7143,
"step": 169
},
{
"epoch": 0.004960824080423713,
"grad_norm": 0.45650023221969604,
"learning_rate": 9.952248664744919e-06,
"loss": 0.565,
"step": 170
},
{
"epoch": 0.004990005398543852,
"grad_norm": 0.3257487416267395,
"learning_rate": 9.951610890356291e-06,
"loss": 0.5586,
"step": 171
},
{
"epoch": 0.005019186716663991,
"grad_norm": 0.3424636721611023,
"learning_rate": 9.95096890577513e-06,
"loss": 1.0405,
"step": 172
},
{
"epoch": 0.005048368034784131,
"grad_norm": 0.45603424310684204,
"learning_rate": 9.950322711547292e-06,
"loss": 0.7305,
"step": 173
},
{
"epoch": 0.005077549352904271,
"grad_norm": 0.4074293375015259,
"learning_rate": 9.949672308222214e-06,
"loss": 0.5625,
"step": 174
},
{
"epoch": 0.00510673067102441,
"grad_norm": 0.40585842728614807,
"learning_rate": 9.949017696352914e-06,
"loss": 0.8139,
"step": 175
},
{
"epoch": 0.00513591198914455,
"grad_norm": 0.3102453351020813,
"learning_rate": 9.948358876495985e-06,
"loss": 0.7125,
"step": 176
},
{
"epoch": 0.005165093307264689,
"grad_norm": 0.3370908796787262,
"learning_rate": 9.947695849211603e-06,
"loss": 0.5844,
"step": 177
},
{
"epoch": 0.005194274625384829,
"grad_norm": 0.3051895499229431,
"learning_rate": 9.947028615063515e-06,
"loss": 0.5174,
"step": 178
},
{
"epoch": 0.005223455943504968,
"grad_norm": 0.3698843717575073,
"learning_rate": 9.946357174619052e-06,
"loss": 0.4539,
"step": 179
},
{
"epoch": 0.005252637261625108,
"grad_norm": 0.29240456223487854,
"learning_rate": 9.945681528449116e-06,
"loss": 0.5571,
"step": 180
},
{
"epoch": 0.005281818579745247,
"grad_norm": 0.3231453001499176,
"learning_rate": 9.945001677128185e-06,
"loss": 1.1014,
"step": 181
},
{
"epoch": 0.0053109998978653865,
"grad_norm": 0.3621669113636017,
"learning_rate": 9.944317621234318e-06,
"loss": 0.6259,
"step": 182
},
{
"epoch": 0.005340181215985526,
"grad_norm": 0.3420839011669159,
"learning_rate": 9.943629361349143e-06,
"loss": 0.6329,
"step": 183
},
{
"epoch": 0.005369362534105666,
"grad_norm": 0.45090600848197937,
"learning_rate": 9.942936898057866e-06,
"loss": 1.0192,
"step": 184
},
{
"epoch": 0.005398543852225805,
"grad_norm": 0.32308369874954224,
"learning_rate": 9.942240231949263e-06,
"loss": 0.5465,
"step": 185
},
{
"epoch": 0.005427725170345944,
"grad_norm": 0.34121567010879517,
"learning_rate": 9.94153936361569e-06,
"loss": 0.6829,
"step": 186
},
{
"epoch": 0.005456906488466084,
"grad_norm": 0.3925750255584717,
"learning_rate": 9.940834293653071e-06,
"loss": 1.0301,
"step": 187
},
{
"epoch": 0.005486087806586224,
"grad_norm": 0.40466272830963135,
"learning_rate": 9.940125022660903e-06,
"loss": 0.8774,
"step": 188
},
{
"epoch": 0.005515269124706363,
"grad_norm": 0.3522239625453949,
"learning_rate": 9.939411551242258e-06,
"loss": 0.7484,
"step": 189
},
{
"epoch": 0.005544450442826502,
"grad_norm": 0.28628867864608765,
"learning_rate": 9.938693880003775e-06,
"loss": 0.7138,
"step": 190
},
{
"epoch": 0.0055736317609466415,
"grad_norm": 0.3318323791027069,
"learning_rate": 9.937972009555667e-06,
"loss": 0.514,
"step": 191
},
{
"epoch": 0.005602813079066782,
"grad_norm": 0.34792056679725647,
"learning_rate": 9.937245940511719e-06,
"loss": 0.5826,
"step": 192
},
{
"epoch": 0.005631994397186921,
"grad_norm": 0.3346732556819916,
"learning_rate": 9.93651567348928e-06,
"loss": 0.4349,
"step": 193
},
{
"epoch": 0.00566117571530706,
"grad_norm": 0.37206366658210754,
"learning_rate": 9.935781209109274e-06,
"loss": 1.2471,
"step": 194
},
{
"epoch": 0.0056903570334272,
"grad_norm": 0.4244491755962372,
"learning_rate": 9.935042547996194e-06,
"loss": 0.9581,
"step": 195
},
{
"epoch": 0.0057195383515473396,
"grad_norm": 0.35628461837768555,
"learning_rate": 9.934299690778096e-06,
"loss": 0.4569,
"step": 196
},
{
"epoch": 0.005748719669667479,
"grad_norm": 0.32731014490127563,
"learning_rate": 9.933552638086607e-06,
"loss": 0.5726,
"step": 197
},
{
"epoch": 0.005777900987787618,
"grad_norm": 0.3174387514591217,
"learning_rate": 9.932801390556926e-06,
"loss": 1.3027,
"step": 198
},
{
"epoch": 0.005807082305907758,
"grad_norm": 0.3262479305267334,
"learning_rate": 9.932045948827809e-06,
"loss": 1.1214,
"step": 199
},
{
"epoch": 0.0058362636240278975,
"grad_norm": 0.3247474431991577,
"learning_rate": 9.931286313541586e-06,
"loss": 0.4761,
"step": 200
},
{
"epoch": 0.005865444942148037,
"grad_norm": 0.38119345903396606,
"learning_rate": 9.930522485344149e-06,
"loss": 0.882,
"step": 201
},
{
"epoch": 0.005894626260268176,
"grad_norm": 0.29767370223999023,
"learning_rate": 9.929754464884958e-06,
"loss": 0.6177,
"step": 202
},
{
"epoch": 0.005923807578388316,
"grad_norm": 0.4254716634750366,
"learning_rate": 9.928982252817032e-06,
"loss": 0.6291,
"step": 203
},
{
"epoch": 0.005952988896508455,
"grad_norm": 0.39530616998672485,
"learning_rate": 9.928205849796963e-06,
"loss": 0.9351,
"step": 204
},
{
"epoch": 0.005982170214628595,
"grad_norm": 0.6373737454414368,
"learning_rate": 9.927425256484894e-06,
"loss": 0.6,
"step": 205
},
{
"epoch": 0.006011351532748734,
"grad_norm": 0.37412020564079285,
"learning_rate": 9.926640473544545e-06,
"loss": 0.6374,
"step": 206
},
{
"epoch": 0.006040532850868874,
"grad_norm": 0.33179807662963867,
"learning_rate": 9.925851501643186e-06,
"loss": 0.715,
"step": 207
},
{
"epoch": 0.006069714168989013,
"grad_norm": 0.3322802186012268,
"learning_rate": 9.925058341451659e-06,
"loss": 0.6499,
"step": 208
},
{
"epoch": 0.0060988954871091525,
"grad_norm": 0.34102705121040344,
"learning_rate": 9.924260993644357e-06,
"loss": 0.4996,
"step": 209
},
{
"epoch": 0.006128076805229293,
"grad_norm": 0.3628920614719391,
"learning_rate": 9.92345945889924e-06,
"loss": 1.0272,
"step": 210
},
{
"epoch": 0.006157258123349432,
"grad_norm": 0.3197779357433319,
"learning_rate": 9.92265373789783e-06,
"loss": 0.4657,
"step": 211
},
{
"epoch": 0.006186439441469571,
"grad_norm": 0.3385450839996338,
"learning_rate": 9.9218438313252e-06,
"loss": 0.5419,
"step": 212
},
{
"epoch": 0.00621562075958971,
"grad_norm": 0.6793949604034424,
"learning_rate": 9.921029739869993e-06,
"loss": 1.5859,
"step": 213
},
{
"epoch": 0.0062448020777098505,
"grad_norm": 0.3696063756942749,
"learning_rate": 9.920211464224398e-06,
"loss": 0.5275,
"step": 214
},
{
"epoch": 0.00627398339582999,
"grad_norm": 0.3089805841445923,
"learning_rate": 9.919389005084173e-06,
"loss": 1.0725,
"step": 215
},
{
"epoch": 0.006303164713950129,
"grad_norm": 0.35823842883110046,
"learning_rate": 9.918562363148625e-06,
"loss": 0.58,
"step": 216
},
{
"epoch": 0.006332346032070268,
"grad_norm": 0.35605373978614807,
"learning_rate": 9.917731539120623e-06,
"loss": 0.8045,
"step": 217
},
{
"epoch": 0.006361527350190408,
"grad_norm": 0.9635096192359924,
"learning_rate": 9.916896533706587e-06,
"loss": 0.6127,
"step": 218
},
{
"epoch": 0.006390708668310548,
"grad_norm": 0.44269976019859314,
"learning_rate": 9.916057347616496e-06,
"loss": 1.1253,
"step": 219
},
{
"epoch": 0.006419889986430687,
"grad_norm": 0.35200875997543335,
"learning_rate": 9.915213981563882e-06,
"loss": 0.7173,
"step": 220
},
{
"epoch": 0.006449071304550826,
"grad_norm": 0.3846692442893982,
"learning_rate": 9.914366436265834e-06,
"loss": 0.625,
"step": 221
},
{
"epoch": 0.006478252622670966,
"grad_norm": 1.3158338069915771,
"learning_rate": 9.913514712442987e-06,
"loss": 0.4931,
"step": 222
},
{
"epoch": 0.006507433940791106,
"grad_norm": 0.3677213191986084,
"learning_rate": 9.912658810819537e-06,
"loss": 0.6044,
"step": 223
},
{
"epoch": 0.006536615258911245,
"grad_norm": 0.37323689460754395,
"learning_rate": 9.911798732123231e-06,
"loss": 0.8516,
"step": 224
},
{
"epoch": 0.006565796577031384,
"grad_norm": 0.41386744379997253,
"learning_rate": 9.910934477085363e-06,
"loss": 0.633,
"step": 225
},
{
"epoch": 0.006594977895151524,
"grad_norm": 0.3256453573703766,
"learning_rate": 9.91006604644078e-06,
"loss": 0.5754,
"step": 226
},
{
"epoch": 0.0066241592132716635,
"grad_norm": 0.42724573612213135,
"learning_rate": 9.909193440927882e-06,
"loss": 0.9702,
"step": 227
},
{
"epoch": 0.006653340531391803,
"grad_norm": 0.30440038442611694,
"learning_rate": 9.908316661288617e-06,
"loss": 0.6389,
"step": 228
},
{
"epoch": 0.006682521849511943,
"grad_norm": 0.3871992528438568,
"learning_rate": 9.907435708268483e-06,
"loss": 1.7041,
"step": 229
},
{
"epoch": 0.006711703167632082,
"grad_norm": 0.3268488049507141,
"learning_rate": 9.906550582616521e-06,
"loss": 1.0783,
"step": 230
},
{
"epoch": 0.006740884485752221,
"grad_norm": 0.3559863865375519,
"learning_rate": 9.90566128508533e-06,
"loss": 1.0627,
"step": 231
},
{
"epoch": 0.006770065803872361,
"grad_norm": 0.3503190577030182,
"learning_rate": 9.904767816431043e-06,
"loss": 1.1182,
"step": 232
},
{
"epoch": 0.006799247121992501,
"grad_norm": 0.41551750898361206,
"learning_rate": 9.903870177413354e-06,
"loss": 1.0803,
"step": 233
},
{
"epoch": 0.00682842844011264,
"grad_norm": 0.32648736238479614,
"learning_rate": 9.902968368795496e-06,
"loss": 0.4153,
"step": 234
},
{
"epoch": 0.006857609758232779,
"grad_norm": 0.5350513458251953,
"learning_rate": 9.902062391344245e-06,
"loss": 1.5776,
"step": 235
},
{
"epoch": 0.0068867910763529185,
"grad_norm": 0.4450839161872864,
"learning_rate": 9.901152245829922e-06,
"loss": 0.4149,
"step": 236
},
{
"epoch": 0.006915972394473059,
"grad_norm": 0.31524381041526794,
"learning_rate": 9.900237933026397e-06,
"loss": 0.5053,
"step": 237
},
{
"epoch": 0.006945153712593198,
"grad_norm": 0.36861109733581543,
"learning_rate": 9.899319453711081e-06,
"loss": 1.1878,
"step": 238
},
{
"epoch": 0.006974335030713337,
"grad_norm": 0.43096283078193665,
"learning_rate": 9.898396808664924e-06,
"loss": 0.827,
"step": 239
},
{
"epoch": 0.007003516348833476,
"grad_norm": 0.4168417453765869,
"learning_rate": 9.89746999867242e-06,
"loss": 1.0594,
"step": 240
},
{
"epoch": 0.0070326976669536165,
"grad_norm": 0.7598991394042969,
"learning_rate": 9.89653902452161e-06,
"loss": 1.001,
"step": 241
},
{
"epoch": 0.007061878985073756,
"grad_norm": 0.4895627796649933,
"learning_rate": 9.895603887004068e-06,
"loss": 1.327,
"step": 242
},
{
"epoch": 0.007091060303193895,
"grad_norm": 0.5062536597251892,
"learning_rate": 9.894664586914911e-06,
"loss": 0.9793,
"step": 243
},
{
"epoch": 0.007120241621314035,
"grad_norm": 0.3142737150192261,
"learning_rate": 9.893721125052794e-06,
"loss": 0.6362,
"step": 244
},
{
"epoch": 0.0071494229394341744,
"grad_norm": 0.316954642534256,
"learning_rate": 9.892773502219913e-06,
"loss": 0.3909,
"step": 245
},
{
"epoch": 0.007178604257554314,
"grad_norm": 0.3828858733177185,
"learning_rate": 9.891821719222e-06,
"loss": 0.529,
"step": 246
},
{
"epoch": 0.007207785575674453,
"grad_norm": 0.35143372416496277,
"learning_rate": 9.890865776868324e-06,
"loss": 1.0562,
"step": 247
},
{
"epoch": 0.007236966893794593,
"grad_norm": 0.4023430347442627,
"learning_rate": 9.889905675971694e-06,
"loss": 0.729,
"step": 248
},
{
"epoch": 0.007266148211914732,
"grad_norm": 0.36702919006347656,
"learning_rate": 9.888941417348453e-06,
"loss": 1.1581,
"step": 249
},
{
"epoch": 0.007295329530034872,
"grad_norm": 0.36125248670578003,
"learning_rate": 9.887973001818473e-06,
"loss": 0.5188,
"step": 250
},
{
"epoch": 0.007324510848155011,
"grad_norm": 0.32154127955436707,
"learning_rate": 9.887000430205173e-06,
"loss": 0.5811,
"step": 251
},
{
"epoch": 0.007353692166275151,
"grad_norm": 0.3822373151779175,
"learning_rate": 9.886023703335493e-06,
"loss": 0.69,
"step": 252
},
{
"epoch": 0.00738287348439529,
"grad_norm": 0.4168561100959778,
"learning_rate": 9.885042822039915e-06,
"loss": 0.9765,
"step": 253
},
{
"epoch": 0.0074120548025154295,
"grad_norm": 0.4219783842563629,
"learning_rate": 9.884057787152451e-06,
"loss": 1.3698,
"step": 254
},
{
"epoch": 0.007441236120635569,
"grad_norm": 0.32248085737228394,
"learning_rate": 9.88306859951064e-06,
"loss": 0.4901,
"step": 255
},
{
"epoch": 0.007470417438755709,
"grad_norm": 0.4942518174648285,
"learning_rate": 9.88207525995556e-06,
"loss": 0.7452,
"step": 256
},
{
"epoch": 0.007499598756875848,
"grad_norm": 0.32152867317199707,
"learning_rate": 9.881077769331811e-06,
"loss": 0.6269,
"step": 257
},
{
"epoch": 0.007528780074995987,
"grad_norm": 0.4080241322517395,
"learning_rate": 9.88007612848753e-06,
"loss": 0.9697,
"step": 258
},
{
"epoch": 0.007557961393116127,
"grad_norm": 0.40665292739868164,
"learning_rate": 9.879070338274379e-06,
"loss": 0.6378,
"step": 259
},
{
"epoch": 0.007587142711236267,
"grad_norm": 0.43658050894737244,
"learning_rate": 9.878060399547547e-06,
"loss": 1.5812,
"step": 260
},
{
"epoch": 0.007616324029356406,
"grad_norm": 0.3814539611339569,
"learning_rate": 9.877046313165754e-06,
"loss": 0.8156,
"step": 261
},
{
"epoch": 0.007645505347476545,
"grad_norm": 0.40121573209762573,
"learning_rate": 9.876028079991242e-06,
"loss": 0.735,
"step": 262
},
{
"epoch": 0.007674686665596685,
"grad_norm": 0.3110452890396118,
"learning_rate": 9.875005700889782e-06,
"loss": 1.0318,
"step": 263
},
{
"epoch": 0.007703867983716825,
"grad_norm": 0.36410999298095703,
"learning_rate": 9.87397917673067e-06,
"loss": 1.0398,
"step": 264
},
{
"epoch": 0.007733049301836964,
"grad_norm": 0.3949889838695526,
"learning_rate": 9.872948508386727e-06,
"loss": 1.3404,
"step": 265
},
{
"epoch": 0.007762230619957103,
"grad_norm": 0.3452715277671814,
"learning_rate": 9.871913696734293e-06,
"loss": 1.7137,
"step": 266
},
{
"epoch": 0.007791411938077243,
"grad_norm": 0.49451926350593567,
"learning_rate": 9.870874742653238e-06,
"loss": 0.6079,
"step": 267
},
{
"epoch": 0.007820593256197382,
"grad_norm": 0.353254497051239,
"learning_rate": 9.869831647026948e-06,
"loss": 1.5703,
"step": 268
},
{
"epoch": 0.007849774574317523,
"grad_norm": 0.39976930618286133,
"learning_rate": 9.868784410742337e-06,
"loss": 0.8591,
"step": 269
},
{
"epoch": 0.007878955892437662,
"grad_norm": 0.3353979289531708,
"learning_rate": 9.867733034689828e-06,
"loss": 0.4523,
"step": 270
},
{
"epoch": 0.007908137210557801,
"grad_norm": 0.332116037607193,
"learning_rate": 9.866677519763381e-06,
"loss": 0.6274,
"step": 271
},
{
"epoch": 0.00793731852867794,
"grad_norm": 0.39066842198371887,
"learning_rate": 9.86561786686046e-06,
"loss": 1.1203,
"step": 272
},
{
"epoch": 0.00796649984679808,
"grad_norm": 0.3723788261413574,
"learning_rate": 9.864554076882055e-06,
"loss": 0.5385,
"step": 273
},
{
"epoch": 0.007995681164918219,
"grad_norm": 0.2947128713130951,
"learning_rate": 9.86348615073267e-06,
"loss": 1.0578,
"step": 274
},
{
"epoch": 0.008024862483038358,
"grad_norm": 0.34853091835975647,
"learning_rate": 9.862414089320331e-06,
"loss": 0.4852,
"step": 275
},
{
"epoch": 0.008054043801158497,
"grad_norm": 0.3926672637462616,
"learning_rate": 9.861337893556574e-06,
"loss": 1.1969,
"step": 276
},
{
"epoch": 0.008083225119278638,
"grad_norm": 0.7392916083335876,
"learning_rate": 9.860257564356452e-06,
"loss": 1.2007,
"step": 277
},
{
"epoch": 0.008112406437398778,
"grad_norm": 0.42414942383766174,
"learning_rate": 9.859173102638538e-06,
"loss": 1.0842,
"step": 278
},
{
"epoch": 0.008141587755518917,
"grad_norm": 0.35072061419487,
"learning_rate": 9.858084509324908e-06,
"loss": 1.1563,
"step": 279
},
{
"epoch": 0.008170769073639056,
"grad_norm": 0.44247132539749146,
"learning_rate": 9.856991785341164e-06,
"loss": 0.7369,
"step": 280
},
{
"epoch": 0.008199950391759196,
"grad_norm": 0.39098939299583435,
"learning_rate": 9.855894931616407e-06,
"loss": 0.6189,
"step": 281
},
{
"epoch": 0.008229131709879335,
"grad_norm": 0.3257642090320587,
"learning_rate": 9.854793949083262e-06,
"loss": 1.1748,
"step": 282
},
{
"epoch": 0.008258313027999474,
"grad_norm": 0.4087084233760834,
"learning_rate": 9.853688838677852e-06,
"loss": 0.7535,
"step": 283
},
{
"epoch": 0.008287494346119615,
"grad_norm": 0.32719582319259644,
"learning_rate": 9.852579601339821e-06,
"loss": 1.213,
"step": 284
},
{
"epoch": 0.008316675664239754,
"grad_norm": 0.3474045395851135,
"learning_rate": 9.851466238012317e-06,
"loss": 1.1946,
"step": 285
},
{
"epoch": 0.008345856982359894,
"grad_norm": 0.2800231873989105,
"learning_rate": 9.850348749641993e-06,
"loss": 0.9802,
"step": 286
},
{
"epoch": 0.008375038300480033,
"grad_norm": 0.39679044485092163,
"learning_rate": 9.849227137179015e-06,
"loss": 0.8683,
"step": 287
},
{
"epoch": 0.008404219618600172,
"grad_norm": 0.359581857919693,
"learning_rate": 9.848101401577052e-06,
"loss": 0.5004,
"step": 288
},
{
"epoch": 0.008433400936720311,
"grad_norm": 0.6436200737953186,
"learning_rate": 9.846971543793285e-06,
"loss": 1.0706,
"step": 289
},
{
"epoch": 0.00846258225484045,
"grad_norm": 0.3004380464553833,
"learning_rate": 9.845837564788387e-06,
"loss": 0.4675,
"step": 290
},
{
"epoch": 0.00849176357296059,
"grad_norm": 0.4008398652076721,
"learning_rate": 9.84469946552655e-06,
"loss": 0.6808,
"step": 291
},
{
"epoch": 0.00852094489108073,
"grad_norm": 0.3354049623012543,
"learning_rate": 9.843557246975459e-06,
"loss": 0.4668,
"step": 292
},
{
"epoch": 0.00855012620920087,
"grad_norm": 0.9540996551513672,
"learning_rate": 9.842410910106305e-06,
"loss": 0.6828,
"step": 293
},
{
"epoch": 0.00857930752732101,
"grad_norm": 0.36251530051231384,
"learning_rate": 9.841260455893784e-06,
"loss": 1.7819,
"step": 294
},
{
"epoch": 0.008608488845441149,
"grad_norm": 0.28926217555999756,
"learning_rate": 9.840105885316087e-06,
"loss": 0.4854,
"step": 295
},
{
"epoch": 0.008637670163561288,
"grad_norm": 0.3726727068424225,
"learning_rate": 9.838947199354905e-06,
"loss": 0.5524,
"step": 296
},
{
"epoch": 0.008666851481681427,
"grad_norm": 0.35486266016960144,
"learning_rate": 9.837784398995436e-06,
"loss": 1.1738,
"step": 297
},
{
"epoch": 0.008696032799801566,
"grad_norm": 0.3808845281600952,
"learning_rate": 9.836617485226368e-06,
"loss": 1.1613,
"step": 298
},
{
"epoch": 0.008725214117921707,
"grad_norm": 0.30906185507774353,
"learning_rate": 9.835446459039888e-06,
"loss": 0.6047,
"step": 299
},
{
"epoch": 0.008754395436041847,
"grad_norm": 0.37905341386795044,
"learning_rate": 9.834271321431686e-06,
"loss": 0.5108,
"step": 300
},
{
"epoch": 0.008783576754161986,
"grad_norm": 0.37152329087257385,
"learning_rate": 9.833092073400938e-06,
"loss": 1.2867,
"step": 301
},
{
"epoch": 0.008812758072282125,
"grad_norm": 0.3889938294887543,
"learning_rate": 9.831908715950325e-06,
"loss": 0.516,
"step": 302
},
{
"epoch": 0.008841939390402264,
"grad_norm": 0.31600430607795715,
"learning_rate": 9.830721250086011e-06,
"loss": 0.5475,
"step": 303
},
{
"epoch": 0.008871120708522404,
"grad_norm": 0.36560872197151184,
"learning_rate": 9.829529676817664e-06,
"loss": 0.6687,
"step": 304
},
{
"epoch": 0.008900302026642543,
"grad_norm": 0.33795249462127686,
"learning_rate": 9.828333997158438e-06,
"loss": 0.8979,
"step": 305
},
{
"epoch": 0.008929483344762682,
"grad_norm": 0.37777137756347656,
"learning_rate": 9.827134212124983e-06,
"loss": 0.4487,
"step": 306
},
{
"epoch": 0.008958664662882823,
"grad_norm": 0.38079383969306946,
"learning_rate": 9.825930322737433e-06,
"loss": 0.6556,
"step": 307
},
{
"epoch": 0.008987845981002962,
"grad_norm": 0.35069790482521057,
"learning_rate": 9.824722330019416e-06,
"loss": 0.5511,
"step": 308
},
{
"epoch": 0.009017027299123102,
"grad_norm": 0.3270156979560852,
"learning_rate": 9.823510234998052e-06,
"loss": 0.6456,
"step": 309
},
{
"epoch": 0.009046208617243241,
"grad_norm": 0.3854398727416992,
"learning_rate": 9.822294038703942e-06,
"loss": 1.1342,
"step": 310
},
{
"epoch": 0.00907538993536338,
"grad_norm": 0.5024343729019165,
"learning_rate": 9.821073742171179e-06,
"loss": 1.3163,
"step": 311
},
{
"epoch": 0.00910457125348352,
"grad_norm": 0.3697148561477661,
"learning_rate": 9.819849346437342e-06,
"loss": 0.472,
"step": 312
},
{
"epoch": 0.009133752571603659,
"grad_norm": 0.5523271560668945,
"learning_rate": 9.818620852543495e-06,
"loss": 0.8309,
"step": 313
},
{
"epoch": 0.0091629338897238,
"grad_norm": 0.49423748254776,
"learning_rate": 9.817388261534185e-06,
"loss": 1.4531,
"step": 314
},
{
"epoch": 0.009192115207843939,
"grad_norm": 0.40192165970802307,
"learning_rate": 9.816151574457444e-06,
"loss": 0.6268,
"step": 315
},
{
"epoch": 0.009221296525964078,
"grad_norm": 0.374344140291214,
"learning_rate": 9.814910792364787e-06,
"loss": 0.7519,
"step": 316
},
{
"epoch": 0.009250477844084217,
"grad_norm": 0.3867436945438385,
"learning_rate": 9.81366591631121e-06,
"loss": 0.5716,
"step": 317
},
{
"epoch": 0.009279659162204357,
"grad_norm": 0.3703562319278717,
"learning_rate": 9.812416947355189e-06,
"loss": 0.5265,
"step": 318
},
{
"epoch": 0.009308840480324496,
"grad_norm": 0.3638060390949249,
"learning_rate": 9.811163886558683e-06,
"loss": 0.6116,
"step": 319
},
{
"epoch": 0.009338021798444635,
"grad_norm": 0.308459609746933,
"learning_rate": 9.80990673498713e-06,
"loss": 0.4987,
"step": 320
},
{
"epoch": 0.009367203116564774,
"grad_norm": 0.4370526671409607,
"learning_rate": 9.80864549370944e-06,
"loss": 0.5645,
"step": 321
},
{
"epoch": 0.009396384434684915,
"grad_norm": 0.3499382436275482,
"learning_rate": 9.807380163798009e-06,
"loss": 0.5573,
"step": 322
},
{
"epoch": 0.009425565752805055,
"grad_norm": 0.4345581829547882,
"learning_rate": 9.806110746328705e-06,
"loss": 1.2744,
"step": 323
},
{
"epoch": 0.009454747070925194,
"grad_norm": 0.3100895285606384,
"learning_rate": 9.804837242380873e-06,
"loss": 0.6461,
"step": 324
},
{
"epoch": 0.009483928389045333,
"grad_norm": 0.5291584730148315,
"learning_rate": 9.803559653037328e-06,
"loss": 1.0081,
"step": 325
},
{
"epoch": 0.009513109707165473,
"grad_norm": 0.3431501090526581,
"learning_rate": 9.802277979384367e-06,
"loss": 0.5404,
"step": 326
},
{
"epoch": 0.009542291025285612,
"grad_norm": 0.38304755091667175,
"learning_rate": 9.800992222511753e-06,
"loss": 0.6265,
"step": 327
},
{
"epoch": 0.009571472343405751,
"grad_norm": 0.3770524561405182,
"learning_rate": 9.799702383512721e-06,
"loss": 0.6301,
"step": 328
},
{
"epoch": 0.00960065366152589,
"grad_norm": 0.3577704429626465,
"learning_rate": 9.798408463483982e-06,
"loss": 0.711,
"step": 329
},
{
"epoch": 0.009629834979646031,
"grad_norm": 0.4507421851158142,
"learning_rate": 9.797110463525715e-06,
"loss": 1.2938,
"step": 330
},
{
"epoch": 0.00965901629776617,
"grad_norm": 0.3144557774066925,
"learning_rate": 9.79580838474156e-06,
"loss": 0.5169,
"step": 331
},
{
"epoch": 0.00968819761588631,
"grad_norm": 0.3704127371311188,
"learning_rate": 9.794502228238638e-06,
"loss": 0.7512,
"step": 332
},
{
"epoch": 0.009717378934006449,
"grad_norm": 0.31256386637687683,
"learning_rate": 9.79319199512753e-06,
"loss": 1.0887,
"step": 333
},
{
"epoch": 0.009746560252126588,
"grad_norm": 0.3950251042842865,
"learning_rate": 9.791877686522285e-06,
"loss": 1.4947,
"step": 334
},
{
"epoch": 0.009775741570246728,
"grad_norm": 0.4597548544406891,
"learning_rate": 9.790559303540413e-06,
"loss": 1.3732,
"step": 335
},
{
"epoch": 0.009804922888366867,
"grad_norm": 0.3754691481590271,
"learning_rate": 9.789236847302896e-06,
"loss": 0.4873,
"step": 336
},
{
"epoch": 0.009834104206487008,
"grad_norm": 0.33539366722106934,
"learning_rate": 9.787910318934172e-06,
"loss": 0.5393,
"step": 337
},
{
"epoch": 0.009863285524607147,
"grad_norm": 0.31377556920051575,
"learning_rate": 9.786579719562146e-06,
"loss": 0.4385,
"step": 338
},
{
"epoch": 0.009892466842727286,
"grad_norm": 0.3356408178806305,
"learning_rate": 9.785245050318184e-06,
"loss": 0.5648,
"step": 339
},
{
"epoch": 0.009921648160847426,
"grad_norm": 0.34536507725715637,
"learning_rate": 9.78390631233711e-06,
"loss": 0.4974,
"step": 340
},
{
"epoch": 0.009950829478967565,
"grad_norm": 0.3652310371398926,
"learning_rate": 9.78256350675721e-06,
"loss": 1.1248,
"step": 341
},
{
"epoch": 0.009980010797087704,
"grad_norm": 0.4672922194004059,
"learning_rate": 9.781216634720227e-06,
"loss": 1.1616,
"step": 342
},
{
"epoch": 0.010009192115207843,
"grad_norm": 0.36882463097572327,
"learning_rate": 9.779865697371362e-06,
"loss": 0.5269,
"step": 343
},
{
"epoch": 0.010038373433327983,
"grad_norm": 0.4571130573749542,
"learning_rate": 9.778510695859274e-06,
"loss": 0.8763,
"step": 344
},
{
"epoch": 0.010067554751448124,
"grad_norm": 0.33660727739334106,
"learning_rate": 9.777151631336074e-06,
"loss": 0.5926,
"step": 345
},
{
"epoch": 0.010096736069568263,
"grad_norm": 0.4739161431789398,
"learning_rate": 9.775788504957334e-06,
"loss": 0.7474,
"step": 346
},
{
"epoch": 0.010125917387688402,
"grad_norm": 0.34859299659729004,
"learning_rate": 9.774421317882071e-06,
"loss": 0.6424,
"step": 347
},
{
"epoch": 0.010155098705808541,
"grad_norm": 0.3759802579879761,
"learning_rate": 9.773050071272764e-06,
"loss": 0.6449,
"step": 348
},
{
"epoch": 0.01018428002392868,
"grad_norm": 0.40361499786376953,
"learning_rate": 9.771674766295334e-06,
"loss": 1.6188,
"step": 349
},
{
"epoch": 0.01021346134204882,
"grad_norm": 0.3608465790748596,
"learning_rate": 9.770295404119163e-06,
"loss": 0.5166,
"step": 350
},
{
"epoch": 0.01024264266016896,
"grad_norm": 0.3379482626914978,
"learning_rate": 9.768911985917073e-06,
"loss": 0.5377,
"step": 351
},
{
"epoch": 0.0102718239782891,
"grad_norm": 0.39209169149398804,
"learning_rate": 9.767524512865342e-06,
"loss": 0.8392,
"step": 352
},
{
"epoch": 0.01030100529640924,
"grad_norm": 0.3134850859642029,
"learning_rate": 9.766132986143694e-06,
"loss": 0.6323,
"step": 353
},
{
"epoch": 0.010330186614529379,
"grad_norm": 0.409471333026886,
"learning_rate": 9.764737406935295e-06,
"loss": 0.6607,
"step": 354
},
{
"epoch": 0.010359367932649518,
"grad_norm": 0.40053945779800415,
"learning_rate": 9.763337776426762e-06,
"loss": 0.9634,
"step": 355
},
{
"epoch": 0.010388549250769657,
"grad_norm": 0.5267995595932007,
"learning_rate": 9.761934095808156e-06,
"loss": 1.3353,
"step": 356
},
{
"epoch": 0.010417730568889796,
"grad_norm": 0.3803673982620239,
"learning_rate": 9.760526366272978e-06,
"loss": 0.7678,
"step": 357
},
{
"epoch": 0.010446911887009936,
"grad_norm": 0.3714295029640198,
"learning_rate": 9.759114589018178e-06,
"loss": 0.5707,
"step": 358
},
{
"epoch": 0.010476093205130075,
"grad_norm": 0.34496983885765076,
"learning_rate": 9.75769876524414e-06,
"loss": 0.7947,
"step": 359
},
{
"epoch": 0.010505274523250216,
"grad_norm": 0.32947367429733276,
"learning_rate": 9.756278896154693e-06,
"loss": 0.9273,
"step": 360
},
{
"epoch": 0.010534455841370355,
"grad_norm": 0.399161159992218,
"learning_rate": 9.75485498295711e-06,
"loss": 1.1393,
"step": 361
},
{
"epoch": 0.010563637159490494,
"grad_norm": 1.0698360204696655,
"learning_rate": 9.753427026862092e-06,
"loss": 1.2065,
"step": 362
},
{
"epoch": 0.010592818477610634,
"grad_norm": 0.36978745460510254,
"learning_rate": 9.751995029083786e-06,
"loss": 1.0173,
"step": 363
},
{
"epoch": 0.010621999795730773,
"grad_norm": 0.4568940997123718,
"learning_rate": 9.750558990839773e-06,
"loss": 0.6282,
"step": 364
},
{
"epoch": 0.010651181113850912,
"grad_norm": 0.3994036614894867,
"learning_rate": 9.749118913351069e-06,
"loss": 0.8021,
"step": 365
},
{
"epoch": 0.010680362431971051,
"grad_norm": 0.3851848244667053,
"learning_rate": 9.747674797842124e-06,
"loss": 0.6805,
"step": 366
},
{
"epoch": 0.010709543750091192,
"grad_norm": 0.36664196848869324,
"learning_rate": 9.746226645540822e-06,
"loss": 0.6545,
"step": 367
},
{
"epoch": 0.010738725068211332,
"grad_norm": 0.3806273937225342,
"learning_rate": 9.74477445767848e-06,
"loss": 0.7037,
"step": 368
},
{
"epoch": 0.010767906386331471,
"grad_norm": 0.3356926143169403,
"learning_rate": 9.743318235489846e-06,
"loss": 0.5096,
"step": 369
},
{
"epoch": 0.01079708770445161,
"grad_norm": 0.5911070108413696,
"learning_rate": 9.741857980213101e-06,
"loss": 0.9225,
"step": 370
},
{
"epoch": 0.01082626902257175,
"grad_norm": 0.36224010586738586,
"learning_rate": 9.740393693089844e-06,
"loss": 0.6222,
"step": 371
},
{
"epoch": 0.010855450340691889,
"grad_norm": 0.35221293568611145,
"learning_rate": 9.73892537536512e-06,
"loss": 0.4876,
"step": 372
},
{
"epoch": 0.010884631658812028,
"grad_norm": 0.33988136053085327,
"learning_rate": 9.737453028287383e-06,
"loss": 0.7258,
"step": 373
},
{
"epoch": 0.010913812976932167,
"grad_norm": 0.32662105560302734,
"learning_rate": 9.735976653108527e-06,
"loss": 0.558,
"step": 374
},
{
"epoch": 0.010942994295052308,
"grad_norm": 0.33718347549438477,
"learning_rate": 9.734496251083865e-06,
"loss": 0.5788,
"step": 375
},
{
"epoch": 0.010972175613172448,
"grad_norm": 0.38472211360931396,
"learning_rate": 9.733011823472131e-06,
"loss": 0.7207,
"step": 376
},
{
"epoch": 0.011001356931292587,
"grad_norm": 0.3394148349761963,
"learning_rate": 9.731523371535488e-06,
"loss": 0.5383,
"step": 377
},
{
"epoch": 0.011030538249412726,
"grad_norm": 0.3245641589164734,
"learning_rate": 9.730030896539518e-06,
"loss": 0.517,
"step": 378
},
{
"epoch": 0.011059719567532865,
"grad_norm": 0.3804280459880829,
"learning_rate": 9.728534399753222e-06,
"loss": 0.5206,
"step": 379
},
{
"epoch": 0.011088900885653005,
"grad_norm": 0.6100478172302246,
"learning_rate": 9.727033882449023e-06,
"loss": 0.7553,
"step": 380
},
{
"epoch": 0.011118082203773144,
"grad_norm": 0.4965435266494751,
"learning_rate": 9.725529345902763e-06,
"loss": 0.574,
"step": 381
},
{
"epoch": 0.011147263521893283,
"grad_norm": 0.31076616048812866,
"learning_rate": 9.724020791393698e-06,
"loss": 0.5868,
"step": 382
},
{
"epoch": 0.011176444840013424,
"grad_norm": 0.3200540542602539,
"learning_rate": 9.722508220204501e-06,
"loss": 0.6465,
"step": 383
},
{
"epoch": 0.011205626158133563,
"grad_norm": 0.44288474321365356,
"learning_rate": 9.720991633621268e-06,
"loss": 1.0584,
"step": 384
},
{
"epoch": 0.011234807476253703,
"grad_norm": 0.33271703124046326,
"learning_rate": 9.719471032933496e-06,
"loss": 0.6802,
"step": 385
},
{
"epoch": 0.011263988794373842,
"grad_norm": 0.3422086238861084,
"learning_rate": 9.717946419434108e-06,
"loss": 0.4469,
"step": 386
},
{
"epoch": 0.011293170112493981,
"grad_norm": 0.46215254068374634,
"learning_rate": 9.716417794419428e-06,
"loss": 0.5714,
"step": 387
},
{
"epoch": 0.01132235143061412,
"grad_norm": 0.32373300194740295,
"learning_rate": 9.714885159189198e-06,
"loss": 1.0706,
"step": 388
},
{
"epoch": 0.01135153274873426,
"grad_norm": 0.3015748858451843,
"learning_rate": 9.713348515046566e-06,
"loss": 0.4271,
"step": 389
},
{
"epoch": 0.0113807140668544,
"grad_norm": 0.6078203320503235,
"learning_rate": 9.711807863298092e-06,
"loss": 0.7983,
"step": 390
},
{
"epoch": 0.01140989538497454,
"grad_norm": 0.44928136467933655,
"learning_rate": 9.710263205253743e-06,
"loss": 1.0152,
"step": 391
},
{
"epoch": 0.011439076703094679,
"grad_norm": 0.3478546440601349,
"learning_rate": 9.708714542226887e-06,
"loss": 0.6882,
"step": 392
},
{
"epoch": 0.011468258021214818,
"grad_norm": 0.6222825050354004,
"learning_rate": 9.707161875534304e-06,
"loss": 1.0782,
"step": 393
},
{
"epoch": 0.011497439339334958,
"grad_norm": 0.36261993646621704,
"learning_rate": 9.705605206496176e-06,
"loss": 0.625,
"step": 394
},
{
"epoch": 0.011526620657455097,
"grad_norm": 0.304210901260376,
"learning_rate": 9.704044536436085e-06,
"loss": 0.3843,
"step": 395
},
{
"epoch": 0.011555801975575236,
"grad_norm": 0.35136616230010986,
"learning_rate": 9.702479866681023e-06,
"loss": 1.0645,
"step": 396
},
{
"epoch": 0.011584983293695375,
"grad_norm": 0.35249078273773193,
"learning_rate": 9.700911198561371e-06,
"loss": 0.559,
"step": 397
},
{
"epoch": 0.011614164611815516,
"grad_norm": 0.28680017590522766,
"learning_rate": 9.69933853341092e-06,
"loss": 0.4269,
"step": 398
},
{
"epoch": 0.011643345929935656,
"grad_norm": 0.3262515962123871,
"learning_rate": 9.697761872566856e-06,
"loss": 0.5446,
"step": 399
},
{
"epoch": 0.011672527248055795,
"grad_norm": 0.3692777156829834,
"learning_rate": 9.69618121736976e-06,
"loss": 0.5779,
"step": 400
}
],
"logging_steps": 1,
"max_steps": 3427,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 400,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.552468222017536e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}