xinchen9's picture
Upload folder using huggingface_hub
74a62d8 verified
Invalid JSON: Unexpected token 'I', ..."ad_norm": Infinity, "... is not valid JSON
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.14378145219266714,
"eval_steps": 500,
"global_step": 1000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"ce_ib": 64.96094512939453,
"ce_orig": 0.7435811758041382,
"epoch": 0,
"kl_loss": 3856.220703125,
"loss_ib": 192.84352111816406,
"step": 0
},
{
"ce_ib": 65.86748504638672,
"ce_orig": 1.261900782585144,
"epoch": 0.00014378145219266715,
"kl_loss": 3989.227294921875,
"loss_ib": 199.4943084716797,
"step": 1
},
{
"ce_ib": 63.70602798461914,
"ce_orig": 1.069283127784729,
"epoch": 0.0002875629043853343,
"kl_loss": 3018.861572265625,
"loss_ib": 150.9749298095703,
"step": 2
},
{
"ce_ib": 65.56930541992188,
"ce_orig": 0.9985544085502625,
"epoch": 0.00043134435657800146,
"kl_loss": 3440.156494140625,
"loss_ib": 172.04061889648438,
"step": 3
},
{
"ce_ib": 62.74740982055664,
"ce_orig": 0.7562570571899414,
"epoch": 0.0005751258087706686,
"kl_loss": 3999.942626953125,
"loss_ib": 200.02850341796875,
"step": 4
},
{
"epoch": 0.0007189072609633358,
"grad_norm": Infinity,
"learning_rate": 0.0,
"loss": 183.2429,
"step": 5
},
{
"ce_ib": 64.26500701904297,
"ce_orig": 0.704367995262146,
"epoch": 0.0007189072609633358,
"kl_loss": 3585.45947265625,
"loss_ib": 179.3050994873047,
"step": 5
},
{
"ce_ib": 64.85440826416016,
"ce_orig": 0.990234375,
"epoch": 0.0008626887131560029,
"kl_loss": 3754.7421875,
"loss_ib": 187.76953125,
"step": 6
},
{
"ce_ib": 64.96634674072266,
"ce_orig": 0.9086857438087463,
"epoch": 0.00100647016534867,
"kl_loss": 3929.58203125,
"loss_ib": 196.51158142089844,
"step": 7
},
{
"ce_ib": 64.07415771484375,
"ce_orig": 0.7150144577026367,
"epoch": 0.0011502516175413372,
"kl_loss": 3740.4150390625,
"loss_ib": 187.0527801513672,
"step": 8
},
{
"ce_ib": 65.45182800292969,
"ce_orig": 0.8336902856826782,
"epoch": 0.0012940330697340044,
"kl_loss": 3728.460205078125,
"loss_ib": 186.4557342529297,
"step": 9
},
{
"epoch": 0.0014378145219266715,
"grad_norm": 2721.89794921875,
"learning_rate": 0.0,
"loss": 185.5215,
"step": 10
},
{
"ce_ib": 64.31118774414062,
"ce_orig": 0.773049533367157,
"epoch": 0.0014378145219266715,
"kl_loss": 3938.441650390625,
"loss_ib": 196.95423889160156,
"step": 10
},
{
"ce_ib": 68.13079071044922,
"ce_orig": 1.7455261945724487,
"epoch": 0.0015815959741193387,
"kl_loss": 3521.255615234375,
"loss_ib": 176.0968475341797,
"step": 11
},
{
"ce_ib": 65.10643005371094,
"ce_orig": 1.115777850151062,
"epoch": 0.0017253774263120058,
"kl_loss": 3872.32763671875,
"loss_ib": 193.64894104003906,
"step": 12
},
{
"ce_ib": 66.39643859863281,
"ce_orig": 0.9639286994934082,
"epoch": 0.001869158878504673,
"kl_loss": 3771.317626953125,
"loss_ib": 188.59909057617188,
"step": 13
},
{
"ce_ib": 63.847900390625,
"ce_orig": 0.6167153716087341,
"epoch": 0.00201294033069734,
"kl_loss": 3233.497314453125,
"loss_ib": 161.706787109375,
"step": 14
},
{
"epoch": 0.002156721782890007,
"grad_norm": 2457.46435546875,
"learning_rate": 3.9936102236421723e-07,
"loss": 181.2678,
"step": 15
},
{
"ce_ib": 64.9281997680664,
"ce_orig": 1.2498202323913574,
"epoch": 0.002156721782890007,
"kl_loss": 3831.1611328125,
"loss_ib": 191.59051513671875,
"step": 15
},
{
"ce_ib": 65.2757797241211,
"ce_orig": 1.0968470573425293,
"epoch": 0.0023005032350826744,
"kl_loss": 3685.48193359375,
"loss_ib": 184.30674743652344,
"step": 16
},
{
"ce_ib": 64.68568420410156,
"ce_orig": 0.8841207027435303,
"epoch": 0.0024442846872753414,
"kl_loss": 3744.38134765625,
"loss_ib": 187.2514190673828,
"step": 17
},
{
"ce_ib": 64.82100677490234,
"ce_orig": 1.1195234060287476,
"epoch": 0.0025880661394680087,
"kl_loss": 3526.25927734375,
"loss_ib": 176.3453826904297,
"step": 18
},
{
"ce_ib": 63.829200744628906,
"ce_orig": 0.9802423119544983,
"epoch": 0.0027318475916606757,
"kl_loss": 3915.511474609375,
"loss_ib": 195.80747985839844,
"step": 19
},
{
"epoch": 0.002875629043853343,
"grad_norm": 2735.018310546875,
"learning_rate": 7.987220447284345e-07,
"loss": 187.5199,
"step": 20
},
{
"ce_ib": 64.30339050292969,
"ce_orig": 0.713705837726593,
"epoch": 0.002875629043853343,
"kl_loss": 3551.751220703125,
"loss_ib": 177.61972045898438,
"step": 20
},
{
"ce_ib": 65.2120132446289,
"ce_orig": 1.1765546798706055,
"epoch": 0.00301941049604601,
"kl_loss": 2725.1201171875,
"loss_ib": 136.2886199951172,
"step": 21
},
{
"ce_ib": 64.87537384033203,
"ce_orig": 1.1634544134140015,
"epoch": 0.0031631919482386773,
"kl_loss": 3579.1513671875,
"loss_ib": 178.99000549316406,
"step": 22
},
{
"ce_ib": 63.88631820678711,
"ce_orig": 0.9461633563041687,
"epoch": 0.0033069734004313443,
"kl_loss": 3845.448974609375,
"loss_ib": 192.3043975830078,
"step": 23
},
{
"ce_ib": 64.70732116699219,
"ce_orig": 1.03489351272583,
"epoch": 0.0034507548526240116,
"kl_loss": 3712.78076171875,
"loss_ib": 185.67138671875,
"step": 24
},
{
"epoch": 0.0035945363048166786,
"grad_norm": 2554.739501953125,
"learning_rate": 1.1980830670926517e-06,
"loss": 181.5605,
"step": 25
},
{
"ce_ib": 60.482200622558594,
"ce_orig": 0.3934582769870758,
"epoch": 0.0035945363048166786,
"kl_loss": 3043.28076171875,
"loss_ib": 152.19427490234375,
"step": 25
},
{
"ce_ib": 63.36570739746094,
"ce_orig": 0.7366315126419067,
"epoch": 0.003738317757009346,
"kl_loss": 3766.66796875,
"loss_ib": 188.36508178710938,
"step": 26
},
{
"ce_ib": 64.73159790039062,
"ce_orig": 0.8274144530296326,
"epoch": 0.003882099209202013,
"kl_loss": 4114.853515625,
"loss_ib": 205.77505493164062,
"step": 27
},
{
"ce_ib": 64.48138427734375,
"ce_orig": 0.9502752423286438,
"epoch": 0.00402588066139468,
"kl_loss": 3778.4033203125,
"loss_ib": 188.95240783691406,
"step": 28
},
{
"ce_ib": 65.42862701416016,
"ce_orig": 1.2197273969650269,
"epoch": 0.004169662113587347,
"kl_loss": 3769.447509765625,
"loss_ib": 188.50509643554688,
"step": 29
},
{
"epoch": 0.004313443565780014,
"grad_norm": 2604.34765625,
"learning_rate": 1.597444089456869e-06,
"loss": 185.5906,
"step": 30
},
{
"ce_ib": 66.01839447021484,
"ce_orig": 1.2358959913253784,
"epoch": 0.004313443565780014,
"kl_loss": 3728.17578125,
"loss_ib": 186.44180297851562,
"step": 30
},
{
"ce_ib": 63.86208724975586,
"ce_orig": 0.9959704875946045,
"epoch": 0.004457225017972682,
"kl_loss": 4007.545654296875,
"loss_ib": 200.4092254638672,
"step": 31
},
{
"ce_ib": 63.890628814697266,
"ce_orig": 0.785792887210846,
"epoch": 0.004601006470165349,
"kl_loss": 3334.66552734375,
"loss_ib": 166.76522827148438,
"step": 32
},
{
"ce_ib": 63.21049118041992,
"ce_orig": 0.9940950274467468,
"epoch": 0.004744787922358016,
"kl_loss": 3633.696533203125,
"loss_ib": 181.7164306640625,
"step": 33
},
{
"ce_ib": 65.66783905029297,
"ce_orig": 1.3076696395874023,
"epoch": 0.004888569374550683,
"kl_loss": 3813.81396484375,
"loss_ib": 190.72354125976562,
"step": 34
},
{
"epoch": 0.0050323508267433505,
"grad_norm": 2616.953369140625,
"learning_rate": 1.9968051118210863e-06,
"loss": 187.4055,
"step": 35
},
{
"ce_ib": 63.93610763549805,
"ce_orig": 1.0693968534469604,
"epoch": 0.0050323508267433505,
"kl_loss": 2865.432373046875,
"loss_ib": 143.3035888671875,
"step": 35
},
{
"ce_ib": 63.83279037475586,
"ce_orig": 0.9967127442359924,
"epoch": 0.0051761322789360175,
"kl_loss": 3682.119384765625,
"loss_ib": 184.1378936767578,
"step": 36
},
{
"ce_ib": 63.78058624267578,
"ce_orig": 0.7548370957374573,
"epoch": 0.005319913731128684,
"kl_loss": 3521.068359375,
"loss_ib": 176.08531188964844,
"step": 37
},
{
"ce_ib": 63.56543731689453,
"ce_orig": 0.9782358407974243,
"epoch": 0.005463695183321351,
"kl_loss": 3663.02587890625,
"loss_ib": 183.18309020996094,
"step": 38
},
{
"ce_ib": 65.04608917236328,
"ce_orig": 0.8212652802467346,
"epoch": 0.005607476635514018,
"kl_loss": 4082.810546875,
"loss_ib": 204.1730499267578,
"step": 39
},
{
"epoch": 0.005751258087706686,
"grad_norm": 2647.227294921875,
"learning_rate": 2.3961661341853035e-06,
"loss": 181.9243,
"step": 40
},
{
"ce_ib": 65.51580047607422,
"ce_orig": 1.1804547309875488,
"epoch": 0.005751258087706686,
"kl_loss": 3949.86279296875,
"loss_ib": 197.52590942382812,
"step": 40
},
{
"ce_ib": 63.80126190185547,
"ce_orig": 0.7081286907196045,
"epoch": 0.005895039539899353,
"kl_loss": 3912.12548828125,
"loss_ib": 195.63816833496094,
"step": 41
},
{
"ce_ib": 62.75702667236328,
"ce_orig": 0.7087532877922058,
"epoch": 0.00603882099209202,
"kl_loss": 3891.41259765625,
"loss_ib": 194.60202026367188,
"step": 42
},
{
"ce_ib": 64.6529541015625,
"ce_orig": 1.2302581071853638,
"epoch": 0.006182602444284687,
"kl_loss": 3733.06787109375,
"loss_ib": 186.6857147216797,
"step": 43
},
{
"ce_ib": 63.076133728027344,
"ce_orig": 0.8998420238494873,
"epoch": 0.006326383896477355,
"kl_loss": 3209.076904296875,
"loss_ib": 160.4853973388672,
"step": 44
},
{
"epoch": 0.006470165348670022,
"grad_norm": 2475.28173828125,
"learning_rate": 2.7955271565495207e-06,
"loss": 182.2709,
"step": 45
},
{
"ce_ib": 61.6526985168457,
"ce_orig": 0.6503346562385559,
"epoch": 0.006470165348670022,
"kl_loss": 3745.544921875,
"loss_ib": 187.30807495117188,
"step": 45
},
{
"ce_ib": 63.45009231567383,
"ce_orig": 1.2053508758544922,
"epoch": 0.0066139468008626886,
"kl_loss": 3523.22900390625,
"loss_ib": 176.19317626953125,
"step": 46
},
{
"ce_ib": 64.10655212402344,
"ce_orig": 1.1375271081924438,
"epoch": 0.0067577282530553555,
"kl_loss": 3449.62353515625,
"loss_ib": 172.5132293701172,
"step": 47
},
{
"ce_ib": 61.43892288208008,
"ce_orig": 0.6051114201545715,
"epoch": 0.006901509705248023,
"kl_loss": 2918.55615234375,
"loss_ib": 145.9585418701172,
"step": 48
},
{
"ce_ib": 62.07788848876953,
"ce_orig": 0.6533306837081909,
"epoch": 0.00704529115744069,
"kl_loss": 3852.226318359375,
"loss_ib": 192.64236450195312,
"step": 49
},
{
"epoch": 0.007189072609633357,
"grad_norm": 2666.057861328125,
"learning_rate": 3.194888178913738e-06,
"loss": 178.7684,
"step": 50
},
{
"ce_ib": 63.88691329956055,
"ce_orig": 0.9801137447357178,
"epoch": 0.007189072609633357,
"kl_loss": 3421.68798828125,
"loss_ib": 171.11634826660156,
"step": 50
},
{
"ce_ib": 63.39102554321289,
"ce_orig": 1.208335041999817,
"epoch": 0.007332854061826024,
"kl_loss": 3816.6484375,
"loss_ib": 190.86412048339844,
"step": 51
},
{
"ce_ib": 62.79481506347656,
"ce_orig": 1.1687792539596558,
"epoch": 0.007476635514018692,
"kl_loss": 3562.60595703125,
"loss_ib": 178.1616973876953,
"step": 52
},
{
"ce_ib": 59.30780792236328,
"ce_orig": 0.48160627484321594,
"epoch": 0.007620416966211359,
"kl_loss": 2727.423828125,
"loss_ib": 136.40084838867188,
"step": 53
},
{
"ce_ib": 60.45918655395508,
"ce_orig": 0.729110598564148,
"epoch": 0.007764198418404026,
"kl_loss": 3612.10888671875,
"loss_ib": 180.6356658935547,
"step": 54
},
{
"epoch": 0.007907979870596693,
"grad_norm": 2656.139404296875,
"learning_rate": 3.5942492012779555e-06,
"loss": 179.8713,
"step": 55
},
{
"ce_ib": 62.10517883300781,
"ce_orig": 0.8718740344047546,
"epoch": 0.007907979870596693,
"kl_loss": 3526.4736328125,
"loss_ib": 176.354736328125,
"step": 55
},
{
"ce_ib": 61.65966796875,
"ce_orig": 0.8692609071731567,
"epoch": 0.00805176132278936,
"kl_loss": 3907.55126953125,
"loss_ib": 195.4084014892578,
"step": 56
},
{
"ce_ib": 62.87982940673828,
"ce_orig": 1.1107903718948364,
"epoch": 0.008195542774982027,
"kl_loss": 3505.02880859375,
"loss_ib": 175.2828826904297,
"step": 57
},
{
"ce_ib": 61.1724853515625,
"ce_orig": 0.7401551008224487,
"epoch": 0.008339324227174694,
"kl_loss": 3798.70947265625,
"loss_ib": 189.966064453125,
"step": 58
},
{
"ce_ib": 64.25030517578125,
"ce_orig": 1.37394380569458,
"epoch": 0.008483105679367362,
"kl_loss": 3642.968994140625,
"loss_ib": 182.1805877685547,
"step": 59
},
{
"epoch": 0.008626887131560028,
"grad_norm": 2610.953369140625,
"learning_rate": 3.993610223642173e-06,
"loss": 181.3146,
"step": 60
},
{
"ce_ib": 61.306488037109375,
"ce_orig": 0.8149375915527344,
"epoch": 0.008626887131560028,
"kl_loss": 3716.3310546875,
"loss_ib": 185.8472137451172,
"step": 60
},
{
"ce_ib": 63.48124313354492,
"ce_orig": 1.404058575630188,
"epoch": 0.008770668583752696,
"kl_loss": 3322.540283203125,
"loss_ib": 166.15875244140625,
"step": 61
},
{
"ce_ib": 63.29791259765625,
"ce_orig": 1.1193113327026367,
"epoch": 0.008914450035945364,
"kl_loss": 3468.67529296875,
"loss_ib": 173.46542358398438,
"step": 62
},
{
"ce_ib": 62.13097381591797,
"ce_orig": 1.177595615386963,
"epoch": 0.00905823148813803,
"kl_loss": 3650.87646484375,
"loss_ib": 182.57489013671875,
"step": 63
},
{
"ce_ib": 60.98870849609375,
"ce_orig": 0.9005176424980164,
"epoch": 0.009202012940330698,
"kl_loss": 3419.352783203125,
"loss_ib": 170.99813842773438,
"step": 64
},
{
"epoch": 0.009345794392523364,
"grad_norm": 2418.697998046875,
"learning_rate": 4.39297124600639e-06,
"loss": 176.6997,
"step": 65
},
{
"ce_ib": 60.742515563964844,
"ce_orig": 0.8558191657066345,
"epoch": 0.009345794392523364,
"kl_loss": 3522.680908203125,
"loss_ib": 176.16441345214844,
"step": 65
},
{
"ce_ib": 61.35293960571289,
"ce_orig": 0.6822745203971863,
"epoch": 0.009489575844716032,
"kl_loss": 3174.692138671875,
"loss_ib": 158.76528930664062,
"step": 66
},
{
"ce_ib": 60.16307067871094,
"ce_orig": 0.6927408576011658,
"epoch": 0.0096333572969087,
"kl_loss": 3434.4345703125,
"loss_ib": 171.75181579589844,
"step": 67
},
{
"ce_ib": 60.551544189453125,
"ce_orig": 0.7352694272994995,
"epoch": 0.009777138749101365,
"kl_loss": 3096.6083984375,
"loss_ib": 154.86070251464844,
"step": 68
},
{
"ce_ib": 61.049774169921875,
"ce_orig": 0.8262593150138855,
"epoch": 0.009920920201294033,
"kl_loss": 3612.48095703125,
"loss_ib": 180.6545867919922,
"step": 69
},
{
"epoch": 0.010064701653486701,
"grad_norm": 2490.9189453125,
"learning_rate": 4.792332268370607e-06,
"loss": 168.4689,
"step": 70
},
{
"ce_ib": 59.70656204223633,
"ce_orig": 0.7031822204589844,
"epoch": 0.010064701653486701,
"kl_loss": 3356.534423828125,
"loss_ib": 167.85658264160156,
"step": 70
},
{
"ce_ib": 58.52351760864258,
"ce_orig": 0.8787212371826172,
"epoch": 0.010208483105679367,
"kl_loss": 3540.501953125,
"loss_ib": 177.0543670654297,
"step": 71
},
{
"ce_ib": 60.51292419433594,
"ce_orig": 0.872455894947052,
"epoch": 0.010352264557872035,
"kl_loss": 3272.396240234375,
"loss_ib": 163.6500701904297,
"step": 72
},
{
"ce_ib": 59.571720123291016,
"ce_orig": 0.9494105577468872,
"epoch": 0.010496046010064701,
"kl_loss": 3453.132568359375,
"loss_ib": 172.68641662597656,
"step": 73
},
{
"ce_ib": 60.829315185546875,
"ce_orig": 0.9209774136543274,
"epoch": 0.010639827462257369,
"kl_loss": 3232.62890625,
"loss_ib": 161.661865234375,
"step": 74
},
{
"epoch": 0.010783608914450037,
"grad_norm": 2442.193359375,
"learning_rate": 5.191693290734825e-06,
"loss": 169.7118,
"step": 75
},
{
"ce_ib": 58.91569519042969,
"ce_orig": 0.6644178628921509,
"epoch": 0.010783608914450037,
"kl_loss": 3597.986572265625,
"loss_ib": 179.9287872314453,
"step": 75
},
{
"ce_ib": 59.93379592895508,
"ce_orig": 0.6643899083137512,
"epoch": 0.010927390366642703,
"kl_loss": 3446.6064453125,
"loss_ib": 172.36029052734375,
"step": 76
},
{
"ce_ib": 61.500518798828125,
"ce_orig": 1.1189907789230347,
"epoch": 0.01107117181883537,
"kl_loss": 3529.22216796875,
"loss_ib": 176.4918670654297,
"step": 77
},
{
"ce_ib": 59.83492660522461,
"ce_orig": 0.6313321590423584,
"epoch": 0.011214953271028037,
"kl_loss": 3552.268798828125,
"loss_ib": 177.6433563232422,
"step": 78
},
{
"ce_ib": 59.81180953979492,
"ce_orig": 1.0850353240966797,
"epoch": 0.011358734723220704,
"kl_loss": 3466.029296875,
"loss_ib": 173.3313751220703,
"step": 79
},
{
"epoch": 0.011502516175413372,
"grad_norm": 2574.475341796875,
"learning_rate": 5.591054313099041e-06,
"loss": 170.7064,
"step": 80
},
{
"ce_ib": 62.353084564208984,
"ce_orig": 1.3420498371124268,
"epoch": 0.011502516175413372,
"kl_loss": 3235.225341796875,
"loss_ib": 161.79244995117188,
"step": 80
},
{
"ce_ib": 60.79109573364258,
"ce_orig": 1.3365955352783203,
"epoch": 0.011646297627606038,
"kl_loss": 3334.964599609375,
"loss_ib": 166.77862548828125,
"step": 81
},
{
"ce_ib": 60.66354751586914,
"ce_orig": 0.7855740785598755,
"epoch": 0.011790079079798706,
"kl_loss": 3520.52783203125,
"loss_ib": 176.05673217773438,
"step": 82
},
{
"ce_ib": 59.765869140625,
"ce_orig": 0.8332124352455139,
"epoch": 0.011933860531991374,
"kl_loss": 3197.080078125,
"loss_ib": 159.88389587402344,
"step": 83
},
{
"ce_ib": 60.493858337402344,
"ce_orig": 1.5375986099243164,
"epoch": 0.01207764198418404,
"kl_loss": 3220.095703125,
"loss_ib": 161.0350341796875,
"step": 84
},
{
"epoch": 0.012221423436376708,
"grad_norm": 2434.75390625,
"learning_rate": 5.990415335463259e-06,
"loss": 165.5288,
"step": 85
},
{
"ce_ib": 57.47407150268555,
"ce_orig": 0.8656359910964966,
"epoch": 0.012221423436376708,
"kl_loss": 3253.564453125,
"loss_ib": 162.7069549560547,
"step": 85
},
{
"ce_ib": 59.44199752807617,
"ce_orig": 0.8775683045387268,
"epoch": 0.012365204888569374,
"kl_loss": 3335.959716796875,
"loss_ib": 166.8277130126953,
"step": 86
},
{
"ce_ib": 59.450687408447266,
"ce_orig": 0.7520811557769775,
"epoch": 0.012508986340762042,
"kl_loss": 3219.025634765625,
"loss_ib": 160.9810028076172,
"step": 87
},
{
"ce_ib": 58.30461120605469,
"ce_orig": 0.9134323000907898,
"epoch": 0.01265276779295471,
"kl_loss": 2271.015625,
"loss_ib": 113.5799331665039,
"step": 88
},
{
"ce_ib": 58.684146881103516,
"ce_orig": 1.100233793258667,
"epoch": 0.012796549245147375,
"kl_loss": 3356.873779296875,
"loss_ib": 167.87303161621094,
"step": 89
},
{
"epoch": 0.012940330697340043,
"grad_norm": 2395.843994140625,
"learning_rate": 6.389776357827476e-06,
"loss": 153.9006,
"step": 90
},
{
"ce_ib": 57.74800491333008,
"ce_orig": 1.038017988204956,
"epoch": 0.012940330697340043,
"kl_loss": 3335.373046875,
"loss_ib": 166.7975311279297,
"step": 90
},
{
"ce_ib": 57.7064323425293,
"ce_orig": 0.921914279460907,
"epoch": 0.013084112149532711,
"kl_loss": 3284.214111328125,
"loss_ib": 164.23956298828125,
"step": 91
},
{
"ce_ib": 57.82907485961914,
"ce_orig": 0.9391928911209106,
"epoch": 0.013227893601725377,
"kl_loss": 3100.417724609375,
"loss_ib": 155.0498046875,
"step": 92
},
{
"ce_ib": 58.58163833618164,
"ce_orig": 1.0686829090118408,
"epoch": 0.013371675053918045,
"kl_loss": 3164.060546875,
"loss_ib": 158.23233032226562,
"step": 93
},
{
"ce_ib": 56.14845657348633,
"ce_orig": 0.4495549201965332,
"epoch": 0.013515456506110711,
"kl_loss": 2954.31982421875,
"loss_ib": 147.7440643310547,
"step": 94
},
{
"epoch": 0.013659237958303379,
"grad_norm": 2321.239013671875,
"learning_rate": 6.789137380191693e-06,
"loss": 156.8229,
"step": 95
},
{
"ce_ib": 57.893184661865234,
"ce_orig": 1.1613727807998657,
"epoch": 0.013659237958303379,
"kl_loss": 3191.733154296875,
"loss_ib": 159.6156005859375,
"step": 95
},
{
"ce_ib": 58.80827713012695,
"ce_orig": 1.103546380996704,
"epoch": 0.013803019410496047,
"kl_loss": 2989.6494140625,
"loss_ib": 149.5118865966797,
"step": 96
},
{
"ce_ib": 57.372257232666016,
"ce_orig": 0.7489521503448486,
"epoch": 0.013946800862688713,
"kl_loss": 3076.787109375,
"loss_ib": 153.8680419921875,
"step": 97
},
{
"ce_ib": 58.20832061767578,
"ce_orig": 1.0948829650878906,
"epoch": 0.01409058231488138,
"kl_loss": 3079.382080078125,
"loss_ib": 153.9982147216797,
"step": 98
},
{
"ce_ib": 57.56401443481445,
"ce_orig": 1.252577543258667,
"epoch": 0.014234363767074048,
"kl_loss": 2953.885498046875,
"loss_ib": 147.7230682373047,
"step": 99
},
{
"epoch": 0.014378145219266714,
"grad_norm": 2354.333251953125,
"learning_rate": 7.188498402555911e-06,
"loss": 151.9125,
"step": 100
},
{
"ce_ib": 57.552894592285156,
"ce_orig": 1.0784616470336914,
"epoch": 0.014378145219266714,
"kl_loss": 3071.802734375,
"loss_ib": 153.61891174316406,
"step": 100
},
{
"ce_ib": 56.86176681518555,
"ce_orig": 0.6961782574653625,
"epoch": 0.014521926671459382,
"kl_loss": 3063.923095703125,
"loss_ib": 153.22459411621094,
"step": 101
},
{
"ce_ib": 54.33999252319336,
"ce_orig": 0.5625413656234741,
"epoch": 0.014665708123652048,
"kl_loss": 2982.339111328125,
"loss_ib": 149.14413452148438,
"step": 102
},
{
"ce_ib": 55.64839553833008,
"ce_orig": 0.7663992047309875,
"epoch": 0.014809489575844716,
"kl_loss": 3048.7392578125,
"loss_ib": 152.46478271484375,
"step": 103
},
{
"ce_ib": 58.38852310180664,
"ce_orig": 1.1540484428405762,
"epoch": 0.014953271028037384,
"kl_loss": 2707.0400390625,
"loss_ib": 135.38121032714844,
"step": 104
},
{
"epoch": 0.01509705248023005,
"grad_norm": 2202.033447265625,
"learning_rate": 7.5878594249201285e-06,
"loss": 148.0272,
"step": 105
},
{
"ce_ib": 58.33769607543945,
"ce_orig": 1.282652735710144,
"epoch": 0.01509705248023005,
"kl_loss": 2638.0634765625,
"loss_ib": 131.93235778808594,
"step": 105
},
{
"ce_ib": 57.01485061645508,
"ce_orig": 1.1682567596435547,
"epoch": 0.015240833932422718,
"kl_loss": 2910.34326171875,
"loss_ib": 145.54566955566406,
"step": 106
},
{
"ce_ib": 56.478248596191406,
"ce_orig": 1.093648910522461,
"epoch": 0.015384615384615385,
"kl_loss": 2749.603759765625,
"loss_ib": 137.5084228515625,
"step": 107
},
{
"ce_ib": 55.969581604003906,
"ce_orig": 0.8221930861473083,
"epoch": 0.015528396836808052,
"kl_loss": 2742.017822265625,
"loss_ib": 137.12889099121094,
"step": 108
},
{
"ce_ib": 55.39336013793945,
"ce_orig": 1.138152837753296,
"epoch": 0.015672178289000718,
"kl_loss": 2064.01513671875,
"loss_ib": 103.22845458984375,
"step": 109
},
{
"epoch": 0.015815959741193385,
"grad_norm": 1960.8631591796875,
"learning_rate": 7.987220447284345e-06,
"loss": 139.8337,
"step": 110
},
{
"ce_ib": 57.7611198425293,
"ce_orig": 0.8048346042633057,
"epoch": 0.015815959741193385,
"kl_loss": 2746.037353515625,
"loss_ib": 137.33074951171875,
"step": 110
},
{
"ce_ib": 54.72801208496094,
"ce_orig": 0.9340922236442566,
"epoch": 0.015959741193386053,
"kl_loss": 2660.56982421875,
"loss_ib": 133.0558624267578,
"step": 111
},
{
"ce_ib": 56.28373336791992,
"ce_orig": 1.2209872007369995,
"epoch": 0.01610352264557872,
"kl_loss": 2672.66015625,
"loss_ib": 133.66114807128906,
"step": 112
},
{
"ce_ib": 55.83729553222656,
"ce_orig": 1.1345103979110718,
"epoch": 0.01624730409777139,
"kl_loss": 2844.884765625,
"loss_ib": 142.27215576171875,
"step": 113
},
{
"ce_ib": 55.07097625732422,
"ce_orig": 1.2232747077941895,
"epoch": 0.016391085549964053,
"kl_loss": 2053.98974609375,
"loss_ib": 102.7270278930664,
"step": 114
},
{
"epoch": 0.01653486700215672,
"grad_norm": 2072.072998046875,
"learning_rate": 8.386581469648563e-06,
"loss": 135.2724,
"step": 115
},
{
"ce_ib": 56.160423278808594,
"ce_orig": 0.8365716338157654,
"epoch": 0.01653486700215672,
"kl_loss": 2660.091796875,
"loss_ib": 133.0326690673828,
"step": 115
},
{
"ce_ib": 54.47496795654297,
"ce_orig": 1.0809355974197388,
"epoch": 0.01667864845434939,
"kl_loss": 2690.080078125,
"loss_ib": 134.53123474121094,
"step": 116
},
{
"ce_ib": 53.27537155151367,
"ce_orig": 1.1820772886276245,
"epoch": 0.016822429906542057,
"kl_loss": 2644.28173828125,
"loss_ib": 132.24072265625,
"step": 117
},
{
"ce_ib": 54.39961242675781,
"ce_orig": 1.2899582386016846,
"epoch": 0.016966211358734724,
"kl_loss": 2602.2080078125,
"loss_ib": 130.13760375976562,
"step": 118
},
{
"ce_ib": 52.184234619140625,
"ce_orig": 0.8353484869003296,
"epoch": 0.01710999281092739,
"kl_loss": 2481.124267578125,
"loss_ib": 124.0822982788086,
"step": 119
},
{
"epoch": 0.017253774263120056,
"grad_norm": 2074.370361328125,
"learning_rate": 8.78594249201278e-06,
"loss": 132.3383,
"step": 120
},
{
"ce_ib": 53.626792907714844,
"ce_orig": 0.689016580581665,
"epoch": 0.017253774263120056,
"kl_loss": 2638.05810546875,
"loss_ib": 131.9297332763672,
"step": 120
},
{
"ce_ib": 52.17591857910156,
"ce_orig": 0.7896418571472168,
"epoch": 0.017397555715312724,
"kl_loss": 1895.728515625,
"loss_ib": 94.81251525878906,
"step": 121
},
{
"ce_ib": 52.534664154052734,
"ce_orig": 0.7579004764556885,
"epoch": 0.017541337167505392,
"kl_loss": 2552.994873046875,
"loss_ib": 127.67601776123047,
"step": 122
},
{
"ce_ib": 52.31755065917969,
"ce_orig": 0.9803644418716431,
"epoch": 0.01768511861969806,
"kl_loss": 2607.78662109375,
"loss_ib": 130.41549682617188,
"step": 123
},
{
"ce_ib": 53.674556732177734,
"ce_orig": 0.8895677328109741,
"epoch": 0.017828900071890728,
"kl_loss": 2425.487548828125,
"loss_ib": 121.30122375488281,
"step": 124
},
{
"epoch": 0.017972681524083392,
"grad_norm": 1952.8394775390625,
"learning_rate": 9.185303514376996e-06,
"loss": 122.1088,
"step": 125
},
{
"ce_ib": 53.564842224121094,
"ce_orig": 1.256234049797058,
"epoch": 0.017972681524083392,
"kl_loss": 2441.9111328125,
"loss_ib": 122.1223373413086,
"step": 125
},
{
"ce_ib": 52.68410110473633,
"ce_orig": 1.2045953273773193,
"epoch": 0.01811646297627606,
"kl_loss": 2293.639404296875,
"loss_ib": 114.70831298828125,
"step": 126
},
{
"ce_ib": 54.133914947509766,
"ce_orig": 1.514359712600708,
"epoch": 0.018260244428468728,
"kl_loss": 2311.011962890625,
"loss_ib": 115.5776596069336,
"step": 127
},
{
"ce_ib": 52.44422149658203,
"ce_orig": 0.9751385450363159,
"epoch": 0.018404025880661395,
"kl_loss": 2449.00048828125,
"loss_ib": 122.47624969482422,
"step": 128
},
{
"ce_ib": 50.06406021118164,
"ce_orig": 0.7257946729660034,
"epoch": 0.018547807332854063,
"kl_loss": 2270.14794921875,
"loss_ib": 113.53243255615234,
"step": 129
},
{
"epoch": 0.018691588785046728,
"grad_norm": 1946.055419921875,
"learning_rate": 9.584664536741214e-06,
"loss": 119.3412,
"step": 130
},
{
"ce_ib": 53.00954055786133,
"ce_orig": 1.0300298929214478,
"epoch": 0.018691588785046728,
"kl_loss": 2245.056640625,
"loss_ib": 112.27933502197266,
"step": 130
},
{
"ce_ib": 50.96159362792969,
"ce_orig": 0.8871611952781677,
"epoch": 0.018835370237239395,
"kl_loss": 2287.427734375,
"loss_ib": 114.39686584472656,
"step": 131
},
{
"ce_ib": 52.4853515625,
"ce_orig": 1.039488673210144,
"epoch": 0.018979151689432063,
"kl_loss": 2313.669921875,
"loss_ib": 115.7097396850586,
"step": 132
},
{
"ce_ib": 51.21004104614258,
"ce_orig": 0.845194935798645,
"epoch": 0.01912293314162473,
"kl_loss": 2127.791259765625,
"loss_ib": 106.41517639160156,
"step": 133
},
{
"ce_ib": 52.13531494140625,
"ce_orig": 1.1372461318969727,
"epoch": 0.0192667145938174,
"kl_loss": 2183.52783203125,
"loss_ib": 109.20245361328125,
"step": 134
},
{
"epoch": 0.019410496046010063,
"grad_norm": 1868.0174560546875,
"learning_rate": 9.984025559105432e-06,
"loss": 111.8021,
"step": 135
},
{
"ce_ib": 50.40250015258789,
"ce_orig": 0.5799722075462341,
"epoch": 0.019410496046010063,
"kl_loss": 2218.98046875,
"loss_ib": 110.9742202758789,
"step": 135
},
{
"ce_ib": 49.68068313598633,
"ce_orig": 0.9591949582099915,
"epoch": 0.01955427749820273,
"kl_loss": 2150.03564453125,
"loss_ib": 107.52662658691406,
"step": 136
},
{
"ce_ib": 49.44940185546875,
"ce_orig": 0.8390080332756042,
"epoch": 0.0196980589503954,
"kl_loss": 1845.8321533203125,
"loss_ib": 92.31633758544922,
"step": 137
},
{
"ce_ib": 52.21055221557617,
"ce_orig": 1.4367856979370117,
"epoch": 0.019841840402588067,
"kl_loss": 2051.351806640625,
"loss_ib": 102.59370422363281,
"step": 138
},
{
"ce_ib": 50.15422058105469,
"ce_orig": 1.2788512706756592,
"epoch": 0.019985621854780734,
"kl_loss": 2027.796875,
"loss_ib": 101.41492462158203,
"step": 139
},
{
"epoch": 0.020129403306973402,
"grad_norm": 1761.1328125,
"learning_rate": 1.038338658146965e-05,
"loss": 102.0922,
"step": 140
},
{
"ce_ib": 50.65812301635742,
"ce_orig": 1.2148778438568115,
"epoch": 0.020129403306973402,
"kl_loss": 1903.0653076171875,
"loss_ib": 95.17859649658203,
"step": 140
},
{
"ce_ib": 49.68356704711914,
"ce_orig": 1.0471272468566895,
"epoch": 0.020273184759166066,
"kl_loss": 1945.1756591796875,
"loss_ib": 97.28362274169922,
"step": 141
},
{
"ce_ib": 47.467594146728516,
"ce_orig": 0.6498449444770813,
"epoch": 0.020416966211358734,
"kl_loss": 1910.508544921875,
"loss_ib": 95.54916381835938,
"step": 142
},
{
"ce_ib": 47.57759094238281,
"ce_orig": 0.8552805781364441,
"epoch": 0.020560747663551402,
"kl_loss": 1907.689697265625,
"loss_ib": 95.40827941894531,
"step": 143
},
{
"ce_ib": 50.04168701171875,
"ce_orig": 1.1924092769622803,
"epoch": 0.02070452911574407,
"kl_loss": 1690.5194091796875,
"loss_ib": 84.55098724365234,
"step": 144
},
{
"epoch": 0.020848310567936738,
"grad_norm": 1661.7161865234375,
"learning_rate": 1.0782747603833867e-05,
"loss": 95.0661,
"step": 145
},
{
"ce_ib": 48.34769821166992,
"ce_orig": 1.0676274299621582,
"epoch": 0.020848310567936738,
"kl_loss": 1760.777099609375,
"loss_ib": 88.06303405761719,
"step": 145
},
{
"ce_ib": 47.0106201171875,
"ce_orig": 1.2353283166885376,
"epoch": 0.020992092020129402,
"kl_loss": 1798.0538330078125,
"loss_ib": 89.92620086669922,
"step": 146
},
{
"ce_ib": 48.50465393066406,
"ce_orig": 1.0733487606048584,
"epoch": 0.02113587347232207,
"kl_loss": 1734.2774658203125,
"loss_ib": 86.73812866210938,
"step": 147
},
{
"ce_ib": 45.04286193847656,
"ce_orig": 0.8353049159049988,
"epoch": 0.021279654924514738,
"kl_loss": 1773.991943359375,
"loss_ib": 88.72212219238281,
"step": 148
},
{
"ce_ib": 47.320526123046875,
"ce_orig": 1.219022512435913,
"epoch": 0.021423436376707405,
"kl_loss": 1750.5101318359375,
"loss_ib": 87.54916381835938,
"step": 149
},
{
"epoch": 0.021567217828900073,
"grad_norm": 1564.1060791015625,
"learning_rate": 1.1182108626198083e-05,
"loss": 87.3973,
"step": 150
},
{
"ce_ib": 47.08059310913086,
"ce_orig": 0.8302248120307922,
"epoch": 0.021567217828900073,
"kl_loss": 1589.67822265625,
"loss_ib": 79.50745391845703,
"step": 150
},
{
"ce_ib": 47.74165725708008,
"ce_orig": 0.7171430587768555,
"epoch": 0.021710999281092738,
"kl_loss": 1636.097412109375,
"loss_ib": 81.82874298095703,
"step": 151
},
{
"ce_ib": 46.43238830566406,
"ce_orig": 0.7868078947067261,
"epoch": 0.021854780733285405,
"kl_loss": 1594.2950439453125,
"loss_ib": 79.73796844482422,
"step": 152
},
{
"ce_ib": 44.30559539794922,
"ce_orig": 1.0339301824569702,
"epoch": 0.021998562185478073,
"kl_loss": 1573.078857421875,
"loss_ib": 78.67610168457031,
"step": 153
},
{
"ce_ib": 44.82426834106445,
"ce_orig": 1.0889431238174438,
"epoch": 0.02214234363767074,
"kl_loss": 1503.557861328125,
"loss_ib": 75.20030975341797,
"step": 154
},
{
"epoch": 0.02228612508986341,
"grad_norm": 1438.786376953125,
"learning_rate": 1.1581469648562302e-05,
"loss": 77.0769,
"step": 155
},
{
"ce_ib": 46.833580017089844,
"ce_orig": 1.3186697959899902,
"epoch": 0.02228612508986341,
"kl_loss": 1401.23486328125,
"loss_ib": 70.08516693115234,
"step": 155
},
{
"ce_ib": 46.787254333496094,
"ce_orig": 0.9530097246170044,
"epoch": 0.022429906542056073,
"kl_loss": 1483.8673095703125,
"loss_ib": 74.21675872802734,
"step": 156
},
{
"ce_ib": 46.81201171875,
"ce_orig": 1.1565759181976318,
"epoch": 0.02257368799424874,
"kl_loss": 1341.5926513671875,
"loss_ib": 67.10304260253906,
"step": 157
},
{
"ce_ib": 45.95816421508789,
"ce_orig": 1.2070651054382324,
"epoch": 0.02271746944644141,
"kl_loss": 1238.02880859375,
"loss_ib": 61.92441940307617,
"step": 158
},
{
"ce_ib": 43.655330657958984,
"ce_orig": 0.7560437321662903,
"epoch": 0.022861250898634077,
"kl_loss": 1213.171875,
"loss_ib": 60.680419921875,
"step": 159
},
{
"epoch": 0.023005032350826744,
"grad_norm": 1232.082763671875,
"learning_rate": 1.1980830670926518e-05,
"loss": 66.2022,
"step": 160
},
{
"ce_ib": 46.1483268737793,
"ce_orig": 1.2238253355026245,
"epoch": 0.023005032350826744,
"kl_loss": 1228.1053466796875,
"loss_ib": 61.428340911865234,
"step": 160
},
{
"ce_ib": 47.45783996582031,
"ce_orig": 1.587011694908142,
"epoch": 0.023148813803019412,
"kl_loss": 1157.474365234375,
"loss_ib": 57.8974494934082,
"step": 161
},
{
"ce_ib": 45.567657470703125,
"ce_orig": 1.3450465202331543,
"epoch": 0.023292595255212076,
"kl_loss": 1117.0738525390625,
"loss_ib": 55.8764762878418,
"step": 162
},
{
"ce_ib": 44.280250549316406,
"ce_orig": 1.2833943367004395,
"epoch": 0.023436376707404744,
"kl_loss": 1064.94677734375,
"loss_ib": 53.26947784423828,
"step": 163
},
{
"ce_ib": 43.552276611328125,
"ce_orig": 0.7282137870788574,
"epoch": 0.023580158159597412,
"kl_loss": 1048.7855224609375,
"loss_ib": 52.461055755615234,
"step": 164
},
{
"epoch": 0.02372393961179008,
"grad_norm": 1095.414794921875,
"learning_rate": 1.2380191693290735e-05,
"loss": 56.1672,
"step": 165
},
{
"ce_ib": 43.77597427368164,
"ce_orig": 1.015390157699585,
"epoch": 0.02372393961179008,
"kl_loss": 996.73828125,
"loss_ib": 49.858802795410156,
"step": 165
},
{
"ce_ib": 43.67955017089844,
"ce_orig": 1.0842498540878296,
"epoch": 0.023867721063982748,
"kl_loss": 947.7979736328125,
"loss_ib": 47.411739349365234,
"step": 166
},
{
"ce_ib": 42.23929214477539,
"ce_orig": 0.9063572287559509,
"epoch": 0.024011502516175412,
"kl_loss": 897.7328491210938,
"loss_ib": 44.90776062011719,
"step": 167
},
{
"ce_ib": 45.15175247192383,
"ce_orig": 1.2839014530181885,
"epoch": 0.02415528396836808,
"kl_loss": 786.1021728515625,
"loss_ib": 39.32768630981445,
"step": 168
},
{
"ce_ib": 43.47494125366211,
"ce_orig": 0.9718037247657776,
"epoch": 0.024299065420560748,
"kl_loss": 860.2787475585938,
"loss_ib": 43.035675048828125,
"step": 169
},
{
"epoch": 0.024442846872753415,
"grad_norm": 1042.965087890625,
"learning_rate": 1.2779552715654951e-05,
"loss": 47.1098,
"step": 170
},
{
"ce_ib": 43.44833755493164,
"ce_orig": 1.3617149591445923,
"epoch": 0.024442846872753415,
"kl_loss": 803.04443359375,
"loss_ib": 40.173946380615234,
"step": 170
},
{
"ce_ib": 43.578792572021484,
"ce_orig": 0.9456126093864441,
"epoch": 0.024586628324946083,
"kl_loss": 699.483642578125,
"loss_ib": 34.9959716796875,
"step": 171
},
{
"ce_ib": 47.23983383178711,
"ce_orig": 1.8550169467926025,
"epoch": 0.024730409777138748,
"kl_loss": 675.0830688476562,
"loss_ib": 33.777774810791016,
"step": 172
},
{
"ce_ib": 43.85140609741211,
"ce_orig": 1.138585090637207,
"epoch": 0.024874191229331415,
"kl_loss": 659.050537109375,
"loss_ib": 32.97445297241211,
"step": 173
},
{
"ce_ib": 41.722373962402344,
"ce_orig": 1.243048906326294,
"epoch": 0.025017972681524083,
"kl_loss": 590.1033325195312,
"loss_ib": 29.526029586791992,
"step": 174
},
{
"epoch": 0.02516175413371675,
"grad_norm": 766.4675903320312,
"learning_rate": 1.3178913738019169e-05,
"loss": 35.4625,
"step": 175
},
{
"ce_ib": 44.18523406982422,
"ce_orig": 1.255010962486267,
"epoch": 0.02516175413371675,
"kl_loss": 592.2353515625,
"loss_ib": 29.633859634399414,
"step": 175
},
{
"ce_ib": 46.14919662475586,
"ce_orig": 0.9867228865623474,
"epoch": 0.02530553558590942,
"kl_loss": 509.3572692871094,
"loss_ib": 25.490938186645508,
"step": 176
},
{
"ce_ib": 46.58631134033203,
"ce_orig": 1.1491925716400146,
"epoch": 0.025449317038102087,
"kl_loss": 459.72491455078125,
"loss_ib": 23.009538650512695,
"step": 177
},
{
"ce_ib": 40.569705963134766,
"ce_orig": 0.7460018396377563,
"epoch": 0.02559309849029475,
"kl_loss": 352.4381103515625,
"loss_ib": 17.64219093322754,
"step": 178
},
{
"ce_ib": 46.841224670410156,
"ce_orig": 1.1470024585723877,
"epoch": 0.02573687994248742,
"kl_loss": 389.03765869140625,
"loss_ib": 19.475303649902344,
"step": 179
},
{
"epoch": 0.025880661394680086,
"grad_norm": 621.7545776367188,
"learning_rate": 1.3578274760383387e-05,
"loss": 24.304,
"step": 180
},
{
"ce_ib": 47.13188552856445,
"ce_orig": 1.330838680267334,
"epoch": 0.025880661394680086,
"kl_loss": 379.19989013671875,
"loss_ib": 18.98356056213379,
"step": 180
},
{
"ce_ib": 52.806861877441406,
"ce_orig": 2.114457130432129,
"epoch": 0.026024442846872754,
"kl_loss": 284.24114990234375,
"loss_ib": 14.2384614944458,
"step": 181
},
{
"ce_ib": 64.3398208618164,
"ce_orig": 0.9137269854545593,
"epoch": 0.026168224299065422,
"kl_loss": 240.51211547851562,
"loss_ib": 12.05777645111084,
"step": 182
},
{
"ce_ib": 62.500736236572266,
"ce_orig": 1.4484410285949707,
"epoch": 0.026312005751258086,
"kl_loss": 223.3756103515625,
"loss_ib": 11.200030326843262,
"step": 183
},
{
"ce_ib": 70.71409606933594,
"ce_orig": 1.7095311880111694,
"epoch": 0.026455787203450754,
"kl_loss": 207.35211181640625,
"loss_ib": 10.402962684631348,
"step": 184
},
{
"epoch": 0.026599568655643422,
"grad_norm": 365.3516845703125,
"learning_rate": 1.3977635782747606e-05,
"loss": 14.3906,
"step": 185
},
{
"ce_ib": 63.17552947998047,
"ce_orig": 2.531080484390259,
"epoch": 0.026599568655643422,
"kl_loss": 165.71923828125,
"loss_ib": 8.317549705505371,
"step": 185
},
{
"ce_ib": 84.16175842285156,
"ce_orig": 2.0548126697540283,
"epoch": 0.02674335010783609,
"kl_loss": 140.68856811523438,
"loss_ib": 7.07650899887085,
"step": 186
},
{
"ce_ib": 77.55998992919922,
"ce_orig": 1.809746503829956,
"epoch": 0.026887131560028758,
"kl_loss": 121.14314270019531,
"loss_ib": 6.095937252044678,
"step": 187
},
{
"ce_ib": 77.81259155273438,
"ce_orig": 1.9566165208816528,
"epoch": 0.027030913012221422,
"kl_loss": 101.40348815917969,
"loss_ib": 5.109080791473389,
"step": 188
},
{
"ce_ib": 69.3860092163086,
"ce_orig": 1.703246831893921,
"epoch": 0.02717469446441409,
"kl_loss": 93.15800476074219,
"loss_ib": 4.692593574523926,
"step": 189
},
{
"epoch": 0.027318475916606758,
"grad_norm": 189.81011962890625,
"learning_rate": 1.4376996805111822e-05,
"loss": 8.0904,
"step": 190
},
{
"ce_ib": 60.081260681152344,
"ce_orig": 1.8992141485214233,
"epoch": 0.027318475916606758,
"kl_loss": 100.07512664794922,
"loss_ib": 5.033797264099121,
"step": 190
},
{
"ce_ib": 58.84217834472656,
"ce_orig": 1.018802285194397,
"epoch": 0.027462257368799425,
"kl_loss": 72.03784942626953,
"loss_ib": 3.6313138008117676,
"step": 191
},
{
"ce_ib": 58.045005798339844,
"ce_orig": 1.0986676216125488,
"epoch": 0.027606038820992093,
"kl_loss": 78.45074462890625,
"loss_ib": 3.9515597820281982,
"step": 192
},
{
"ce_ib": 53.310672760009766,
"ce_orig": 1.2137010097503662,
"epoch": 0.027749820273184757,
"kl_loss": 56.91468811035156,
"loss_ib": 2.872389793395996,
"step": 193
},
{
"ce_ib": 47.84365463256836,
"ce_orig": 1.141069769859314,
"epoch": 0.027893601725377425,
"kl_loss": 49.71630096435547,
"loss_ib": 2.5097367763519287,
"step": 194
},
{
"epoch": 0.028037383177570093,
"grad_norm": 102.58991241455078,
"learning_rate": 1.477635782747604e-05,
"loss": 4.9077,
"step": 195
},
{
"ce_ib": 47.62435531616211,
"ce_orig": 1.2880587577819824,
"epoch": 0.028037383177570093,
"kl_loss": 51.41084289550781,
"loss_ib": 2.5943543910980225,
"step": 195
},
{
"ce_ib": 45.36109161376953,
"ce_orig": 0.8635546565055847,
"epoch": 0.02818116462976276,
"kl_loss": 44.00239181518555,
"loss_ib": 2.2228000164031982,
"step": 196
},
{
"ce_ib": 41.63899230957031,
"ce_orig": 1.1706253290176392,
"epoch": 0.02832494608195543,
"kl_loss": 40.91154479980469,
"loss_ib": 2.066396713256836,
"step": 197
},
{
"ce_ib": 40.076194763183594,
"ce_orig": 0.9194042682647705,
"epoch": 0.028468727534148097,
"kl_loss": 38.050926208496094,
"loss_ib": 1.9225844144821167,
"step": 198
},
{
"ce_ib": 35.384429931640625,
"ce_orig": 0.8929119110107422,
"epoch": 0.02861250898634076,
"kl_loss": 40.8455924987793,
"loss_ib": 2.059971809387207,
"step": 199
},
{
"epoch": 0.02875629043853343,
"grad_norm": 53.01378631591797,
"learning_rate": 1.5175718849840257e-05,
"loss": 3.3711,
"step": 200
},
{
"ce_ib": 37.04123306274414,
"ce_orig": 0.9671800136566162,
"epoch": 0.02875629043853343,
"kl_loss": 35.982879638671875,
"loss_ib": 1.8176645040512085,
"step": 200
},
{
"ce_ib": 37.61642837524414,
"ce_orig": 1.3750962018966675,
"epoch": 0.028900071890726096,
"kl_loss": 30.0595703125,
"loss_ib": 1.5217866897583008,
"step": 201
},
{
"ce_ib": 40.238006591796875,
"ce_orig": 1.7766847610473633,
"epoch": 0.029043853342918764,
"kl_loss": 26.227649688720703,
"loss_ib": 1.3315014839172363,
"step": 202
},
{
"ce_ib": 38.055755615234375,
"ce_orig": 1.213000774383545,
"epoch": 0.029187634795111432,
"kl_loss": 29.874595642089844,
"loss_ib": 1.512757658958435,
"step": 203
},
{
"ce_ib": 32.915287017822266,
"ce_orig": 0.86496502161026,
"epoch": 0.029331416247304096,
"kl_loss": 30.330623626708984,
"loss_ib": 1.5329889059066772,
"step": 204
},
{
"epoch": 0.029475197699496764,
"grad_norm": 23.908042907714844,
"learning_rate": 1.5575079872204475e-05,
"loss": 2.7418,
"step": 205
},
{
"ce_ib": 35.01918411254883,
"ce_orig": 1.2721844911575317,
"epoch": 0.029475197699496764,
"kl_loss": 26.31290054321289,
"loss_ib": 1.333154559135437,
"step": 205
},
{
"ce_ib": 32.87287139892578,
"ce_orig": 1.1400614976882935,
"epoch": 0.029618979151689432,
"kl_loss": 26.177352905273438,
"loss_ib": 1.3253041505813599,
"step": 206
},
{
"ce_ib": 35.85771942138672,
"ce_orig": 1.3444130420684814,
"epoch": 0.0297627606038821,
"kl_loss": 34.98625183105469,
"loss_ib": 1.7672414779663086,
"step": 207
},
{
"ce_ib": 31.547334671020508,
"ce_orig": 0.9652504920959473,
"epoch": 0.029906542056074768,
"kl_loss": 20.096208572387695,
"loss_ib": 1.0205841064453125,
"step": 208
},
{
"ce_ib": 32.527896881103516,
"ce_orig": 1.0664165019989014,
"epoch": 0.030050323508267432,
"kl_loss": 20.749666213989258,
"loss_ib": 1.053747296333313,
"step": 209
},
{
"epoch": 0.0301941049604601,
"grad_norm": 22.29879379272461,
"learning_rate": 1.597444089456869e-05,
"loss": 2.3639,
"step": 210
},
{
"ce_ib": 32.71118927001953,
"ce_orig": 1.2022026777267456,
"epoch": 0.0301941049604601,
"kl_loss": 20.01553726196289,
"loss_ib": 1.0171325206756592,
"step": 210
},
{
"ce_ib": 30.730520248413086,
"ce_orig": 0.8476402163505554,
"epoch": 0.030337886412652768,
"kl_loss": 17.856157302856445,
"loss_ib": 0.9081730842590332,
"step": 211
},
{
"ce_ib": 29.36618995666504,
"ce_orig": 0.7845667004585266,
"epoch": 0.030481667864845435,
"kl_loss": 18.906429290771484,
"loss_ib": 0.9600045084953308,
"step": 212
},
{
"ce_ib": 27.0706844329834,
"ce_orig": 0.3370642066001892,
"epoch": 0.030625449317038103,
"kl_loss": 18.621246337890625,
"loss_ib": 0.9445976614952087,
"step": 213
},
{
"ce_ib": 34.2156867980957,
"ce_orig": 0.6062073111534119,
"epoch": 0.03076923076923077,
"kl_loss": 19.93109893798828,
"loss_ib": 1.0136628150939941,
"step": 214
},
{
"epoch": 0.030913012221423435,
"grad_norm": 16.176197052001953,
"learning_rate": 1.6373801916932906e-05,
"loss": 1.8299,
"step": 215
},
{
"ce_ib": 28.275493621826172,
"ce_orig": 1.0786405801773071,
"epoch": 0.030913012221423435,
"kl_loss": 18.11065673828125,
"loss_ib": 0.919670581817627,
"step": 215
},
{
"ce_ib": 32.22303009033203,
"ce_orig": 0.8485822081565857,
"epoch": 0.031056793673616103,
"kl_loss": 19.91845703125,
"loss_ib": 1.012034296989441,
"step": 216
},
{
"ce_ib": 27.742530822753906,
"ce_orig": 0.8794713020324707,
"epoch": 0.03120057512580877,
"kl_loss": 14.383018493652344,
"loss_ib": 0.7330222129821777,
"step": 217
},
{
"ce_ib": 31.794010162353516,
"ce_orig": 0.6954683661460876,
"epoch": 0.031344356578001435,
"kl_loss": 16.9901180267334,
"loss_ib": 0.86540287733078,
"step": 218
},
{
"ce_ib": 31.032941818237305,
"ce_orig": 1.1841130256652832,
"epoch": 0.0314881380301941,
"kl_loss": 13.406427383422852,
"loss_ib": 0.6858378648757935,
"step": 219
},
{
"epoch": 0.03163191948238677,
"grad_norm": 14.094661712646484,
"learning_rate": 1.6773162939297126e-05,
"loss": 1.8049,
"step": 220
},
{
"ce_ib": 32.347557067871094,
"ce_orig": 0.7493203282356262,
"epoch": 0.03163191948238677,
"kl_loss": 18.79709815979004,
"loss_ib": 0.9560286402702332,
"step": 220
},
{
"ce_ib": 31.1737060546875,
"ce_orig": 1.0758991241455078,
"epoch": 0.03177570093457944,
"kl_loss": 20.572391510009766,
"loss_ib": 1.0442065000534058,
"step": 221
},
{
"ce_ib": 30.046796798706055,
"ce_orig": 0.6758080124855042,
"epoch": 0.031919482386772106,
"kl_loss": 13.740577697753906,
"loss_ib": 0.7020522952079773,
"step": 222
},
{
"ce_ib": 30.532987594604492,
"ce_orig": 1.0658819675445557,
"epoch": 0.032063263838964774,
"kl_loss": 11.958425521850586,
"loss_ib": 0.6131877899169922,
"step": 223
},
{
"ce_ib": 31.667829513549805,
"ce_orig": 1.1956491470336914,
"epoch": 0.03220704529115744,
"kl_loss": 12.607803344726562,
"loss_ib": 0.6462240815162659,
"step": 224
},
{
"epoch": 0.03235082674335011,
"grad_norm": 5.280292987823486,
"learning_rate": 1.7172523961661345e-05,
"loss": 1.6789,
"step": 225
},
{
"ce_ib": 31.769432067871094,
"ce_orig": 1.1676932573318481,
"epoch": 0.03235082674335011,
"kl_loss": 10.932788848876953,
"loss_ib": 0.5625241994857788,
"step": 225
},
{
"ce_ib": 30.539987564086914,
"ce_orig": 1.3033503293991089,
"epoch": 0.03249460819554278,
"kl_loss": 12.105158805847168,
"loss_ib": 0.6205279231071472,
"step": 226
},
{
"ce_ib": 28.660764694213867,
"ce_orig": 0.6191550493240356,
"epoch": 0.032638389647735445,
"kl_loss": 9.103292465209961,
"loss_ib": 0.46949502825737,
"step": 227
},
{
"ce_ib": 29.167789459228516,
"ce_orig": 0.7975085973739624,
"epoch": 0.032782171099928106,
"kl_loss": 10.556224822998047,
"loss_ib": 0.5423951148986816,
"step": 228
},
{
"ce_ib": 29.07682991027832,
"ce_orig": 1.0861161947250366,
"epoch": 0.032925952552120774,
"kl_loss": 9.905879974365234,
"loss_ib": 0.5098324418067932,
"step": 229
},
{
"epoch": 0.03306973400431344,
"grad_norm": 5.249032974243164,
"learning_rate": 1.757188498402556e-05,
"loss": 1.5289,
"step": 230
},
{
"ce_ib": 28.152671813964844,
"ce_orig": 0.8374654054641724,
"epoch": 0.03306973400431344,
"kl_loss": 10.608142852783203,
"loss_ib": 0.544483482837677,
"step": 230
},
{
"ce_ib": 26.917634963989258,
"ce_orig": 1.0145775079727173,
"epoch": 0.03321351545650611,
"kl_loss": 12.240139961242676,
"loss_ib": 0.6254658102989197,
"step": 231
},
{
"ce_ib": 29.13026237487793,
"ce_orig": 0.8860113024711609,
"epoch": 0.03335729690869878,
"kl_loss": 10.390130043029785,
"loss_ib": 0.5340716242790222,
"step": 232
},
{
"ce_ib": 27.842693328857422,
"ce_orig": 1.1651474237442017,
"epoch": 0.033501078360891445,
"kl_loss": 9.602649688720703,
"loss_ib": 0.49405384063720703,
"step": 233
},
{
"ce_ib": 24.930410385131836,
"ce_orig": 0.6166008710861206,
"epoch": 0.03364485981308411,
"kl_loss": 10.271495819091797,
"loss_ib": 0.5260400176048279,
"step": 234
},
{
"epoch": 0.03378864126527678,
"grad_norm": 5.078007698059082,
"learning_rate": 1.7971246006389777e-05,
"loss": 1.4736,
"step": 235
},
{
"ce_ib": 27.262588500976562,
"ce_orig": 1.0158146619796753,
"epoch": 0.03378864126527678,
"kl_loss": 10.76059627532959,
"loss_ib": 0.5516611337661743,
"step": 235
},
{
"ce_ib": 28.12372398376465,
"ce_orig": 0.9383306503295898,
"epoch": 0.03393242271746945,
"kl_loss": 9.931174278259277,
"loss_ib": 0.5106205940246582,
"step": 236
},
{
"ce_ib": 27.61408233642578,
"ce_orig": 1.3611609935760498,
"epoch": 0.034076204169662117,
"kl_loss": 11.292994499206543,
"loss_ib": 0.5784568190574646,
"step": 237
},
{
"ce_ib": 25.42107582092285,
"ce_orig": 0.8392754197120667,
"epoch": 0.03421998562185478,
"kl_loss": 10.153192520141602,
"loss_ib": 0.5203701853752136,
"step": 238
},
{
"ce_ib": 26.299903869628906,
"ce_orig": 0.8138580918312073,
"epoch": 0.034363767074047445,
"kl_loss": 10.076581001281738,
"loss_ib": 0.5169789791107178,
"step": 239
},
{
"epoch": 0.03450754852624011,
"grad_norm": 1.6015193462371826,
"learning_rate": 1.8370607028753993e-05,
"loss": 1.371,
"step": 240
},
{
"ce_ib": 25.303178787231445,
"ce_orig": 0.9828527569770813,
"epoch": 0.03450754852624011,
"kl_loss": 9.301372528076172,
"loss_ib": 0.4777202308177948,
"step": 240
},
{
"ce_ib": 17.445037841796875,
"ce_orig": 0.447143018245697,
"epoch": 0.03465132997843278,
"kl_loss": 7.682253837585449,
"loss_ib": 0.39283522963523865,
"step": 241
},
{
"ce_ib": 25.544435501098633,
"ce_orig": 0.531434953212738,
"epoch": 0.03479511143062545,
"kl_loss": 9.930435180664062,
"loss_ib": 0.5092939734458923,
"step": 242
},
{
"ce_ib": 26.459491729736328,
"ce_orig": 1.5432560443878174,
"epoch": 0.034938892882818116,
"kl_loss": 10.38063907623291,
"loss_ib": 0.5322617292404175,
"step": 243
},
{
"ce_ib": 22.71752166748047,
"ce_orig": 0.8145064115524292,
"epoch": 0.035082674335010784,
"kl_loss": 9.922914505004883,
"loss_ib": 0.5075044631958008,
"step": 244
},
{
"epoch": 0.03522645578720345,
"grad_norm": 3.806140184402466,
"learning_rate": 1.8769968051118212e-05,
"loss": 1.4145,
"step": 245
},
{
"ce_ib": 26.927804946899414,
"ce_orig": 0.44644680619239807,
"epoch": 0.03522645578720345,
"kl_loss": 9.268022537231445,
"loss_ib": 0.4768650233745575,
"step": 245
},
{
"ce_ib": 25.422399520874023,
"ce_orig": 1.1885180473327637,
"epoch": 0.03537023723939612,
"kl_loss": 8.077836990356445,
"loss_ib": 0.41660305857658386,
"step": 246
},
{
"ce_ib": 25.16580581665039,
"ce_orig": 0.6401370763778687,
"epoch": 0.03551401869158879,
"kl_loss": 8.176619529724121,
"loss_ib": 0.4214138984680176,
"step": 247
},
{
"ce_ib": 21.501310348510742,
"ce_orig": 0.7412813901901245,
"epoch": 0.035657800143781455,
"kl_loss": 10.056595802307129,
"loss_ib": 0.5135805010795593,
"step": 248
},
{
"ce_ib": 24.469106674194336,
"ce_orig": 0.9428795576095581,
"epoch": 0.035801581595974116,
"kl_loss": 8.948450088500977,
"loss_ib": 0.45965704321861267,
"step": 249
},
{
"epoch": 0.035945363048166784,
"grad_norm": 2.816962957382202,
"learning_rate": 1.9169329073482428e-05,
"loss": 1.397,
"step": 250
},
{
"ce_ib": 22.195499420166016,
"ce_orig": 0.7263768911361694,
"epoch": 0.035945363048166784,
"kl_loss": 8.741401672363281,
"loss_ib": 0.4481678605079651,
"step": 250
},
{
"ce_ib": 22.70380973815918,
"ce_orig": 0.6979135274887085,
"epoch": 0.03608914450035945,
"kl_loss": 9.159427642822266,
"loss_ib": 0.4693233072757721,
"step": 251
},
{
"ce_ib": 24.72095489501953,
"ce_orig": 0.8336602449417114,
"epoch": 0.03623292595255212,
"kl_loss": 8.704825401306152,
"loss_ib": 0.4476017653942108,
"step": 252
},
{
"ce_ib": 25.478559494018555,
"ce_orig": 1.2140023708343506,
"epoch": 0.03637670740474479,
"kl_loss": 8.463350296020508,
"loss_ib": 0.4359067976474762,
"step": 253
},
{
"ce_ib": 24.30400276184082,
"ce_orig": 1.159781813621521,
"epoch": 0.036520488856937455,
"kl_loss": 9.485654830932617,
"loss_ib": 0.4864347577095032,
"step": 254
},
{
"epoch": 0.03666427030913012,
"grad_norm": 2.2414402961730957,
"learning_rate": 1.9568690095846644e-05,
"loss": 1.421,
"step": 255
},
{
"ce_ib": 22.23937225341797,
"ce_orig": 0.5963650941848755,
"epoch": 0.03666427030913012,
"kl_loss": 8.78721809387207,
"loss_ib": 0.4504806101322174,
"step": 255
},
{
"ce_ib": 23.723373413085938,
"ce_orig": 1.1367262601852417,
"epoch": 0.03680805176132279,
"kl_loss": 8.774513244628906,
"loss_ib": 0.45058736205101013,
"step": 256
},
{
"ce_ib": 25.06109619140625,
"ce_orig": 1.4180657863616943,
"epoch": 0.03695183321351546,
"kl_loss": 8.770709991455078,
"loss_ib": 0.4510660767555237,
"step": 257
},
{
"ce_ib": 23.2429141998291,
"ce_orig": 1.0652941465377808,
"epoch": 0.037095614665708126,
"kl_loss": 8.671271324157715,
"loss_ib": 0.4451850354671478,
"step": 258
},
{
"ce_ib": 21.69077491760254,
"ce_orig": 0.3725070655345917,
"epoch": 0.03723939611790079,
"kl_loss": 8.793878555297852,
"loss_ib": 0.45053932070732117,
"step": 259
},
{
"epoch": 0.037383177570093455,
"grad_norm": 3.798487663269043,
"learning_rate": 1.9968051118210863e-05,
"loss": 1.4654,
"step": 260
},
{
"ce_ib": 23.512039184570312,
"ce_orig": 1.0731854438781738,
"epoch": 0.037383177570093455,
"kl_loss": 8.859106063842773,
"loss_ib": 0.45471134781837463,
"step": 260
},
{
"ce_ib": 22.52284049987793,
"ce_orig": 1.0415581464767456,
"epoch": 0.03752695902228612,
"kl_loss": 8.765377044677734,
"loss_ib": 0.4495302736759186,
"step": 261
},
{
"ce_ib": 24.735340118408203,
"ce_orig": 1.2017862796783447,
"epoch": 0.03767074047447879,
"kl_loss": 8.242500305175781,
"loss_ib": 0.4244926869869232,
"step": 262
},
{
"ce_ib": 21.546173095703125,
"ce_orig": 1.224229097366333,
"epoch": 0.03781452192667146,
"kl_loss": 8.425148963928223,
"loss_ib": 0.4320305287837982,
"step": 263
},
{
"ce_ib": 21.48952293395996,
"ce_orig": 0.8686205148696899,
"epoch": 0.037958303378864126,
"kl_loss": 8.037482261657715,
"loss_ib": 0.41261887550354004,
"step": 264
},
{
"epoch": 0.038102084831056794,
"grad_norm": 3.353695869445801,
"learning_rate": 2.0367412140575082e-05,
"loss": 1.396,
"step": 265
},
{
"ce_ib": 21.23736000061035,
"ce_orig": 1.0968440771102905,
"epoch": 0.038102084831056794,
"kl_loss": 8.250904083251953,
"loss_ib": 0.42316389083862305,
"step": 265
},
{
"ce_ib": 23.215932846069336,
"ce_orig": 1.1477916240692139,
"epoch": 0.03824586628324946,
"kl_loss": 8.169689178466797,
"loss_ib": 0.4200924336910248,
"step": 266
},
{
"ce_ib": 23.65303611755371,
"ce_orig": 0.9717550277709961,
"epoch": 0.03838964773544213,
"kl_loss": 7.602072715759277,
"loss_ib": 0.39193016290664673,
"step": 267
},
{
"ce_ib": 20.33776092529297,
"ce_orig": 0.7842280864715576,
"epoch": 0.0385334291876348,
"kl_loss": 7.547301769256592,
"loss_ib": 0.3875339925289154,
"step": 268
},
{
"ce_ib": 21.66446304321289,
"ce_orig": 0.9328132271766663,
"epoch": 0.038677210639827465,
"kl_loss": 7.6735310554504395,
"loss_ib": 0.39450880885124207,
"step": 269
},
{
"epoch": 0.038820992092020126,
"grad_norm": 3.106287717819214,
"learning_rate": 2.07667731629393e-05,
"loss": 1.361,
"step": 270
},
{
"ce_ib": 14.545405387878418,
"ce_orig": 0.4456147849559784,
"epoch": 0.038820992092020126,
"kl_loss": 6.566807746887207,
"loss_ib": 0.33561310172080994,
"step": 270
},
{
"ce_ib": 22.359256744384766,
"ce_orig": 1.0915645360946655,
"epoch": 0.038964773544212794,
"kl_loss": 7.441349029541016,
"loss_ib": 0.38324710726737976,
"step": 271
},
{
"ce_ib": 19.833614349365234,
"ce_orig": 0.6568068861961365,
"epoch": 0.03910855499640546,
"kl_loss": 7.507279396057129,
"loss_ib": 0.3852807879447937,
"step": 272
},
{
"ce_ib": 18.543352127075195,
"ce_orig": 0.6701676845550537,
"epoch": 0.03925233644859813,
"kl_loss": 7.031156539916992,
"loss_ib": 0.36082950234413147,
"step": 273
},
{
"ce_ib": 19.24417495727539,
"ce_orig": 0.9314641952514648,
"epoch": 0.0393961179007908,
"kl_loss": 7.164027214050293,
"loss_ib": 0.36782345175743103,
"step": 274
},
{
"epoch": 0.039539899352983465,
"grad_norm": 3.4644718170166016,
"learning_rate": 2.1166134185303514e-05,
"loss": 1.2343,
"step": 275
},
{
"ce_ib": 20.395038604736328,
"ce_orig": 0.9956320524215698,
"epoch": 0.039539899352983465,
"kl_loss": 7.304188251495361,
"loss_ib": 0.375406950712204,
"step": 275
},
{
"ce_ib": 22.580080032348633,
"ce_orig": 1.0877642631530762,
"epoch": 0.03968368080517613,
"kl_loss": 6.740540504455566,
"loss_ib": 0.34831708669662476,
"step": 276
},
{
"ce_ib": 17.001928329467773,
"ce_orig": 0.6867518424987793,
"epoch": 0.0398274622573688,
"kl_loss": 6.005027770996094,
"loss_ib": 0.3087523579597473,
"step": 277
},
{
"ce_ib": 20.226699829101562,
"ce_orig": 0.5907849669456482,
"epoch": 0.03997124370956147,
"kl_loss": 6.040300369262695,
"loss_ib": 0.3121283948421478,
"step": 278
},
{
"ce_ib": 20.84942626953125,
"ce_orig": 0.8668643832206726,
"epoch": 0.040115025161754136,
"kl_loss": 5.51943302154541,
"loss_ib": 0.2863963544368744,
"step": 279
},
{
"epoch": 0.040258806613946804,
"grad_norm": 2.836003541946411,
"learning_rate": 2.1565495207667734e-05,
"loss": 1.2083,
"step": 280
},
{
"ce_ib": 21.198511123657227,
"ce_orig": 1.0262176990509033,
"epoch": 0.040258806613946804,
"kl_loss": 5.828641414642334,
"loss_ib": 0.30203133821487427,
"step": 280
},
{
"ce_ib": 20.3875675201416,
"ce_orig": 1.1043504476547241,
"epoch": 0.040402588066139465,
"kl_loss": 5.128015041351318,
"loss_ib": 0.2665945291519165,
"step": 281
},
{
"ce_ib": 21.915267944335938,
"ce_orig": 0.9482531547546387,
"epoch": 0.04054636951833213,
"kl_loss": 4.9252400398254395,
"loss_ib": 0.2572196424007416,
"step": 282
},
{
"ce_ib": 20.219453811645508,
"ce_orig": 0.9774989485740662,
"epoch": 0.0406901509705248,
"kl_loss": 4.067303657531738,
"loss_ib": 0.21347491443157196,
"step": 283
},
{
"ce_ib": 20.512344360351562,
"ce_orig": 0.8953350186347961,
"epoch": 0.04083393242271747,
"kl_loss": 2.980595588684082,
"loss_ib": 0.15928594768047333,
"step": 284
},
{
"epoch": 0.040977713874910136,
"grad_norm": 3.126970052719116,
"learning_rate": 2.196485623003195e-05,
"loss": 1.0977,
"step": 285
},
{
"ce_ib": 22.42814826965332,
"ce_orig": 0.8264601230621338,
"epoch": 0.040977713874910136,
"kl_loss": 2.260897397994995,
"loss_ib": 0.12425895035266876,
"step": 285
},
{
"ce_ib": 19.28723907470703,
"ce_orig": 1.0294511318206787,
"epoch": 0.041121495327102804,
"kl_loss": 1.4155142307281494,
"loss_ib": 0.08041933178901672,
"step": 286
},
{
"ce_ib": 22.32074546813965,
"ce_orig": 1.2082335948944092,
"epoch": 0.04126527677929547,
"kl_loss": 1.4922515153884888,
"loss_ib": 0.08577295392751694,
"step": 287
},
{
"ce_ib": 21.85085105895996,
"ce_orig": 1.1711450815200806,
"epoch": 0.04140905823148814,
"kl_loss": 1.0158560276031494,
"loss_ib": 0.06171822547912598,
"step": 288
},
{
"ce_ib": 19.448490142822266,
"ce_orig": 0.8083485960960388,
"epoch": 0.04155283968368081,
"kl_loss": 0.8125163316726685,
"loss_ib": 0.050350066274404526,
"step": 289
},
{
"epoch": 0.041696621135873475,
"grad_norm": 0.41338738799095154,
"learning_rate": 2.2364217252396165e-05,
"loss": 0.9838,
"step": 290
},
{
"ce_ib": 19.226776123046875,
"ce_orig": 1.1908280849456787,
"epoch": 0.041696621135873475,
"kl_loss": 0.7504492998123169,
"loss_ib": 0.0471358560025692,
"step": 290
},
{
"ce_ib": 19.87446403503418,
"ce_orig": 1.02711021900177,
"epoch": 0.041840402588066136,
"kl_loss": 0.6533533930778503,
"loss_ib": 0.04260490462183952,
"step": 291
},
{
"ce_ib": 17.87228775024414,
"ce_orig": 1.0570042133331299,
"epoch": 0.041984184040258804,
"kl_loss": 0.5579368472099304,
"loss_ib": 0.036832984536886215,
"step": 292
},
{
"ce_ib": 16.936511993408203,
"ce_orig": 0.5302789807319641,
"epoch": 0.04212796549245147,
"kl_loss": 0.6725109219551086,
"loss_ib": 0.042093802243471146,
"step": 293
},
{
"ce_ib": 18.851577758789062,
"ce_orig": 1.0163416862487793,
"epoch": 0.04227174694464414,
"kl_loss": 0.5098384618759155,
"loss_ib": 0.03491771221160889,
"step": 294
},
{
"epoch": 0.04241552839683681,
"grad_norm": 0.21297426521778107,
"learning_rate": 2.2763578274760385e-05,
"loss": 0.9126,
"step": 295
},
{
"ce_ib": 15.781968116760254,
"ce_orig": 0.5711618661880493,
"epoch": 0.04241552839683681,
"kl_loss": 0.4373496174812317,
"loss_ib": 0.02975846640765667,
"step": 295
},
{
"ce_ib": 22.636154174804688,
"ce_orig": 1.496230959892273,
"epoch": 0.042559309849029475,
"kl_loss": 0.4643814265727997,
"loss_ib": 0.03453714773058891,
"step": 296
},
{
"ce_ib": 14.683899879455566,
"ce_orig": 0.6033921241760254,
"epoch": 0.04270309130122214,
"kl_loss": 0.39723098278045654,
"loss_ib": 0.027203500270843506,
"step": 297
},
{
"ce_ib": 17.57473373413086,
"ce_orig": 0.7110596895217896,
"epoch": 0.04284687275341481,
"kl_loss": 0.4147634506225586,
"loss_ib": 0.02952553890645504,
"step": 298
},
{
"ce_ib": 16.251811981201172,
"ce_orig": 0.7895355820655823,
"epoch": 0.04299065420560748,
"kl_loss": 0.4082551896572113,
"loss_ib": 0.028538664802908897,
"step": 299
},
{
"epoch": 0.043134435657800146,
"grad_norm": 0.1568593829870224,
"learning_rate": 2.3162939297124604e-05,
"loss": 0.9485,
"step": 300
},
{
"ce_ib": 18.58441162109375,
"ce_orig": 1.089181900024414,
"epoch": 0.043134435657800146,
"kl_loss": 0.4467368721961975,
"loss_ib": 0.0316290520131588,
"step": 300
},
{
"ce_ib": 18.67475700378418,
"ce_orig": 0.566021740436554,
"epoch": 0.043278217109992814,
"kl_loss": 0.3963052034378052,
"loss_ib": 0.029152637347579002,
"step": 301
},
{
"ce_ib": 16.824068069458008,
"ce_orig": 0.7251248359680176,
"epoch": 0.043421998562185475,
"kl_loss": 0.34888386726379395,
"loss_ib": 0.025856226682662964,
"step": 302
},
{
"ce_ib": 18.8211669921875,
"ce_orig": 0.6579341888427734,
"epoch": 0.04356578001437814,
"kl_loss": 0.3751975893974304,
"loss_ib": 0.02817046456038952,
"step": 303
},
{
"ce_ib": 18.44203758239746,
"ce_orig": 0.8343374729156494,
"epoch": 0.04370956146657081,
"kl_loss": 0.3514009118080139,
"loss_ib": 0.02679106593132019,
"step": 304
},
{
"epoch": 0.04385334291876348,
"grad_norm": 0.15141652524471283,
"learning_rate": 2.356230031948882e-05,
"loss": 0.9599,
"step": 305
},
{
"ce_ib": 16.4177188873291,
"ce_orig": 0.6326029896736145,
"epoch": 0.04385334291876348,
"kl_loss": 0.331454873085022,
"loss_ib": 0.02478160336613655,
"step": 305
},
{
"ce_ib": 21.196409225463867,
"ce_orig": 1.230805516242981,
"epoch": 0.043997124370956146,
"kl_loss": 0.40000325441360474,
"loss_ib": 0.030598366633057594,
"step": 306
},
{
"ce_ib": 17.367687225341797,
"ce_orig": 0.863945484161377,
"epoch": 0.044140905823148814,
"kl_loss": 0.31744185090065,
"loss_ib": 0.024555936455726624,
"step": 307
},
{
"ce_ib": 18.942859649658203,
"ce_orig": 1.148938536643982,
"epoch": 0.04428468727534148,
"kl_loss": 0.3378192186355591,
"loss_ib": 0.02636238932609558,
"step": 308
},
{
"ce_ib": 19.47002410888672,
"ce_orig": 1.0959943532943726,
"epoch": 0.04442846872753415,
"kl_loss": 0.31226494908332825,
"loss_ib": 0.025348259136080742,
"step": 309
},
{
"epoch": 0.04457225017972682,
"grad_norm": 0.13533012568950653,
"learning_rate": 2.3961661341853036e-05,
"loss": 1.015,
"step": 310
},
{
"ce_ib": 17.681161880493164,
"ce_orig": 0.9902611374855042,
"epoch": 0.04457225017972682,
"kl_loss": 0.26686644554138184,
"loss_ib": 0.02218390442430973,
"step": 310
},
{
"ce_ib": 16.987899780273438,
"ce_orig": 0.6236902475357056,
"epoch": 0.044716031631919485,
"kl_loss": 0.33637407422065735,
"loss_ib": 0.025312652811408043,
"step": 311
},
{
"ce_ib": 18.65533447265625,
"ce_orig": 0.8152080774307251,
"epoch": 0.044859813084112146,
"kl_loss": 0.35383230447769165,
"loss_ib": 0.027019282802939415,
"step": 312
},
{
"ce_ib": 20.103788375854492,
"ce_orig": 0.8615632653236389,
"epoch": 0.045003594536304814,
"kl_loss": 0.2777783274650574,
"loss_ib": 0.023940810933709145,
"step": 313
},
{
"ce_ib": 16.64844512939453,
"ce_orig": 0.9147619605064392,
"epoch": 0.04514737598849748,
"kl_loss": 0.27109894156455994,
"loss_ib": 0.0218791700899601,
"step": 314
},
{
"epoch": 0.04529115744069015,
"grad_norm": 0.11459668725728989,
"learning_rate": 2.4361022364217255e-05,
"loss": 0.8602,
"step": 315
},
{
"ce_ib": 11.063411712646484,
"ce_orig": 0.27872321009635925,
"epoch": 0.04529115744069015,
"kl_loss": 0.3863391876220703,
"loss_ib": 0.024848666042089462,
"step": 315
},
{
"ce_ib": 17.38253402709961,
"ce_orig": 1.05631422996521,
"epoch": 0.04543493889288282,
"kl_loss": 0.29607954621315,
"loss_ib": 0.023495245724916458,
"step": 316
},
{
"ce_ib": 17.782546997070312,
"ce_orig": 0.9817179441452026,
"epoch": 0.045578720345075485,
"kl_loss": 0.28195664286613464,
"loss_ib": 0.02298910729587078,
"step": 317
},
{
"ce_ib": 17.297069549560547,
"ce_orig": 0.6919381618499756,
"epoch": 0.04572250179726815,
"kl_loss": 0.29801255464553833,
"loss_ib": 0.023549163714051247,
"step": 318
},
{
"ce_ib": 17.82324981689453,
"ce_orig": 1.2199736833572388,
"epoch": 0.04586628324946082,
"kl_loss": 0.2285030335187912,
"loss_ib": 0.020336776971817017,
"step": 319
},
{
"epoch": 0.04601006470165349,
"grad_norm": 0.10239739716053009,
"learning_rate": 2.476038338658147e-05,
"loss": 0.7986,
"step": 320
},
{
"ce_ib": 16.177907943725586,
"ce_orig": 0.7963648438453674,
"epoch": 0.04601006470165349,
"kl_loss": 0.2211613953113556,
"loss_ib": 0.019147023558616638,
"step": 320
},
{
"ce_ib": 17.704599380493164,
"ce_orig": 0.9023761749267578,
"epoch": 0.046153846153846156,
"kl_loss": 0.28145700693130493,
"loss_ib": 0.022925151512026787,
"step": 321
},
{
"ce_ib": 17.584495544433594,
"ce_orig": 0.8088329434394836,
"epoch": 0.046297627606038824,
"kl_loss": 0.2326435148715973,
"loss_ib": 0.02042442373931408,
"step": 322
},
{
"ce_ib": 16.041574478149414,
"ce_orig": 0.7201854586601257,
"epoch": 0.046441409058231485,
"kl_loss": 0.3453258275985718,
"loss_ib": 0.02528708055615425,
"step": 323
},
{
"ce_ib": 17.880464553833008,
"ce_orig": 0.917682409286499,
"epoch": 0.04658519051042415,
"kl_loss": 0.2218429148197174,
"loss_ib": 0.020032377913594246,
"step": 324
},
{
"epoch": 0.04672897196261682,
"grad_norm": 0.12532667815685272,
"learning_rate": 2.515974440894569e-05,
"loss": 0.8682,
"step": 325
},
{
"ce_ib": 15.092937469482422,
"ce_orig": 0.7272103428840637,
"epoch": 0.04672897196261682,
"kl_loss": 0.31795835494995117,
"loss_ib": 0.023444388061761856,
"step": 325
},
{
"ce_ib": 15.842549324035645,
"ce_orig": 0.9047788977622986,
"epoch": 0.04687275341480949,
"kl_loss": 0.2592480480670929,
"loss_ib": 0.020883677527308464,
"step": 326
},
{
"ce_ib": 14.095027923583984,
"ce_orig": 0.7625716328620911,
"epoch": 0.047016534867002156,
"kl_loss": 0.17521969974040985,
"loss_ib": 0.015808498486876488,
"step": 327
},
{
"ce_ib": 16.17887306213379,
"ce_orig": 0.7734029293060303,
"epoch": 0.047160316319194824,
"kl_loss": 0.23375201225280762,
"loss_ib": 0.01977703720331192,
"step": 328
},
{
"ce_ib": 18.957979202270508,
"ce_orig": 0.8559271097183228,
"epoch": 0.04730409777138749,
"kl_loss": 0.33970198035240173,
"loss_ib": 0.026464087888598442,
"step": 329
},
{
"epoch": 0.04744787922358016,
"grad_norm": 0.09043259918689728,
"learning_rate": 2.5559105431309903e-05,
"loss": 0.7934,
"step": 330
},
{
"ce_ib": 14.730635643005371,
"ce_orig": 0.8875608444213867,
"epoch": 0.04744787922358016,
"kl_loss": 0.2284042239189148,
"loss_ib": 0.01878552883863449,
"step": 330
},
{
"ce_ib": 12.517528533935547,
"ce_orig": 0.6206594705581665,
"epoch": 0.04759166067577283,
"kl_loss": 0.21303001046180725,
"loss_ib": 0.016910264268517494,
"step": 331
},
{
"ce_ib": 14.753087043762207,
"ce_orig": 0.841992199420929,
"epoch": 0.047735442127965495,
"kl_loss": 0.20966459810733795,
"loss_ib": 0.017859773710370064,
"step": 332
},
{
"ce_ib": 14.960477828979492,
"ce_orig": 1.0839353799819946,
"epoch": 0.047879223580158156,
"kl_loss": 0.15192289650440216,
"loss_ib": 0.01507638394832611,
"step": 333
},
{
"ce_ib": 15.1857328414917,
"ce_orig": 0.9817880392074585,
"epoch": 0.048023005032350824,
"kl_loss": 0.19470617175102234,
"loss_ib": 0.017328176647424698,
"step": 334
},
{
"epoch": 0.04816678648454349,
"grad_norm": 0.10444720834493637,
"learning_rate": 2.5958466453674125e-05,
"loss": 0.8294,
"step": 335
},
{
"ce_ib": 15.169656753540039,
"ce_orig": 0.8038283586502075,
"epoch": 0.04816678648454349,
"kl_loss": 0.2266630232334137,
"loss_ib": 0.018917979672551155,
"step": 335
},
{
"ce_ib": 14.643211364746094,
"ce_orig": 0.7942169904708862,
"epoch": 0.04831056793673616,
"kl_loss": 0.22511643171310425,
"loss_ib": 0.01857742667198181,
"step": 336
},
{
"ce_ib": 12.96651554107666,
"ce_orig": 0.8289546370506287,
"epoch": 0.04845434938892883,
"kl_loss": 0.17975889146327972,
"loss_ib": 0.015471202321350574,
"step": 337
},
{
"ce_ib": 16.585371017456055,
"ce_orig": 1.165753722190857,
"epoch": 0.048598130841121495,
"kl_loss": 0.190724715590477,
"loss_ib": 0.017828920856118202,
"step": 338
},
{
"ce_ib": 14.718335151672363,
"ce_orig": 0.8502370119094849,
"epoch": 0.04874191229331416,
"kl_loss": 0.20640771090984344,
"loss_ib": 0.017679553478956223,
"step": 339
},
{
"epoch": 0.04888569374550683,
"grad_norm": 0.09888631105422974,
"learning_rate": 2.6357827476038338e-05,
"loss": 0.8718,
"step": 340
},
{
"ce_ib": 13.714351654052734,
"ce_orig": 1.0317654609680176,
"epoch": 0.04888569374550683,
"kl_loss": 0.19221842288970947,
"loss_ib": 0.016468096524477005,
"step": 340
},
{
"ce_ib": 15.859162330627441,
"ce_orig": 0.6472983956336975,
"epoch": 0.0490294751976995,
"kl_loss": 0.2313256412744522,
"loss_ib": 0.01949586346745491,
"step": 341
},
{
"ce_ib": 16.512859344482422,
"ce_orig": 1.1302889585494995,
"epoch": 0.049173256649892166,
"kl_loss": 0.1906917691230774,
"loss_ib": 0.0177910178899765,
"step": 342
},
{
"ce_ib": 16.816452026367188,
"ce_orig": 0.8666338920593262,
"epoch": 0.049317038102084834,
"kl_loss": 0.17613860964775085,
"loss_ib": 0.01721515692770481,
"step": 343
},
{
"ce_ib": 13.920699119567871,
"ce_orig": 0.719912588596344,
"epoch": 0.049460819554277495,
"kl_loss": 0.18724943697452545,
"loss_ib": 0.016322821378707886,
"step": 344
},
{
"epoch": 0.04960460100647016,
"grad_norm": 0.10115125775337219,
"learning_rate": 2.6757188498402557e-05,
"loss": 0.843,
"step": 345
},
{
"ce_ib": 16.091005325317383,
"ce_orig": 1.0146100521087646,
"epoch": 0.04960460100647016,
"kl_loss": 0.1512732356786728,
"loss_ib": 0.015609164722263813,
"step": 345
},
{
"ce_ib": 15.5990571975708,
"ce_orig": 0.7898549437522888,
"epoch": 0.04974838245866283,
"kl_loss": 0.25755369663238525,
"loss_ib": 0.020677214488387108,
"step": 346
},
{
"ce_ib": 17.706438064575195,
"ce_orig": 1.2053872346878052,
"epoch": 0.0498921639108555,
"kl_loss": 0.21856242418289185,
"loss_ib": 0.019781339913606644,
"step": 347
},
{
"ce_ib": 10.220382690429688,
"ce_orig": 0.5169559121131897,
"epoch": 0.050035945363048166,
"kl_loss": 0.2967785596847534,
"loss_ib": 0.01994911953806877,
"step": 348
},
{
"ce_ib": 14.920300483703613,
"ce_orig": 0.7747300267219543,
"epoch": 0.050179726815240834,
"kl_loss": 0.13559795916080475,
"loss_ib": 0.01424004789441824,
"step": 349
},
{
"epoch": 0.0503235082674335,
"grad_norm": 0.09113696962594986,
"learning_rate": 2.7156549520766773e-05,
"loss": 0.898,
"step": 350
},
{
"ce_ib": 16.00478172302246,
"ce_orig": 1.2635383605957031,
"epoch": 0.0503235082674335,
"kl_loss": 0.2005615234375,
"loss_ib": 0.018030468374490738,
"step": 350
},
{
"ce_ib": 16.192142486572266,
"ce_orig": 0.9619611501693726,
"epoch": 0.05046728971962617,
"kl_loss": 0.25098395347595215,
"loss_ib": 0.020645270124077797,
"step": 351
},
{
"ce_ib": 13.817422866821289,
"ce_orig": 0.6711569428443909,
"epoch": 0.05061107117181884,
"kl_loss": 0.21178191900253296,
"loss_ib": 0.01749780774116516,
"step": 352
},
{
"ce_ib": 15.532769203186035,
"ce_orig": 0.5863211750984192,
"epoch": 0.050754852624011505,
"kl_loss": 0.33604636788368225,
"loss_ib": 0.024568704888224602,
"step": 353
},
{
"ce_ib": 14.479625701904297,
"ce_orig": 1.0277538299560547,
"epoch": 0.05089863407620417,
"kl_loss": 0.251324862241745,
"loss_ib": 0.01980605535209179,
"step": 354
},
{
"epoch": 0.051042415528396834,
"grad_norm": 0.10617897659540176,
"learning_rate": 2.7555910543130992e-05,
"loss": 0.8813,
"step": 355
},
{
"ce_ib": 16.734331130981445,
"ce_orig": 0.9177318215370178,
"epoch": 0.051042415528396834,
"kl_loss": 0.2959950864315033,
"loss_ib": 0.023166919127106667,
"step": 355
},
{
"ce_ib": 14.452986717224121,
"ce_orig": 0.8091175556182861,
"epoch": 0.0511861969805895,
"kl_loss": 0.19851821660995483,
"loss_ib": 0.01715240441262722,
"step": 356
},
{
"ce_ib": 15.050978660583496,
"ce_orig": 1.065936803817749,
"epoch": 0.05132997843278217,
"kl_loss": 0.1562933474779129,
"loss_ib": 0.015340156853199005,
"step": 357
},
{
"ce_ib": 16.42909812927246,
"ce_orig": 0.6200724840164185,
"epoch": 0.05147375988497484,
"kl_loss": 0.22564369440078735,
"loss_ib": 0.0194967333227396,
"step": 358
},
{
"ce_ib": 15.036846160888672,
"ce_orig": 0.8413035273551941,
"epoch": 0.051617541337167505,
"kl_loss": 0.15089106559753418,
"loss_ib": 0.015062975697219372,
"step": 359
},
{
"epoch": 0.05176132278936017,
"grad_norm": 0.0994093269109726,
"learning_rate": 2.7955271565495212e-05,
"loss": 0.8787,
"step": 360
},
{
"ce_ib": 16.430448532104492,
"ce_orig": 1.058258295059204,
"epoch": 0.05176132278936017,
"kl_loss": 0.19877898693084717,
"loss_ib": 0.018154174089431763,
"step": 360
},
{
"ce_ib": 16.932239532470703,
"ce_orig": 1.2644236087799072,
"epoch": 0.05190510424155284,
"kl_loss": 0.19584302604198456,
"loss_ib": 0.018258271738886833,
"step": 361
},
{
"ce_ib": 14.099466323852539,
"ce_orig": 0.9995023608207703,
"epoch": 0.05204888569374551,
"kl_loss": 0.16139906644821167,
"loss_ib": 0.015119686722755432,
"step": 362
},
{
"ce_ib": 15.650708198547363,
"ce_orig": 0.9404743313789368,
"epoch": 0.052192667145938176,
"kl_loss": 0.21341916918754578,
"loss_ib": 0.018496312201023102,
"step": 363
},
{
"ce_ib": 16.308624267578125,
"ce_orig": 1.1696351766586304,
"epoch": 0.052336448598130844,
"kl_loss": 0.18094685673713684,
"loss_ib": 0.017201654613018036,
"step": 364
},
{
"epoch": 0.052480230050323505,
"grad_norm": 0.08824347704648972,
"learning_rate": 2.8354632587859424e-05,
"loss": 0.92,
"step": 365
},
{
"ce_ib": 16.583515167236328,
"ce_orig": 1.182395577430725,
"epoch": 0.052480230050323505,
"kl_loss": 0.19043317437171936,
"loss_ib": 0.017813416197896004,
"step": 365
},
{
"ce_ib": 13.93136978149414,
"ce_orig": 0.8936623930931091,
"epoch": 0.05262401150251617,
"kl_loss": 0.20374388992786407,
"loss_ib": 0.017152879387140274,
"step": 366
},
{
"ce_ib": 13.256582260131836,
"ce_orig": 0.6269909143447876,
"epoch": 0.05276779295470884,
"kl_loss": 0.16010135412216187,
"loss_ib": 0.014633359387516975,
"step": 367
},
{
"ce_ib": 12.879414558410645,
"ce_orig": 0.6315767765045166,
"epoch": 0.05291157440690151,
"kl_loss": 0.16557064652442932,
"loss_ib": 0.014718241058290005,
"step": 368
},
{
"ce_ib": 15.146659851074219,
"ce_orig": 1.0897746086120605,
"epoch": 0.053055355859094176,
"kl_loss": 0.16495399177074432,
"loss_ib": 0.015821030363440514,
"step": 369
},
{
"epoch": 0.053199137311286844,
"grad_norm": 0.09635983407497406,
"learning_rate": 2.8753993610223644e-05,
"loss": 0.933,
"step": 370
},
{
"ce_ib": 14.545082092285156,
"ce_orig": 0.903797447681427,
"epoch": 0.053199137311286844,
"kl_loss": 0.12444749474525452,
"loss_ib": 0.013494915328919888,
"step": 370
},
{
"ce_ib": 12.932695388793945,
"ce_orig": 0.6494324207305908,
"epoch": 0.05334291876347951,
"kl_loss": 0.1315731406211853,
"loss_ib": 0.013045004568994045,
"step": 371
},
{
"ce_ib": 15.617569923400879,
"ce_orig": 0.7816907167434692,
"epoch": 0.05348670021567218,
"kl_loss": 0.16312208771705627,
"loss_ib": 0.015964889898896217,
"step": 372
},
{
"ce_ib": 15.400293350219727,
"ce_orig": 1.033065915107727,
"epoch": 0.05363048166786485,
"kl_loss": 0.17930516600608826,
"loss_ib": 0.01666540466248989,
"step": 373
},
{
"ce_ib": 11.445207595825195,
"ce_orig": 0.5388709902763367,
"epoch": 0.053774263120057515,
"kl_loss": 0.2616915702819824,
"loss_ib": 0.0188071820884943,
"step": 374
},
{
"epoch": 0.05391804457225018,
"grad_norm": 0.0901573896408081,
"learning_rate": 2.915335463258786e-05,
"loss": 0.926,
"step": 375
},
{
"ce_ib": 13.34332275390625,
"ce_orig": 1.1417230367660522,
"epoch": 0.05391804457225018,
"kl_loss": 0.1469050794839859,
"loss_ib": 0.014016914181411266,
"step": 375
},
{
"ce_ib": 11.211030006408691,
"ce_orig": 0.6393249034881592,
"epoch": 0.054061826024442844,
"kl_loss": 0.13886746764183044,
"loss_ib": 0.012548888102173805,
"step": 376
},
{
"ce_ib": 15.887382507324219,
"ce_orig": 0.9176316261291504,
"epoch": 0.05420560747663551,
"kl_loss": 0.2912940979003906,
"loss_ib": 0.022508395835757256,
"step": 377
},
{
"ce_ib": 11.42358112335205,
"ce_orig": 0.8122538924217224,
"epoch": 0.05434938892882818,
"kl_loss": 0.1490350216627121,
"loss_ib": 0.013163541443645954,
"step": 378
},
{
"ce_ib": 14.985864639282227,
"ce_orig": 0.9277105927467346,
"epoch": 0.05449317038102085,
"kl_loss": 0.14583294093608856,
"loss_ib": 0.014784579165279865,
"step": 379
},
{
"epoch": 0.054636951833213515,
"grad_norm": 0.12827961146831512,
"learning_rate": 2.955271565495208e-05,
"loss": 0.9204,
"step": 380
},
{
"ce_ib": 13.780610084533691,
"ce_orig": 0.8345714807510376,
"epoch": 0.054636951833213515,
"kl_loss": 0.1690724641084671,
"loss_ib": 0.015343928709626198,
"step": 380
},
{
"ce_ib": 11.765593528747559,
"ce_orig": 0.7576747536659241,
"epoch": 0.05478073328540618,
"kl_loss": 0.15963752567768097,
"loss_ib": 0.013864672742784023,
"step": 381
},
{
"ce_ib": 12.893147468566895,
"ce_orig": 1.0378029346466064,
"epoch": 0.05492451473759885,
"kl_loss": 0.13528262078762054,
"loss_ib": 0.013210705481469631,
"step": 382
},
{
"ce_ib": 15.782855987548828,
"ce_orig": 0.8279376029968262,
"epoch": 0.05506829618979152,
"kl_loss": 0.16392827033996582,
"loss_ib": 0.016087843105196953,
"step": 383
},
{
"ce_ib": 12.443214416503906,
"ce_orig": 0.8166038990020752,
"epoch": 0.055212077641984186,
"kl_loss": 0.14554069936275482,
"loss_ib": 0.013498641550540924,
"step": 384
},
{
"epoch": 0.055355859094176854,
"grad_norm": 0.13148367404937744,
"learning_rate": 2.9952076677316295e-05,
"loss": 0.8973,
"step": 385
},
{
"ce_ib": 16.574996948242188,
"ce_orig": 1.3873276710510254,
"epoch": 0.055355859094176854,
"kl_loss": 0.206925630569458,
"loss_ib": 0.018633781000971794,
"step": 385
},
{
"ce_ib": 14.954483032226562,
"ce_orig": 1.4074153900146484,
"epoch": 0.055499640546369515,
"kl_loss": 0.1725064069032669,
"loss_ib": 0.016102561727166176,
"step": 386
},
{
"ce_ib": 13.222760200500488,
"ce_orig": 0.581721842288971,
"epoch": 0.05564342199856218,
"kl_loss": 0.3106327950954437,
"loss_ib": 0.02214301936328411,
"step": 387
},
{
"ce_ib": 12.130496978759766,
"ce_orig": 0.8030052185058594,
"epoch": 0.05578720345075485,
"kl_loss": 0.19192326068878174,
"loss_ib": 0.015661410987377167,
"step": 388
},
{
"ce_ib": 12.304028511047363,
"ce_orig": 0.838097095489502,
"epoch": 0.05593098490294752,
"kl_loss": 0.17265933752059937,
"loss_ib": 0.0147849814966321,
"step": 389
},
{
"epoch": 0.056074766355140186,
"grad_norm": 0.1129549965262413,
"learning_rate": 3.0351437699680514e-05,
"loss": 0.9147,
"step": 390
},
{
"ce_ib": 12.584757804870605,
"ce_orig": 0.6829859018325806,
"epoch": 0.056074766355140186,
"kl_loss": 0.15448611974716187,
"loss_ib": 0.014016685076057911,
"step": 390
},
{
"ce_ib": 14.888505935668945,
"ce_orig": 0.8695336580276489,
"epoch": 0.056218547807332854,
"kl_loss": 0.18638572096824646,
"loss_ib": 0.016763538122177124,
"step": 391
},
{
"ce_ib": 13.158818244934082,
"ce_orig": 0.724577009677887,
"epoch": 0.05636232925952552,
"kl_loss": 0.15236912667751312,
"loss_ib": 0.01419786550104618,
"step": 392
},
{
"ce_ib": 14.405329704284668,
"ce_orig": 0.4904825985431671,
"epoch": 0.05650611071171819,
"kl_loss": 0.21818403899669647,
"loss_ib": 0.01811186783015728,
"step": 393
},
{
"ce_ib": 10.370551109313965,
"ce_orig": 0.6885640621185303,
"epoch": 0.05664989216391086,
"kl_loss": 0.18041831254959106,
"loss_ib": 0.014206192456185818,
"step": 394
},
{
"epoch": 0.056793673616103525,
"grad_norm": 0.09922255575656891,
"learning_rate": 3.075079872204473e-05,
"loss": 0.8916,
"step": 395
},
{
"ce_ib": 15.597278594970703,
"ce_orig": 1.2381712198257446,
"epoch": 0.056793673616103525,
"kl_loss": 0.1798793077468872,
"loss_ib": 0.016792604699730873,
"step": 395
},
{
"ce_ib": 16.811328887939453,
"ce_orig": 1.2628995180130005,
"epoch": 0.05693745506829619,
"kl_loss": 0.171632319688797,
"loss_ib": 0.016987280920147896,
"step": 396
},
{
"ce_ib": 14.330126762390137,
"ce_orig": 0.842546284198761,
"epoch": 0.057081236520488854,
"kl_loss": 0.17398859560489655,
"loss_ib": 0.01586449332535267,
"step": 397
},
{
"ce_ib": 13.21159553527832,
"ce_orig": 0.8423411250114441,
"epoch": 0.05722501797268152,
"kl_loss": 0.13025188446044922,
"loss_ib": 0.013118392787873745,
"step": 398
},
{
"ce_ib": 12.565587043762207,
"ce_orig": 0.7189036011695862,
"epoch": 0.05736879942487419,
"kl_loss": 0.14162641763687134,
"loss_ib": 0.013364115729928017,
"step": 399
},
{
"epoch": 0.05751258087706686,
"grad_norm": 0.0966155007481575,
"learning_rate": 3.115015974440895e-05,
"loss": 0.9267,
"step": 400
},
{
"ce_ib": 11.058226585388184,
"ce_orig": 0.9342263340950012,
"epoch": 0.05751258087706686,
"kl_loss": 0.11544251441955566,
"loss_ib": 0.011301239021122456,
"step": 400
},
{
"ce_ib": 14.200401306152344,
"ce_orig": 1.0684270858764648,
"epoch": 0.057656362329259525,
"kl_loss": 0.14682269096374512,
"loss_ib": 0.014441335573792458,
"step": 401
},
{
"ce_ib": 14.785656929016113,
"ce_orig": 1.1560802459716797,
"epoch": 0.05780014378145219,
"kl_loss": 0.1572328507900238,
"loss_ib": 0.015254470519721508,
"step": 402
},
{
"ce_ib": 16.529001235961914,
"ce_orig": 1.4409286975860596,
"epoch": 0.05794392523364486,
"kl_loss": 0.1712377667427063,
"loss_ib": 0.016826389357447624,
"step": 403
},
{
"ce_ib": 8.062843322753906,
"ce_orig": 0.4845752716064453,
"epoch": 0.05808770668583753,
"kl_loss": 0.114130899310112,
"loss_ib": 0.009737967513501644,
"step": 404
},
{
"epoch": 0.058231488138030196,
"grad_norm": 0.10344849526882172,
"learning_rate": 3.154952076677317e-05,
"loss": 0.9143,
"step": 405
},
{
"ce_ib": 14.247758865356445,
"ce_orig": 0.8494449853897095,
"epoch": 0.058231488138030196,
"kl_loss": 0.14982560276985168,
"loss_ib": 0.014615160413086414,
"step": 405
},
{
"ce_ib": 15.090539932250977,
"ce_orig": 1.255419135093689,
"epoch": 0.058375269590222864,
"kl_loss": 0.2056526243686676,
"loss_ib": 0.017827901989221573,
"step": 406
},
{
"ce_ib": 13.584562301635742,
"ce_orig": 1.0737160444259644,
"epoch": 0.058519051042415525,
"kl_loss": 0.16253307461738586,
"loss_ib": 0.014918935485184193,
"step": 407
},
{
"ce_ib": 10.844743728637695,
"ce_orig": 0.7258655428886414,
"epoch": 0.05866283249460819,
"kl_loss": 0.2175343632698059,
"loss_ib": 0.016299089416861534,
"step": 408
},
{
"ce_ib": 9.838624000549316,
"ce_orig": 0.5341205596923828,
"epoch": 0.05880661394680086,
"kl_loss": 0.24159343540668488,
"loss_ib": 0.016998983919620514,
"step": 409
},
{
"epoch": 0.05895039539899353,
"grad_norm": 0.0828595831990242,
"learning_rate": 3.194888178913738e-05,
"loss": 0.8595,
"step": 410
},
{
"ce_ib": 11.394186019897461,
"ce_orig": 0.6941292881965637,
"epoch": 0.05895039539899353,
"kl_loss": 0.13403424620628357,
"loss_ib": 0.012398804537951946,
"step": 410
},
{
"ce_ib": 13.786474227905273,
"ce_orig": 1.0304478406906128,
"epoch": 0.059094176851186196,
"kl_loss": 0.24797815084457397,
"loss_ib": 0.01929214410483837,
"step": 411
},
{
"ce_ib": 12.280767440795898,
"ce_orig": 0.9082537889480591,
"epoch": 0.059237958303378864,
"kl_loss": 0.09719130396842957,
"loss_ib": 0.010999949648976326,
"step": 412
},
{
"ce_ib": 11.551681518554688,
"ce_orig": 0.9754782915115356,
"epoch": 0.05938173975557153,
"kl_loss": 0.1405172348022461,
"loss_ib": 0.0128017021343112,
"step": 413
},
{
"ce_ib": 13.329681396484375,
"ce_orig": 0.9015910625457764,
"epoch": 0.0595255212077642,
"kl_loss": 0.15253770351409912,
"loss_ib": 0.014291726052761078,
"step": 414
},
{
"epoch": 0.05966930265995687,
"grad_norm": 0.09882552921772003,
"learning_rate": 3.23482428115016e-05,
"loss": 0.8577,
"step": 415
},
{
"ce_ib": 12.369913101196289,
"ce_orig": 0.8101427555084229,
"epoch": 0.05966930265995687,
"kl_loss": 0.2113226056098938,
"loss_ib": 0.016751086339354515,
"step": 415
},
{
"ce_ib": 14.39426040649414,
"ce_orig": 1.3613587617874146,
"epoch": 0.059813084112149535,
"kl_loss": 0.1314837634563446,
"loss_ib": 0.013771317899227142,
"step": 416
},
{
"ce_ib": 12.081097602844238,
"ce_orig": 0.4347302317619324,
"epoch": 0.0599568655643422,
"kl_loss": 0.2995225489139557,
"loss_ib": 0.021016675978899002,
"step": 417
},
{
"ce_ib": 15.218514442443848,
"ce_orig": 1.2289142608642578,
"epoch": 0.060100647016534864,
"kl_loss": 0.26874852180480957,
"loss_ib": 0.021046683192253113,
"step": 418
},
{
"ce_ib": 13.258194923400879,
"ce_orig": 1.0039843320846558,
"epoch": 0.06024442846872753,
"kl_loss": 0.12897028028964996,
"loss_ib": 0.013077611103653908,
"step": 419
},
{
"epoch": 0.0603882099209202,
"grad_norm": 0.1349947154521942,
"learning_rate": 3.274760383386581e-05,
"loss": 0.899,
"step": 420
},
{
"ce_ib": 12.171891212463379,
"ce_orig": 0.9144300222396851,
"epoch": 0.0603882099209202,
"kl_loss": 0.21492531895637512,
"loss_ib": 0.01683221198618412,
"step": 420
},
{
"ce_ib": 10.995501518249512,
"ce_orig": 0.699188768863678,
"epoch": 0.06053199137311287,
"kl_loss": 0.12099233269691467,
"loss_ib": 0.011547367088496685,
"step": 421
},
{
"ce_ib": 12.991347312927246,
"ce_orig": 0.9281318187713623,
"epoch": 0.060675772825305535,
"kl_loss": 0.1598033308982849,
"loss_ib": 0.014485838823020458,
"step": 422
},
{
"ce_ib": 10.423280715942383,
"ce_orig": 0.9821050763130188,
"epoch": 0.0608195542774982,
"kl_loss": 0.10947795957326889,
"loss_ib": 0.010685537941753864,
"step": 423
},
{
"ce_ib": 11.12364387512207,
"ce_orig": 0.7817228436470032,
"epoch": 0.06096333572969087,
"kl_loss": 0.17594116926193237,
"loss_ib": 0.014358880929648876,
"step": 424
},
{
"epoch": 0.06110711718188354,
"grad_norm": 0.0929858386516571,
"learning_rate": 3.314696485623003e-05,
"loss": 0.7994,
"step": 425
},
{
"ce_ib": 12.660994529724121,
"ce_orig": 0.9210802912712097,
"epoch": 0.06110711718188354,
"kl_loss": 0.15979456901550293,
"loss_ib": 0.014320224523544312,
"step": 425
},
{
"ce_ib": 15.475061416625977,
"ce_orig": 1.6302592754364014,
"epoch": 0.061250898634076206,
"kl_loss": 0.1844199150800705,
"loss_ib": 0.01695852540433407,
"step": 426
},
{
"ce_ib": 9.331029891967773,
"ce_orig": 0.5564351081848145,
"epoch": 0.061394680086268874,
"kl_loss": 0.1388329267501831,
"loss_ib": 0.011607161723077297,
"step": 427
},
{
"ce_ib": 12.874106407165527,
"ce_orig": 0.9861687421798706,
"epoch": 0.06153846153846154,
"kl_loss": 0.12479162216186523,
"loss_ib": 0.012676633894443512,
"step": 428
},
{
"ce_ib": 7.756659507751465,
"ce_orig": 0.28384384512901306,
"epoch": 0.0616822429906542,
"kl_loss": 0.2630873918533325,
"loss_ib": 0.017032699659466743,
"step": 429
},
{
"epoch": 0.06182602444284687,
"grad_norm": 0.09535211324691772,
"learning_rate": 3.354632587859425e-05,
"loss": 0.9043,
"step": 430
},
{
"ce_ib": 10.622055053710938,
"ce_orig": 0.6629616022109985,
"epoch": 0.06182602444284687,
"kl_loss": 0.13496457040309906,
"loss_ib": 0.012059256434440613,
"step": 430
},
{
"ce_ib": 11.811662673950195,
"ce_orig": 0.7327677011489868,
"epoch": 0.06196980589503954,
"kl_loss": 0.14013449847698212,
"loss_ib": 0.012912556529045105,
"step": 431
},
{
"ce_ib": 8.620430946350098,
"ce_orig": 0.7203670144081116,
"epoch": 0.062113587347232206,
"kl_loss": 0.11925005167722702,
"loss_ib": 0.010272718034684658,
"step": 432
},
{
"ce_ib": 14.616909980773926,
"ce_orig": 1.4517083168029785,
"epoch": 0.062257368799424874,
"kl_loss": 0.15538114309310913,
"loss_ib": 0.01507751177996397,
"step": 433
},
{
"ce_ib": 9.763717651367188,
"ce_orig": 0.6260893940925598,
"epoch": 0.06240115025161754,
"kl_loss": 0.13390487432479858,
"loss_ib": 0.011577102355659008,
"step": 434
},
{
"epoch": 0.0625449317038102,
"grad_norm": 0.11180251836776733,
"learning_rate": 3.394568690095847e-05,
"loss": 0.9071,
"step": 435
},
{
"ce_ib": 12.881009101867676,
"ce_orig": 0.5546009540557861,
"epoch": 0.0625449317038102,
"kl_loss": 0.13927477598190308,
"loss_ib": 0.013404244557023048,
"step": 435
},
{
"ce_ib": 12.253645896911621,
"ce_orig": 0.7509746551513672,
"epoch": 0.06268871315600287,
"kl_loss": 0.16948378086090088,
"loss_ib": 0.014601011760532856,
"step": 436
},
{
"ce_ib": 13.29328441619873,
"ce_orig": 0.9583929777145386,
"epoch": 0.06283249460819554,
"kl_loss": 0.15764841437339783,
"loss_ib": 0.014529063366353512,
"step": 437
},
{
"ce_ib": 12.615095138549805,
"ce_orig": 1.1630975008010864,
"epoch": 0.0629762760603882,
"kl_loss": 0.12097503244876862,
"loss_ib": 0.012356298975646496,
"step": 438
},
{
"ce_ib": 11.95744514465332,
"ce_orig": 0.734953761100769,
"epoch": 0.06312005751258087,
"kl_loss": 0.13797758519649506,
"loss_ib": 0.012877601198852062,
"step": 439
},
{
"epoch": 0.06326383896477354,
"grad_norm": 0.09555409848690033,
"learning_rate": 3.434504792332269e-05,
"loss": 0.8284,
"step": 440
},
{
"ce_ib": 17.494842529296875,
"ce_orig": 1.2540117502212524,
"epoch": 0.06326383896477354,
"kl_loss": 0.20184318721294403,
"loss_ib": 0.018839580938220024,
"step": 440
},
{
"ce_ib": 13.520644187927246,
"ce_orig": 1.173345923423767,
"epoch": 0.06340762041696621,
"kl_loss": 0.224016010761261,
"loss_ib": 0.01796112395823002,
"step": 441
},
{
"ce_ib": 6.444005489349365,
"ce_orig": 0.33801600337028503,
"epoch": 0.06355140186915888,
"kl_loss": 0.24252083897590637,
"loss_ib": 0.015348044224083424,
"step": 442
},
{
"ce_ib": 11.194876670837402,
"ce_orig": 0.8596982359886169,
"epoch": 0.06369518332135155,
"kl_loss": 0.1475781947374344,
"loss_ib": 0.012976348400115967,
"step": 443
},
{
"ce_ib": 14.788161277770996,
"ce_orig": 0.9179244041442871,
"epoch": 0.06383896477354421,
"kl_loss": 0.23837195336818695,
"loss_ib": 0.019312677904963493,
"step": 444
},
{
"epoch": 0.06398274622573688,
"grad_norm": 0.10049393773078918,
"learning_rate": 3.47444089456869e-05,
"loss": 0.915,
"step": 445
},
{
"ce_ib": 12.728458404541016,
"ce_orig": 0.6808370351791382,
"epoch": 0.06398274622573688,
"kl_loss": 0.16207855939865112,
"loss_ib": 0.014468157663941383,
"step": 445
},
{
"ce_ib": 9.595919609069824,
"ce_orig": 0.48967745900154114,
"epoch": 0.06412652767792955,
"kl_loss": 0.12488089501857758,
"loss_ib": 0.011042005382478237,
"step": 446
},
{
"ce_ib": 15.164140701293945,
"ce_orig": 0.8277769684791565,
"epoch": 0.06427030913012222,
"kl_loss": 0.1784917414188385,
"loss_ib": 0.016506657004356384,
"step": 447
},
{
"ce_ib": 11.631290435791016,
"ce_orig": 0.786353588104248,
"epoch": 0.06441409058231488,
"kl_loss": 0.13490960001945496,
"loss_ib": 0.012561124749481678,
"step": 448
},
{
"ce_ib": 11.316841125488281,
"ce_orig": 0.6659090518951416,
"epoch": 0.06455787203450755,
"kl_loss": 0.1083206981420517,
"loss_ib": 0.011074455454945564,
"step": 449
},
{
"epoch": 0.06470165348670022,
"grad_norm": 0.10170278698205948,
"learning_rate": 3.514376996805112e-05,
"loss": 0.8419,
"step": 450
},
{
"ce_ib": 9.49474048614502,
"ce_orig": 0.7802785038948059,
"epoch": 0.06470165348670022,
"kl_loss": 0.1756356954574585,
"loss_ib": 0.013529154472053051,
"step": 450
},
{
"ce_ib": 11.215967178344727,
"ce_orig": 0.4214544892311096,
"epoch": 0.06484543493889289,
"kl_loss": 0.12542136013507843,
"loss_ib": 0.01187905203551054,
"step": 451
},
{
"ce_ib": 6.312502861022949,
"ce_orig": 0.3134852945804596,
"epoch": 0.06498921639108556,
"kl_loss": 0.2386016845703125,
"loss_ib": 0.015086335130035877,
"step": 452
},
{
"ce_ib": 13.910443305969238,
"ce_orig": 0.7964897155761719,
"epoch": 0.06513299784327822,
"kl_loss": 0.19260820746421814,
"loss_ib": 0.016585631296038628,
"step": 453
},
{
"ce_ib": 10.858504295349121,
"ce_orig": 0.8178758025169373,
"epoch": 0.06527677929547089,
"kl_loss": 0.1271795630455017,
"loss_ib": 0.011788229458034039,
"step": 454
},
{
"epoch": 0.06542056074766354,
"grad_norm": 0.1267521232366562,
"learning_rate": 3.5543130990415334e-05,
"loss": 0.8513,
"step": 455
},
{
"ce_ib": 11.637347221374512,
"ce_orig": 1.0193455219268799,
"epoch": 0.06542056074766354,
"kl_loss": 0.144621342420578,
"loss_ib": 0.01304974127560854,
"step": 455
},
{
"ce_ib": 10.80041217803955,
"ce_orig": 0.6328637599945068,
"epoch": 0.06556434219985621,
"kl_loss": 0.10495860129594803,
"loss_ib": 0.010648136027157307,
"step": 456
},
{
"ce_ib": 7.313602447509766,
"ce_orig": 0.42815887928009033,
"epoch": 0.06570812365204888,
"kl_loss": 0.17510683834552765,
"loss_ib": 0.01241214293986559,
"step": 457
},
{
"ce_ib": 13.05362606048584,
"ce_orig": 1.0646302700042725,
"epoch": 0.06585190510424155,
"kl_loss": 0.14800792932510376,
"loss_ib": 0.013927209191024303,
"step": 458
},
{
"ce_ib": 8.705698013305664,
"ce_orig": 0.5751362442970276,
"epoch": 0.06599568655643422,
"kl_loss": 0.19291532039642334,
"loss_ib": 0.013998615555465221,
"step": 459
},
{
"epoch": 0.06613946800862688,
"grad_norm": 0.14026452600955963,
"learning_rate": 3.5942492012779554e-05,
"loss": 0.8978,
"step": 460
},
{
"ce_ib": 12.629561424255371,
"ce_orig": 1.248939871788025,
"epoch": 0.06613946800862688,
"kl_loss": 0.1431877613067627,
"loss_ib": 0.013474169187247753,
"step": 460
},
{
"ce_ib": 13.466840744018555,
"ce_orig": 1.1314830780029297,
"epoch": 0.06628324946081955,
"kl_loss": 0.11893537640571594,
"loss_ib": 0.012680189684033394,
"step": 461
},
{
"ce_ib": 12.272945404052734,
"ce_orig": 0.5334405303001404,
"epoch": 0.06642703091301222,
"kl_loss": 0.19608467817306519,
"loss_ib": 0.015940707176923752,
"step": 462
},
{
"ce_ib": 11.584327697753906,
"ce_orig": 0.5882666707038879,
"epoch": 0.06657081236520489,
"kl_loss": 0.15428690612316132,
"loss_ib": 0.013506509363651276,
"step": 463
},
{
"ce_ib": 10.483445167541504,
"ce_orig": 0.5081559419631958,
"epoch": 0.06671459381739756,
"kl_loss": 0.23190432786941528,
"loss_ib": 0.01683693937957287,
"step": 464
},
{
"epoch": 0.06685837526959022,
"grad_norm": 0.11186351627111435,
"learning_rate": 3.6341853035143766e-05,
"loss": 0.977,
"step": 465
},
{
"ce_ib": 10.818644523620605,
"ce_orig": 0.8423200249671936,
"epoch": 0.06685837526959022,
"kl_loss": 0.12322719395160675,
"loss_ib": 0.011570681817829609,
"step": 465
},
{
"ce_ib": 13.477171897888184,
"ce_orig": 1.135223627090454,
"epoch": 0.06700215672178289,
"kl_loss": 0.1358594447374344,
"loss_ib": 0.013531558215618134,
"step": 466
},
{
"ce_ib": 12.029156684875488,
"ce_orig": 0.925537645816803,
"epoch": 0.06714593817397556,
"kl_loss": 0.1674036681652069,
"loss_ib": 0.014384761452674866,
"step": 467
},
{
"ce_ib": 8.591270446777344,
"ce_orig": 0.6351872086524963,
"epoch": 0.06728971962616823,
"kl_loss": 0.16200634837150574,
"loss_ib": 0.012395952828228474,
"step": 468
},
{
"ce_ib": 12.34648609161377,
"ce_orig": 0.8252216577529907,
"epoch": 0.0674335010783609,
"kl_loss": 0.1306806206703186,
"loss_ib": 0.012707273475825787,
"step": 469
},
{
"epoch": 0.06757728253055356,
"grad_norm": 0.11462409794330597,
"learning_rate": 3.6741214057507985e-05,
"loss": 0.8112,
"step": 470
},
{
"ce_ib": 12.145110130310059,
"ce_orig": 0.7569481730461121,
"epoch": 0.06757728253055356,
"kl_loss": 0.1333114206790924,
"loss_ib": 0.012738126330077648,
"step": 470
},
{
"ce_ib": 10.791728019714355,
"ce_orig": 0.8886812329292297,
"epoch": 0.06772106398274623,
"kl_loss": 0.14122334122657776,
"loss_ib": 0.012457030825316906,
"step": 471
},
{
"ce_ib": 11.75979232788086,
"ce_orig": 0.93720543384552,
"epoch": 0.0678648454349389,
"kl_loss": 0.08433859050273895,
"loss_ib": 0.010096825659275055,
"step": 472
},
{
"ce_ib": 7.816238880157471,
"ce_orig": 0.5898436903953552,
"epoch": 0.06800862688713157,
"kl_loss": 0.26155394315719604,
"loss_ib": 0.0169858168810606,
"step": 473
},
{
"ce_ib": 12.64213752746582,
"ce_orig": 1.2193433046340942,
"epoch": 0.06815240833932423,
"kl_loss": 0.1382063627243042,
"loss_ib": 0.013231388293206692,
"step": 474
},
{
"epoch": 0.0682961897915169,
"grad_norm": 0.12322834134101868,
"learning_rate": 3.714057507987221e-05,
"loss": 0.942,
"step": 475
},
{
"ce_ib": 12.208565711975098,
"ce_orig": 0.6283319592475891,
"epoch": 0.0682961897915169,
"kl_loss": 0.1437569409608841,
"loss_ib": 0.013292129151523113,
"step": 475
},
{
"ce_ib": 10.480301856994629,
"ce_orig": 0.6875295042991638,
"epoch": 0.06843997124370955,
"kl_loss": 0.171269491314888,
"loss_ib": 0.013803625479340553,
"step": 476
},
{
"ce_ib": 12.140584945678711,
"ce_orig": 0.686497151851654,
"epoch": 0.06858375269590222,
"kl_loss": 0.14714768528938293,
"loss_ib": 0.013427676633000374,
"step": 477
},
{
"ce_ib": 13.12353515625,
"ce_orig": 1.330522060394287,
"epoch": 0.06872753414809489,
"kl_loss": 0.18113256990909576,
"loss_ib": 0.015618395991623402,
"step": 478
},
{
"ce_ib": 12.710488319396973,
"ce_orig": 1.4100775718688965,
"epoch": 0.06887131560028756,
"kl_loss": 0.12010614573955536,
"loss_ib": 0.01236055139452219,
"step": 479
},
{
"epoch": 0.06901509705248023,
"grad_norm": 0.11526408791542053,
"learning_rate": 3.7539936102236424e-05,
"loss": 0.8998,
"step": 480
},
{
"ce_ib": 13.879955291748047,
"ce_orig": 1.1341381072998047,
"epoch": 0.06901509705248023,
"kl_loss": 0.1282053291797638,
"loss_ib": 0.013350243680179119,
"step": 480
},
{
"ce_ib": 8.764423370361328,
"ce_orig": 0.6200535893440247,
"epoch": 0.0691588785046729,
"kl_loss": 0.1320233792066574,
"loss_ib": 0.010983380489051342,
"step": 481
},
{
"ce_ib": 9.943157196044922,
"ce_orig": 0.6673835515975952,
"epoch": 0.06930265995686556,
"kl_loss": 0.10792216658592224,
"loss_ib": 0.010367686860263348,
"step": 482
},
{
"ce_ib": 10.92377758026123,
"ce_orig": 0.7028371095657349,
"epoch": 0.06944644140905823,
"kl_loss": 0.15012815594673157,
"loss_ib": 0.012968296185135841,
"step": 483
},
{
"ce_ib": 9.512238502502441,
"ce_orig": 0.5816277265548706,
"epoch": 0.0695902228612509,
"kl_loss": 0.13318368792533875,
"loss_ib": 0.011415303684771061,
"step": 484
},
{
"epoch": 0.06973400431344356,
"grad_norm": 0.12581641972064972,
"learning_rate": 3.793929712460064e-05,
"loss": 0.8585,
"step": 485
},
{
"ce_ib": 13.113508224487305,
"ce_orig": 1.0122153759002686,
"epoch": 0.06973400431344356,
"kl_loss": 0.17498700320720673,
"loss_ib": 0.01530610304325819,
"step": 485
},
{
"ce_ib": 11.20240592956543,
"ce_orig": 0.8718814253807068,
"epoch": 0.06987778576563623,
"kl_loss": 0.12174628674983978,
"loss_ib": 0.011688517406582832,
"step": 486
},
{
"ce_ib": 12.784674644470215,
"ce_orig": 0.8871896266937256,
"epoch": 0.0700215672178289,
"kl_loss": 0.16703587770462036,
"loss_ib": 0.014744131825864315,
"step": 487
},
{
"ce_ib": 10.580418586730957,
"ce_orig": 0.8577698469161987,
"epoch": 0.07016534867002157,
"kl_loss": 0.11169049143791199,
"loss_ib": 0.010874733328819275,
"step": 488
},
{
"ce_ib": 10.39923095703125,
"ce_orig": 0.6622049808502197,
"epoch": 0.07030913012221424,
"kl_loss": 0.17256517708301544,
"loss_ib": 0.013827874325215816,
"step": 489
},
{
"epoch": 0.0704529115744069,
"grad_norm": 0.11361895501613617,
"learning_rate": 3.8338658146964856e-05,
"loss": 0.901,
"step": 490
},
{
"ce_ib": 13.566216468811035,
"ce_orig": 0.9003996849060059,
"epoch": 0.0704529115744069,
"kl_loss": 0.18744704127311707,
"loss_ib": 0.016155460849404335,
"step": 490
},
{
"ce_ib": 11.603694915771484,
"ce_orig": 1.0972646474838257,
"epoch": 0.07059669302659957,
"kl_loss": 0.09514350444078445,
"loss_ib": 0.010559022426605225,
"step": 491
},
{
"ce_ib": 12.866926193237305,
"ce_orig": 1.1191866397857666,
"epoch": 0.07074047447879224,
"kl_loss": 0.12916871905326843,
"loss_ib": 0.012891898863017559,
"step": 492
},
{
"ce_ib": 11.685700416564941,
"ce_orig": 1.0439685583114624,
"epoch": 0.07088425593098491,
"kl_loss": 0.13916221261024475,
"loss_ib": 0.012800960801541805,
"step": 493
},
{
"ce_ib": 8.240974426269531,
"ce_orig": 0.6664552092552185,
"epoch": 0.07102803738317758,
"kl_loss": 0.0913599506020546,
"loss_ib": 0.008688484318554401,
"step": 494
},
{
"epoch": 0.07117181883537024,
"grad_norm": 0.11460109055042267,
"learning_rate": 3.8738019169329075e-05,
"loss": 0.8749,
"step": 495
},
{
"ce_ib": 15.161900520324707,
"ce_orig": 1.5418399572372437,
"epoch": 0.07117181883537024,
"kl_loss": 0.14499804377555847,
"loss_ib": 0.014830851927399635,
"step": 495
},
{
"ce_ib": 12.499137878417969,
"ce_orig": 0.7715917229652405,
"epoch": 0.07131560028756291,
"kl_loss": 0.13650982081890106,
"loss_ib": 0.013075060211122036,
"step": 496
},
{
"ce_ib": 10.516082763671875,
"ce_orig": 0.810151219367981,
"epoch": 0.07145938173975556,
"kl_loss": 0.1658136546611786,
"loss_ib": 0.013548724353313446,
"step": 497
},
{
"ce_ib": 12.26677131652832,
"ce_orig": 0.4871862828731537,
"epoch": 0.07160316319194823,
"kl_loss": 0.15727362036705017,
"loss_ib": 0.013997065834701061,
"step": 498
},
{
"ce_ib": 8.879415512084961,
"ce_orig": 0.5100986957550049,
"epoch": 0.0717469446441409,
"kl_loss": 0.10066086053848267,
"loss_ib": 0.009472750127315521,
"step": 499
},
{
"epoch": 0.07189072609633357,
"grad_norm": 0.1443518102169037,
"learning_rate": 3.913738019169329e-05,
"loss": 0.8769,
"step": 500
},
{
"ce_ib": 11.191156387329102,
"ce_orig": 0.7616560459136963,
"epoch": 0.07189072609633357,
"kl_loss": 0.13632240891456604,
"loss_ib": 0.012411698698997498,
"step": 500
},
{
"ce_ib": 11.555066108703613,
"ce_orig": 0.8566383123397827,
"epoch": 0.07203450754852624,
"kl_loss": 0.162722647190094,
"loss_ib": 0.013913665898144245,
"step": 501
},
{
"ce_ib": 10.90505599975586,
"ce_orig": 0.9519692063331604,
"epoch": 0.0721782890007189,
"kl_loss": 0.12059365957975388,
"loss_ib": 0.011482210829854012,
"step": 502
},
{
"ce_ib": 9.185860633850098,
"ce_orig": 0.5769492387771606,
"epoch": 0.07232207045291157,
"kl_loss": 0.14534525573253632,
"loss_ib": 0.011860193684697151,
"step": 503
},
{
"ce_ib": 12.747846603393555,
"ce_orig": 0.893709123134613,
"epoch": 0.07246585190510424,
"kl_loss": 0.12308457493782043,
"loss_ib": 0.012528151273727417,
"step": 504
},
{
"epoch": 0.07260963335729691,
"grad_norm": 0.11177890002727509,
"learning_rate": 3.953674121405751e-05,
"loss": 0.8203,
"step": 505
},
{
"ce_ib": 10.646434783935547,
"ce_orig": 1.0113996267318726,
"epoch": 0.07260963335729691,
"kl_loss": 0.12359193712472916,
"loss_ib": 0.01150281447917223,
"step": 505
},
{
"ce_ib": 10.180166244506836,
"ce_orig": 1.0264123678207397,
"epoch": 0.07275341480948957,
"kl_loss": 0.1544847935438156,
"loss_ib": 0.012814322486519814,
"step": 506
},
{
"ce_ib": 11.716635704040527,
"ce_orig": 0.9286133646965027,
"epoch": 0.07289719626168224,
"kl_loss": 0.1243153065443039,
"loss_ib": 0.012074083089828491,
"step": 507
},
{
"ce_ib": 9.650020599365234,
"ce_orig": 0.9953688383102417,
"epoch": 0.07304097771387491,
"kl_loss": 0.12280933558940887,
"loss_ib": 0.010965476743876934,
"step": 508
},
{
"ce_ib": 9.697562217712402,
"ce_orig": 0.4963395297527313,
"epoch": 0.07318475916606758,
"kl_loss": 0.11651341617107391,
"loss_ib": 0.010674451477825642,
"step": 509
},
{
"epoch": 0.07332854061826025,
"grad_norm": 0.10411624610424042,
"learning_rate": 3.9936102236421726e-05,
"loss": 0.8555,
"step": 510
},
{
"ce_ib": 11.558809280395508,
"ce_orig": 1.02761971950531,
"epoch": 0.07332854061826025,
"kl_loss": 0.15587545931339264,
"loss_ib": 0.01357317715883255,
"step": 510
},
{
"ce_ib": 12.694424629211426,
"ce_orig": 1.1861635446548462,
"epoch": 0.07347232207045291,
"kl_loss": 0.1220870316028595,
"loss_ib": 0.012451563961803913,
"step": 511
},
{
"ce_ib": 8.967998504638672,
"ce_orig": 0.8245717883110046,
"epoch": 0.07361610352264558,
"kl_loss": 0.11289598047733307,
"loss_ib": 0.010128798894584179,
"step": 512
},
{
"ce_ib": 10.940625190734863,
"ce_orig": 0.742675244808197,
"epoch": 0.07375988497483825,
"kl_loss": 0.11433502286672592,
"loss_ib": 0.011187063530087471,
"step": 513
},
{
"ce_ib": 11.216033935546875,
"ce_orig": 0.7051765322685242,
"epoch": 0.07390366642703092,
"kl_loss": 0.17939868569374084,
"loss_ib": 0.014577952213585377,
"step": 514
},
{
"epoch": 0.07404744787922359,
"grad_norm": 0.12375368177890778,
"learning_rate": 4.0335463258785946e-05,
"loss": 0.8018,
"step": 515
},
{
"ce_ib": 12.851394653320312,
"ce_orig": 0.959805965423584,
"epoch": 0.07404744787922359,
"kl_loss": 0.11663807928562164,
"loss_ib": 0.012257601134479046,
"step": 515
},
{
"ce_ib": 9.45693302154541,
"ce_orig": 0.6901520490646362,
"epoch": 0.07419122933141625,
"kl_loss": 0.16356007754802704,
"loss_ib": 0.012906471267342567,
"step": 516
},
{
"ce_ib": 13.640007972717285,
"ce_orig": 1.0324355363845825,
"epoch": 0.07433501078360892,
"kl_loss": 0.14768928289413452,
"loss_ib": 0.01420446764677763,
"step": 517
},
{
"ce_ib": 10.631475448608398,
"ce_orig": 0.7622382044792175,
"epoch": 0.07447879223580157,
"kl_loss": 0.1414967179298401,
"loss_ib": 0.012390573509037495,
"step": 518
},
{
"ce_ib": 13.440409660339355,
"ce_orig": 1.4887185096740723,
"epoch": 0.07462257368799424,
"kl_loss": 0.13175088167190552,
"loss_ib": 0.013307749293744564,
"step": 519
},
{
"epoch": 0.07476635514018691,
"grad_norm": 0.12905798852443695,
"learning_rate": 4.0734824281150165e-05,
"loss": 0.9488,
"step": 520
},
{
"ce_ib": 11.399759292602539,
"ce_orig": 0.9179264307022095,
"epoch": 0.07476635514018691,
"kl_loss": 0.1584293693304062,
"loss_ib": 0.013621347956359386,
"step": 520
},
{
"ce_ib": 9.124701499938965,
"ce_orig": 0.6920540928840637,
"epoch": 0.07491013659237958,
"kl_loss": 0.1023043617606163,
"loss_ib": 0.009677569381892681,
"step": 521
},
{
"ce_ib": 11.09079647064209,
"ce_orig": 1.124171257019043,
"epoch": 0.07505391804457225,
"kl_loss": 0.1274327039718628,
"loss_ib": 0.011917034164071083,
"step": 522
},
{
"ce_ib": 11.651871681213379,
"ce_orig": 1.1573556661605835,
"epoch": 0.07519769949676491,
"kl_loss": 0.1257188469171524,
"loss_ib": 0.012111878953874111,
"step": 523
},
{
"ce_ib": 10.919167518615723,
"ce_orig": 1.096139907836914,
"epoch": 0.07534148094895758,
"kl_loss": 0.14454081654548645,
"loss_ib": 0.012686625123023987,
"step": 524
},
{
"epoch": 0.07548526240115025,
"grad_norm": 0.09990247339010239,
"learning_rate": 4.113418530351438e-05,
"loss": 0.8695,
"step": 525
},
{
"ce_ib": 13.44006633758545,
"ce_orig": 1.1762773990631104,
"epoch": 0.07548526240115025,
"kl_loss": 0.1347535103559494,
"loss_ib": 0.013457709923386574,
"step": 525
},
{
"ce_ib": 10.870895385742188,
"ce_orig": 0.6810830235481262,
"epoch": 0.07562904385334292,
"kl_loss": 0.1084175780415535,
"loss_ib": 0.010856325738132,
"step": 526
},
{
"ce_ib": 11.001641273498535,
"ce_orig": 0.5774558782577515,
"epoch": 0.07577282530553558,
"kl_loss": 0.17218388617038727,
"loss_ib": 0.014110015705227852,
"step": 527
},
{
"ce_ib": 12.118193626403809,
"ce_orig": 1.1861246824264526,
"epoch": 0.07591660675772825,
"kl_loss": 0.12062694877386093,
"loss_ib": 0.012090444564819336,
"step": 528
},
{
"ce_ib": 10.118423461914062,
"ce_orig": 0.9129387736320496,
"epoch": 0.07606038820992092,
"kl_loss": 0.09990894794464111,
"loss_ib": 0.010054659098386765,
"step": 529
},
{
"epoch": 0.07620416966211359,
"grad_norm": 0.11624792218208313,
"learning_rate": 4.15335463258786e-05,
"loss": 0.8571,
"step": 530
},
{
"ce_ib": 10.449731826782227,
"ce_orig": 0.5511190891265869,
"epoch": 0.07620416966211359,
"kl_loss": 0.18196257948875427,
"loss_ib": 0.014322995208203793,
"step": 530
},
{
"ce_ib": 10.00999641418457,
"ce_orig": 0.7014174461364746,
"epoch": 0.07634795111430626,
"kl_loss": 0.14452239871025085,
"loss_ib": 0.01223111804574728,
"step": 531
},
{
"ce_ib": 12.196045875549316,
"ce_orig": 1.0053819417953491,
"epoch": 0.07649173256649892,
"kl_loss": 0.12312173843383789,
"loss_ib": 0.012254110537469387,
"step": 532
},
{
"ce_ib": 11.569887161254883,
"ce_orig": 0.9444661140441895,
"epoch": 0.07663551401869159,
"kl_loss": 0.12740397453308105,
"loss_ib": 0.012155142612755299,
"step": 533
},
{
"ce_ib": 9.093774795532227,
"ce_orig": 0.7246710658073425,
"epoch": 0.07677929547088426,
"kl_loss": 0.1298743486404419,
"loss_ib": 0.011040604673326015,
"step": 534
},
{
"epoch": 0.07692307692307693,
"grad_norm": 0.13377481698989868,
"learning_rate": 4.193290734824281e-05,
"loss": 0.8601,
"step": 535
},
{
"ce_ib": 13.371556282043457,
"ce_orig": 1.1632790565490723,
"epoch": 0.07692307692307693,
"kl_loss": 0.13051572442054749,
"loss_ib": 0.013211563229560852,
"step": 535
},
{
"ce_ib": 11.797460556030273,
"ce_orig": 0.9421883821487427,
"epoch": 0.0770668583752696,
"kl_loss": 0.13678061962127686,
"loss_ib": 0.012737761251628399,
"step": 536
},
{
"ce_ib": 12.24150562286377,
"ce_orig": 0.7441449761390686,
"epoch": 0.07721063982746226,
"kl_loss": 0.11821313202381134,
"loss_ib": 0.012031408958137035,
"step": 537
},
{
"ce_ib": 10.22864818572998,
"ce_orig": 0.9380773305892944,
"epoch": 0.07735442127965493,
"kl_loss": 0.10538578778505325,
"loss_ib": 0.010383613407611847,
"step": 538
},
{
"ce_ib": 12.875476837158203,
"ce_orig": 0.8242666721343994,
"epoch": 0.0774982027318476,
"kl_loss": 0.15831780433654785,
"loss_ib": 0.014353628270328045,
"step": 539
},
{
"epoch": 0.07764198418404025,
"grad_norm": 0.10647567361593246,
"learning_rate": 4.233226837060703e-05,
"loss": 0.8364,
"step": 540
},
{
"ce_ib": 11.523818016052246,
"ce_orig": 0.8004295229911804,
"epoch": 0.07764198418404025,
"kl_loss": 0.1284905970096588,
"loss_ib": 0.012186438776552677,
"step": 540
},
{
"ce_ib": 11.77370548248291,
"ce_orig": 0.6032363176345825,
"epoch": 0.07778576563623292,
"kl_loss": 0.17018523812294006,
"loss_ib": 0.014396115206182003,
"step": 541
},
{
"ce_ib": 10.745952606201172,
"ce_orig": 0.6720147132873535,
"epoch": 0.07792954708842559,
"kl_loss": 0.13083013892173767,
"loss_ib": 0.011914483271539211,
"step": 542
},
{
"ce_ib": 12.431722640991211,
"ce_orig": 1.328582525253296,
"epoch": 0.07807332854061826,
"kl_loss": 0.11531039327383041,
"loss_ib": 0.011981381103396416,
"step": 543
},
{
"ce_ib": 10.487591743469238,
"ce_orig": 0.8851913809776306,
"epoch": 0.07821710999281092,
"kl_loss": 0.1059553325176239,
"loss_ib": 0.010541562922298908,
"step": 544
},
{
"epoch": 0.07836089144500359,
"grad_norm": 0.12910747528076172,
"learning_rate": 4.273162939297125e-05,
"loss": 0.8664,
"step": 545
},
{
"ce_ib": 10.0007905960083,
"ce_orig": 0.9234699010848999,
"epoch": 0.07836089144500359,
"kl_loss": 0.1631077080965042,
"loss_ib": 0.013155780732631683,
"step": 545
},
{
"ce_ib": 10.215255737304688,
"ce_orig": 0.8365392088890076,
"epoch": 0.07850467289719626,
"kl_loss": 0.11404451727867126,
"loss_ib": 0.010809853672981262,
"step": 546
},
{
"ce_ib": 10.295679092407227,
"ce_orig": 0.6545149683952332,
"epoch": 0.07864845434938893,
"kl_loss": 0.12140201032161713,
"loss_ib": 0.011217939667403698,
"step": 547
},
{
"ce_ib": 9.238651275634766,
"ce_orig": 0.7438675761222839,
"epoch": 0.0787922358015816,
"kl_loss": 0.10375800728797913,
"loss_ib": 0.009807226248085499,
"step": 548
},
{
"ce_ib": 11.177849769592285,
"ce_orig": 0.9599023461341858,
"epoch": 0.07893601725377426,
"kl_loss": 0.15443462133407593,
"loss_ib": 0.013310655951499939,
"step": 549
},
{
"epoch": 0.07907979870596693,
"grad_norm": 0.11976125091314316,
"learning_rate": 4.313099041533547e-05,
"loss": 0.9199,
"step": 550
},
{
"ce_ib": 8.745708465576172,
"ce_orig": 0.745059072971344,
"epoch": 0.07907979870596693,
"kl_loss": 0.10965421795845032,
"loss_ib": 0.00985556561499834,
"step": 550
},
{
"ce_ib": 9.332277297973633,
"ce_orig": 0.7904736995697021,
"epoch": 0.0792235801581596,
"kl_loss": 0.14583170413970947,
"loss_ib": 0.011957723647356033,
"step": 551
},
{
"ce_ib": 10.741596221923828,
"ce_orig": 0.9755902290344238,
"epoch": 0.07936736161035227,
"kl_loss": 0.11897563934326172,
"loss_ib": 0.011319580487906933,
"step": 552
},
{
"ce_ib": 8.226643562316895,
"ce_orig": 0.6986103057861328,
"epoch": 0.07951114306254493,
"kl_loss": 0.11197762191295624,
"loss_ib": 0.009712203405797482,
"step": 553
},
{
"ce_ib": 11.091338157653809,
"ce_orig": 0.7560675740242004,
"epoch": 0.0796549245147376,
"kl_loss": 0.12657985091209412,
"loss_ib": 0.011874661780893803,
"step": 554
},
{
"epoch": 0.07979870596693027,
"grad_norm": 0.1107710599899292,
"learning_rate": 4.3530351437699686e-05,
"loss": 0.8873,
"step": 555
},
{
"ce_ib": 10.592079162597656,
"ce_orig": 1.1071833372116089,
"epoch": 0.07979870596693027,
"kl_loss": 0.13393369317054749,
"loss_ib": 0.011992724612355232,
"step": 555
},
{
"ce_ib": 10.618276596069336,
"ce_orig": 0.9089037775993347,
"epoch": 0.07994248741912294,
"kl_loss": 0.16187895834445953,
"loss_ib": 0.013403086923062801,
"step": 556
},
{
"ce_ib": 10.179542541503906,
"ce_orig": 0.7911267876625061,
"epoch": 0.0800862688713156,
"kl_loss": 0.19424620270729065,
"loss_ib": 0.01480208057910204,
"step": 557
},
{
"ce_ib": 9.025123596191406,
"ce_orig": 0.8696843385696411,
"epoch": 0.08023005032350827,
"kl_loss": 0.10537652671337128,
"loss_ib": 0.009781388565897942,
"step": 558
},
{
"ce_ib": 10.719731330871582,
"ce_orig": 1.1363403797149658,
"epoch": 0.08037383177570094,
"kl_loss": 0.12376905977725983,
"loss_ib": 0.011548318900167942,
"step": 559
},
{
"epoch": 0.08051761322789361,
"grad_norm": 0.11026783287525177,
"learning_rate": 4.39297124600639e-05,
"loss": 0.9008,
"step": 560
},
{
"ce_ib": 11.907670021057129,
"ce_orig": 1.1177096366882324,
"epoch": 0.08051761322789361,
"kl_loss": 0.13737264275550842,
"loss_ib": 0.012822466902434826,
"step": 560
},
{
"ce_ib": 8.7591552734375,
"ce_orig": 0.5712915658950806,
"epoch": 0.08066139468008626,
"kl_loss": 0.0947001650929451,
"loss_ib": 0.009114585816860199,
"step": 561
},
{
"ce_ib": 7.738790988922119,
"ce_orig": 0.688946545124054,
"epoch": 0.08080517613227893,
"kl_loss": 0.10011854022741318,
"loss_ib": 0.00887532252818346,
"step": 562
},
{
"ce_ib": 9.232172012329102,
"ce_orig": 0.485678106546402,
"epoch": 0.0809489575844716,
"kl_loss": 0.14091522991657257,
"loss_ib": 0.011661847122013569,
"step": 563
},
{
"ce_ib": 11.895172119140625,
"ce_orig": 1.1715614795684814,
"epoch": 0.08109273903666427,
"kl_loss": 0.1400681585073471,
"loss_ib": 0.012950994074344635,
"step": 564
},
{
"epoch": 0.08123652048885693,
"grad_norm": 0.12719914317131042,
"learning_rate": 4.432907348242812e-05,
"loss": 0.9266,
"step": 565
},
{
"ce_ib": 11.971985816955566,
"ce_orig": 1.0788679122924805,
"epoch": 0.08123652048885693,
"kl_loss": 0.1462956666946411,
"loss_ib": 0.013300776481628418,
"step": 565
},
{
"ce_ib": 10.301172256469727,
"ce_orig": 1.0739414691925049,
"epoch": 0.0813803019410496,
"kl_loss": 0.13510483503341675,
"loss_ib": 0.01190582849085331,
"step": 566
},
{
"ce_ib": 13.213077545166016,
"ce_orig": 1.0148154497146606,
"epoch": 0.08152408339324227,
"kl_loss": 0.14191558957099915,
"loss_ib": 0.01370231807231903,
"step": 567
},
{
"ce_ib": 8.01417350769043,
"ce_orig": 0.836399257183075,
"epoch": 0.08166786484543494,
"kl_loss": 0.11862446367740631,
"loss_ib": 0.009938309900462627,
"step": 568
},
{
"ce_ib": 9.659282684326172,
"ce_orig": 0.9532420039176941,
"epoch": 0.0818116462976276,
"kl_loss": 0.15856535732746124,
"loss_ib": 0.01275790948420763,
"step": 569
},
{
"epoch": 0.08195542774982027,
"grad_norm": 0.11036694049835205,
"learning_rate": 4.472843450479233e-05,
"loss": 0.8765,
"step": 570
},
{
"ce_ib": 6.825167655944824,
"ce_orig": 0.5682023763656616,
"epoch": 0.08195542774982027,
"kl_loss": 0.17675429582595825,
"loss_ib": 0.012250298634171486,
"step": 570
},
{
"ce_ib": 9.991971969604492,
"ce_orig": 0.8481072783470154,
"epoch": 0.08209920920201294,
"kl_loss": 0.17780299484729767,
"loss_ib": 0.013886136002838612,
"step": 571
},
{
"ce_ib": 11.411465644836426,
"ce_orig": 1.0281726121902466,
"epoch": 0.08224299065420561,
"kl_loss": 0.11431736499071121,
"loss_ib": 0.01142160128802061,
"step": 572
},
{
"ce_ib": 13.019828796386719,
"ce_orig": 1.1337321996688843,
"epoch": 0.08238677210639828,
"kl_loss": 0.2963365316390991,
"loss_ib": 0.021326741203665733,
"step": 573
},
{
"ce_ib": 8.582294464111328,
"ce_orig": 0.5892922878265381,
"epoch": 0.08253055355859094,
"kl_loss": 0.10176214575767517,
"loss_ib": 0.009379254654049873,
"step": 574
},
{
"epoch": 0.08267433501078361,
"grad_norm": 0.1296963393688202,
"learning_rate": 4.512779552715655e-05,
"loss": 0.9216,
"step": 575
},
{
"ce_ib": 8.016057014465332,
"ce_orig": 0.8886985778808594,
"epoch": 0.08267433501078361,
"kl_loss": 0.11895924806594849,
"loss_ib": 0.009955990128219128,
"step": 575
},
{
"ce_ib": 8.56618595123291,
"ce_orig": 0.9640144109725952,
"epoch": 0.08281811646297628,
"kl_loss": 0.10030095279216766,
"loss_ib": 0.009298141114413738,
"step": 576
},
{
"ce_ib": 11.222845077514648,
"ce_orig": 0.7690722346305847,
"epoch": 0.08296189791516895,
"kl_loss": 0.15022683143615723,
"loss_ib": 0.01312276441603899,
"step": 577
},
{
"ce_ib": 8.99817943572998,
"ce_orig": 0.8388349413871765,
"epoch": 0.08310567936736162,
"kl_loss": 0.1211586445569992,
"loss_ib": 0.01055702194571495,
"step": 578
},
{
"ce_ib": 8.70596694946289,
"ce_orig": 0.8935554623603821,
"epoch": 0.08324946081955428,
"kl_loss": 0.18791162967681885,
"loss_ib": 0.013748565688729286,
"step": 579
},
{
"epoch": 0.08339324227174695,
"grad_norm": 0.09863687306642532,
"learning_rate": 4.552715654952077e-05,
"loss": 0.8877,
"step": 580
},
{
"ce_ib": 9.075626373291016,
"ce_orig": 0.6573423743247986,
"epoch": 0.08339324227174695,
"kl_loss": 0.11877353489398956,
"loss_ib": 0.010476489551365376,
"step": 580
},
{
"ce_ib": 9.447694778442383,
"ce_orig": 0.8443095684051514,
"epoch": 0.08353702372393962,
"kl_loss": 0.10504357516765594,
"loss_ib": 0.009976026602089405,
"step": 581
},
{
"ce_ib": 10.347243309020996,
"ce_orig": 0.8741527795791626,
"epoch": 0.08368080517613227,
"kl_loss": 0.1390937864780426,
"loss_ib": 0.012128311209380627,
"step": 582
},
{
"ce_ib": 10.385614395141602,
"ce_orig": 0.9572657346725464,
"epoch": 0.08382458662832494,
"kl_loss": 0.11069104075431824,
"loss_ib": 0.010727359913289547,
"step": 583
},
{
"ce_ib": 9.45213508605957,
"ce_orig": 0.8818128705024719,
"epoch": 0.08396836808051761,
"kl_loss": 0.11259730160236359,
"loss_ib": 0.0103559335693717,
"step": 584
},
{
"epoch": 0.08411214953271028,
"grad_norm": 0.13552848994731903,
"learning_rate": 4.592651757188499e-05,
"loss": 0.8925,
"step": 585
},
{
"ce_ib": 11.096572875976562,
"ce_orig": 1.0407826900482178,
"epoch": 0.08411214953271028,
"kl_loss": 0.12498383969068527,
"loss_ib": 0.01179747935384512,
"step": 585
},
{
"ce_ib": 8.041762351989746,
"ce_orig": 0.5626028776168823,
"epoch": 0.08425593098490294,
"kl_loss": 0.10758772492408752,
"loss_ib": 0.009400267153978348,
"step": 586
},
{
"ce_ib": 9.080262184143066,
"ce_orig": 0.8330552577972412,
"epoch": 0.08439971243709561,
"kl_loss": 0.11845473945140839,
"loss_ib": 0.01046286802738905,
"step": 587
},
{
"ce_ib": 9.385934829711914,
"ce_orig": 0.9699596166610718,
"epoch": 0.08454349388928828,
"kl_loss": 0.11468707025051117,
"loss_ib": 0.010427321307361126,
"step": 588
},
{
"ce_ib": 8.70030689239502,
"ce_orig": 0.8794450163841248,
"epoch": 0.08468727534148095,
"kl_loss": 0.09058257937431335,
"loss_ib": 0.008879282511770725,
"step": 589
},
{
"epoch": 0.08483105679367361,
"grad_norm": 0.12226880341768265,
"learning_rate": 4.632587859424921e-05,
"loss": 0.8537,
"step": 590
},
{
"ce_ib": 11.954180717468262,
"ce_orig": 0.6831358075141907,
"epoch": 0.08483105679367361,
"kl_loss": 0.11714005470275879,
"loss_ib": 0.011834092438220978,
"step": 590
},
{
"ce_ib": 11.130764961242676,
"ce_orig": 1.1798628568649292,
"epoch": 0.08497483824586628,
"kl_loss": 0.11755703389644623,
"loss_ib": 0.01144323404878378,
"step": 591
},
{
"ce_ib": 9.00490665435791,
"ce_orig": 1.0365432500839233,
"epoch": 0.08511861969805895,
"kl_loss": 0.10278277099132538,
"loss_ib": 0.009641592390835285,
"step": 592
},
{
"ce_ib": 10.016988754272461,
"ce_orig": 0.7779159545898438,
"epoch": 0.08526240115025162,
"kl_loss": 0.1287814825773239,
"loss_ib": 0.01144756842404604,
"step": 593
},
{
"ce_ib": 8.835247039794922,
"ce_orig": 0.7125930190086365,
"epoch": 0.08540618260244429,
"kl_loss": 0.14099836349487305,
"loss_ib": 0.011467541567981243,
"step": 594
},
{
"epoch": 0.08554996405463695,
"grad_norm": 0.11100345104932785,
"learning_rate": 4.672523961661342e-05,
"loss": 0.8406,
"step": 595
},
{
"ce_ib": 11.02431583404541,
"ce_orig": 0.7663914561271667,
"epoch": 0.08554996405463695,
"kl_loss": 0.0822073221206665,
"loss_ib": 0.009622524492442608,
"step": 595
},
{
"ce_ib": 10.236846923828125,
"ce_orig": 0.5589779615402222,
"epoch": 0.08569374550682962,
"kl_loss": 0.1600915789604187,
"loss_ib": 0.013123002834618092,
"step": 596
},
{
"ce_ib": 8.800088882446289,
"ce_orig": 0.6275121569633484,
"epoch": 0.08583752695902229,
"kl_loss": 0.12892715632915497,
"loss_ib": 0.010846401564776897,
"step": 597
},
{
"ce_ib": 8.204318046569824,
"ce_orig": 0.6452017426490784,
"epoch": 0.08598130841121496,
"kl_loss": 0.10103371739387512,
"loss_ib": 0.00915384478867054,
"step": 598
},
{
"ce_ib": 8.391890525817871,
"ce_orig": 0.7417363524436951,
"epoch": 0.08612508986340763,
"kl_loss": 0.13140645623207092,
"loss_ib": 0.010766267776489258,
"step": 599
},
{
"epoch": 0.08626887131560029,
"grad_norm": 0.132738396525383,
"learning_rate": 4.712460063897764e-05,
"loss": 0.8082,
"step": 600
},
{
"ce_ib": 10.40978717803955,
"ce_orig": 0.8029245734214783,
"epoch": 0.08626887131560029,
"kl_loss": 0.09170211851596832,
"loss_ib": 0.009789999574422836,
"step": 600
},
{
"ce_ib": 8.616018295288086,
"ce_orig": 1.1102370023727417,
"epoch": 0.08641265276779296,
"kl_loss": 0.11172410845756531,
"loss_ib": 0.009894214570522308,
"step": 601
},
{
"ce_ib": 10.94422435760498,
"ce_orig": 0.9171683192253113,
"epoch": 0.08655643421998563,
"kl_loss": 0.13117164373397827,
"loss_ib": 0.01203069370239973,
"step": 602
},
{
"ce_ib": 11.655625343322754,
"ce_orig": 1.1917483806610107,
"epoch": 0.08670021567217828,
"kl_loss": 0.13371366262435913,
"loss_ib": 0.012513495981693268,
"step": 603
},
{
"ce_ib": 7.700047969818115,
"ce_orig": 0.6811072826385498,
"epoch": 0.08684399712437095,
"kl_loss": 0.13317114114761353,
"loss_ib": 0.010508581064641476,
"step": 604
},
{
"epoch": 0.08698777857656362,
"grad_norm": 0.10468501597642899,
"learning_rate": 4.752396166134185e-05,
"loss": 0.8558,
"step": 605
},
{
"ce_ib": 9.883849143981934,
"ce_orig": 0.9263736605644226,
"epoch": 0.08698777857656362,
"kl_loss": 0.21517115831375122,
"loss_ib": 0.015700481832027435,
"step": 605
},
{
"ce_ib": 9.480666160583496,
"ce_orig": 1.0162242650985718,
"epoch": 0.08713156002875629,
"kl_loss": 0.11819291114807129,
"loss_ib": 0.010649978183209896,
"step": 606
},
{
"ce_ib": 9.99695873260498,
"ce_orig": 0.7871977090835571,
"epoch": 0.08727534148094895,
"kl_loss": 0.21086569130420685,
"loss_ib": 0.015541763976216316,
"step": 607
},
{
"ce_ib": 10.787432670593262,
"ce_orig": 1.3771346807479858,
"epoch": 0.08741912293314162,
"kl_loss": 0.11459565162658691,
"loss_ib": 0.01112349983304739,
"step": 608
},
{
"ce_ib": 7.926780700683594,
"ce_orig": 0.6084997057914734,
"epoch": 0.08756290438533429,
"kl_loss": 0.11265780031681061,
"loss_ib": 0.009596280753612518,
"step": 609
},
{
"epoch": 0.08770668583752696,
"grad_norm": 0.1861116737127304,
"learning_rate": 4.792332268370607e-05,
"loss": 0.841,
"step": 610
},
{
"ce_ib": 11.350239753723145,
"ce_orig": 1.3010127544403076,
"epoch": 0.08770668583752696,
"kl_loss": 0.1271829605102539,
"loss_ib": 0.012034268118441105,
"step": 610
},
{
"ce_ib": 9.161566734313965,
"ce_orig": 1.089638590812683,
"epoch": 0.08785046728971962,
"kl_loss": 0.1308199018239975,
"loss_ib": 0.0111217787489295,
"step": 611
},
{
"ce_ib": 9.268600463867188,
"ce_orig": 0.6986385583877563,
"epoch": 0.08799424874191229,
"kl_loss": 0.1047590896487236,
"loss_ib": 0.00987225491553545,
"step": 612
},
{
"ce_ib": 9.51627254486084,
"ce_orig": 0.7158625721931458,
"epoch": 0.08813803019410496,
"kl_loss": 0.13369254767894745,
"loss_ib": 0.011442764662206173,
"step": 613
},
{
"ce_ib": 8.772662162780762,
"ce_orig": 1.0912283658981323,
"epoch": 0.08828181164629763,
"kl_loss": 0.12375178933143616,
"loss_ib": 0.010573920793831348,
"step": 614
},
{
"epoch": 0.0884255930984903,
"grad_norm": 0.11201420426368713,
"learning_rate": 4.832268370607029e-05,
"loss": 0.9532,
"step": 615
},
{
"ce_ib": 9.231523513793945,
"ce_orig": 0.9239461421966553,
"epoch": 0.0884255930984903,
"kl_loss": 0.11232542991638184,
"loss_ib": 0.010232033208012581,
"step": 615
},
{
"ce_ib": 10.984260559082031,
"ce_orig": 1.0705440044403076,
"epoch": 0.08856937455068296,
"kl_loss": 0.14025995135307312,
"loss_ib": 0.012505128048360348,
"step": 616
},
{
"ce_ib": 9.729615211486816,
"ce_orig": 0.7350847721099854,
"epoch": 0.08871315600287563,
"kl_loss": 0.1346513330936432,
"loss_ib": 0.011597374454140663,
"step": 617
},
{
"ce_ib": 10.69382381439209,
"ce_orig": 0.7845515608787537,
"epoch": 0.0888569374550683,
"kl_loss": 0.16862596571445465,
"loss_ib": 0.013778209686279297,
"step": 618
},
{
"ce_ib": 9.848870277404785,
"ce_orig": 0.9884905815124512,
"epoch": 0.08900071890726097,
"kl_loss": 0.13911129534244537,
"loss_ib": 0.011880000121891499,
"step": 619
},
{
"epoch": 0.08914450035945364,
"grad_norm": 0.12046127766370773,
"learning_rate": 4.872204472843451e-05,
"loss": 0.8906,
"step": 620
},
{
"ce_ib": 9.231974601745605,
"ce_orig": 0.6775012612342834,
"epoch": 0.08914450035945364,
"kl_loss": 0.17472121119499207,
"loss_ib": 0.013352048583328724,
"step": 620
},
{
"ce_ib": 11.804699897766113,
"ce_orig": 1.529233694076538,
"epoch": 0.0892882818116463,
"kl_loss": 0.12259548157453537,
"loss_ib": 0.01203212421387434,
"step": 621
},
{
"ce_ib": 7.200037002563477,
"ce_orig": 0.4418286383152008,
"epoch": 0.08943206326383897,
"kl_loss": 0.1409044861793518,
"loss_ib": 0.010645243339240551,
"step": 622
},
{
"ce_ib": 13.076546669006348,
"ce_orig": 1.375049114227295,
"epoch": 0.08957584471603164,
"kl_loss": 0.13703083992004395,
"loss_ib": 0.01338981557637453,
"step": 623
},
{
"ce_ib": 7.833815574645996,
"ce_orig": 0.5396220684051514,
"epoch": 0.08971962616822429,
"kl_loss": 0.11749826371669769,
"loss_ib": 0.009791821241378784,
"step": 624
},
{
"epoch": 0.08986340762041696,
"grad_norm": 0.1378038227558136,
"learning_rate": 4.912140575079873e-05,
"loss": 0.9042,
"step": 625
},
{
"ce_ib": 8.829384803771973,
"ce_orig": 0.9549583792686462,
"epoch": 0.08986340762041696,
"kl_loss": 0.08257201313972473,
"loss_ib": 0.008543292991816998,
"step": 625
},
{
"ce_ib": 10.215102195739746,
"ce_orig": 0.6707859039306641,
"epoch": 0.09000718907260963,
"kl_loss": 0.1320197582244873,
"loss_ib": 0.011708538979291916,
"step": 626
},
{
"ce_ib": 12.020796775817871,
"ce_orig": 1.0823876857757568,
"epoch": 0.0901509705248023,
"kl_loss": 0.12088725715875626,
"loss_ib": 0.012054760940372944,
"step": 627
},
{
"ce_ib": 6.340538501739502,
"ce_orig": 0.5447245836257935,
"epoch": 0.09029475197699496,
"kl_loss": 0.08977651596069336,
"loss_ib": 0.0076590958051383495,
"step": 628
},
{
"ce_ib": 10.423829078674316,
"ce_orig": 1.2043941020965576,
"epoch": 0.09043853342918763,
"kl_loss": 0.08985006809234619,
"loss_ib": 0.009704417549073696,
"step": 629
},
{
"epoch": 0.0905823148813803,
"grad_norm": 0.13364273309707642,
"learning_rate": 4.952076677316294e-05,
"loss": 0.9167,
"step": 630
},
{
"ce_ib": 10.545193672180176,
"ce_orig": 1.0608717203140259,
"epoch": 0.0905823148813803,
"kl_loss": 0.10969355702400208,
"loss_ib": 0.01075727492570877,
"step": 630
},
{
"ce_ib": 6.636918544769287,
"ce_orig": 0.5482816696166992,
"epoch": 0.09072609633357297,
"kl_loss": 0.14597171545028687,
"loss_ib": 0.010617044754326344,
"step": 631
},
{
"ce_ib": 7.407113075256348,
"ce_orig": 0.6304860711097717,
"epoch": 0.09086987778576563,
"kl_loss": 0.1305876523256302,
"loss_ib": 0.010232939384877682,
"step": 632
},
{
"ce_ib": 9.818191528320312,
"ce_orig": 0.7274070978164673,
"epoch": 0.0910136592379583,
"kl_loss": 0.10508064925670624,
"loss_ib": 0.01016312837600708,
"step": 633
},
{
"ce_ib": 7.220940589904785,
"ce_orig": 0.6228238940238953,
"epoch": 0.09115744069015097,
"kl_loss": 0.11822935938835144,
"loss_ib": 0.009521937929093838,
"step": 634
},
{
"epoch": 0.09130122214234364,
"grad_norm": 0.1587258279323578,
"learning_rate": 4.992012779552716e-05,
"loss": 0.8734,
"step": 635
},
{
"ce_ib": 11.307782173156738,
"ce_orig": 0.93541419506073,
"epoch": 0.09130122214234364,
"kl_loss": 0.10773300379514694,
"loss_ib": 0.011040541343390942,
"step": 635
},
{
"ce_ib": 12.55269718170166,
"ce_orig": 1.1095880270004272,
"epoch": 0.0914450035945363,
"kl_loss": 0.12884891033172607,
"loss_ib": 0.01271879393607378,
"step": 636
},
{
"ce_ib": 10.303481101989746,
"ce_orig": 1.0926016569137573,
"epoch": 0.09158878504672897,
"kl_loss": 0.09748281538486481,
"loss_ib": 0.010025881230831146,
"step": 637
},
{
"ce_ib": 10.083890914916992,
"ce_orig": 1.1573371887207031,
"epoch": 0.09173256649892164,
"kl_loss": 0.1186913400888443,
"loss_ib": 0.010976512916386127,
"step": 638
},
{
"ce_ib": 8.328096389770508,
"ce_orig": 0.8586428165435791,
"epoch": 0.09187634795111431,
"kl_loss": 0.11891971528530121,
"loss_ib": 0.010110034607350826,
"step": 639
},
{
"epoch": 0.09202012940330698,
"grad_norm": 0.1216014102101326,
"learning_rate": 4.999999518105881e-05,
"loss": 0.9069,
"step": 640
},
{
"ce_ib": 12.23937702178955,
"ce_orig": 1.0276381969451904,
"epoch": 0.09202012940330698,
"kl_loss": 0.1557500809431076,
"loss_ib": 0.013907193206250668,
"step": 640
},
{
"ce_ib": 10.701261520385742,
"ce_orig": 1.2324309349060059,
"epoch": 0.09216391085549965,
"kl_loss": 0.12038681656122208,
"loss_ib": 0.011369972489774227,
"step": 641
},
{
"ce_ib": 10.260677337646484,
"ce_orig": 0.7305514216423035,
"epoch": 0.09230769230769231,
"kl_loss": 0.1621103733778,
"loss_ib": 0.013235858641564846,
"step": 642
},
{
"ce_ib": 8.343496322631836,
"ce_orig": 0.9428196549415588,
"epoch": 0.09245147375988498,
"kl_loss": 0.1055351197719574,
"loss_ib": 0.009448505006730556,
"step": 643
},
{
"ce_ib": 11.247652053833008,
"ce_orig": 1.0904582738876343,
"epoch": 0.09259525521207765,
"kl_loss": 0.13140398263931274,
"loss_ib": 0.012194025330245495,
"step": 644
},
{
"epoch": 0.0927390366642703,
"grad_norm": 0.11910561472177505,
"learning_rate": 4.9999975604113406e-05,
"loss": 0.9693,
"step": 645
},
{
"ce_ib": 6.632207870483398,
"ce_orig": 0.6067308783531189,
"epoch": 0.0927390366642703,
"kl_loss": 0.09612976759672165,
"loss_ib": 0.008122592233121395,
"step": 645
},
{
"ce_ib": 8.443811416625977,
"ce_orig": 0.5988494157791138,
"epoch": 0.09288281811646297,
"kl_loss": 0.09652799367904663,
"loss_ib": 0.009048305451869965,
"step": 646
},
{
"ce_ib": 4.723007678985596,
"ce_orig": 0.2960648238658905,
"epoch": 0.09302659956865564,
"kl_loss": 0.17113137245178223,
"loss_ib": 0.010918072424829006,
"step": 647
},
{
"ce_ib": 9.759404182434082,
"ce_orig": 1.0581693649291992,
"epoch": 0.0931703810208483,
"kl_loss": 0.1296691596508026,
"loss_ib": 0.011363159865140915,
"step": 648
},
{
"ce_ib": 10.742829322814941,
"ce_orig": 1.3071558475494385,
"epoch": 0.09331416247304097,
"kl_loss": 0.1319178342819214,
"loss_ib": 0.011967306025326252,
"step": 649
},
{
"epoch": 0.09345794392523364,
"grad_norm": 0.1334671825170517,
"learning_rate": 4.999994096799175e-05,
"loss": 0.8334,
"step": 650
},
{
"ce_ib": 8.407774925231934,
"ce_orig": 1.0564472675323486,
"epoch": 0.09345794392523364,
"kl_loss": 0.10574951022863388,
"loss_ib": 0.009491363540291786,
"step": 650
},
{
"ce_ib": 8.83208179473877,
"ce_orig": 1.1277897357940674,
"epoch": 0.09360172537742631,
"kl_loss": 0.11387504637241364,
"loss_ib": 0.010109792463481426,
"step": 651
},
{
"ce_ib": 9.621241569519043,
"ce_orig": 1.163621187210083,
"epoch": 0.09374550682961898,
"kl_loss": 0.15881387889385223,
"loss_ib": 0.012751313857734203,
"step": 652
},
{
"ce_ib": 8.543654441833496,
"ce_orig": 1.220476508140564,
"epoch": 0.09388928828181164,
"kl_loss": 0.13807114958763123,
"loss_ib": 0.011175385676324368,
"step": 653
},
{
"ce_ib": 7.747812271118164,
"ce_orig": 0.603781521320343,
"epoch": 0.09403306973400431,
"kl_loss": 0.14266037940979004,
"loss_ib": 0.011006924323737621,
"step": 654
},
{
"epoch": 0.09417685118619698,
"grad_norm": 0.10387395322322845,
"learning_rate": 4.99998912727147e-05,
"loss": 0.8407,
"step": 655
},
{
"ce_ib": 8.64326000213623,
"ce_orig": 0.6705234050750732,
"epoch": 0.09417685118619698,
"kl_loss": 0.08966037631034851,
"loss_ib": 0.008804649114608765,
"step": 655
},
{
"ce_ib": 7.918277263641357,
"ce_orig": 0.4241083264350891,
"epoch": 0.09432063263838965,
"kl_loss": 0.10727507621049881,
"loss_ib": 0.00932289194315672,
"step": 656
},
{
"ce_ib": 9.530045509338379,
"ce_orig": 1.0509976148605347,
"epoch": 0.09446441409058232,
"kl_loss": 0.11631051450967789,
"loss_ib": 0.010580549016594887,
"step": 657
},
{
"ce_ib": 9.818882942199707,
"ce_orig": 1.0639723539352417,
"epoch": 0.09460819554277498,
"kl_loss": 0.09178834408521652,
"loss_ib": 0.009498858824372292,
"step": 658
},
{
"ce_ib": 10.010890007019043,
"ce_orig": 0.93562251329422,
"epoch": 0.09475197699496765,
"kl_loss": 0.11310561001300812,
"loss_ib": 0.010660725645720959,
"step": 659
},
{
"epoch": 0.09489575844716032,
"grad_norm": 0.11378470063209534,
"learning_rate": 4.9999826518312206e-05,
"loss": 1.0042,
"step": 660
},
{
"ce_ib": 6.337740898132324,
"ce_orig": 0.6393367052078247,
"epoch": 0.09489575844716032,
"kl_loss": 0.20546941459178925,
"loss_ib": 0.013442340306937695,
"step": 660
},
{
"ce_ib": 9.358152389526367,
"ce_orig": 0.6469663977622986,
"epoch": 0.09503953989935299,
"kl_loss": 0.16981008648872375,
"loss_ib": 0.013169581070542336,
"step": 661
},
{
"ce_ib": 12.261252403259277,
"ce_orig": 1.3894939422607422,
"epoch": 0.09518332135154565,
"kl_loss": 0.12019184231758118,
"loss_ib": 0.012140218168497086,
"step": 662
},
{
"ce_ib": 7.736466407775879,
"ce_orig": 0.7755306363105774,
"epoch": 0.09532710280373832,
"kl_loss": 0.12816421687602997,
"loss_ib": 0.010276444256305695,
"step": 663
},
{
"ce_ib": 7.187182426452637,
"ce_orig": 0.5523344874382019,
"epoch": 0.09547088425593099,
"kl_loss": 0.11785402148962021,
"loss_ib": 0.009486292488873005,
"step": 664
},
{
"epoch": 0.09561466570812366,
"grad_norm": 0.1056251972913742,
"learning_rate": 4.999974670482325e-05,
"loss": 0.9002,
"step": 665
},
{
"ce_ib": 8.833344459533691,
"ce_orig": 0.6083950400352478,
"epoch": 0.09561466570812366,
"kl_loss": 0.13750138878822327,
"loss_ib": 0.011291741393506527,
"step": 665
},
{
"ce_ib": 6.192565441131592,
"ce_orig": 0.44607073068618774,
"epoch": 0.09575844716031631,
"kl_loss": 0.17101138830184937,
"loss_ib": 0.011646851897239685,
"step": 666
},
{
"ce_ib": 8.367379188537598,
"ce_orig": 0.6870203018188477,
"epoch": 0.09590222861250898,
"kl_loss": 0.13014401495456696,
"loss_ib": 0.01069089025259018,
"step": 667
},
{
"ce_ib": 9.423290252685547,
"ce_orig": 0.8573846817016602,
"epoch": 0.09604601006470165,
"kl_loss": 0.09922461211681366,
"loss_ib": 0.009672875516116619,
"step": 668
},
{
"ce_ib": 7.007211208343506,
"ce_orig": 0.5984476208686829,
"epoch": 0.09618979151689432,
"kl_loss": 0.07789325714111328,
"loss_ib": 0.007398268673568964,
"step": 669
},
{
"epoch": 0.09633357296908698,
"grad_norm": 0.12372449785470963,
"learning_rate": 4.999965183229593e-05,
"loss": 0.7418,
"step": 670
},
{
"ce_ib": 9.530304908752441,
"ce_orig": 1.1927051544189453,
"epoch": 0.09633357296908698,
"kl_loss": 0.1058434247970581,
"loss_ib": 0.01005732361227274,
"step": 670
},
{
"ce_ib": 7.50916862487793,
"ce_orig": 0.706110954284668,
"epoch": 0.09647735442127965,
"kl_loss": 0.0839371532201767,
"loss_ib": 0.00795144122093916,
"step": 671
},
{
"ce_ib": 9.66496467590332,
"ce_orig": 1.1682391166687012,
"epoch": 0.09662113587347232,
"kl_loss": 0.26062270998954773,
"loss_ib": 0.017863618209958076,
"step": 672
},
{
"ce_ib": 13.103985786437988,
"ce_orig": 1.5685844421386719,
"epoch": 0.09676491732566499,
"kl_loss": 0.14245596528053284,
"loss_ib": 0.013674790970981121,
"step": 673
},
{
"ce_ib": 8.4186429977417,
"ce_orig": 0.8082234859466553,
"epoch": 0.09690869877785765,
"kl_loss": 0.15234991908073425,
"loss_ib": 0.011826817877590656,
"step": 674
},
{
"epoch": 0.09705248023005032,
"grad_norm": 0.11889223009347916,
"learning_rate": 4.9999541900787386e-05,
"loss": 0.9837,
"step": 675
},
{
"ce_ib": 10.21523666381836,
"ce_orig": 0.7224376797676086,
"epoch": 0.09705248023005032,
"kl_loss": 0.11821460723876953,
"loss_ib": 0.01101834885776043,
"step": 675
},
{
"ce_ib": 11.221392631530762,
"ce_orig": 1.364400863647461,
"epoch": 0.09719626168224299,
"kl_loss": 0.12399769574403763,
"loss_ib": 0.01181058119982481,
"step": 676
},
{
"ce_ib": 8.069270133972168,
"ce_orig": 0.6916780471801758,
"epoch": 0.09734004313443566,
"kl_loss": 0.09095580875873566,
"loss_ib": 0.00858242530375719,
"step": 677
},
{
"ce_ib": 8.115494728088379,
"ce_orig": 0.5971089601516724,
"epoch": 0.09748382458662833,
"kl_loss": 0.16152238845825195,
"loss_ib": 0.012133866548538208,
"step": 678
},
{
"ce_ib": 9.717530250549316,
"ce_orig": 0.9812992215156555,
"epoch": 0.097627606038821,
"kl_loss": 0.1159181147813797,
"loss_ib": 0.010654671117663383,
"step": 679
},
{
"epoch": 0.09777138749101366,
"grad_norm": 0.1076693907380104,
"learning_rate": 4.999941691036383e-05,
"loss": 0.7919,
"step": 680
},
{
"ce_ib": 6.14840030670166,
"ce_orig": 0.4094475209712982,
"epoch": 0.09777138749101366,
"kl_loss": 0.21727138757705688,
"loss_ib": 0.013937770389020443,
"step": 680
},
{
"ce_ib": 5.300361156463623,
"ce_orig": 0.40322771668434143,
"epoch": 0.09791516894320633,
"kl_loss": 0.10324863344430923,
"loss_ib": 0.007812611758708954,
"step": 681
},
{
"ce_ib": 10.288033485412598,
"ce_orig": 0.9181535840034485,
"epoch": 0.098058950395399,
"kl_loss": 0.13683810830116272,
"loss_ib": 0.01198592223227024,
"step": 682
},
{
"ce_ib": 6.534595489501953,
"ce_orig": 0.588758647441864,
"epoch": 0.09820273184759166,
"kl_loss": 0.12447066605091095,
"loss_ib": 0.00949083175510168,
"step": 683
},
{
"ce_ib": 6.564389705657959,
"ce_orig": 0.6715453863143921,
"epoch": 0.09834651329978433,
"kl_loss": 0.09237797558307648,
"loss_ib": 0.007901093922555447,
"step": 684
},
{
"epoch": 0.098490294751977,
"grad_norm": 0.13819435238838196,
"learning_rate": 4.999927686110056e-05,
"loss": 0.8023,
"step": 685
},
{
"ce_ib": 7.38820219039917,
"ce_orig": 1.00199294090271,
"epoch": 0.098490294751977,
"kl_loss": 0.09955717623233795,
"loss_ib": 0.008671960793435574,
"step": 685
},
{
"ce_ib": 8.343016624450684,
"ce_orig": 0.9671911001205444,
"epoch": 0.09863407620416967,
"kl_loss": 0.15917587280273438,
"loss_ib": 0.012130302377045155,
"step": 686
},
{
"ce_ib": 7.283801078796387,
"ce_orig": 0.8381296992301941,
"epoch": 0.09877785765636234,
"kl_loss": 0.08582788705825806,
"loss_ib": 0.007933295331895351,
"step": 687
},
{
"ce_ib": 7.987790107727051,
"ce_orig": 0.6048274636268616,
"epoch": 0.09892163910855499,
"kl_loss": 0.16408729553222656,
"loss_ib": 0.012198260053992271,
"step": 688
},
{
"ce_ib": 13.21379280090332,
"ce_orig": 1.2273222208023071,
"epoch": 0.09906542056074766,
"kl_loss": 0.13596788048744202,
"loss_ib": 0.013405290432274342,
"step": 689
},
{
"epoch": 0.09920920201294033,
"grad_norm": 0.10889366269111633,
"learning_rate": 4.999912175308195e-05,
"loss": 0.8662,
"step": 690
},
{
"ce_ib": 9.05671501159668,
"ce_orig": 1.0359764099121094,
"epoch": 0.09920920201294033,
"kl_loss": 0.09794212877750397,
"loss_ib": 0.00942546408623457,
"step": 690
},
{
"ce_ib": 6.449003219604492,
"ce_orig": 0.5917060971260071,
"epoch": 0.099352983465133,
"kl_loss": 0.1164952963590622,
"loss_ib": 0.009049266576766968,
"step": 691
},
{
"ce_ib": 8.728141784667969,
"ce_orig": 1.2859077453613281,
"epoch": 0.09949676491732566,
"kl_loss": 0.09028451889753342,
"loss_ib": 0.008878297172486782,
"step": 692
},
{
"ce_ib": 11.696812629699707,
"ce_orig": 1.4011276960372925,
"epoch": 0.09964054636951833,
"kl_loss": 0.0865880697965622,
"loss_ib": 0.010177809745073318,
"step": 693
},
{
"ce_ib": 9.487892150878906,
"ce_orig": 0.8238059282302856,
"epoch": 0.099784327821711,
"kl_loss": 0.09209860861301422,
"loss_ib": 0.009348876774311066,
"step": 694
},
{
"epoch": 0.09992810927390366,
"grad_norm": 0.1301174759864807,
"learning_rate": 4.999895158640141e-05,
"loss": 0.9068,
"step": 695
},
{
"ce_ib": 6.869251728057861,
"ce_orig": 0.7711926102638245,
"epoch": 0.09992810927390366,
"kl_loss": 0.17371296882629395,
"loss_ib": 0.012120273895561695,
"step": 695
},
{
"ce_ib": 9.267475128173828,
"ce_orig": 0.5941876173019409,
"epoch": 0.10007189072609633,
"kl_loss": 0.1114778146147728,
"loss_ib": 0.010207627899944782,
"step": 696
},
{
"ce_ib": 7.718648910522461,
"ce_orig": 0.9436888694763184,
"epoch": 0.100215672178289,
"kl_loss": 0.10313811898231506,
"loss_ib": 0.009016230702400208,
"step": 697
},
{
"ce_ib": 7.209829807281494,
"ce_orig": 0.6572908759117126,
"epoch": 0.10035945363048167,
"kl_loss": 0.11239567399024963,
"loss_ib": 0.00922469887882471,
"step": 698
},
{
"ce_ib": 8.946769714355469,
"ce_orig": 0.8208485841751099,
"epoch": 0.10050323508267434,
"kl_loss": 0.12747088074684143,
"loss_ib": 0.010846929624676704,
"step": 699
},
{
"epoch": 0.100647016534867,
"grad_norm": 0.11714436113834381,
"learning_rate": 4.999876636116145e-05,
"loss": 0.8387,
"step": 700
},
{
"ce_ib": 9.900582313537598,
"ce_orig": 0.894629180431366,
"epoch": 0.100647016534867,
"kl_loss": 0.13778752088546753,
"loss_ib": 0.011839667335152626,
"step": 700
},
{
"ce_ib": 8.739860534667969,
"ce_orig": 0.9482055902481079,
"epoch": 0.10079079798705967,
"kl_loss": 0.10394522547721863,
"loss_ib": 0.009567191824316978,
"step": 701
},
{
"ce_ib": 7.199070930480957,
"ce_orig": 0.8619567155838013,
"epoch": 0.10093457943925234,
"kl_loss": 0.13609068095684052,
"loss_ib": 0.010404069907963276,
"step": 702
},
{
"ce_ib": 7.6442036628723145,
"ce_orig": 0.8066157102584839,
"epoch": 0.10107836089144501,
"kl_loss": 0.09935668110847473,
"loss_ib": 0.00878993608057499,
"step": 703
},
{
"ce_ib": 7.810359001159668,
"ce_orig": 0.8702456951141357,
"epoch": 0.10122214234363767,
"kl_loss": 0.0937383696436882,
"loss_ib": 0.008592098020017147,
"step": 704
},
{
"epoch": 0.10136592379583034,
"grad_norm": 0.11369116604328156,
"learning_rate": 4.9998566077473645e-05,
"loss": 0.84,
"step": 705
},
{
"ce_ib": 8.6611909866333,
"ce_orig": 0.8015434145927429,
"epoch": 0.10136592379583034,
"kl_loss": 0.1313619613647461,
"loss_ib": 0.010898693464696407,
"step": 705
},
{
"ce_ib": 8.346274375915527,
"ce_orig": 0.8527460694313049,
"epoch": 0.10150970524802301,
"kl_loss": 0.09497249126434326,
"loss_ib": 0.008921761997044086,
"step": 706
},
{
"ce_ib": 11.167051315307617,
"ce_orig": 1.280219554901123,
"epoch": 0.10165348670021568,
"kl_loss": 0.13609477877616882,
"loss_ib": 0.012388264760375023,
"step": 707
},
{
"ce_ib": 9.243364334106445,
"ce_orig": 1.1910686492919922,
"epoch": 0.10179726815240835,
"kl_loss": 0.10804169625043869,
"loss_ib": 0.010023767128586769,
"step": 708
},
{
"ce_ib": 6.902951240539551,
"ce_orig": 0.6421184539794922,
"epoch": 0.101941049604601,
"kl_loss": 0.10724660754203796,
"loss_ib": 0.008813805878162384,
"step": 709
},
{
"epoch": 0.10208483105679367,
"grad_norm": 0.10316238552331924,
"learning_rate": 4.9998350735458646e-05,
"loss": 0.8424,
"step": 710
},
{
"ce_ib": 8.496487617492676,
"ce_orig": 0.8121188879013062,
"epoch": 0.10208483105679367,
"kl_loss": 0.1422605663537979,
"loss_ib": 0.01136127207428217,
"step": 710
},
{
"ce_ib": 7.606898784637451,
"ce_orig": 0.4156748056411743,
"epoch": 0.10222861250898634,
"kl_loss": 0.12221111357212067,
"loss_ib": 0.009914005175232887,
"step": 711
},
{
"ce_ib": 8.244635581970215,
"ce_orig": 0.9483067393302917,
"epoch": 0.102372393961179,
"kl_loss": 0.1223490834236145,
"loss_ib": 0.010239771567285061,
"step": 712
},
{
"ce_ib": 8.075944900512695,
"ce_orig": 0.9652693867683411,
"epoch": 0.10251617541337167,
"kl_loss": 0.11330495774745941,
"loss_ib": 0.009703220799565315,
"step": 713
},
{
"ce_ib": 7.7777581214904785,
"ce_orig": 0.6340931057929993,
"epoch": 0.10265995686556434,
"kl_loss": 0.11384841799736023,
"loss_ib": 0.009581300429999828,
"step": 714
},
{
"epoch": 0.102803738317757,
"grad_norm": 0.11466086655855179,
"learning_rate": 4.999812033524616e-05,
"loss": 0.8816,
"step": 715
},
{
"ce_ib": 7.194620609283447,
"ce_orig": 0.5225186944007874,
"epoch": 0.102803738317757,
"kl_loss": 0.16458478569984436,
"loss_ib": 0.011826549656689167,
"step": 715
},
{
"ce_ib": 7.576101303100586,
"ce_orig": 0.7184677124023438,
"epoch": 0.10294751976994967,
"kl_loss": 0.17702654004096985,
"loss_ib": 0.012639378197491169,
"step": 716
},
{
"ce_ib": 10.157630920410156,
"ce_orig": 0.9195736646652222,
"epoch": 0.10309130122214234,
"kl_loss": 0.11237962543964386,
"loss_ib": 0.010697796940803528,
"step": 717
},
{
"ce_ib": 6.4610395431518555,
"ce_orig": 0.2969205975532532,
"epoch": 0.10323508267433501,
"kl_loss": 0.3771587312221527,
"loss_ib": 0.02208845689892769,
"step": 718
},
{
"ce_ib": 10.73964786529541,
"ce_orig": 1.206357479095459,
"epoch": 0.10337886412652768,
"kl_loss": 0.11494225263595581,
"loss_ib": 0.011116936802864075,
"step": 719
},
{
"epoch": 0.10352264557872035,
"grad_norm": 0.13988007605075836,
"learning_rate": 4.9997874876974966e-05,
"loss": 0.9343,
"step": 720
},
{
"ce_ib": 8.02586841583252,
"ce_orig": 0.5530202984809875,
"epoch": 0.10352264557872035,
"kl_loss": 0.15824423730373383,
"loss_ib": 0.011925145983695984,
"step": 720
},
{
"ce_ib": 7.987276077270508,
"ce_orig": 0.8079979419708252,
"epoch": 0.10366642703091301,
"kl_loss": 0.09849292039871216,
"loss_ib": 0.008918284438550472,
"step": 721
},
{
"ce_ib": 7.2635416984558105,
"ce_orig": 1.058225154876709,
"epoch": 0.10381020848310568,
"kl_loss": 0.07580313086509705,
"loss_ib": 0.007421927060931921,
"step": 722
},
{
"ce_ib": 11.01719856262207,
"ce_orig": 1.5954816341400146,
"epoch": 0.10395398993529835,
"kl_loss": 0.1211317703127861,
"loss_ib": 0.011565187945961952,
"step": 723
},
{
"ce_ib": 7.661947727203369,
"ce_orig": 0.8771336078643799,
"epoch": 0.10409777138749102,
"kl_loss": 0.17039231956005096,
"loss_ib": 0.012350589968264103,
"step": 724
},
{
"epoch": 0.10424155283968368,
"grad_norm": 0.11809071898460388,
"learning_rate": 4.9997614360792934e-05,
"loss": 0.9575,
"step": 725
},
{
"ce_ib": 9.20604133605957,
"ce_orig": 0.8485362529754639,
"epoch": 0.10424155283968368,
"kl_loss": 0.10916964709758759,
"loss_ib": 0.010061502456665039,
"step": 725
},
{
"ce_ib": 7.615476608276367,
"ce_orig": 0.9565079808235168,
"epoch": 0.10438533429187635,
"kl_loss": 0.09149608016014099,
"loss_ib": 0.008382542990148067,
"step": 726
},
{
"ce_ib": 8.909895896911621,
"ce_orig": 1.0168468952178955,
"epoch": 0.10452911574406902,
"kl_loss": 0.10178525745868683,
"loss_ib": 0.009544211439788342,
"step": 727
},
{
"ce_ib": 9.962603569030762,
"ce_orig": 1.013784646987915,
"epoch": 0.10467289719626169,
"kl_loss": 0.0983659103512764,
"loss_ib": 0.009899596683681011,
"step": 728
},
{
"ce_ib": 7.675144195556641,
"ce_orig": 0.7497639060020447,
"epoch": 0.10481667864845436,
"kl_loss": 0.1465551257133484,
"loss_ib": 0.011165328323841095,
"step": 729
},
{
"epoch": 0.10496046010064701,
"grad_norm": 0.10833033919334412,
"learning_rate": 4.999733878685698e-05,
"loss": 0.8967,
"step": 730
},
{
"ce_ib": 9.022236824035645,
"ce_orig": 1.01668381690979,
"epoch": 0.10496046010064701,
"kl_loss": 0.12041808664798737,
"loss_ib": 0.010532023385167122,
"step": 730
},
{
"ce_ib": 8.386323928833008,
"ce_orig": 0.8513427376747131,
"epoch": 0.10510424155283968,
"kl_loss": 0.12366791814565659,
"loss_ib": 0.01037655770778656,
"step": 731
},
{
"ce_ib": 11.588960647583008,
"ce_orig": 1.5426373481750488,
"epoch": 0.10524802300503235,
"kl_loss": 0.13248568773269653,
"loss_ib": 0.012418764643371105,
"step": 732
},
{
"ce_ib": 8.538729667663574,
"ce_orig": 0.8282467126846313,
"epoch": 0.10539180445722501,
"kl_loss": 0.121379554271698,
"loss_ib": 0.010338342748582363,
"step": 733
},
{
"ce_ib": 8.161299705505371,
"ce_orig": 0.8536421060562134,
"epoch": 0.10553558590941768,
"kl_loss": 0.10607169568538666,
"loss_ib": 0.009384234435856342,
"step": 734
},
{
"epoch": 0.10567936736161035,
"grad_norm": 0.12110400199890137,
"learning_rate": 4.999704815533312e-05,
"loss": 0.8765,
"step": 735
},
{
"ce_ib": 8.390252113342285,
"ce_orig": 1.1886399984359741,
"epoch": 0.10567936736161035,
"kl_loss": 0.12097896635532379,
"loss_ib": 0.010244074277579784,
"step": 735
},
{
"ce_ib": 9.484691619873047,
"ce_orig": 1.0858343839645386,
"epoch": 0.10582314881380302,
"kl_loss": 0.11114107072353363,
"loss_ib": 0.010299399495124817,
"step": 736
},
{
"ce_ib": 7.962080478668213,
"ce_orig": 0.9100456237792969,
"epoch": 0.10596693026599568,
"kl_loss": 0.10763818770647049,
"loss_ib": 0.00936294998973608,
"step": 737
},
{
"ce_ib": 9.301447868347168,
"ce_orig": 1.0797451734542847,
"epoch": 0.10611071171818835,
"kl_loss": 0.08851733803749084,
"loss_ib": 0.009076590649783611,
"step": 738
},
{
"ce_ib": 5.509831428527832,
"ce_orig": 0.5435864925384521,
"epoch": 0.10625449317038102,
"kl_loss": 0.11892074346542358,
"loss_ib": 0.008700952865183353,
"step": 739
},
{
"epoch": 0.10639827462257369,
"grad_norm": 0.12525738775730133,
"learning_rate": 4.9996742466396395e-05,
"loss": 0.9647,
"step": 740
},
{
"ce_ib": 7.956106662750244,
"ce_orig": 0.5784890055656433,
"epoch": 0.10639827462257369,
"kl_loss": 0.0811728686094284,
"loss_ib": 0.008036697283387184,
"step": 740
},
{
"ce_ib": 4.005490779876709,
"ce_orig": 0.20808197557926178,
"epoch": 0.10654205607476636,
"kl_loss": 0.19454577565193176,
"loss_ib": 0.011730033904314041,
"step": 741
},
{
"ce_ib": 5.741650581359863,
"ce_orig": 0.5170265436172485,
"epoch": 0.10668583752695902,
"kl_loss": 0.15967172384262085,
"loss_ib": 0.01085441093891859,
"step": 742
},
{
"ce_ib": 4.918848514556885,
"ce_orig": 0.5805583596229553,
"epoch": 0.10682961897915169,
"kl_loss": 0.10687030851840973,
"loss_ib": 0.007802939508110285,
"step": 743
},
{
"ce_ib": 10.824296951293945,
"ce_orig": 1.5680608749389648,
"epoch": 0.10697340043134436,
"kl_loss": 0.0964423194527626,
"loss_ib": 0.01023426465690136,
"step": 744
},
{
"epoch": 0.10711718188353703,
"grad_norm": 0.12692318856716156,
"learning_rate": 4.9996421720230955e-05,
"loss": 0.8577,
"step": 745
},
{
"ce_ib": 11.27613639831543,
"ce_orig": 1.3031110763549805,
"epoch": 0.10711718188353703,
"kl_loss": 0.09824083745479584,
"loss_ib": 0.010550110600888729,
"step": 745
},
{
"ce_ib": 9.67626953125,
"ce_orig": 0.8898366689682007,
"epoch": 0.1072609633357297,
"kl_loss": 0.07953157275915146,
"loss_ib": 0.008814713917672634,
"step": 746
},
{
"ce_ib": 6.659996509552002,
"ce_orig": 0.6965008974075317,
"epoch": 0.10740474478792236,
"kl_loss": 0.15939068794250488,
"loss_ib": 0.01129953283816576,
"step": 747
},
{
"ce_ib": 9.864056587219238,
"ce_orig": 0.9512748122215271,
"epoch": 0.10754852624011503,
"kl_loss": 0.13909170031547546,
"loss_ib": 0.011886613443493843,
"step": 748
},
{
"ce_ib": 7.443923473358154,
"ce_orig": 0.8524958491325378,
"epoch": 0.1076923076923077,
"kl_loss": 0.11339938640594482,
"loss_ib": 0.009391930885612965,
"step": 749
},
{
"epoch": 0.10783608914450037,
"grad_norm": 0.12534423172473907,
"learning_rate": 4.999608591703001e-05,
"loss": 0.8728,
"step": 750
},
{
"ce_ib": 4.702718257904053,
"ce_orig": 0.37047120928764343,
"epoch": 0.10783608914450037,
"kl_loss": 0.14064157009124756,
"loss_ib": 0.009383438155055046,
"step": 750
},
{
"ce_ib": 6.858421802520752,
"ce_orig": 1.1404428482055664,
"epoch": 0.10797987059669302,
"kl_loss": 0.08100490272045135,
"loss_ib": 0.007479456253349781,
"step": 751
},
{
"ce_ib": 8.411442756652832,
"ce_orig": 0.8211296200752258,
"epoch": 0.10812365204888569,
"kl_loss": 0.11985696852207184,
"loss_ib": 0.010198569856584072,
"step": 752
},
{
"ce_ib": 6.272851943969727,
"ce_orig": 0.7544890642166138,
"epoch": 0.10826743350107836,
"kl_loss": 0.08774818480014801,
"loss_ib": 0.007523834705352783,
"step": 753
},
{
"ce_ib": 8.928715705871582,
"ce_orig": 0.9599756002426147,
"epoch": 0.10841121495327102,
"kl_loss": 0.11729443073272705,
"loss_ib": 0.010329079814255238,
"step": 754
},
{
"epoch": 0.10855499640546369,
"grad_norm": 0.11862904578447342,
"learning_rate": 4.9995735056995826e-05,
"loss": 0.8637,
"step": 755
},
{
"ce_ib": 10.275476455688477,
"ce_orig": 1.2619425058364868,
"epoch": 0.10855499640546369,
"kl_loss": 0.13122868537902832,
"loss_ib": 0.011699172668159008,
"step": 755
},
{
"ce_ib": 8.515475273132324,
"ce_orig": 0.8744072318077087,
"epoch": 0.10869877785765636,
"kl_loss": 0.14314191043376923,
"loss_ib": 0.01141483336687088,
"step": 756
},
{
"ce_ib": 8.099058151245117,
"ce_orig": 1.0449519157409668,
"epoch": 0.10884255930984903,
"kl_loss": 0.08819152414798737,
"loss_ib": 0.008459105156362057,
"step": 757
},
{
"ce_ib": 7.1664228439331055,
"ce_orig": 0.856492280960083,
"epoch": 0.1089863407620417,
"kl_loss": 0.07748554646968842,
"loss_ib": 0.0074574886821210384,
"step": 758
},
{
"ce_ib": 8.10939884185791,
"ce_orig": 0.7625494003295898,
"epoch": 0.10913012221423436,
"kl_loss": 0.12002036720514297,
"loss_ib": 0.010055718012154102,
"step": 759
},
{
"epoch": 0.10927390366642703,
"grad_norm": 0.12130390107631683,
"learning_rate": 4.999536914033977e-05,
"loss": 0.864,
"step": 760
},
{
"ce_ib": 6.7783203125,
"ce_orig": 0.6925073266029358,
"epoch": 0.10927390366642703,
"kl_loss": 0.14075952768325806,
"loss_ib": 0.010427136905491352,
"step": 760
},
{
"ce_ib": 6.035461902618408,
"ce_orig": 0.5443659424781799,
"epoch": 0.1094176851186197,
"kl_loss": 0.11947020888328552,
"loss_ib": 0.008991241455078125,
"step": 761
},
{
"ce_ib": 10.941012382507324,
"ce_orig": 1.4812273979187012,
"epoch": 0.10956146657081237,
"kl_loss": 0.09991130232810974,
"loss_ib": 0.010466071777045727,
"step": 762
},
{
"ce_ib": 10.077688217163086,
"ce_orig": 1.1687238216400146,
"epoch": 0.10970524802300503,
"kl_loss": 0.10050664842128754,
"loss_ib": 0.01006417628377676,
"step": 763
},
{
"ce_ib": 7.7856125831604,
"ce_orig": 0.8300952911376953,
"epoch": 0.1098490294751977,
"kl_loss": 0.11259394139051437,
"loss_ib": 0.009522504173219204,
"step": 764
},
{
"epoch": 0.10999281092739037,
"grad_norm": 0.09898856282234192,
"learning_rate": 4.999498816728223e-05,
"loss": 0.8381,
"step": 765
},
{
"ce_ib": 6.172881126403809,
"ce_orig": 0.695868968963623,
"epoch": 0.10999281092739037,
"kl_loss": 0.10089915245771408,
"loss_ib": 0.008131398819386959,
"step": 765
},
{
"ce_ib": 7.305203437805176,
"ce_orig": 0.7463586330413818,
"epoch": 0.11013659237958304,
"kl_loss": 0.09400838613510132,
"loss_ib": 0.008353020995855331,
"step": 766
},
{
"ce_ib": 4.747533798217773,
"ce_orig": 0.5122455954551697,
"epoch": 0.1102803738317757,
"kl_loss": 0.074123814702034,
"loss_ib": 0.0060799578204751015,
"step": 767
},
{
"ce_ib": 6.877668857574463,
"ce_orig": 0.5530845522880554,
"epoch": 0.11042415528396837,
"kl_loss": 0.11388804018497467,
"loss_ib": 0.009133236482739449,
"step": 768
},
{
"ce_ib": 11.079581260681152,
"ce_orig": 1.1505775451660156,
"epoch": 0.11056793673616104,
"kl_loss": 0.2215118706226349,
"loss_ib": 0.016615385189652443,
"step": 769
},
{
"epoch": 0.11071171818835371,
"grad_norm": 0.13919848203659058,
"learning_rate": 4.999459213805272e-05,
"loss": 0.8708,
"step": 770
},
{
"ce_ib": 8.570969581604004,
"ce_orig": 1.2589601278305054,
"epoch": 0.11071171818835371,
"kl_loss": 0.08060797303915024,
"loss_ib": 0.008315883576869965,
"step": 770
},
{
"ce_ib": 7.168095588684082,
"ce_orig": 0.8928415775299072,
"epoch": 0.11085549964054638,
"kl_loss": 0.10986876487731934,
"loss_ib": 0.009077486582100391,
"step": 771
},
{
"ce_ib": 8.855610847473145,
"ce_orig": 0.733345091342926,
"epoch": 0.11099928109273903,
"kl_loss": 0.12182736396789551,
"loss_ib": 0.010519173927605152,
"step": 772
},
{
"ce_ib": 3.5407509803771973,
"ce_orig": 0.3610730469226837,
"epoch": 0.1111430625449317,
"kl_loss": 0.14145609736442566,
"loss_ib": 0.00884318072348833,
"step": 773
},
{
"ce_ib": 8.146163940429688,
"ce_orig": 0.6188514828681946,
"epoch": 0.11128684399712437,
"kl_loss": 0.10034702718257904,
"loss_ib": 0.009090433828532696,
"step": 774
},
{
"epoch": 0.11143062544931703,
"grad_norm": 0.11237223446369171,
"learning_rate": 4.999418105288978e-05,
"loss": 0.8281,
"step": 775
},
{
"ce_ib": 6.921514987945557,
"ce_orig": 0.7439333200454712,
"epoch": 0.11143062544931703,
"kl_loss": 0.08917941153049469,
"loss_ib": 0.007919727824628353,
"step": 775
},
{
"ce_ib": 6.831997871398926,
"ce_orig": 0.7869361042976379,
"epoch": 0.1115744069015097,
"kl_loss": 0.10595273226499557,
"loss_ib": 0.008713635616004467,
"step": 776
},
{
"ce_ib": 7.639923095703125,
"ce_orig": 0.9338688254356384,
"epoch": 0.11171818835370237,
"kl_loss": 0.10657122731208801,
"loss_ib": 0.009148523211479187,
"step": 777
},
{
"ce_ib": 6.877979755401611,
"ce_orig": 0.7685055136680603,
"epoch": 0.11186196980589504,
"kl_loss": 0.09290573745965958,
"loss_ib": 0.00808427669107914,
"step": 778
},
{
"ce_ib": 10.934708595275879,
"ce_orig": 1.0183820724487305,
"epoch": 0.1120057512580877,
"kl_loss": 0.13018369674682617,
"loss_ib": 0.01197653915733099,
"step": 779
},
{
"epoch": 0.11214953271028037,
"grad_norm": 0.1425534039735794,
"learning_rate": 4.999375491204102e-05,
"loss": 0.9258,
"step": 780
},
{
"ce_ib": 8.499329566955566,
"ce_orig": 0.6566687822341919,
"epoch": 0.11214953271028037,
"kl_loss": 0.09959501028060913,
"loss_ib": 0.009229416027665138,
"step": 780
},
{
"ce_ib": 8.128525733947754,
"ce_orig": 0.7684027552604675,
"epoch": 0.11229331416247304,
"kl_loss": 0.0928315594792366,
"loss_ib": 0.00870584137737751,
"step": 781
},
{
"ce_ib": 6.759765148162842,
"ce_orig": 0.7490063309669495,
"epoch": 0.11243709561466571,
"kl_loss": 0.05619842931628227,
"loss_ib": 0.006189804058521986,
"step": 782
},
{
"ce_ib": 7.713232517242432,
"ce_orig": 0.9557498693466187,
"epoch": 0.11258087706685838,
"kl_loss": 0.10368062555789948,
"loss_ib": 0.009040648117661476,
"step": 783
},
{
"ce_ib": 7.60745906829834,
"ce_orig": 0.7716460227966309,
"epoch": 0.11272465851905104,
"kl_loss": 0.08124081790447235,
"loss_ib": 0.00786577071994543,
"step": 784
},
{
"epoch": 0.11286843997124371,
"grad_norm": 0.13931146264076233,
"learning_rate": 4.9993313715763166e-05,
"loss": 0.8111,
"step": 785
},
{
"ce_ib": 9.313846588134766,
"ce_orig": 1.1762357950210571,
"epoch": 0.11286843997124371,
"kl_loss": 0.12043385207653046,
"loss_ib": 0.010678616352379322,
"step": 785
},
{
"ce_ib": 9.520804405212402,
"ce_orig": 1.0862674713134766,
"epoch": 0.11301222142343638,
"kl_loss": 0.13194264471530914,
"loss_ib": 0.011357533745467663,
"step": 786
},
{
"ce_ib": 5.819194793701172,
"ce_orig": 0.5117652416229248,
"epoch": 0.11315600287562905,
"kl_loss": 0.08215292543172836,
"loss_ib": 0.007017243653535843,
"step": 787
},
{
"ce_ib": 8.07127857208252,
"ce_orig": 1.1504753828048706,
"epoch": 0.11329978432782171,
"kl_loss": 0.09762965887784958,
"loss_ib": 0.008917122147977352,
"step": 788
},
{
"ce_ib": 7.960126876831055,
"ce_orig": 0.7059429883956909,
"epoch": 0.11344356578001438,
"kl_loss": 0.14231815934181213,
"loss_ib": 0.011095970869064331,
"step": 789
},
{
"epoch": 0.11358734723220705,
"grad_norm": 0.12224866449832916,
"learning_rate": 4.9992857464321963e-05,
"loss": 0.862,
"step": 790
},
{
"ce_ib": 7.631105899810791,
"ce_orig": 0.8110905289649963,
"epoch": 0.11358734723220705,
"kl_loss": 0.08087232708930969,
"loss_ib": 0.007859169505536556,
"step": 790
},
{
"ce_ib": 7.410162448883057,
"ce_orig": 0.6299762725830078,
"epoch": 0.11373112868439972,
"kl_loss": 0.12838461995124817,
"loss_ib": 0.010124312713742256,
"step": 791
},
{
"ce_ib": 8.221912384033203,
"ce_orig": 0.9979586005210876,
"epoch": 0.11387491013659239,
"kl_loss": 0.1258949488401413,
"loss_ib": 0.010405703447759151,
"step": 792
},
{
"ce_ib": 6.832244873046875,
"ce_orig": 0.7152706384658813,
"epoch": 0.11401869158878504,
"kl_loss": 0.08785121142864227,
"loss_ib": 0.0078086829744279385,
"step": 793
},
{
"ce_ib": 7.743945598602295,
"ce_orig": 0.9861294627189636,
"epoch": 0.11416247304097771,
"kl_loss": 0.09959419816732407,
"loss_ib": 0.00885168369859457,
"step": 794
},
{
"epoch": 0.11430625449317038,
"grad_norm": 0.12471532076597214,
"learning_rate": 4.9992386157992246e-05,
"loss": 0.8318,
"step": 795
},
{
"ce_ib": 7.021468162536621,
"ce_orig": 0.7713713049888611,
"epoch": 0.11430625449317038,
"kl_loss": 0.07618668675422668,
"loss_ib": 0.007320068776607513,
"step": 795
},
{
"ce_ib": 7.368815898895264,
"ce_orig": 1.1165920495986938,
"epoch": 0.11445003594536304,
"kl_loss": 0.07658970355987549,
"loss_ib": 0.007513892836868763,
"step": 796
},
{
"ce_ib": 6.703334808349609,
"ce_orig": 0.5481064915657043,
"epoch": 0.11459381739755571,
"kl_loss": 0.09191010147333145,
"loss_ib": 0.007947172038257122,
"step": 797
},
{
"ce_ib": 9.726116180419922,
"ce_orig": 1.2227307558059692,
"epoch": 0.11473759884974838,
"kl_loss": 0.09769535809755325,
"loss_ib": 0.009747825562953949,
"step": 798
},
{
"ce_ib": 8.778748512268066,
"ce_orig": 0.9550623893737793,
"epoch": 0.11488138030194105,
"kl_loss": 0.08937544375658035,
"loss_ib": 0.008858147077262402,
"step": 799
},
{
"epoch": 0.11502516175413371,
"grad_norm": 0.12755703926086426,
"learning_rate": 4.9991899797057904e-05,
"loss": 0.9259,
"step": 800
},
{
"ce_ib": 10.388070106506348,
"ce_orig": 1.069626808166504,
"epoch": 0.11502516175413371,
"kl_loss": 0.12364666163921356,
"loss_ib": 0.011376367881894112,
"step": 800
},
{
"ce_ib": 6.369958400726318,
"ce_orig": 0.8895975947380066,
"epoch": 0.11516894320632638,
"kl_loss": 0.06259946525096893,
"loss_ib": 0.0063149528577923775,
"step": 801
},
{
"ce_ib": 9.586531639099121,
"ce_orig": 1.2012348175048828,
"epoch": 0.11531272465851905,
"kl_loss": 0.09361310303211212,
"loss_ib": 0.009473921731114388,
"step": 802
},
{
"ce_ib": 7.590814113616943,
"ce_orig": 1.0715209245681763,
"epoch": 0.11545650611071172,
"kl_loss": 0.10730178654193878,
"loss_ib": 0.009160496294498444,
"step": 803
},
{
"ce_ib": 8.784406661987305,
"ce_orig": 1.1157152652740479,
"epoch": 0.11560028756290439,
"kl_loss": 0.10003470629453659,
"loss_ib": 0.009393938817083836,
"step": 804
},
{
"epoch": 0.11574406901509705,
"grad_norm": 0.13564252853393555,
"learning_rate": 4.9991398381811924e-05,
"loss": 0.892,
"step": 805
},
{
"ce_ib": 6.886279106140137,
"ce_orig": 0.6827164888381958,
"epoch": 0.11574406901509705,
"kl_loss": 0.08341995626688004,
"loss_ib": 0.007614137139171362,
"step": 805
},
{
"ce_ib": 7.802131175994873,
"ce_orig": 1.107555627822876,
"epoch": 0.11588785046728972,
"kl_loss": 0.08331786096096039,
"loss_ib": 0.008066958747804165,
"step": 806
},
{
"ce_ib": 7.465454578399658,
"ce_orig": 0.8235023617744446,
"epoch": 0.11603163191948239,
"kl_loss": 0.0998779833316803,
"loss_ib": 0.008726626634597778,
"step": 807
},
{
"ce_ib": 5.956389427185059,
"ce_orig": 0.83247971534729,
"epoch": 0.11617541337167506,
"kl_loss": 0.07732568681240082,
"loss_ib": 0.006844479124993086,
"step": 808
},
{
"ce_ib": 5.556079864501953,
"ce_orig": 0.6450206637382507,
"epoch": 0.11631919482386772,
"kl_loss": 0.08965234458446503,
"loss_ib": 0.0072606573812663555,
"step": 809
},
{
"epoch": 0.11646297627606039,
"grad_norm": 0.11268489807844162,
"learning_rate": 4.999088191255632e-05,
"loss": 0.8579,
"step": 810
},
{
"ce_ib": 8.282671928405762,
"ce_orig": 0.7875933051109314,
"epoch": 0.11646297627606039,
"kl_loss": 0.11104097217321396,
"loss_ib": 0.009693384170532227,
"step": 810
},
{
"ce_ib": 7.059372901916504,
"ce_orig": 0.8186449408531189,
"epoch": 0.11660675772825306,
"kl_loss": 0.1103602796792984,
"loss_ib": 0.00904770102351904,
"step": 811
},
{
"ce_ib": 8.647476196289062,
"ce_orig": 1.0445913076400757,
"epoch": 0.11675053918044573,
"kl_loss": 0.10142830014228821,
"loss_ib": 0.009395153261721134,
"step": 812
},
{
"ce_ib": 7.812716007232666,
"ce_orig": 0.9563208222389221,
"epoch": 0.1168943206326384,
"kl_loss": 0.07602293789386749,
"loss_ib": 0.0077075050212442875,
"step": 813
},
{
"ce_ib": 7.100307941436768,
"ce_orig": 0.6869685649871826,
"epoch": 0.11703810208483105,
"kl_loss": 0.13484741747379303,
"loss_ib": 0.01029252540320158,
"step": 814
},
{
"epoch": 0.11718188353702372,
"grad_norm": 0.145658478140831,
"learning_rate": 4.9990350389602214e-05,
"loss": 0.8295,
"step": 815
},
{
"ce_ib": 7.6031060218811035,
"ce_orig": 0.7811675667762756,
"epoch": 0.11718188353702372,
"kl_loss": 0.14399005472660065,
"loss_ib": 0.01100105606019497,
"step": 815
},
{
"ce_ib": 7.256862640380859,
"ce_orig": 0.7083752751350403,
"epoch": 0.11732566498921639,
"kl_loss": 0.14461134374141693,
"loss_ib": 0.010858998633921146,
"step": 816
},
{
"ce_ib": 6.885143280029297,
"ce_orig": 0.8050568699836731,
"epoch": 0.11746944644140905,
"kl_loss": 0.11550889909267426,
"loss_ib": 0.009218016639351845,
"step": 817
},
{
"ce_ib": 6.232802391052246,
"ce_orig": 0.5001097321510315,
"epoch": 0.11761322789360172,
"kl_loss": 0.1038711816072464,
"loss_ib": 0.00830996036529541,
"step": 818
},
{
"ce_ib": 5.1833367347717285,
"ce_orig": 0.37858033180236816,
"epoch": 0.11775700934579439,
"kl_loss": 0.11628204584121704,
"loss_ib": 0.008405770175158978,
"step": 819
},
{
"epoch": 0.11790079079798706,
"grad_norm": 0.11669674515724182,
"learning_rate": 4.9989803813269775e-05,
"loss": 0.7666,
"step": 820
},
{
"ce_ib": 7.362185478210449,
"ce_orig": 0.9856128692626953,
"epoch": 0.11790079079798706,
"kl_loss": 0.09990614652633667,
"loss_ib": 0.008676400408148766,
"step": 820
},
{
"ce_ib": 6.022683620452881,
"ce_orig": 0.6719481348991394,
"epoch": 0.11804457225017972,
"kl_loss": 0.09421147406101227,
"loss_ib": 0.0077219158411026,
"step": 821
},
{
"ce_ib": 7.54342794418335,
"ce_orig": 0.9077051877975464,
"epoch": 0.11818835370237239,
"kl_loss": 0.09480626881122589,
"loss_ib": 0.008512027561664581,
"step": 822
},
{
"ce_ib": 7.5844597816467285,
"ce_orig": 0.8468344807624817,
"epoch": 0.11833213515456506,
"kl_loss": 0.09350645542144775,
"loss_ib": 0.008467552252113819,
"step": 823
},
{
"ce_ib": 9.546996116638184,
"ce_orig": 0.9645229578018188,
"epoch": 0.11847591660675773,
"kl_loss": 0.10462982207536697,
"loss_ib": 0.010004988871514797,
"step": 824
},
{
"epoch": 0.1186196980589504,
"grad_norm": 0.1411919742822647,
"learning_rate": 4.998924218388824e-05,
"loss": 0.8377,
"step": 825
},
{
"ce_ib": 6.855606555938721,
"ce_orig": 0.6925601363182068,
"epoch": 0.1186196980589504,
"kl_loss": 0.11507317423820496,
"loss_ib": 0.009181462228298187,
"step": 825
},
{
"ce_ib": 7.184199333190918,
"ce_orig": 0.7112440466880798,
"epoch": 0.11876347951114306,
"kl_loss": 0.11154375225305557,
"loss_ib": 0.009169287048280239,
"step": 826
},
{
"ce_ib": 6.221945762634277,
"ce_orig": 0.7550656199455261,
"epoch": 0.11890726096333573,
"kl_loss": 0.08264736086130142,
"loss_ib": 0.00724334130063653,
"step": 827
},
{
"ce_ib": 6.1294331550598145,
"ce_orig": 0.8655829429626465,
"epoch": 0.1190510424155284,
"kl_loss": 0.2529202103614807,
"loss_ib": 0.015710726380348206,
"step": 828
},
{
"ce_ib": 7.718718528747559,
"ce_orig": 0.9449477195739746,
"epoch": 0.11919482386772107,
"kl_loss": 0.1413031816482544,
"loss_ib": 0.01092451810836792,
"step": 829
},
{
"epoch": 0.11933860531991373,
"grad_norm": 0.13230349123477936,
"learning_rate": 4.998866550179591e-05,
"loss": 0.8497,
"step": 830
},
{
"ce_ib": 8.910257339477539,
"ce_orig": 1.0684385299682617,
"epoch": 0.11933860531991373,
"kl_loss": 0.08060315996408463,
"loss_ib": 0.008485286496579647,
"step": 830
},
{
"ce_ib": 6.161970138549805,
"ce_orig": 0.7749412059783936,
"epoch": 0.1194823867721064,
"kl_loss": 0.09046103060245514,
"loss_ib": 0.00760403648018837,
"step": 831
},
{
"ce_ib": 8.188762664794922,
"ce_orig": 1.1623848676681519,
"epoch": 0.11962616822429907,
"kl_loss": 0.1053881123661995,
"loss_ib": 0.00936378724873066,
"step": 832
},
{
"ce_ib": 8.579145431518555,
"ce_orig": 1.0423914194107056,
"epoch": 0.11976994967649174,
"kl_loss": 0.0925225242972374,
"loss_ib": 0.00891569908708334,
"step": 833
},
{
"ce_ib": 6.8061676025390625,
"ce_orig": 0.7699450850486755,
"epoch": 0.1199137311286844,
"kl_loss": 0.1753084808588028,
"loss_ib": 0.01216850709170103,
"step": 834
},
{
"epoch": 0.12005751258087707,
"grad_norm": 0.17299415171146393,
"learning_rate": 4.9988073767340174e-05,
"loss": 0.8972,
"step": 835
},
{
"ce_ib": 11.318516731262207,
"ce_orig": 1.7190030813217163,
"epoch": 0.12005751258087707,
"kl_loss": 0.0838870257139206,
"loss_ib": 0.009853609837591648,
"step": 835
},
{
"ce_ib": 6.024441242218018,
"ce_orig": 0.5168812274932861,
"epoch": 0.12020129403306973,
"kl_loss": 0.08710253238677979,
"loss_ib": 0.007367347367107868,
"step": 836
},
{
"ce_ib": 9.387094497680664,
"ce_orig": 0.5673519968986511,
"epoch": 0.1203450754852624,
"kl_loss": 0.08531000465154648,
"loss_ib": 0.008959047496318817,
"step": 837
},
{
"ce_ib": 3.3088152408599854,
"ce_orig": 0.40009650588035583,
"epoch": 0.12048885693745506,
"kl_loss": 0.14413371682167053,
"loss_ib": 0.008861093781888485,
"step": 838
},
{
"ce_ib": 5.766994953155518,
"ce_orig": 0.5750284790992737,
"epoch": 0.12063263838964773,
"kl_loss": 0.0825929120182991,
"loss_ib": 0.007013143040239811,
"step": 839
},
{
"epoch": 0.1207764198418404,
"grad_norm": 0.15587636828422546,
"learning_rate": 4.998746698087745e-05,
"loss": 0.9364,
"step": 840
},
{
"ce_ib": 8.854097366333008,
"ce_orig": 1.37924063205719,
"epoch": 0.1207764198418404,
"kl_loss": 0.12135547399520874,
"loss_ib": 0.010494822636246681,
"step": 840
},
{
"ce_ib": 5.431858062744141,
"ce_orig": 0.7637268304824829,
"epoch": 0.12092020129403307,
"kl_loss": 0.07992113381624222,
"loss_ib": 0.006711985915899277,
"step": 841
},
{
"ce_ib": 8.615471839904785,
"ce_orig": 1.1368523836135864,
"epoch": 0.12106398274622573,
"kl_loss": 0.10841356217861176,
"loss_ib": 0.009728414006531239,
"step": 842
},
{
"ce_ib": 6.431497573852539,
"ce_orig": 0.6517418622970581,
"epoch": 0.1212077641984184,
"kl_loss": 0.12224148958921432,
"loss_ib": 0.009327823296189308,
"step": 843
},
{
"ce_ib": 6.216892719268799,
"ce_orig": 0.9238556623458862,
"epoch": 0.12135154565061107,
"kl_loss": 0.07222751528024673,
"loss_ib": 0.006719821598380804,
"step": 844
},
{
"epoch": 0.12149532710280374,
"grad_norm": 0.11728943139314651,
"learning_rate": 4.9986845142773275e-05,
"loss": 0.97,
"step": 845
},
{
"ce_ib": 7.411965847015381,
"ce_orig": 0.9339240193367004,
"epoch": 0.12149532710280374,
"kl_loss": 0.1781131625175476,
"loss_ib": 0.01261164154857397,
"step": 845
},
{
"ce_ib": 6.76970100402832,
"ce_orig": 0.8812845945358276,
"epoch": 0.1216391085549964,
"kl_loss": 0.1000911146402359,
"loss_ib": 0.008389405906200409,
"step": 846
},
{
"ce_ib": 4.417732238769531,
"ce_orig": 0.5549830198287964,
"epoch": 0.12178289000718907,
"kl_loss": 0.06278645992279053,
"loss_ib": 0.005348189268261194,
"step": 847
},
{
"ce_ib": 7.800521373748779,
"ce_orig": 0.6102645993232727,
"epoch": 0.12192667145938174,
"kl_loss": 0.12678933143615723,
"loss_ib": 0.01023972686380148,
"step": 848
},
{
"ce_ib": 9.005143165588379,
"ce_orig": 0.9449028968811035,
"epoch": 0.12207045291157441,
"kl_loss": 0.10566647350788116,
"loss_ib": 0.009785895235836506,
"step": 849
},
{
"epoch": 0.12221423436376708,
"grad_norm": 0.12012098729610443,
"learning_rate": 4.998620825340221e-05,
"loss": 0.7973,
"step": 850
},
{
"ce_ib": 5.912694454193115,
"ce_orig": 0.6100848317146301,
"epoch": 0.12221423436376708,
"kl_loss": 0.12130458652973175,
"loss_ib": 0.0090215764939785,
"step": 850
},
{
"ce_ib": 6.602824687957764,
"ce_orig": 0.6862348318099976,
"epoch": 0.12235801581595974,
"kl_loss": 0.10095177590847015,
"loss_ib": 0.008349001407623291,
"step": 851
},
{
"ce_ib": 8.233159065246582,
"ce_orig": 0.880978524684906,
"epoch": 0.12250179726815241,
"kl_loss": 0.074901282787323,
"loss_ib": 0.00786164402961731,
"step": 852
},
{
"ce_ib": 7.939795017242432,
"ce_orig": 0.9793850779533386,
"epoch": 0.12264557872034508,
"kl_loss": 0.10613537579774857,
"loss_ib": 0.00927666574716568,
"step": 853
},
{
"ce_ib": 6.5190935134887695,
"ce_orig": 0.9610397815704346,
"epoch": 0.12278936017253775,
"kl_loss": 0.16304026544094086,
"loss_ib": 0.011411559768021107,
"step": 854
},
{
"epoch": 0.12293314162473042,
"grad_norm": 0.11487976461648941,
"learning_rate": 4.9985556313147895e-05,
"loss": 0.9548,
"step": 855
},
{
"ce_ib": 7.369608402252197,
"ce_orig": 0.6912807822227478,
"epoch": 0.12293314162473042,
"kl_loss": 0.12601375579833984,
"loss_ib": 0.009985491633415222,
"step": 855
},
{
"ce_ib": 8.943307876586914,
"ce_orig": 1.071799397468567,
"epoch": 0.12307692307692308,
"kl_loss": 0.1637842357158661,
"loss_ib": 0.012660865671932697,
"step": 856
},
{
"ce_ib": 7.475729465484619,
"ce_orig": 0.9372177124023438,
"epoch": 0.12322070452911574,
"kl_loss": 0.08353350311517715,
"loss_ib": 0.007914540357887745,
"step": 857
},
{
"ce_ib": 4.192291736602783,
"ce_orig": 0.4509224593639374,
"epoch": 0.1233644859813084,
"kl_loss": 0.08783835917711258,
"loss_ib": 0.0064880638383328915,
"step": 858
},
{
"ce_ib": 6.574756145477295,
"ce_orig": 0.7229125499725342,
"epoch": 0.12350826743350107,
"kl_loss": 0.12410986423492432,
"loss_ib": 0.009492871351540089,
"step": 859
},
{
"epoch": 0.12365204888569374,
"grad_norm": 0.10187579691410065,
"learning_rate": 4.998488932240305e-05,
"loss": 0.896,
"step": 860
},
{
"ce_ib": 6.708831310272217,
"ce_orig": 0.624555766582489,
"epoch": 0.12365204888569374,
"kl_loss": 0.10983145236968994,
"loss_ib": 0.008845987729728222,
"step": 860
},
{
"ce_ib": 11.562445640563965,
"ce_orig": 1.5727458000183105,
"epoch": 0.12379583033788641,
"kl_loss": 0.11283396929502487,
"loss_ib": 0.011422920972108841,
"step": 861
},
{
"ce_ib": 6.905985355377197,
"ce_orig": 0.5196636319160461,
"epoch": 0.12393961179007908,
"kl_loss": 0.08311201632022858,
"loss_ib": 0.007608593441545963,
"step": 862
},
{
"ce_ib": 6.673101425170898,
"ce_orig": 0.7954714298248291,
"epoch": 0.12408339324227174,
"kl_loss": 0.12156697362661362,
"loss_ib": 0.009414899162948132,
"step": 863
},
{
"ce_ib": 9.510821342468262,
"ce_orig": 1.0977836847305298,
"epoch": 0.12422717469446441,
"kl_loss": 0.11292783170938492,
"loss_ib": 0.010401802137494087,
"step": 864
},
{
"epoch": 0.12437095614665708,
"grad_norm": 0.10984697192907333,
"learning_rate": 4.9984207281569426e-05,
"loss": 0.8947,
"step": 865
},
{
"ce_ib": 6.371394634246826,
"ce_orig": 0.8775607347488403,
"epoch": 0.12437095614665708,
"kl_loss": 0.10108557343482971,
"loss_ib": 0.00823997613042593,
"step": 865
},
{
"ce_ib": 8.300228118896484,
"ce_orig": 0.8505659103393555,
"epoch": 0.12451473759884975,
"kl_loss": 0.13992467522621155,
"loss_ib": 0.011146347038447857,
"step": 866
},
{
"ce_ib": 7.113746166229248,
"ce_orig": 0.6279032826423645,
"epoch": 0.12465851905104242,
"kl_loss": 0.07766470313072205,
"loss_ib": 0.007440108340233564,
"step": 867
},
{
"ce_ib": 9.46139907836914,
"ce_orig": 0.982869029045105,
"epoch": 0.12480230050323508,
"kl_loss": 0.13695910573005676,
"loss_ib": 0.011578655801713467,
"step": 868
},
{
"ce_ib": 6.4365973472595215,
"ce_orig": 0.7115389108657837,
"epoch": 0.12494608195542775,
"kl_loss": 0.08151569217443466,
"loss_ib": 0.00729408347979188,
"step": 869
},
{
"epoch": 0.1250898634076204,
"grad_norm": 0.14856663346290588,
"learning_rate": 4.998351019105789e-05,
"loss": 0.8044,
"step": 870
},
{
"ce_ib": 7.635079383850098,
"ce_orig": 0.9380092620849609,
"epoch": 0.1250898634076204,
"kl_loss": 0.08349855244159698,
"loss_ib": 0.00799246784299612,
"step": 870
},
{
"ce_ib": 6.534551620483398,
"ce_orig": 0.5008100867271423,
"epoch": 0.1252336448598131,
"kl_loss": 0.14466875791549683,
"loss_ib": 0.010500714182853699,
"step": 871
},
{
"ce_ib": 7.542534828186035,
"ce_orig": 1.1307094097137451,
"epoch": 0.12537742631200574,
"kl_loss": 0.10606865584850311,
"loss_ib": 0.009074700064957142,
"step": 872
},
{
"ce_ib": 5.353913307189941,
"ce_orig": 0.5838197469711304,
"epoch": 0.12552120776419842,
"kl_loss": 0.21842791140079498,
"loss_ib": 0.013598352670669556,
"step": 873
},
{
"ce_ib": 6.852460861206055,
"ce_orig": 0.898078978061676,
"epoch": 0.12566498921639108,
"kl_loss": 0.19777730107307434,
"loss_ib": 0.013315095566213131,
"step": 874
},
{
"epoch": 0.12580877066858376,
"grad_norm": 0.16305744647979736,
"learning_rate": 4.9982798051288326e-05,
"loss": 0.8503,
"step": 875
},
{
"ce_ib": 5.873923301696777,
"ce_orig": 0.4752177894115448,
"epoch": 0.12580877066858376,
"kl_loss": 0.06173687055706978,
"loss_ib": 0.006023805122822523,
"step": 875
},
{
"ce_ib": 6.247898101806641,
"ce_orig": 0.8174537420272827,
"epoch": 0.1259525521207764,
"kl_loss": 0.0691542774438858,
"loss_ib": 0.006581662688404322,
"step": 876
},
{
"ce_ib": 3.595930814743042,
"ce_orig": 0.4554266929626465,
"epoch": 0.1260963335729691,
"kl_loss": 0.1204795241355896,
"loss_ib": 0.007821941748261452,
"step": 877
},
{
"ce_ib": 5.832032203674316,
"ce_orig": 0.7023007273674011,
"epoch": 0.12624011502516175,
"kl_loss": 0.12059217691421509,
"loss_ib": 0.008945624344050884,
"step": 878
},
{
"ce_ib": 5.493231773376465,
"ce_orig": 0.74711012840271,
"epoch": 0.12638389647735443,
"kl_loss": 0.13393926620483398,
"loss_ib": 0.009443579241633415,
"step": 879
},
{
"epoch": 0.12652767792954708,
"grad_norm": 0.11151118576526642,
"learning_rate": 4.998207086268971e-05,
"loss": 0.7946,
"step": 880
},
{
"ce_ib": 7.743311882019043,
"ce_orig": 1.0406618118286133,
"epoch": 0.12652767792954708,
"kl_loss": 0.07510361075401306,
"loss_ib": 0.007626836188137531,
"step": 880
},
{
"ce_ib": 6.917181015014648,
"ce_orig": 0.9735674262046814,
"epoch": 0.12667145938173976,
"kl_loss": 0.0983152836561203,
"loss_ib": 0.008374354802072048,
"step": 881
},
{
"ce_ib": 8.613320350646973,
"ce_orig": 1.0136125087738037,
"epoch": 0.12681524083393242,
"kl_loss": 0.12226949632167816,
"loss_ib": 0.010420135222375393,
"step": 882
},
{
"ce_ib": 10.181398391723633,
"ce_orig": 1.487206220626831,
"epoch": 0.1269590222861251,
"kl_loss": 0.08668573200702667,
"loss_ib": 0.009424986317753792,
"step": 883
},
{
"ce_ib": 6.982693672180176,
"ce_orig": 0.819129228591919,
"epoch": 0.12710280373831775,
"kl_loss": 0.11231732368469238,
"loss_ib": 0.009107212536036968,
"step": 884
},
{
"epoch": 0.12724658519051044,
"grad_norm": 0.12548822164535522,
"learning_rate": 4.998132862570007e-05,
"loss": 0.9055,
"step": 885
},
{
"ce_ib": 10.360311508178711,
"ce_orig": 1.447431206703186,
"epoch": 0.12724658519051044,
"kl_loss": 0.10747027397155762,
"loss_ib": 0.010553669184446335,
"step": 885
},
{
"ce_ib": 7.586730003356934,
"ce_orig": 0.9365792870521545,
"epoch": 0.1273903666427031,
"kl_loss": 0.10849650204181671,
"loss_ib": 0.009218189865350723,
"step": 886
},
{
"ce_ib": 8.302633285522461,
"ce_orig": 0.9514833688735962,
"epoch": 0.12753414809489577,
"kl_loss": 0.07761082053184509,
"loss_ib": 0.008031858131289482,
"step": 887
},
{
"ce_ib": 6.314914703369141,
"ce_orig": 0.6305254101753235,
"epoch": 0.12767792954708843,
"kl_loss": 0.11676283180713654,
"loss_ib": 0.00899559911340475,
"step": 888
},
{
"ce_ib": 8.043998718261719,
"ce_orig": 0.9012079238891602,
"epoch": 0.12782171099928108,
"kl_loss": 0.07340681552886963,
"loss_ib": 0.007692340295761824,
"step": 889
},
{
"epoch": 0.12796549245147376,
"grad_norm": 0.10787968337535858,
"learning_rate": 4.9980571340766526e-05,
"loss": 0.9241,
"step": 890
},
{
"ce_ib": 7.90130090713501,
"ce_orig": 1.0600311756134033,
"epoch": 0.12796549245147376,
"kl_loss": 0.12333562225103378,
"loss_ib": 0.010117431171238422,
"step": 890
},
{
"ce_ib": 6.691804885864258,
"ce_orig": 0.9607767462730408,
"epoch": 0.12810927390366642,
"kl_loss": 0.07735402882099152,
"loss_ib": 0.00721360370516777,
"step": 891
},
{
"ce_ib": 7.60068416595459,
"ce_orig": 0.7663901448249817,
"epoch": 0.1282530553558591,
"kl_loss": 0.08263403922319412,
"loss_ib": 0.007932043634355068,
"step": 892
},
{
"ce_ib": 7.247678756713867,
"ce_orig": 1.0724464654922485,
"epoch": 0.12839683680805175,
"kl_loss": 0.11332201212644577,
"loss_ib": 0.009289939887821674,
"step": 893
},
{
"ce_ib": 5.542264938354492,
"ce_orig": 0.664827823638916,
"epoch": 0.12854061826024443,
"kl_loss": 0.08512848615646362,
"loss_ib": 0.007027556654065847,
"step": 894
},
{
"epoch": 0.1286843997124371,
"grad_norm": 0.12153299897909164,
"learning_rate": 4.9979799008345215e-05,
"loss": 0.9543,
"step": 895
},
{
"ce_ib": 5.504077911376953,
"ce_orig": 0.6800283193588257,
"epoch": 0.1286843997124371,
"kl_loss": 0.07652309536933899,
"loss_ib": 0.006578193511813879,
"step": 895
},
{
"ce_ib": 4.984541416168213,
"ce_orig": 0.6408447623252869,
"epoch": 0.12882818116462977,
"kl_loss": 0.0679374560713768,
"loss_ib": 0.005889143329113722,
"step": 896
},
{
"ce_ib": 7.2878217697143555,
"ce_orig": 0.8095806837081909,
"epoch": 0.12897196261682242,
"kl_loss": 0.08664879202842712,
"loss_ib": 0.007976350374519825,
"step": 897
},
{
"ce_ib": 4.948668003082275,
"ce_orig": 0.4147416055202484,
"epoch": 0.1291157440690151,
"kl_loss": 0.17131486535072327,
"loss_ib": 0.011040077544748783,
"step": 898
},
{
"ce_ib": 6.665126323699951,
"ce_orig": 0.6478716731071472,
"epoch": 0.12925952552120776,
"kl_loss": 0.07731156051158905,
"loss_ib": 0.007198141422122717,
"step": 899
},
{
"epoch": 0.12940330697340044,
"grad_norm": 0.12399930506944656,
"learning_rate": 4.997901162890139e-05,
"loss": 0.762,
"step": 900
},
{
"ce_ib": 7.272383689880371,
"ce_orig": 0.6885168552398682,
"epoch": 0.12940330697340044,
"kl_loss": 0.08325660228729248,
"loss_ib": 0.007799021899700165,
"step": 900
},
{
"ce_ib": 5.927865028381348,
"ce_orig": 0.7656009793281555,
"epoch": 0.1295470884255931,
"kl_loss": 0.07506805658340454,
"loss_ib": 0.006717335432767868,
"step": 901
},
{
"ce_ib": 5.351132392883301,
"ce_orig": 0.6215174794197083,
"epoch": 0.12969086987778577,
"kl_loss": 0.08371435105800629,
"loss_ib": 0.0068612839095294476,
"step": 902
},
{
"ce_ib": 7.883289337158203,
"ce_orig": 0.797773540019989,
"epoch": 0.12983465132997843,
"kl_loss": 0.10347917675971985,
"loss_ib": 0.009115603752434254,
"step": 903
},
{
"ce_ib": 5.427731513977051,
"ce_orig": 0.5187583565711975,
"epoch": 0.1299784327821711,
"kl_loss": 0.09261107444763184,
"loss_ib": 0.007344419602304697,
"step": 904
},
{
"epoch": 0.13012221423436376,
"grad_norm": 0.1253208965063095,
"learning_rate": 4.997820920290933e-05,
"loss": 0.8364,
"step": 905
},
{
"ce_ib": 4.651042938232422,
"ce_orig": 0.5111949443817139,
"epoch": 0.13012221423436376,
"kl_loss": 0.06503793597221375,
"loss_ib": 0.005577418487519026,
"step": 905
},
{
"ce_ib": 6.314610481262207,
"ce_orig": 0.7297623753547668,
"epoch": 0.13026599568655645,
"kl_loss": 0.058927081525325775,
"loss_ib": 0.006103659514337778,
"step": 906
},
{
"ce_ib": 6.989099502563477,
"ce_orig": 0.9666232466697693,
"epoch": 0.1304097771387491,
"kl_loss": 0.05747806280851364,
"loss_ib": 0.006368452217429876,
"step": 907
},
{
"ce_ib": 6.891256332397461,
"ce_orig": 0.6751134395599365,
"epoch": 0.13055355859094178,
"kl_loss": 0.09852255135774612,
"loss_ib": 0.008371755480766296,
"step": 908
},
{
"ce_ib": 6.981194496154785,
"ce_orig": 0.9139746427536011,
"epoch": 0.13069734004313444,
"kl_loss": 0.08173255622386932,
"loss_ib": 0.007577225100249052,
"step": 909
},
{
"epoch": 0.1308411214953271,
"grad_norm": 0.12957040965557098,
"learning_rate": 4.9977391730852386e-05,
"loss": 0.8999,
"step": 910
},
{
"ce_ib": 5.080508232116699,
"ce_orig": 0.7197664380073547,
"epoch": 0.1308411214953271,
"kl_loss": 0.06574570387601852,
"loss_ib": 0.005827539600431919,
"step": 910
},
{
"ce_ib": 5.3453497886657715,
"ce_orig": 0.5241774320602417,
"epoch": 0.13098490294751977,
"kl_loss": 0.08800401538610458,
"loss_ib": 0.007072875741869211,
"step": 911
},
{
"ce_ib": 6.245478630065918,
"ce_orig": 0.6996411085128784,
"epoch": 0.13112868439971243,
"kl_loss": 0.09634403884410858,
"loss_ib": 0.007939941249787807,
"step": 912
},
{
"ce_ib": 7.945882797241211,
"ce_orig": 1.143088698387146,
"epoch": 0.1312724658519051,
"kl_loss": 0.08361925929784775,
"loss_ib": 0.008153904229402542,
"step": 913
},
{
"ce_ib": 8.524740219116211,
"ce_orig": 1.103076696395874,
"epoch": 0.13141624730409776,
"kl_loss": 0.07524412125349045,
"loss_ib": 0.008024576120078564,
"step": 914
},
{
"epoch": 0.13156002875629044,
"grad_norm": 0.11736583709716797,
"learning_rate": 4.997655921322299e-05,
"loss": 0.7881,
"step": 915
},
{
"ce_ib": 7.337964057922363,
"ce_orig": 0.7371410131454468,
"epoch": 0.13156002875629044,
"kl_loss": 0.08646431565284729,
"loss_ib": 0.007992197759449482,
"step": 915
},
{
"ce_ib": 8.713349342346191,
"ce_orig": 0.9880151152610779,
"epoch": 0.1317038102084831,
"kl_loss": 0.07716777920722961,
"loss_ib": 0.00821506418287754,
"step": 916
},
{
"ce_ib": 6.380884647369385,
"ce_orig": 0.6128709316253662,
"epoch": 0.13184759166067578,
"kl_loss": 0.09613852202892303,
"loss_ib": 0.007997368462383747,
"step": 917
},
{
"ce_ib": 5.234755992889404,
"ce_orig": 0.6296612024307251,
"epoch": 0.13199137311286843,
"kl_loss": 0.10441483557224274,
"loss_ib": 0.007838119752705097,
"step": 918
},
{
"ce_ib": 5.749408721923828,
"ce_orig": 0.6389755606651306,
"epoch": 0.1321351545650611,
"kl_loss": 0.09272737056016922,
"loss_ib": 0.007511072792112827,
"step": 919
},
{
"epoch": 0.13227893601725377,
"grad_norm": 0.10284168273210526,
"learning_rate": 4.997571165052262e-05,
"loss": 0.8021,
"step": 920
},
{
"ce_ib": 7.687596797943115,
"ce_orig": 0.9395468235015869,
"epoch": 0.13227893601725377,
"kl_loss": 0.10860046744346619,
"loss_ib": 0.009273822419345379,
"step": 920
},
{
"ce_ib": 5.143918991088867,
"ce_orig": 0.6956912279129028,
"epoch": 0.13242271746944645,
"kl_loss": 0.05788666009902954,
"loss_ib": 0.00546629261225462,
"step": 921
},
{
"ce_ib": 4.9963812828063965,
"ce_orig": 0.641473114490509,
"epoch": 0.1325664989216391,
"kl_loss": 0.076176717877388,
"loss_ib": 0.0063070268370211124,
"step": 922
},
{
"ce_ib": 5.956387996673584,
"ce_orig": 0.6581974625587463,
"epoch": 0.13271028037383178,
"kl_loss": 0.08382155001163483,
"loss_ib": 0.007169271353632212,
"step": 923
},
{
"ce_ib": 6.518821716308594,
"ce_orig": 0.8877462148666382,
"epoch": 0.13285406182602444,
"kl_loss": 0.07050419598817825,
"loss_ib": 0.0067846211604774,
"step": 924
},
{
"epoch": 0.13299784327821712,
"grad_norm": 0.1254836916923523,
"learning_rate": 4.99748490432618e-05,
"loss": 0.7649,
"step": 925
},
{
"ce_ib": 9.03060531616211,
"ce_orig": 1.0601074695587158,
"epoch": 0.13299784327821712,
"kl_loss": 0.1262633204460144,
"loss_ib": 0.010828468017280102,
"step": 925
},
{
"ce_ib": 5.390571594238281,
"ce_orig": 0.6605896353721619,
"epoch": 0.13314162473040977,
"kl_loss": 0.08747322857379913,
"loss_ib": 0.007068946957588196,
"step": 926
},
{
"ce_ib": 9.145002365112305,
"ce_orig": 0.7847701907157898,
"epoch": 0.13328540618260246,
"kl_loss": 0.08090417087078094,
"loss_ib": 0.008617709390819073,
"step": 927
},
{
"ce_ib": 5.643058776855469,
"ce_orig": 0.7314335107803345,
"epoch": 0.1334291876347951,
"kl_loss": 0.05843006446957588,
"loss_ib": 0.0057430327869951725,
"step": 928
},
{
"ce_ib": 6.753859996795654,
"ce_orig": 1.0451164245605469,
"epoch": 0.1335729690869878,
"kl_loss": 0.07054602354764938,
"loss_ib": 0.006904230918735266,
"step": 929
},
{
"epoch": 0.13371675053918045,
"grad_norm": 0.1414111852645874,
"learning_rate": 4.9973971391960167e-05,
"loss": 0.8961,
"step": 930
},
{
"ce_ib": 8.576930046081543,
"ce_orig": 1.1311094760894775,
"epoch": 0.13371675053918045,
"kl_loss": 0.09142087399959564,
"loss_ib": 0.00885950867086649,
"step": 930
},
{
"ce_ib": 7.471253871917725,
"ce_orig": 1.0194929838180542,
"epoch": 0.1338605319913731,
"kl_loss": 0.11889418959617615,
"loss_ib": 0.009680337272584438,
"step": 931
},
{
"ce_ib": 6.724924564361572,
"ce_orig": 0.864520788192749,
"epoch": 0.13400431344356578,
"kl_loss": 0.09439219534397125,
"loss_ib": 0.008082072250545025,
"step": 932
},
{
"ce_ib": 7.768470287322998,
"ce_orig": 1.0322949886322021,
"epoch": 0.13414809489575844,
"kl_loss": 0.16739840805530548,
"loss_ib": 0.012254155240952969,
"step": 933
},
{
"ce_ib": 8.237820625305176,
"ce_orig": 1.0064351558685303,
"epoch": 0.13429187634795112,
"kl_loss": 0.10145040601491928,
"loss_ib": 0.009191430173814297,
"step": 934
},
{
"epoch": 0.13443565780014377,
"grad_norm": 0.1539893001317978,
"learning_rate": 4.997307869714637e-05,
"loss": 0.8736,
"step": 935
},
{
"ce_ib": 6.874648571014404,
"ce_orig": 1.1211917400360107,
"epoch": 0.13443565780014377,
"kl_loss": 0.10167817026376724,
"loss_ib": 0.00852123275399208,
"step": 935
},
{
"ce_ib": 5.751309871673584,
"ce_orig": 0.8845812082290649,
"epoch": 0.13457943925233645,
"kl_loss": 0.09366403520107269,
"loss_ib": 0.007558857090771198,
"step": 936
},
{
"ce_ib": 8.573022842407227,
"ce_orig": 1.0504612922668457,
"epoch": 0.1347232207045291,
"kl_loss": 0.07060796767473221,
"loss_ib": 0.007816909812390804,
"step": 937
},
{
"ce_ib": 8.377857208251953,
"ce_orig": 0.9093595147132874,
"epoch": 0.1348670021567218,
"kl_loss": 0.11797395348548889,
"loss_ib": 0.010087626054883003,
"step": 938
},
{
"ce_ib": 7.620189189910889,
"ce_orig": 0.9195590019226074,
"epoch": 0.13501078360891444,
"kl_loss": 0.11536361277103424,
"loss_ib": 0.009578275494277477,
"step": 939
},
{
"epoch": 0.13515456506110712,
"grad_norm": 0.12349986284971237,
"learning_rate": 4.9972170959358156e-05,
"loss": 0.8263,
"step": 940
},
{
"ce_ib": 4.433289527893066,
"ce_orig": 0.5269767642021179,
"epoch": 0.13515456506110712,
"kl_loss": 0.06309656798839569,
"loss_ib": 0.005371473263949156,
"step": 940
},
{
"ce_ib": 6.865793228149414,
"ce_orig": 0.8600917458534241,
"epoch": 0.13529834651329978,
"kl_loss": 0.268341064453125,
"loss_ib": 0.016849949955940247,
"step": 941
},
{
"ce_ib": 5.769837856292725,
"ce_orig": 0.9609159827232361,
"epoch": 0.13544212796549246,
"kl_loss": 0.073099285364151,
"loss_ib": 0.006539882626384497,
"step": 942
},
{
"ce_ib": 6.068971157073975,
"ce_orig": 0.8975405097007751,
"epoch": 0.1355859094176851,
"kl_loss": 0.07797665894031525,
"loss_ib": 0.006933317985385656,
"step": 943
},
{
"ce_ib": 4.143486976623535,
"ce_orig": 0.5329647660255432,
"epoch": 0.1357296908698778,
"kl_loss": 0.1322351098060608,
"loss_ib": 0.008683498948812485,
"step": 944
},
{
"epoch": 0.13587347232207045,
"grad_norm": 0.12345600128173828,
"learning_rate": 4.9971248179142296e-05,
"loss": 0.8698,
"step": 945
},
{
"ce_ib": 8.5775146484375,
"ce_orig": 1.4092092514038086,
"epoch": 0.13587347232207045,
"kl_loss": 0.07750563323497772,
"loss_ib": 0.008164039812982082,
"step": 945
},
{
"ce_ib": 6.79196310043335,
"ce_orig": 1.1046801805496216,
"epoch": 0.13601725377426313,
"kl_loss": 0.08599655330181122,
"loss_ib": 0.007695809006690979,
"step": 946
},
{
"ce_ib": 6.365146636962891,
"ce_orig": 0.7971789836883545,
"epoch": 0.13616103522645578,
"kl_loss": 0.08945336192846298,
"loss_ib": 0.007655241526663303,
"step": 947
},
{
"ce_ib": 6.125009059906006,
"ce_orig": 0.9030478000640869,
"epoch": 0.13630481667864847,
"kl_loss": 0.08069920539855957,
"loss_ib": 0.007097464986145496,
"step": 948
},
{
"ce_ib": 7.449437618255615,
"ce_orig": 0.9372292757034302,
"epoch": 0.13644859813084112,
"kl_loss": 0.07520076632499695,
"loss_ib": 0.007484757341444492,
"step": 949
},
{
"epoch": 0.1365923795830338,
"grad_norm": 0.1214538961648941,
"learning_rate": 4.997031035705466e-05,
"loss": 0.8814,
"step": 950
},
{
"ce_ib": 6.2350687980651855,
"ce_orig": 0.8483865857124329,
"epoch": 0.1365923795830338,
"kl_loss": 0.08866537362337112,
"loss_ib": 0.007550803013145924,
"step": 950
},
{
"ce_ib": 6.066655158996582,
"ce_orig": 0.6078613996505737,
"epoch": 0.13673616103522646,
"kl_loss": 0.06268885731697083,
"loss_ib": 0.006167770363390446,
"step": 951
},
{
"ce_ib": 7.969608783721924,
"ce_orig": 1.1507118940353394,
"epoch": 0.1368799424874191,
"kl_loss": 0.07303150743246078,
"loss_ib": 0.007636380381882191,
"step": 952
},
{
"ce_ib": 6.006031036376953,
"ce_orig": 0.7712607383728027,
"epoch": 0.1370237239396118,
"kl_loss": 0.1005963534116745,
"loss_ib": 0.008032833226025105,
"step": 953
},
{
"ce_ib": 6.047878265380859,
"ce_orig": 0.8621774315834045,
"epoch": 0.13716750539180445,
"kl_loss": 0.06825101375579834,
"loss_ib": 0.006436489522457123,
"step": 954
},
{
"epoch": 0.13731128684399713,
"grad_norm": 0.1393401324748993,
"learning_rate": 4.996935749366015e-05,
"loss": 0.9195,
"step": 955
},
{
"ce_ib": 5.52618408203125,
"ce_orig": 0.7416598200798035,
"epoch": 0.13731128684399713,
"kl_loss": 0.07697509229183197,
"loss_ib": 0.006611846387386322,
"step": 955
},
{
"ce_ib": 5.450565338134766,
"ce_orig": 0.7644197940826416,
"epoch": 0.13745506829618978,
"kl_loss": 0.06023329496383667,
"loss_ib": 0.005736947525292635,
"step": 956
},
{
"ce_ib": 7.305458068847656,
"ce_orig": 1.0978727340698242,
"epoch": 0.13759884974838246,
"kl_loss": 0.11561448872089386,
"loss_ib": 0.009433453902602196,
"step": 957
},
{
"ce_ib": 6.976890563964844,
"ce_orig": 0.7256874442100525,
"epoch": 0.13774263120057512,
"kl_loss": 0.09455625712871552,
"loss_ib": 0.008216258138418198,
"step": 958
},
{
"ce_ib": 6.9282097816467285,
"ce_orig": 0.7956810593605042,
"epoch": 0.1378864126527678,
"kl_loss": 0.087988942861557,
"loss_ib": 0.007863552309572697,
"step": 959
},
{
"epoch": 0.13803019410496045,
"grad_norm": 0.12086982280015945,
"learning_rate": 4.996838958953275e-05,
"loss": 0.8213,
"step": 960
},
{
"ce_ib": 6.342545032501221,
"ce_orig": 0.7129842638969421,
"epoch": 0.13803019410496045,
"kl_loss": 0.13139081001281738,
"loss_ib": 0.009740813635289669,
"step": 960
},
{
"ce_ib": 6.692712306976318,
"ce_orig": 0.8974847197532654,
"epoch": 0.13817397555715313,
"kl_loss": 0.08609960973262787,
"loss_ib": 0.007651336491107941,
"step": 961
},
{
"ce_ib": 8.433094024658203,
"ce_orig": 1.1379026174545288,
"epoch": 0.1383177570093458,
"kl_loss": 0.10287706553936005,
"loss_ib": 0.009360400028526783,
"step": 962
},
{
"ce_ib": 8.236790657043457,
"ce_orig": 1.2443400621414185,
"epoch": 0.13846153846153847,
"kl_loss": 0.08817592263221741,
"loss_ib": 0.00852719135582447,
"step": 963
},
{
"ce_ib": 5.165767669677734,
"ce_orig": 0.5761904716491699,
"epoch": 0.13860531991373112,
"kl_loss": 0.0794210359454155,
"loss_ib": 0.006553936284035444,
"step": 964
},
{
"epoch": 0.1387491013659238,
"grad_norm": 0.1046949103474617,
"learning_rate": 4.996740664525549e-05,
"loss": 0.9412,
"step": 965
},
{
"ce_ib": 7.795332908630371,
"ce_orig": 1.0906161069869995,
"epoch": 0.1387491013659238,
"kl_loss": 0.07911582291126251,
"loss_ib": 0.00785345770418644,
"step": 965
},
{
"ce_ib": 5.823874473571777,
"ce_orig": 0.6993345022201538,
"epoch": 0.13889288281811646,
"kl_loss": 0.08959372341632843,
"loss_ib": 0.0073916236869990826,
"step": 966
},
{
"ce_ib": 7.680074214935303,
"ce_orig": 1.1049128770828247,
"epoch": 0.13903666427030914,
"kl_loss": 0.08348916471004486,
"loss_ib": 0.008014495484530926,
"step": 967
},
{
"ce_ib": 6.938556671142578,
"ce_orig": 0.6378891468048096,
"epoch": 0.1391804457225018,
"kl_loss": 0.11031496524810791,
"loss_ib": 0.008985026739537716,
"step": 968
},
{
"ce_ib": 8.571457862854004,
"ce_orig": 1.1795148849487305,
"epoch": 0.13932422717469448,
"kl_loss": 0.08533471077680588,
"loss_ib": 0.00855246465653181,
"step": 969
},
{
"epoch": 0.13946800862688713,
"grad_norm": 0.1300869584083557,
"learning_rate": 4.996640866142046e-05,
"loss": 0.9504,
"step": 970
},
{
"ce_ib": 8.15340805053711,
"ce_orig": 1.0846816301345825,
"epoch": 0.13946800862688713,
"kl_loss": 0.08402827382087708,
"loss_ib": 0.008278118446469307,
"step": 970
},
{
"ce_ib": 4.409031391143799,
"ce_orig": 0.46880093216896057,
"epoch": 0.1396117900790798,
"kl_loss": 0.11544293165206909,
"loss_ib": 0.007976662367582321,
"step": 971
},
{
"ce_ib": 8.286463737487793,
"ce_orig": 0.7137627601623535,
"epoch": 0.13975557153127247,
"kl_loss": 0.07826922088861465,
"loss_ib": 0.008056692779064178,
"step": 972
},
{
"ce_ib": 7.249392986297607,
"ce_orig": 1.0557063817977905,
"epoch": 0.13989935298346512,
"kl_loss": 0.08211711049079895,
"loss_ib": 0.007730551995337009,
"step": 973
},
{
"ce_ib": 6.922652721405029,
"ce_orig": 0.7434417009353638,
"epoch": 0.1400431344356578,
"kl_loss": 0.09765265882015228,
"loss_ib": 0.008343959227204323,
"step": 974
},
{
"epoch": 0.14018691588785046,
"grad_norm": 0.10048684477806091,
"learning_rate": 4.996539563862881e-05,
"loss": 0.9914,
"step": 975
},
{
"ce_ib": 5.8394904136657715,
"ce_orig": 0.9871096611022949,
"epoch": 0.14018691588785046,
"kl_loss": 0.07624734193086624,
"loss_ib": 0.0067321122623980045,
"step": 975
},
{
"ce_ib": 4.4480671882629395,
"ce_orig": 0.5778042078018188,
"epoch": 0.14033069734004314,
"kl_loss": 0.12898705899715424,
"loss_ib": 0.00867338664829731,
"step": 976
},
{
"ce_ib": 4.4938836097717285,
"ce_orig": 0.5107190012931824,
"epoch": 0.1404744787922358,
"kl_loss": 0.060333944857120514,
"loss_ib": 0.005263639148324728,
"step": 977
},
{
"ce_ib": 5.551029682159424,
"ce_orig": 0.7901322841644287,
"epoch": 0.14061826024442847,
"kl_loss": 0.1715565174818039,
"loss_ib": 0.011353340931236744,
"step": 978
},
{
"ce_ib": 6.682663917541504,
"ce_orig": 0.8593956828117371,
"epoch": 0.14076204169662113,
"kl_loss": 0.08176585286855698,
"loss_ib": 0.007429624442011118,
"step": 979
},
{
"epoch": 0.1409058231488138,
"grad_norm": 0.1225329264998436,
"learning_rate": 4.996436757749077e-05,
"loss": 0.8339,
"step": 980
},
{
"ce_ib": 5.105162620544434,
"ce_orig": 0.6780040860176086,
"epoch": 0.1409058231488138,
"kl_loss": 0.06629600375890732,
"loss_ib": 0.005867381580173969,
"step": 980
},
{
"ce_ib": 6.971573352813721,
"ce_orig": 1.1731876134872437,
"epoch": 0.14104960460100646,
"kl_loss": 0.09537965059280396,
"loss_ib": 0.008254769258201122,
"step": 981
},
{
"ce_ib": 7.419947624206543,
"ce_orig": 1.0379914045333862,
"epoch": 0.14119338605319914,
"kl_loss": 0.09539618343114853,
"loss_ib": 0.008479783311486244,
"step": 982
},
{
"ce_ib": 8.027047157287598,
"ce_orig": 1.0013635158538818,
"epoch": 0.1413371675053918,
"kl_loss": 0.11693254113197327,
"loss_ib": 0.009860150516033173,
"step": 983
},
{
"ce_ib": 5.866478443145752,
"ce_orig": 0.8373990058898926,
"epoch": 0.14148094895758448,
"kl_loss": 0.056306250393390656,
"loss_ib": 0.005748551804572344,
"step": 984
},
{
"epoch": 0.14162473040977713,
"grad_norm": 0.12583786249160767,
"learning_rate": 4.99633244786256e-05,
"loss": 0.9591,
"step": 985
},
{
"ce_ib": 9.714642524719238,
"ce_orig": 1.50913405418396,
"epoch": 0.14162473040977713,
"kl_loss": 0.11189399659633636,
"loss_ib": 0.010452020913362503,
"step": 985
},
{
"ce_ib": 5.668364524841309,
"ce_orig": 0.510465681552887,
"epoch": 0.14176851186196981,
"kl_loss": 0.10138311982154846,
"loss_ib": 0.00790333840996027,
"step": 986
},
{
"ce_ib": 6.4766764640808105,
"ce_orig": 0.9878252744674683,
"epoch": 0.14191229331416247,
"kl_loss": 0.05886990204453468,
"loss_ib": 0.0061818333342671394,
"step": 987
},
{
"ce_ib": 4.52413272857666,
"ce_orig": 0.6275699734687805,
"epoch": 0.14205607476635515,
"kl_loss": 0.07127900421619415,
"loss_ib": 0.0058260164223611355,
"step": 988
},
{
"ce_ib": 7.104588031768799,
"ce_orig": 1.0290782451629639,
"epoch": 0.1421998562185478,
"kl_loss": 0.09330768138170242,
"loss_ib": 0.008217677474021912,
"step": 989
},
{
"epoch": 0.14234363767074049,
"grad_norm": 0.15013280510902405,
"learning_rate": 4.9962266342661624e-05,
"loss": 0.8154,
"step": 990
},
{
"ce_ib": 6.073655128479004,
"ce_orig": 0.7287778258323669,
"epoch": 0.14234363767074049,
"kl_loss": 0.08072468638420105,
"loss_ib": 0.007073062006384134,
"step": 990
},
{
"ce_ib": 7.8407883644104,
"ce_orig": 0.9141483902931213,
"epoch": 0.14248741912293314,
"kl_loss": 0.07278452068567276,
"loss_ib": 0.007559619843959808,
"step": 991
},
{
"ce_ib": 7.545557975769043,
"ce_orig": 1.0704678297042847,
"epoch": 0.14263120057512582,
"kl_loss": 0.10130893439054489,
"loss_ib": 0.008838226087391376,
"step": 992
},
{
"ce_ib": 9.583335876464844,
"ce_orig": 1.1344443559646606,
"epoch": 0.14277498202731848,
"kl_loss": 0.0773579329252243,
"loss_ib": 0.008659563958644867,
"step": 993
},
{
"ce_ib": 6.852370738983154,
"ce_orig": 1.0198527574539185,
"epoch": 0.14291876347951113,
"kl_loss": 0.09604521840810776,
"loss_ib": 0.00822844635695219,
"step": 994
},
{
"epoch": 0.1430625449317038,
"grad_norm": 0.140967458486557,
"learning_rate": 4.9961193170236234e-05,
"loss": 0.9539,
"step": 995
},
{
"ce_ib": 7.311800956726074,
"ce_orig": 0.8755852580070496,
"epoch": 0.1430625449317038,
"kl_loss": 0.1115192174911499,
"loss_ib": 0.009231860749423504,
"step": 995
},
{
"ce_ib": 7.3969621658325195,
"ce_orig": 0.7122746706008911,
"epoch": 0.14320632638389647,
"kl_loss": 0.16122053563594818,
"loss_ib": 0.011759507469832897,
"step": 996
},
{
"ce_ib": 6.429070949554443,
"ce_orig": 0.7772039175033569,
"epoch": 0.14335010783608915,
"kl_loss": 0.06662163138389587,
"loss_ib": 0.006545616779476404,
"step": 997
},
{
"ce_ib": 6.746508598327637,
"ce_orig": 1.0281816720962524,
"epoch": 0.1434938892882818,
"kl_loss": 0.11056394129991531,
"loss_ib": 0.008901451714336872,
"step": 998
},
{
"ce_ib": 5.576045989990234,
"ce_orig": 0.8467837572097778,
"epoch": 0.14363767074047448,
"kl_loss": 0.08954350650310516,
"loss_ib": 0.007265198510140181,
"step": 999
},
{
"epoch": 0.14378145219266714,
"grad_norm": 0.1209883913397789,
"learning_rate": 4.996010496199587e-05,
"loss": 0.8696,
"step": 1000
}
],
"logging_steps": 5,
"max_steps": 20865,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}