{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 11.444444444444445, "eval_steps": 500, "global_step": 4200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.136332651670075, "grad_norm": 11.269988059997559, "learning_rate": 0.00010652173913043477, "loss": 39.8198193359375, "step": 50 }, { "epoch": 0.27266530334015, "grad_norm": 14.962249755859375, "learning_rate": 0.0002152173913043478, "loss": 26.48206298828125, "step": 100 }, { "epoch": 0.40899795501022496, "grad_norm": 16.134239196777344, "learning_rate": 0.0003239130434782608, "loss": 19.36090087890625, "step": 150 }, { "epoch": 0.5453306066803, "grad_norm": 12.964629173278809, "learning_rate": 0.00043260869565217385, "loss": 14.5181689453125, "step": 200 }, { "epoch": 0.6816632583503749, "grad_norm": 10.061964988708496, "learning_rate": 0.0005413043478260869, "loss": 12.766522216796876, "step": 250 }, { "epoch": 0.8179959100204499, "grad_norm": 6.430169105529785, "learning_rate": 0.0005999713580566041, "loss": 11.881512451171876, "step": 300 }, { "epoch": 0.9543285616905249, "grad_norm": 5.836061954498291, "learning_rate": 0.0005997115106245061, "loss": 11.075633544921875, "step": 350 }, { "epoch": 1.0899795501022496, "grad_norm": 4.700469017028809, "learning_rate": 0.000599181221756225, "loss": 10.015964965820313, "step": 400 }, { "epoch": 1.2263122017723245, "grad_norm": 4.282314777374268, "learning_rate": 0.0005983809699521793, "loss": 9.592711181640626, "step": 450 }, { "epoch": 1.3626448534423994, "grad_norm": 3.662461280822754, "learning_rate": 0.0005973114773109183, "loss": 9.211348266601563, "step": 500 }, { "epoch": 1.4989775051124745, "grad_norm": 3.783862590789795, "learning_rate": 0.0005959737088775463, "loss": 8.782565307617187, "step": 550 }, { "epoch": 1.6353101567825494, "grad_norm": 3.429069757461548, "learning_rate": 0.0005943688717729229, "loss": 8.386593627929688, "step": 600 }, { "epoch": 1.7716428084526243, "grad_norm": 2.529482841491699, "learning_rate": 0.0005924984141044315, "loss": 8.06916259765625, "step": 650 }, { "epoch": 1.9079754601226995, "grad_norm": 2.4761486053466797, "learning_rate": 0.0005903640236592949, "loss": 7.7736474609375, "step": 700 }, { "epoch": 2.043626448534424, "grad_norm": 2.3675243854522705, "learning_rate": 0.0005879676263816192, "loss": 7.4699859619140625, "step": 750 }, { "epoch": 2.179959100204499, "grad_norm": 2.305575132369995, "learning_rate": 0.0005853113846345384, "loss": 7.305302734375, "step": 800 }, { "epoch": 2.316291751874574, "grad_norm": 2.3954548835754395, "learning_rate": 0.0005823976952490298, "loss": 7.102890625, "step": 850 }, { "epoch": 2.452624403544649, "grad_norm": 2.2483468055725098, "learning_rate": 0.0005792291873611596, "loss": 6.9435498046875, "step": 900 }, { "epoch": 2.588957055214724, "grad_norm": 2.0154542922973633, "learning_rate": 0.00057580872003971, "loss": 6.761861572265625, "step": 950 }, { "epoch": 2.7252897068847988, "grad_norm": 2.155539035797119, "learning_rate": 0.00057213937970633, "loss": 6.58298583984375, "step": 1000 }, { "epoch": 2.861622358554874, "grad_norm": 1.7670893669128418, "learning_rate": 0.0005682244773505363, "loss": 6.419741821289063, "step": 1050 }, { "epoch": 2.997955010224949, "grad_norm": 2.3611533641815186, "learning_rate": 0.0005640675455420765, "loss": 6.288121948242187, "step": 1100 }, { "epoch": 3.1336059986366736, "grad_norm": 1.8783221244812012, "learning_rate": 0.0005596723352433551, "loss": 6.090737915039062, "step": 1150 }, { "epoch": 3.2699386503067487, "grad_norm": 2.2795541286468506, "learning_rate": 0.0005550428124247912, "loss": 5.979439086914063, "step": 1200 }, { "epoch": 3.4062713019768234, "grad_norm": 1.3074142932891846, "learning_rate": 0.0005501831544861696, "loss": 5.897046508789063, "step": 1250 }, { "epoch": 3.5426039536468985, "grad_norm": 1.259432077407837, "learning_rate": 0.0005450977464872081, "loss": 5.734913940429688, "step": 1300 }, { "epoch": 3.6789366053169736, "grad_norm": 1.4511066675186157, "learning_rate": 0.0005397911771907473, "loss": 5.604786987304688, "step": 1350 }, { "epoch": 3.8152692569870483, "grad_norm": 1.428958535194397, "learning_rate": 0.0005342682349221297, "loss": 5.445667114257812, "step": 1400 }, { "epoch": 3.9516019086571235, "grad_norm": 1.2921603918075562, "learning_rate": 0.000528533903248506, "loss": 5.391282958984375, "step": 1450 }, { "epoch": 4.087252897068848, "grad_norm": 1.3616167306900024, "learning_rate": 0.0005225933564819676, "loss": 5.183615112304688, "step": 1500 }, { "epoch": 4.223585548738923, "grad_norm": 1.2340154647827148, "learning_rate": 0.0005164519550105623, "loss": 5.060681457519531, "step": 1550 }, { "epoch": 4.359918200408998, "grad_norm": 1.5526384115219116, "learning_rate": 0.0005101152404614052, "loss": 4.902400817871094, "step": 1600 }, { "epoch": 4.496250852079073, "grad_norm": 1.4958291053771973, "learning_rate": 0.0005035889307002529, "loss": 4.787099304199219, "step": 1650 }, { "epoch": 4.632583503749148, "grad_norm": 1.4236118793487549, "learning_rate": 0.0004968789146720478, "loss": 4.660638427734375, "step": 1700 }, { "epoch": 4.768916155419223, "grad_norm": 1.3592256307601929, "learning_rate": 0.0004899912470870939, "loss": 4.454691162109375, "step": 1750 }, { "epoch": 4.905248807089298, "grad_norm": 1.502578616142273, "learning_rate": 0.00048293214295765303, "loss": 4.297479553222656, "step": 1800 }, { "epoch": 5.040899795501023, "grad_norm": 1.7106261253356934, "learning_rate": 0.0004757079719898968, "loss": 4.13409423828125, "step": 1850 }, { "epoch": 5.1772324471710975, "grad_norm": 1.2559808492660522, "learning_rate": 0.00046832525283627114, "loss": 3.96047607421875, "step": 1900 }, { "epoch": 5.313565098841172, "grad_norm": 1.2694923877716064, "learning_rate": 0.0004607906472134603, "loss": 3.8196981811523436, "step": 1950 }, { "epoch": 5.449897750511248, "grad_norm": 1.6992137432098389, "learning_rate": 0.0004531109538912596, "loss": 3.6628662109375, "step": 2000 }, { "epoch": 5.5862304021813225, "grad_norm": 1.453190803527832, "learning_rate": 0.00044529310255777855, "loss": 3.52033935546875, "step": 2050 }, { "epoch": 5.722563053851397, "grad_norm": 1.504873514175415, "learning_rate": 0.0004373441475665124, "loss": 3.3988775634765624, "step": 2100 }, { "epoch": 5.858895705521473, "grad_norm": 1.4465556144714355, "learning_rate": 0.00042927126157092204, "loss": 3.2702841186523437, "step": 2150 }, { "epoch": 5.9952283571915475, "grad_norm": 1.4014344215393066, "learning_rate": 0.0004210817290522684, "loss": 3.1291094970703126, "step": 2200 }, { "epoch": 6.130879345603272, "grad_norm": 1.645821213722229, "learning_rate": 0.00041278293974653904, "loss": 2.936179504394531, "step": 2250 }, { "epoch": 6.267211997273347, "grad_norm": 1.6427346467971802, "learning_rate": 0.00040438238197640066, "loss": 2.857735900878906, "step": 2300 }, { "epoch": 6.403544648943422, "grad_norm": 1.66471529006958, "learning_rate": 0.00039588763589419156, "loss": 2.748570556640625, "step": 2350 }, { "epoch": 6.539877300613497, "grad_norm": 1.5286723375320435, "learning_rate": 0.0003873063666420535, "loss": 2.6635064697265625, "step": 2400 }, { "epoch": 6.676209952283572, "grad_norm": 1.5245234966278076, "learning_rate": 0.00037864631743537395, "loss": 2.556291046142578, "step": 2450 }, { "epoch": 6.812542603953647, "grad_norm": 1.4000400304794312, "learning_rate": 0.000369915302575779, "loss": 2.4817964172363283, "step": 2500 }, { "epoch": 6.948875255623722, "grad_norm": 1.3968268632888794, "learning_rate": 0.00036112120039998323, "loss": 2.362508087158203, "step": 2550 }, { "epoch": 7.084526244035446, "grad_norm": 1.6692546606063843, "learning_rate": 0.0003522719461708582, "loss": 2.273824005126953, "step": 2600 }, { "epoch": 7.220858895705521, "grad_norm": 1.4489309787750244, "learning_rate": 0.00034337552491713324, "loss": 2.1658897399902344, "step": 2650 }, { "epoch": 7.357191547375597, "grad_norm": 1.5687353610992432, "learning_rate": 0.00033443996422819145, "loss": 2.108182220458984, "step": 2700 }, { "epoch": 7.493524199045671, "grad_norm": 1.6350905895233154, "learning_rate": 0.00032547332701046195, "loss": 1.99987060546875, "step": 2750 }, { "epoch": 7.629856850715746, "grad_norm": 1.5019129514694214, "learning_rate": 0.0003164837042119428, "loss": 1.9454510498046875, "step": 2800 }, { "epoch": 7.766189502385822, "grad_norm": 1.4423465728759766, "learning_rate": 0.00030747920752142186, "loss": 1.9158531188964845, "step": 2850 }, { "epoch": 7.902522154055896, "grad_norm": 1.5868362188339233, "learning_rate": 0.0002984679620489827, "loss": 1.8568917846679687, "step": 2900 }, { "epoch": 8.03817314246762, "grad_norm": 1.7355551719665527, "learning_rate": 0.0002894580989943989, "loss": 1.7664053344726562, "step": 2950 }, { "epoch": 8.174505794137696, "grad_norm": 1.4344327449798584, "learning_rate": 0.0002804577483100344, "loss": 1.6748054504394532, "step": 3000 }, { "epoch": 8.310838445807772, "grad_norm": 1.6083476543426514, "learning_rate": 0.00027147503136486895, "loss": 1.6389869689941405, "step": 3050 }, { "epoch": 8.447171097477845, "grad_norm": 1.412381649017334, "learning_rate": 0.0002625180536162685, "loss": 1.6107588195800782, "step": 3100 }, { "epoch": 8.583503749147921, "grad_norm": 1.4404499530792236, "learning_rate": 0.00025359489729611366, "loss": 1.558354034423828, "step": 3150 }, { "epoch": 8.719836400817996, "grad_norm": 1.394539713859558, "learning_rate": 0.0002447136141178857, "loss": 1.5231396484375, "step": 3200 }, { "epoch": 8.85616905248807, "grad_norm": 1.4844084978103638, "learning_rate": 0.00023588221801128917, "loss": 1.4771731567382813, "step": 3250 }, { "epoch": 8.992501704158146, "grad_norm": 1.3957374095916748, "learning_rate": 0.0002271086778909701, "loss": 1.4401710510253907, "step": 3300 }, { "epoch": 9.12815269256987, "grad_norm": 1.4386154413223267, "learning_rate": 0.00021840091046585182, "loss": 1.3497396850585937, "step": 3350 }, { "epoch": 9.264485344239946, "grad_norm": 1.4959100484848022, "learning_rate": 0.000209766773095578, "loss": 1.3368931579589844, "step": 3400 }, { "epoch": 9.400817995910021, "grad_norm": 1.3249437808990479, "learning_rate": 0.00020121405670051008, "loss": 1.297091064453125, "step": 3450 }, { "epoch": 9.537150647580095, "grad_norm": 1.3749561309814453, "learning_rate": 0.00019275047873167374, "loss": 1.260106658935547, "step": 3500 }, { "epoch": 9.67348329925017, "grad_norm": 1.4010766744613647, "learning_rate": 0.0001843836762070014, "loss": 1.239128646850586, "step": 3550 }, { "epoch": 9.809815950920246, "grad_norm": 1.5308102369308472, "learning_rate": 0.00017612119882015126, "loss": 1.1977056121826173, "step": 3600 }, { "epoch": 9.94614860259032, "grad_norm": 1.3873751163482666, "learning_rate": 0.00016797050212812275, "loss": 1.1842040252685546, "step": 3650 }, { "epoch": 10.081799591002046, "grad_norm": 1.3666012287139893, "learning_rate": 0.00015993894082381616, "loss": 1.1095658111572266, "step": 3700 }, { "epoch": 10.21813224267212, "grad_norm": 1.3528972864151, "learning_rate": 0.00015203376209960474, "loss": 1.103120346069336, "step": 3750 }, { "epoch": 10.354464894342195, "grad_norm": 1.3081281185150146, "learning_rate": 0.00014426209910790887, "loss": 1.0691104125976563, "step": 3800 }, { "epoch": 10.49079754601227, "grad_norm": 1.3515572547912598, "learning_rate": 0.00013663096452467343, "loss": 1.0644143676757813, "step": 3850 }, { "epoch": 10.627130197682344, "grad_norm": 1.2935131788253784, "learning_rate": 0.00012914724422155598, "loss": 1.0334495544433593, "step": 3900 }, { "epoch": 10.76346284935242, "grad_norm": 1.3209459781646729, "learning_rate": 0.00012181769105253435, "loss": 1.0103805541992188, "step": 3950 }, { "epoch": 10.899795501022496, "grad_norm": 1.324385643005371, "learning_rate": 0.00011464891876054252, "loss": 0.990460433959961, "step": 4000 }, { "epoch": 11.03544648943422, "grad_norm": 1.374879002571106, "learning_rate": 0.00010764739600963116, "loss": 0.9643755340576172, "step": 4050 }, { "epoch": 11.171779141104295, "grad_norm": 1.275993824005127, "learning_rate": 0.00010081944054803842, "loss": 0.936119155883789, "step": 4100 }, { "epoch": 11.308111792774369, "grad_norm": 1.2590258121490479, "learning_rate": 9.417121350743844e-05, "loss": 0.9281440734863281, "step": 4150 }, { "epoch": 11.444444444444445, "grad_norm": 1.2088381052017212, "learning_rate": 8.770871384351085e-05, "loss": 0.9070972442626953, "step": 4200 } ], "logging_steps": 50, "max_steps": 5505, "num_input_tokens_seen": 0, "num_train_epochs": 15, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 9.989600444122399e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }