| { |
| "best_global_step": 4000, |
| "best_metric": 6.2553181648254395, |
| "best_model_checkpoint": "gpt-small-c4/checkpoint-4000", |
| "epoch": 0.20065211938801103, |
| "eval_steps": 100, |
| "global_step": 4000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.005016302984700276, |
| "grad_norm": 1.367209553718567, |
| "learning_rate": 4.975169300225734e-05, |
| "loss": 8.9927, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.005016302984700276, |
| "eval_loss": 8.038568496704102, |
| "eval_runtime": 106.068, |
| "eval_samples_per_second": 167.619, |
| "eval_steps_per_second": 20.958, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.010032605969400551, |
| "grad_norm": 1.294573426246643, |
| "learning_rate": 4.950087785302233e-05, |
| "loss": 7.6182, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.010032605969400551, |
| "eval_loss": 7.498734474182129, |
| "eval_runtime": 102.098, |
| "eval_samples_per_second": 174.137, |
| "eval_steps_per_second": 21.773, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.015048908954100828, |
| "grad_norm": 1.3138136863708496, |
| "learning_rate": 4.925006270378731e-05, |
| "loss": 7.4073, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.015048908954100828, |
| "eval_loss": 7.337331295013428, |
| "eval_runtime": 93.9798, |
| "eval_samples_per_second": 189.179, |
| "eval_steps_per_second": 23.654, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.020065211938801102, |
| "grad_norm": 1.247786283493042, |
| "learning_rate": 4.89992475545523e-05, |
| "loss": 7.2394, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.020065211938801102, |
| "eval_loss": 7.221430778503418, |
| "eval_runtime": 94.4305, |
| "eval_samples_per_second": 188.276, |
| "eval_steps_per_second": 23.541, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.02508151492350138, |
| "grad_norm": 1.5004276037216187, |
| "learning_rate": 4.874843240531728e-05, |
| "loss": 7.1744, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.02508151492350138, |
| "eval_loss": 7.13123893737793, |
| "eval_runtime": 93.9964, |
| "eval_samples_per_second": 189.146, |
| "eval_steps_per_second": 23.65, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.030097817908201655, |
| "grad_norm": 1.6555237770080566, |
| "learning_rate": 4.8497617256082266e-05, |
| "loss": 7.0923, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.030097817908201655, |
| "eval_loss": 7.061245918273926, |
| "eval_runtime": 94.3719, |
| "eval_samples_per_second": 188.393, |
| "eval_steps_per_second": 23.556, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.03511412089290193, |
| "grad_norm": 1.7525192499160767, |
| "learning_rate": 4.8246802106847256e-05, |
| "loss": 7.0225, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.03511412089290193, |
| "eval_loss": 6.9907097816467285, |
| "eval_runtime": 94.5022, |
| "eval_samples_per_second": 188.133, |
| "eval_steps_per_second": 23.523, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.040130423877602205, |
| "grad_norm": 1.6555861234664917, |
| "learning_rate": 4.7995986957612246e-05, |
| "loss": 6.9318, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.040130423877602205, |
| "eval_loss": 6.9372453689575195, |
| "eval_runtime": 94.1339, |
| "eval_samples_per_second": 188.869, |
| "eval_steps_per_second": 23.615, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.045146726862302484, |
| "grad_norm": 2.0145726203918457, |
| "learning_rate": 4.774517180837723e-05, |
| "loss": 6.8885, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.045146726862302484, |
| "eval_loss": 6.890288352966309, |
| "eval_runtime": 94.637, |
| "eval_samples_per_second": 187.865, |
| "eval_steps_per_second": 23.49, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.05016302984700276, |
| "grad_norm": 1.4140468835830688, |
| "learning_rate": 4.749435665914221e-05, |
| "loss": 6.8442, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.05016302984700276, |
| "eval_loss": 6.846121788024902, |
| "eval_runtime": 98.5415, |
| "eval_samples_per_second": 180.421, |
| "eval_steps_per_second": 22.559, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.05517933283170304, |
| "grad_norm": 2.1040637493133545, |
| "learning_rate": 4.72435415099072e-05, |
| "loss": 6.7646, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.05517933283170304, |
| "eval_loss": 6.808917999267578, |
| "eval_runtime": 95.1845, |
| "eval_samples_per_second": 186.785, |
| "eval_steps_per_second": 23.355, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.06019563581640331, |
| "grad_norm": 1.9216736555099487, |
| "learning_rate": 4.6992726360672185e-05, |
| "loss": 6.7815, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.06019563581640331, |
| "eval_loss": 6.7738037109375, |
| "eval_runtime": 94.6149, |
| "eval_samples_per_second": 187.909, |
| "eval_steps_per_second": 23.495, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.06521193880110358, |
| "grad_norm": 1.5636332035064697, |
| "learning_rate": 4.6741911211437175e-05, |
| "loss": 6.7469, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.06521193880110358, |
| "eval_loss": 6.739809036254883, |
| "eval_runtime": 94.7876, |
| "eval_samples_per_second": 187.567, |
| "eval_steps_per_second": 23.452, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.07022824178580386, |
| "grad_norm": 1.5623961687088013, |
| "learning_rate": 4.649109606220216e-05, |
| "loss": 6.7202, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.07022824178580386, |
| "eval_loss": 6.719963550567627, |
| "eval_runtime": 94.3193, |
| "eval_samples_per_second": 188.498, |
| "eval_steps_per_second": 23.569, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.07524454477050414, |
| "grad_norm": 1.5444605350494385, |
| "learning_rate": 4.624028091296714e-05, |
| "loss": 6.689, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.07524454477050414, |
| "eval_loss": 6.685614109039307, |
| "eval_runtime": 94.1801, |
| "eval_samples_per_second": 188.777, |
| "eval_steps_per_second": 23.604, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.08026084775520441, |
| "grad_norm": 2.0542852878570557, |
| "learning_rate": 4.598946576373213e-05, |
| "loss": 6.6448, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.08026084775520441, |
| "eval_loss": 6.665727615356445, |
| "eval_runtime": 94.8119, |
| "eval_samples_per_second": 187.519, |
| "eval_steps_per_second": 23.446, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.08527715073990469, |
| "grad_norm": 1.7234691381454468, |
| "learning_rate": 4.573865061449712e-05, |
| "loss": 6.6167, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.08527715073990469, |
| "eval_loss": 6.640410423278809, |
| "eval_runtime": 94.192, |
| "eval_samples_per_second": 188.753, |
| "eval_steps_per_second": 23.601, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.09029345372460497, |
| "grad_norm": 1.8299592733383179, |
| "learning_rate": 4.5487835465262104e-05, |
| "loss": 6.6109, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.09029345372460497, |
| "eval_loss": 6.620120525360107, |
| "eval_runtime": 93.7995, |
| "eval_samples_per_second": 189.543, |
| "eval_steps_per_second": 23.699, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.09530975670930525, |
| "grad_norm": 3.1380774974823, |
| "learning_rate": 4.523702031602709e-05, |
| "loss": 6.6401, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.09530975670930525, |
| "eval_loss": 6.596529006958008, |
| "eval_runtime": 94.29, |
| "eval_samples_per_second": 188.557, |
| "eval_steps_per_second": 23.576, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.10032605969400551, |
| "grad_norm": 2.301722526550293, |
| "learning_rate": 4.4986205166792077e-05, |
| "loss": 6.5718, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.10032605969400551, |
| "eval_loss": 6.580358982086182, |
| "eval_runtime": 94.8156, |
| "eval_samples_per_second": 187.511, |
| "eval_steps_per_second": 23.445, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.1053423626787058, |
| "grad_norm": 2.1571052074432373, |
| "learning_rate": 4.473539001755706e-05, |
| "loss": 6.5723, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.1053423626787058, |
| "eval_loss": 6.559490203857422, |
| "eval_runtime": 93.9805, |
| "eval_samples_per_second": 189.177, |
| "eval_steps_per_second": 23.654, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.11035866566340607, |
| "grad_norm": 1.8901225328445435, |
| "learning_rate": 4.448457486832205e-05, |
| "loss": 6.5405, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.11035866566340607, |
| "eval_loss": 6.536637783050537, |
| "eval_runtime": 95.7913, |
| "eval_samples_per_second": 185.601, |
| "eval_steps_per_second": 23.207, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.11537496864810634, |
| "grad_norm": 1.9951658248901367, |
| "learning_rate": 4.423375971908704e-05, |
| "loss": 6.5143, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.11537496864810634, |
| "eval_loss": 6.518816947937012, |
| "eval_runtime": 94.7754, |
| "eval_samples_per_second": 187.591, |
| "eval_steps_per_second": 23.455, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.12039127163280662, |
| "grad_norm": 1.5648778676986694, |
| "learning_rate": 4.398294456985202e-05, |
| "loss": 6.5145, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.12039127163280662, |
| "eval_loss": 6.496397018432617, |
| "eval_runtime": 119.6751, |
| "eval_samples_per_second": 148.561, |
| "eval_steps_per_second": 18.575, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.1254075746175069, |
| "grad_norm": 1.6384185552597046, |
| "learning_rate": 4.3732129420617006e-05, |
| "loss": 6.4635, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.1254075746175069, |
| "eval_loss": 6.48058557510376, |
| "eval_runtime": 116.6468, |
| "eval_samples_per_second": 152.417, |
| "eval_steps_per_second": 19.058, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.13042387760220717, |
| "grad_norm": 2.172386884689331, |
| "learning_rate": 4.3481314271381995e-05, |
| "loss": 6.4973, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.13042387760220717, |
| "eval_loss": 6.466433048248291, |
| "eval_runtime": 98.1503, |
| "eval_samples_per_second": 181.141, |
| "eval_steps_per_second": 22.649, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.13544018058690746, |
| "grad_norm": 1.9169673919677734, |
| "learning_rate": 4.323049912214698e-05, |
| "loss": 6.475, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.13544018058690746, |
| "eval_loss": 6.4444684982299805, |
| "eval_runtime": 94.3642, |
| "eval_samples_per_second": 188.408, |
| "eval_steps_per_second": 23.558, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.14045648357160773, |
| "grad_norm": 1.8222382068634033, |
| "learning_rate": 4.297968397291197e-05, |
| "loss": 6.4216, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.14045648357160773, |
| "eval_loss": 6.434403896331787, |
| "eval_runtime": 96.3492, |
| "eval_samples_per_second": 184.527, |
| "eval_steps_per_second": 23.072, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.145472786556308, |
| "grad_norm": 2.0480902194976807, |
| "learning_rate": 4.272886882367695e-05, |
| "loss": 6.4062, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.145472786556308, |
| "eval_loss": 6.415233135223389, |
| "eval_runtime": 94.0684, |
| "eval_samples_per_second": 189.001, |
| "eval_steps_per_second": 23.632, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.1504890895410083, |
| "grad_norm": 1.965072512626648, |
| "learning_rate": 4.2478053674441935e-05, |
| "loss": 6.3655, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.1504890895410083, |
| "eval_loss": 6.401170253753662, |
| "eval_runtime": 94.0417, |
| "eval_samples_per_second": 189.054, |
| "eval_steps_per_second": 23.638, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.15550539252570855, |
| "grad_norm": 2.16786789894104, |
| "learning_rate": 4.2227238525206924e-05, |
| "loss": 6.4038, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.15550539252570855, |
| "eval_loss": 6.388797283172607, |
| "eval_runtime": 93.9464, |
| "eval_samples_per_second": 189.246, |
| "eval_steps_per_second": 23.662, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.16052169551040882, |
| "grad_norm": 2.5082712173461914, |
| "learning_rate": 4.1976423375971914e-05, |
| "loss": 6.3553, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.16052169551040882, |
| "eval_loss": 6.368188858032227, |
| "eval_runtime": 94.1089, |
| "eval_samples_per_second": 188.919, |
| "eval_steps_per_second": 23.622, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.1655379984951091, |
| "grad_norm": 2.0116617679595947, |
| "learning_rate": 4.17256082267369e-05, |
| "loss": 6.3573, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.1655379984951091, |
| "eval_loss": 6.35645866394043, |
| "eval_runtime": 94.2829, |
| "eval_samples_per_second": 188.571, |
| "eval_steps_per_second": 23.578, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.17055430147980938, |
| "grad_norm": 2.309736728668213, |
| "learning_rate": 4.147479307750188e-05, |
| "loss": 6.3458, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.17055430147980938, |
| "eval_loss": 6.339991092681885, |
| "eval_runtime": 94.4661, |
| "eval_samples_per_second": 188.205, |
| "eval_steps_per_second": 23.532, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.17557060446450964, |
| "grad_norm": 1.963045597076416, |
| "learning_rate": 4.122397792826687e-05, |
| "loss": 6.3157, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.17557060446450964, |
| "eval_loss": 6.325737476348877, |
| "eval_runtime": 94.2601, |
| "eval_samples_per_second": 188.616, |
| "eval_steps_per_second": 23.584, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.18058690744920994, |
| "grad_norm": 2.3348584175109863, |
| "learning_rate": 4.0973162779031853e-05, |
| "loss": 6.3232, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.18058690744920994, |
| "eval_loss": 6.314403533935547, |
| "eval_runtime": 94.6269, |
| "eval_samples_per_second": 187.885, |
| "eval_steps_per_second": 23.492, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.1856032104339102, |
| "grad_norm": 1.7809332609176636, |
| "learning_rate": 4.072234762979684e-05, |
| "loss": 6.2809, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.1856032104339102, |
| "eval_loss": 6.2983903884887695, |
| "eval_runtime": 94.2727, |
| "eval_samples_per_second": 188.591, |
| "eval_steps_per_second": 23.581, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.1906195134186105, |
| "grad_norm": 2.0216691493988037, |
| "learning_rate": 4.047153248056183e-05, |
| "loss": 6.2558, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.1906195134186105, |
| "eval_loss": 6.28033971786499, |
| "eval_runtime": 94.2314, |
| "eval_samples_per_second": 188.674, |
| "eval_steps_per_second": 23.591, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.19563581640331076, |
| "grad_norm": 2.2930386066436768, |
| "learning_rate": 4.022071733132681e-05, |
| "loss": 6.2869, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.19563581640331076, |
| "eval_loss": 6.2675676345825195, |
| "eval_runtime": 94.5425, |
| "eval_samples_per_second": 188.053, |
| "eval_steps_per_second": 23.513, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.20065211938801103, |
| "grad_norm": 2.321624755859375, |
| "learning_rate": 3.99699021820918e-05, |
| "loss": 6.2825, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.20065211938801103, |
| "eval_loss": 6.2553181648254395, |
| "eval_runtime": 94.2273, |
| "eval_samples_per_second": 188.682, |
| "eval_steps_per_second": 23.592, |
| "step": 4000 |
| } |
| ], |
| "logging_steps": 100, |
| "max_steps": 19935, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 929726201856000.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|