| { |
| "best_metric": 1.9750508069992065, |
| "best_model_checkpoint": "./gemma-python/checkpoint-40", |
| "epoch": 5.0, |
| "eval_steps": 2, |
| "global_step": 40, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.12, |
| "grad_norm": 40.636978402335416, |
| "learning_rate": 0.0001, |
| "loss": 19.0016, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.12, |
| "eval_loss": 18.6992130279541, |
| "eval_runtime": 2.881, |
| "eval_samples_per_second": 7.289, |
| "eval_steps_per_second": 1.041, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 41.61053527062362, |
| "learning_rate": 0.0002, |
| "loss": 19.4686, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.25, |
| "eval_loss": 16.257802963256836, |
| "eval_runtime": 2.9111, |
| "eval_samples_per_second": 7.214, |
| "eval_steps_per_second": 1.031, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.38, |
| "grad_norm": 28.704819713850974, |
| "learning_rate": 0.00019991889981715698, |
| "loss": 13.2303, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.5, |
| "grad_norm": 26.40444243073739, |
| "learning_rate": 0.00019967573081342103, |
| "loss": 11.468, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.5, |
| "eval_loss": 8.28911018371582, |
| "eval_runtime": 2.9257, |
| "eval_samples_per_second": 7.178, |
| "eval_steps_per_second": 1.025, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.62, |
| "grad_norm": 12.912981323843146, |
| "learning_rate": 0.0001992708874098054, |
| "loss": 9.3107, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.75, |
| "grad_norm": 7.943058500648636, |
| "learning_rate": 0.00019870502626379127, |
| "loss": 7.5305, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.75, |
| "eval_loss": 5.884701728820801, |
| "eval_runtime": 2.9479, |
| "eval_samples_per_second": 7.124, |
| "eval_steps_per_second": 1.018, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.88, |
| "grad_norm": 6.267657551985817, |
| "learning_rate": 0.00019797906520422677, |
| "loss": 6.6492, |
| "step": 7 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 5.0825555341832365, |
| "learning_rate": 0.0001970941817426052, |
| "loss": 5.7572, |
| "step": 8 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 4.363473892211914, |
| "eval_runtime": 2.9653, |
| "eval_samples_per_second": 7.082, |
| "eval_steps_per_second": 1.012, |
| "step": 8 |
| }, |
| { |
| "epoch": 1.12, |
| "grad_norm": 4.88565620317727, |
| "learning_rate": 0.00019605181116313724, |
| "loss": 4.5414, |
| "step": 9 |
| }, |
| { |
| "epoch": 1.25, |
| "grad_norm": 5.0847008955317605, |
| "learning_rate": 0.00019485364419471454, |
| "loss": 4.3903, |
| "step": 10 |
| }, |
| { |
| "epoch": 1.25, |
| "eval_loss": 3.284867763519287, |
| "eval_runtime": 2.9746, |
| "eval_samples_per_second": 7.06, |
| "eval_steps_per_second": 1.009, |
| "step": 10 |
| }, |
| { |
| "epoch": 1.38, |
| "grad_norm": 3.424587898800574, |
| "learning_rate": 0.0001935016242685415, |
| "loss": 3.79, |
| "step": 11 |
| }, |
| { |
| "epoch": 1.5, |
| "grad_norm": 2.7255824385278506, |
| "learning_rate": 0.00019199794436588243, |
| "loss": 2.9497, |
| "step": 12 |
| }, |
| { |
| "epoch": 1.5, |
| "eval_loss": 2.853942394256592, |
| "eval_runtime": 2.9866, |
| "eval_samples_per_second": 7.031, |
| "eval_steps_per_second": 1.004, |
| "step": 12 |
| }, |
| { |
| "epoch": 1.62, |
| "grad_norm": 2.1001906898750624, |
| "learning_rate": 0.00019034504346103823, |
| "loss": 2.7728, |
| "step": 13 |
| }, |
| { |
| "epoch": 1.75, |
| "grad_norm": 1.9200021565941778, |
| "learning_rate": 0.000188545602565321, |
| "loss": 2.8738, |
| "step": 14 |
| }, |
| { |
| "epoch": 1.75, |
| "eval_loss": 2.62028431892395, |
| "eval_runtime": 2.9982, |
| "eval_samples_per_second": 7.004, |
| "eval_steps_per_second": 1.001, |
| "step": 14 |
| }, |
| { |
| "epoch": 1.88, |
| "grad_norm": 1.8837224890225774, |
| "learning_rate": 0.00018660254037844388, |
| "loss": 3.0787, |
| "step": 15 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 1.8929687978608318, |
| "learning_rate": 0.0001845190085543795, |
| "loss": 2.7298, |
| "step": 16 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 2.453444242477417, |
| "eval_runtime": 2.9964, |
| "eval_samples_per_second": 7.008, |
| "eval_steps_per_second": 1.001, |
| "step": 16 |
| }, |
| { |
| "epoch": 2.12, |
| "grad_norm": 1.3652069569291694, |
| "learning_rate": 0.00018229838658936564, |
| "loss": 2.5967, |
| "step": 17 |
| }, |
| { |
| "epoch": 2.25, |
| "grad_norm": 2.4263600812149417, |
| "learning_rate": 0.00017994427634035015, |
| "loss": 2.4284, |
| "step": 18 |
| }, |
| { |
| "epoch": 2.25, |
| "eval_loss": 2.307706832885742, |
| "eval_runtime": 2.9963, |
| "eval_samples_per_second": 7.009, |
| "eval_steps_per_second": 1.001, |
| "step": 18 |
| }, |
| { |
| "epoch": 2.38, |
| "grad_norm": 2.5673391658400053, |
| "learning_rate": 0.00017746049618276545, |
| "loss": 2.6721, |
| "step": 19 |
| }, |
| { |
| "epoch": 2.5, |
| "grad_norm": 2.2252437500899656, |
| "learning_rate": 0.00017485107481711012, |
| "loss": 2.394, |
| "step": 20 |
| }, |
| { |
| "epoch": 2.5, |
| "eval_loss": 2.187636137008667, |
| "eval_runtime": 2.9975, |
| "eval_samples_per_second": 7.006, |
| "eval_steps_per_second": 1.001, |
| "step": 20 |
| }, |
| { |
| "epoch": 2.62, |
| "grad_norm": 2.345233295279928, |
| "learning_rate": 0.00017212024473438147, |
| "loss": 2.3972, |
| "step": 21 |
| }, |
| { |
| "epoch": 2.75, |
| "grad_norm": 1.1122620317353238, |
| "learning_rate": 0.00016927243535095997, |
| "loss": 2.069, |
| "step": 22 |
| }, |
| { |
| "epoch": 2.75, |
| "eval_loss": 2.1294100284576416, |
| "eval_runtime": 2.993, |
| "eval_samples_per_second": 7.016, |
| "eval_steps_per_second": 1.002, |
| "step": 22 |
| }, |
| { |
| "epoch": 2.88, |
| "grad_norm": 2.8270209249093803, |
| "learning_rate": 0.00016631226582407952, |
| "loss": 2.211, |
| "step": 23 |
| }, |
| { |
| "epoch": 3.0, |
| "grad_norm": 7.323169716541166, |
| "learning_rate": 0.00016324453755953773, |
| "loss": 1.9355, |
| "step": 24 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_loss": 2.1047682762145996, |
| "eval_runtime": 2.9871, |
| "eval_samples_per_second": 7.03, |
| "eval_steps_per_second": 1.004, |
| "step": 24 |
| }, |
| { |
| "epoch": 3.12, |
| "grad_norm": 1.9938311808450486, |
| "learning_rate": 0.0001600742264237979, |
| "loss": 2.1962, |
| "step": 25 |
| }, |
| { |
| "epoch": 3.25, |
| "grad_norm": 3.330986691029466, |
| "learning_rate": 0.00015680647467311557, |
| "loss": 1.9635, |
| "step": 26 |
| }, |
| { |
| "epoch": 3.25, |
| "eval_loss": 2.0707101821899414, |
| "eval_runtime": 2.9895, |
| "eval_samples_per_second": 7.025, |
| "eval_steps_per_second": 1.004, |
| "step": 26 |
| }, |
| { |
| "epoch": 3.38, |
| "grad_norm": 2.0371854480792178, |
| "learning_rate": 0.0001534465826127801, |
| "loss": 2.2319, |
| "step": 27 |
| }, |
| { |
| "epoch": 3.5, |
| "grad_norm": 3.2163831286077653, |
| "learning_rate": 0.00015000000000000001, |
| "loss": 2.092, |
| "step": 28 |
| }, |
| { |
| "epoch": 3.5, |
| "eval_loss": 2.059619426727295, |
| "eval_runtime": 2.9996, |
| "eval_samples_per_second": 7.001, |
| "eval_steps_per_second": 1.0, |
| "step": 28 |
| }, |
| { |
| "epoch": 3.62, |
| "grad_norm": 2.853987323853131, |
| "learning_rate": 0.00014647231720437686, |
| "loss": 1.9182, |
| "step": 29 |
| }, |
| { |
| "epoch": 3.75, |
| "grad_norm": 2.2997509863024352, |
| "learning_rate": 0.00014286925614030542, |
| "loss": 1.9675, |
| "step": 30 |
| }, |
| { |
| "epoch": 3.75, |
| "eval_loss": 2.0287458896636963, |
| "eval_runtime": 2.9966, |
| "eval_samples_per_second": 7.008, |
| "eval_steps_per_second": 1.001, |
| "step": 30 |
| }, |
| { |
| "epoch": 3.88, |
| "grad_norm": 2.2770679758385244, |
| "learning_rate": 0.00013919666098600753, |
| "loss": 1.9815, |
| "step": 31 |
| }, |
| { |
| "epoch": 4.0, |
| "grad_norm": 0.8553765652252152, |
| "learning_rate": 0.00013546048870425356, |
| "loss": 1.9693, |
| "step": 32 |
| }, |
| { |
| "epoch": 4.0, |
| "eval_loss": 2.022012710571289, |
| "eval_runtime": 2.9895, |
| "eval_samples_per_second": 7.025, |
| "eval_steps_per_second": 1.004, |
| "step": 32 |
| }, |
| { |
| "epoch": 4.12, |
| "grad_norm": 3.8094922067262336, |
| "learning_rate": 0.00013166679938014726, |
| "loss": 1.6479, |
| "step": 33 |
| }, |
| { |
| "epoch": 4.25, |
| "grad_norm": 3.5435911597121277, |
| "learning_rate": 0.0001278217463916453, |
| "loss": 2.0198, |
| "step": 34 |
| }, |
| { |
| "epoch": 4.25, |
| "eval_loss": 2.012432336807251, |
| "eval_runtime": 2.9987, |
| "eval_samples_per_second": 7.003, |
| "eval_steps_per_second": 1.0, |
| "step": 34 |
| }, |
| { |
| "epoch": 4.38, |
| "grad_norm": 1.4676241516417539, |
| "learning_rate": 0.0001239315664287558, |
| "loss": 1.7496, |
| "step": 35 |
| }, |
| { |
| "epoch": 4.5, |
| "grad_norm": 1.4772602834377506, |
| "learning_rate": 0.00012000256937760445, |
| "loss": 1.9357, |
| "step": 36 |
| }, |
| { |
| "epoch": 4.5, |
| "eval_loss": 1.9945744276046753, |
| "eval_runtime": 3.0019, |
| "eval_samples_per_second": 6.995, |
| "eval_steps_per_second": 0.999, |
| "step": 36 |
| }, |
| { |
| "epoch": 4.62, |
| "grad_norm": 0.8198622785029981, |
| "learning_rate": 0.00011604112808577603, |
| "loss": 1.8365, |
| "step": 37 |
| }, |
| { |
| "epoch": 4.75, |
| "grad_norm": 2.5267989029749556, |
| "learning_rate": 0.0001120536680255323, |
| "loss": 1.8147, |
| "step": 38 |
| }, |
| { |
| "epoch": 4.75, |
| "eval_loss": 1.9979486465454102, |
| "eval_runtime": 2.9865, |
| "eval_samples_per_second": 7.032, |
| "eval_steps_per_second": 1.005, |
| "step": 38 |
| }, |
| { |
| "epoch": 4.88, |
| "grad_norm": 1.2889515222114942, |
| "learning_rate": 0.00010804665687167262, |
| "loss": 1.6703, |
| "step": 39 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 1.3474067788797102, |
| "learning_rate": 0.00010402659401094152, |
| "loss": 1.9084, |
| "step": 40 |
| }, |
| { |
| "epoch": 5.0, |
| "eval_loss": 1.9750508069992065, |
| "eval_runtime": 2.9945, |
| "eval_samples_per_second": 7.013, |
| "eval_steps_per_second": 1.002, |
| "step": 40 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 80, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 10, |
| "save_steps": 8, |
| "total_flos": 1.8523438033403904e+17, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|