Buckets:
| { | |
| "best_global_step": 2000, | |
| "best_metric": 0.5165990591049194, | |
| "best_model_checkpoint": "cache/tiny/checkpoints_v2/checkpoint-2000", | |
| "epoch": 10.309278350515465, | |
| "eval_steps": 200, | |
| "global_step": 2000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.25773195876288657, | |
| "grad_norm": 2.0141708850860596, | |
| "learning_rate": 9.800000000000001e-06, | |
| "loss": 3.7401187133789064, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.5154639175257731, | |
| "grad_norm": 2.9495837688446045, | |
| "learning_rate": 1.9800000000000004e-05, | |
| "loss": 3.4125726318359373, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.7731958762886598, | |
| "grad_norm": 4.327129364013672, | |
| "learning_rate": 2.98e-05, | |
| "loss": 3.0153933715820314, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.0309278350515463, | |
| "grad_norm": 3.4784326553344727, | |
| "learning_rate": 3.9800000000000005e-05, | |
| "loss": 2.6601983642578126, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.0309278350515463, | |
| "eval_loss": 3.318650245666504, | |
| "eval_runtime": 1.0667, | |
| "eval_samples_per_second": 959.963, | |
| "eval_steps_per_second": 0.937, | |
| "eval_teacher_cosine_loss": 0.8297808170318604, | |
| "eval_teacher_cosine_mean": 0.17021918296813965, | |
| "eval_teacher_cosine_p10": 0.09951353073120117, | |
| "eval_teacher_cosine_p50": 0.17211058735847473, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.2886597938144329, | |
| "grad_norm": 5.863973617553711, | |
| "learning_rate": 4.9800000000000004e-05, | |
| "loss": 2.333711853027344, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.5463917525773194, | |
| "grad_norm": 7.006694793701172, | |
| "learning_rate": 5.9800000000000003e-05, | |
| "loss": 2.122469024658203, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.8041237113402062, | |
| "grad_norm": 5.193179607391357, | |
| "learning_rate": 6.98e-05, | |
| "loss": 1.9349069213867187, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.0618556701030926, | |
| "grad_norm": 3.973529577255249, | |
| "learning_rate": 7.98e-05, | |
| "loss": 1.7354617309570313, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.0618556701030926, | |
| "eval_loss": 2.6581077575683594, | |
| "eval_runtime": 0.7736, | |
| "eval_samples_per_second": 1323.646, | |
| "eval_steps_per_second": 1.293, | |
| "eval_teacher_cosine_loss": 0.7238675355911255, | |
| "eval_teacher_cosine_mean": 0.2761324942111969, | |
| "eval_teacher_cosine_p10": 0.18719647824764252, | |
| "eval_teacher_cosine_p50": 0.27989375591278076, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.319587628865979, | |
| "grad_norm": 4.733378887176514, | |
| "learning_rate": 8.98e-05, | |
| "loss": 1.4927114868164062, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.5773195876288657, | |
| "grad_norm": 4.523077964782715, | |
| "learning_rate": 9.98e-05, | |
| "loss": 1.404100341796875, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.8350515463917527, | |
| "grad_norm": 3.3091869354248047, | |
| "learning_rate": 9.731947483588622e-05, | |
| "loss": 1.3087448120117187, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 3.0927835051546393, | |
| "grad_norm": 3.69515323638916, | |
| "learning_rate": 9.458424507658644e-05, | |
| "loss": 1.1459870910644532, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 3.0927835051546393, | |
| "eval_loss": 2.1926748752593994, | |
| "eval_runtime": 0.7824, | |
| "eval_samples_per_second": 1308.767, | |
| "eval_steps_per_second": 1.278, | |
| "eval_teacher_cosine_loss": 0.6389099359512329, | |
| "eval_teacher_cosine_mean": 0.3610900640487671, | |
| "eval_teacher_cosine_p10": 0.2641688585281372, | |
| "eval_teacher_cosine_p50": 0.3655257821083069, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 3.350515463917526, | |
| "grad_norm": 3.519090175628662, | |
| "learning_rate": 9.184901531728666e-05, | |
| "loss": 0.9953593444824219, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 3.6082474226804124, | |
| "grad_norm": 3.035456657409668, | |
| "learning_rate": 8.911378555798688e-05, | |
| "loss": 0.9687478637695313, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 3.865979381443299, | |
| "grad_norm": 2.8461368083953857, | |
| "learning_rate": 8.63785557986871e-05, | |
| "loss": 0.9490547180175781, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 4.123711340206185, | |
| "grad_norm": 2.469802141189575, | |
| "learning_rate": 8.364332603938731e-05, | |
| "loss": 0.8209384155273437, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 4.123711340206185, | |
| "eval_loss": 1.9687132835388184, | |
| "eval_runtime": 0.7809, | |
| "eval_samples_per_second": 1311.356, | |
| "eval_steps_per_second": 1.281, | |
| "eval_teacher_cosine_loss": 0.5924775004386902, | |
| "eval_teacher_cosine_mean": 0.4075224995613098, | |
| "eval_teacher_cosine_p10": 0.3042549788951874, | |
| "eval_teacher_cosine_p50": 0.41277939081192017, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 4.381443298969073, | |
| "grad_norm": 2.479365587234497, | |
| "learning_rate": 8.090809628008753e-05, | |
| "loss": 0.7298025512695312, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 4.639175257731958, | |
| "grad_norm": 3.0340750217437744, | |
| "learning_rate": 7.817286652078775e-05, | |
| "loss": 0.7384292602539062, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 4.896907216494846, | |
| "grad_norm": 2.370948076248169, | |
| "learning_rate": 7.543763676148797e-05, | |
| "loss": 0.7220243072509765, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 5.154639175257732, | |
| "grad_norm": 2.102903366088867, | |
| "learning_rate": 7.270240700218819e-05, | |
| "loss": 0.6203727722167969, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 5.154639175257732, | |
| "eval_loss": 1.8500784635543823, | |
| "eval_runtime": 0.7743, | |
| "eval_samples_per_second": 1322.405, | |
| "eval_steps_per_second": 1.291, | |
| "eval_teacher_cosine_loss": 0.558039128780365, | |
| "eval_teacher_cosine_mean": 0.441960871219635, | |
| "eval_teacher_cosine_p10": 0.33395877480506897, | |
| "eval_teacher_cosine_p50": 0.4483190178871155, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 5.412371134020619, | |
| "grad_norm": 2.429229974746704, | |
| "learning_rate": 6.99671772428884e-05, | |
| "loss": 0.5804964828491211, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 5.670103092783505, | |
| "grad_norm": 2.1712899208068848, | |
| "learning_rate": 6.723194748358862e-05, | |
| "loss": 0.5780911254882812, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 5.927835051546392, | |
| "grad_norm": 2.242732286453247, | |
| "learning_rate": 6.449671772428884e-05, | |
| "loss": 0.5856694030761719, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 6.185567010309279, | |
| "grad_norm": 1.761767864227295, | |
| "learning_rate": 6.176148796498906e-05, | |
| "loss": 0.49521896362304685, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 6.185567010309279, | |
| "eval_loss": 1.7896480560302734, | |
| "eval_runtime": 0.7721, | |
| "eval_samples_per_second": 1326.333, | |
| "eval_steps_per_second": 1.295, | |
| "eval_teacher_cosine_loss": 0.5402054786682129, | |
| "eval_teacher_cosine_mean": 0.4597945213317871, | |
| "eval_teacher_cosine_p10": 0.35114195942878723, | |
| "eval_teacher_cosine_p50": 0.46680837869644165, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 6.443298969072165, | |
| "grad_norm": 1.6649208068847656, | |
| "learning_rate": 5.902625820568928e-05, | |
| "loss": 0.47256912231445314, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 6.701030927835052, | |
| "grad_norm": 2.0650441646575928, | |
| "learning_rate": 5.6291028446389504e-05, | |
| "loss": 0.48461719512939455, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 6.958762886597938, | |
| "grad_norm": 1.870469331741333, | |
| "learning_rate": 5.355579868708972e-05, | |
| "loss": 0.4868436050415039, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 7.216494845360825, | |
| "grad_norm": 1.441678524017334, | |
| "learning_rate": 5.0820568927789934e-05, | |
| "loss": 0.4157650756835938, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 7.216494845360825, | |
| "eval_loss": 1.7518922090530396, | |
| "eval_runtime": 0.7752, | |
| "eval_samples_per_second": 1321.009, | |
| "eval_steps_per_second": 1.29, | |
| "eval_teacher_cosine_loss": 0.52263343334198, | |
| "eval_teacher_cosine_mean": 0.47736653685569763, | |
| "eval_teacher_cosine_p10": 0.36584946513175964, | |
| "eval_teacher_cosine_p50": 0.4856162667274475, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 7.474226804123711, | |
| "grad_norm": 1.5953190326690674, | |
| "learning_rate": 4.808533916849016e-05, | |
| "loss": 0.4144211959838867, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 7.731958762886598, | |
| "grad_norm": 1.5992157459259033, | |
| "learning_rate": 4.535010940919037e-05, | |
| "loss": 0.41783378601074217, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 7.989690721649485, | |
| "grad_norm": 1.3128303289413452, | |
| "learning_rate": 4.2614879649890596e-05, | |
| "loss": 0.4216469192504883, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 8.24742268041237, | |
| "grad_norm": 1.2639213800430298, | |
| "learning_rate": 3.987964989059081e-05, | |
| "loss": 0.3638432312011719, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 8.24742268041237, | |
| "eval_loss": 1.6979527473449707, | |
| "eval_runtime": 0.7815, | |
| "eval_samples_per_second": 1310.248, | |
| "eval_steps_per_second": 1.28, | |
| "eval_teacher_cosine_loss": 0.5083520412445068, | |
| "eval_teacher_cosine_mean": 0.4916479289531708, | |
| "eval_teacher_cosine_p10": 0.37511730194091797, | |
| "eval_teacher_cosine_p50": 0.5001251697540283, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 8.505154639175258, | |
| "grad_norm": 1.3072861433029175, | |
| "learning_rate": 3.714442013129103e-05, | |
| "loss": 0.3709882736206055, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 8.762886597938145, | |
| "grad_norm": 1.325158953666687, | |
| "learning_rate": 3.440919037199125e-05, | |
| "loss": 0.37377830505371096, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 9.02061855670103, | |
| "grad_norm": 1.1044729948043823, | |
| "learning_rate": 3.167396061269147e-05, | |
| "loss": 0.37382389068603517, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 9.278350515463918, | |
| "grad_norm": 1.2414360046386719, | |
| "learning_rate": 2.8938730853391688e-05, | |
| "loss": 0.33761756896972656, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 9.278350515463918, | |
| "eval_loss": 1.6755480766296387, | |
| "eval_runtime": 0.7738, | |
| "eval_samples_per_second": 1323.261, | |
| "eval_steps_per_second": 1.292, | |
| "eval_teacher_cosine_loss": 0.495100736618042, | |
| "eval_teacher_cosine_mean": 0.504899263381958, | |
| "eval_teacher_cosine_p10": 0.3864120841026306, | |
| "eval_teacher_cosine_p50": 0.5146739482879639, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 9.536082474226804, | |
| "grad_norm": 0.997848629951477, | |
| "learning_rate": 2.6203501094091903e-05, | |
| "loss": 0.3416647720336914, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 9.793814432989691, | |
| "grad_norm": 1.1428130865097046, | |
| "learning_rate": 2.3468271334792125e-05, | |
| "loss": 0.34493419647216794, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 10.051546391752577, | |
| "grad_norm": 0.9203411340713501, | |
| "learning_rate": 2.0733041575492343e-05, | |
| "loss": 0.3403450012207031, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 10.309278350515465, | |
| "grad_norm": 0.9985235929489136, | |
| "learning_rate": 1.799781181619256e-05, | |
| "loss": 0.3205509948730469, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 10.309278350515465, | |
| "eval_loss": 1.6462994813919067, | |
| "eval_runtime": 0.7834, | |
| "eval_samples_per_second": 1307.169, | |
| "eval_steps_per_second": 1.277, | |
| "eval_teacher_cosine_loss": 0.48340094089508057, | |
| "eval_teacher_cosine_mean": 0.5165990591049194, | |
| "eval_teacher_cosine_p10": 0.4014687240123749, | |
| "eval_teacher_cosine_p50": 0.5252096652984619, | |
| "step": 2000 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 2328, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 12, | |
| "save_steps": 200, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 512, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |
Xet Storage Details
- Size:
- 12.5 kB
- Xet hash:
- 8dff1e5fc2880047c06c8611f554f48c4d36bd9bb8bd9019087ab4b64fb1baac
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.